Repository: apache/incubator-liminal Branch: master Commit: aee9827f3b33 Files: 266 Total size: 596.4 KB Directory structure: gitextract_29sbg3vw/ ├── .asf.yaml ├── .bumpversion.cfg ├── .github/ │ ├── PULL_REQUEST_TEMPLATE.md │ └── workflows/ │ ├── pre_commits.yml │ ├── unittests.yml │ └── versioning.yml ├── .gitignore ├── .pre-commit-config.yaml ├── CONTRIBUTING.md ├── DISCLAIMER-WIP ├── LICENSE ├── MANIFEST.in ├── NOTICE ├── README.md ├── RETIRED.txt ├── dev/ │ ├── LICENSE.txt │ ├── RELEASE.md │ └── sign.sh ├── docs/ │ ├── Makefile │ ├── README.md │ ├── architecture.md │ ├── conf.py │ ├── getting-started/ │ │ ├── hello_world.md │ │ ├── iris_classification.md │ │ └── spark_app_demo.md │ ├── index.rst │ ├── liminal/ │ │ ├── README.md │ │ ├── advanced.liminal.yml.md │ │ ├── executors/ │ │ │ ├── README.md │ │ │ ├── emr.md │ │ │ ├── index.rst │ │ │ └── kubernetes.md │ │ ├── extensibility.md │ │ ├── images/ │ │ │ ├── README.md │ │ │ ├── index.rst │ │ │ ├── python.md │ │ │ └── python_server.md │ │ ├── index.rst │ │ ├── kubernetes/ │ │ │ └── secret_util.md │ │ ├── liminal.yml.md │ │ ├── metrics_backends/ │ │ │ ├── README.md │ │ │ ├── aws_cloudwatch.md │ │ │ └── index.rst │ │ ├── monitoring.md │ │ ├── pipelines.md │ │ ├── services.md │ │ └── tasks/ │ │ ├── README.md │ │ ├── create_cloudformation_stack.md │ │ ├── delete_cloudformation_stack.md │ │ ├── index.rst │ │ ├── python.md │ │ └── spark.md │ ├── make.bat │ └── source/ │ ├── How_to_install_liminal_in_airflow_on_kubernetes.md │ └── liminal_aws_deploy.sh ├── examples/ │ ├── aws-ml-app-demo/ │ │ ├── __init__.py │ │ ├── liminal.yml │ │ ├── manifests/ │ │ │ └── aws-ml-app-demo.yaml │ │ ├── model_store.py │ │ ├── requirements.txt │ │ ├── serving.py │ │ └── training.py │ ├── extensibility/ │ │ ├── __init__.py │ │ ├── executors/ │ │ │ ├── __init__.py │ │ │ └── custom_executor.py │ │ ├── images/ │ │ │ ├── __init__.py │ │ │ ├── custom_image.py │ │ │ └── custom_image_builder/ │ │ │ ├── Dockerfile │ │ │ └── __init__.py │ │ ├── liminal.yml │ │ └── tasks/ │ │ ├── __init__.py │ │ └── custom_task.py │ ├── liminal-getting-started/ │ │ ├── __init__.py │ │ ├── hello_world.json │ │ ├── helloworld/ │ │ │ ├── __init__.py │ │ │ └── hello_world.py │ │ ├── liminal.yml │ │ ├── myserver/ │ │ │ ├── __init__.py │ │ │ └── my_server.py │ │ ├── pip.conf │ │ └── requirements.txt │ ├── sagemaker_example/ │ │ ├── README.md │ │ ├── archetype/ │ │ │ └── liminal.yaml │ │ ├── data_preparation/ │ │ │ ├── __init__.py │ │ │ ├── data_preparation.py │ │ │ └── data_uploader.py │ │ ├── data_train/ │ │ │ ├── __init__.py │ │ │ └── train.py │ │ ├── inference.py │ │ ├── liminal.yaml │ │ ├── liminal_sm.py │ │ ├── requirements.txt │ │ └── sm_ops.py │ └── spark-app-demo/ │ └── k8s/ │ ├── __init__.py │ ├── archetype/ │ │ └── liminal.yml │ ├── data/ │ │ └── iris.csv │ ├── data_cleanup.py │ ├── liminal.yml │ ├── manifests/ │ │ └── spark-app-demo.yaml │ ├── model_store.py │ ├── requirements.txt │ ├── serving.py │ └── training.py ├── install.sh ├── liminal/ │ ├── __init__.py │ ├── build/ │ │ ├── __init__.py │ │ ├── image/ │ │ │ ├── __init__.py │ │ │ ├── python/ │ │ │ │ ├── Dockerfile │ │ │ │ ├── __init__.py │ │ │ │ └── python.py │ │ │ ├── python_server/ │ │ │ │ ├── Dockerfile │ │ │ │ ├── __init__.py │ │ │ │ ├── liminal_python_server.py │ │ │ │ ├── python_server.py │ │ │ │ └── python_server_requirements.txt │ │ │ └── spark/ │ │ │ ├── Dockerfile │ │ │ ├── __init__.py │ │ │ └── spark.py │ │ ├── image_builder.py │ │ ├── liminal_apps_builder.py │ │ └── python.py │ ├── core/ │ │ ├── __init__.py │ │ ├── config/ │ │ │ ├── __init__.py │ │ │ ├── config.py │ │ │ └── defaults/ │ │ │ ├── __init__.py │ │ │ ├── base/ │ │ │ │ ├── __init__.py │ │ │ │ └── liminal.yml │ │ │ └── default_configs.py │ │ ├── environment.py │ │ └── util/ │ │ ├── __init__.py │ │ ├── class_util.py │ │ ├── dict_util.py │ │ ├── env_util.py │ │ ├── extensible.py │ │ └── files_util.py │ ├── docker/ │ │ └── __init__.py │ ├── kubernetes/ │ │ ├── __init__.py │ │ ├── secret_util.py │ │ └── volume_util.py │ ├── logging/ │ │ ├── __init__.py │ │ └── logging_setup.py │ ├── monitoring/ │ │ └── __init__.py │ ├── runners/ │ │ ├── __init__.py │ │ └── airflow/ │ │ ├── __init__.py │ │ ├── config/ │ │ │ ├── __init__.py │ │ │ └── standalone_variable_backend.py │ │ ├── dag/ │ │ │ ├── __init__.py │ │ │ ├── liminal_dags.py │ │ │ └── liminal_register_dags.py │ │ ├── executors/ │ │ │ ├── __init__.py │ │ │ ├── airflow.py │ │ │ ├── emr.py │ │ │ └── kubernetes.py │ │ ├── model/ │ │ │ ├── __init__.py │ │ │ ├── executor.py │ │ │ └── task.py │ │ ├── operators/ │ │ │ ├── __init__.py │ │ │ ├── cloudformation.py │ │ │ ├── job_status_operator.py │ │ │ └── operator_with_variable_resolving.py │ │ └── tasks/ │ │ ├── __init__.py │ │ ├── airflow.py │ │ ├── containerable.py │ │ ├── create_cloudformation_stack.py │ │ ├── delete_cloudformation_stack.py │ │ ├── hadoop.py │ │ ├── job_end.py │ │ ├── job_start.py │ │ ├── python.py │ │ ├── spark.py │ │ └── sql.py │ ├── settings.py │ └── sql/ │ └── __init__.py ├── liminal-arch.md ├── pyproject.toml ├── requirements.txt ├── run_tests.sh ├── scripts/ │ ├── Dockerfile-airflow │ ├── __init__.py │ ├── docker-compose.yml │ ├── liminal │ ├── requirements-airflow.txt │ └── webserver_config.py ├── setup.py ├── tests/ │ ├── __init__.py │ ├── liminal/ │ │ ├── __init__.py │ │ ├── core/ │ │ │ ├── __init__.py │ │ │ ├── config/ │ │ │ │ ├── __init__.py │ │ │ │ ├── defaults/ │ │ │ │ │ ├── __init__.py │ │ │ │ │ ├── test_apply_variables_substitution.py │ │ │ │ │ ├── test_defaults_pipeline_config.py │ │ │ │ │ ├── test_defaults_service_config.py │ │ │ │ │ └── test_defaults_tasks_config.py │ │ │ │ └── test_config.py │ │ │ └── util/ │ │ │ ├── __init__.py │ │ │ └── test_dict_utils.py │ │ └── kubernetes/ │ │ ├── __init__.py │ │ └── test_volume_util.py │ ├── runners/ │ │ ├── __init__.py │ │ ├── airflow/ │ │ │ ├── __init__.py │ │ │ ├── build/ │ │ │ │ ├── __init__.py │ │ │ │ ├── http/ │ │ │ │ │ ├── __init__.py │ │ │ │ │ └── python/ │ │ │ │ │ ├── __init__.py │ │ │ │ │ └── test_python_server_image_builder.py │ │ │ │ ├── python/ │ │ │ │ │ ├── __init__.py │ │ │ │ │ └── test_python_image_builder.py │ │ │ │ ├── spark/ │ │ │ │ │ ├── __init__.py │ │ │ │ │ └── test_spark_image_builder.py │ │ │ │ └── test_liminal_apps_builder.py │ │ │ ├── compiler/ │ │ │ │ └── __init__.py │ │ │ ├── dag/ │ │ │ │ ├── __init__.py │ │ │ │ └── test_liminal_dags.py │ │ │ ├── executors/ │ │ │ │ ├── __init__.py │ │ │ │ ├── test_airflow_executor.py │ │ │ │ └── test_emr.py │ │ │ ├── liminal/ │ │ │ │ ├── __init__.py │ │ │ │ ├── liminal.yml │ │ │ │ ├── myserver/ │ │ │ │ │ ├── __init__.py │ │ │ │ │ └── my_server.py │ │ │ │ ├── pip.conf │ │ │ │ ├── requirements.txt │ │ │ │ ├── write_inputs/ │ │ │ │ │ ├── __init__.py │ │ │ │ │ └── write_inputs.py │ │ │ │ └── write_outputs/ │ │ │ │ ├── __init__.py │ │ │ │ └── write_outputs.py │ │ │ ├── operators/ │ │ │ │ ├── __init__.py │ │ │ │ └── test_operator_with_variable_resolving.py │ │ │ └── tasks/ │ │ │ ├── __init__.py │ │ │ ├── test_create_cloudformation_stack.py │ │ │ ├── test_delete_cloudformation_stack.py │ │ │ ├── test_job_end.py │ │ │ ├── test_job_start.py │ │ │ ├── test_python.py │ │ │ └── test_spark_task.py │ │ └── apps/ │ │ ├── __init__.py │ │ ├── test/ │ │ │ └── liminal.yml │ │ ├── test_app/ │ │ │ ├── __init__.py │ │ │ ├── defaults/ │ │ │ │ ├── __init__.py │ │ │ │ └── liminal.yml │ │ │ ├── extra/ │ │ │ │ ├── __init__.py │ │ │ │ └── liminal.yml │ │ │ ├── helloworld/ │ │ │ │ ├── __init__.py │ │ │ │ └── hellp_world.py │ │ │ ├── liminal.yml │ │ │ └── my_server/ │ │ │ ├── __init__.py │ │ │ └── my_server.py │ │ └── test_spark_app/ │ │ ├── __init__.py │ │ ├── liminal.yml │ │ └── wordcount/ │ │ ├── __init__.py │ │ ├── requirements.txt │ │ ├── wordcount.py │ │ └── words.txt │ ├── test_licenses.py │ └── util/ │ ├── __init__.py │ ├── dag_test_utils.py │ ├── test_class_utils.py │ └── test_pkg_1/ │ ├── __init__.py │ ├── test_clazz_base.py │ └── test_pkg_1_1/ │ ├── __init__.py │ ├── test_clazz_child_1.py │ ├── test_clazz_child_2.py │ ├── test_pkg_1_1_1/ │ │ ├── __init__.py │ │ └── test_clazz_leaf_1.py │ └── test_pkg_1_1_2/ │ ├── __init__.py │ └── test_clazz_leaf_2.py └── yamllint-config.yml ================================================ FILE CONTENTS ================================================ ================================================ FILE: .asf.yaml ================================================ --- # # Licensed to the Apache Software Foundation (ASF) under one # or more contributor license agreements. See the NOTICE file # distributed with this work for additional information # regarding copyright ownership. The ASF licenses this file # to you under the Apache License, Version 2.0 (the # "License"); you may not use this file except in compliance # with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, # software distributed under the License is distributed on an # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY # KIND, either express or implied. See the License for the # specific language governing permissions and limitations # under the License. github: description: Apache Liminals goal is to operationalise the machine learning process, allowing data scientists to quickly transition from a successful experiment to an automated pipeline of model training, validation, deployment and inference in production. Liminal provides a Domain Specific Language to build ML workflows on top of Apache Airflow. homepage: https://liminal.apache.org labels: - data-science - big-data - machine-learning - airflow - ai - ml - workflows notifications: commits: commits@liminal.apache.org issues: issues@liminal.apache.org pullrequests: dev@liminal.apache.org jira_options: link label worklog ================================================ FILE: .bumpversion.cfg ================================================ # # Licensed to the Apache Software Foundation (ASF) under one # or more contributor license agreements. See the NOTICE file # distributed with this work for additional information # regarding copyright ownership. The ASF licenses this file # to you under the Apache License, Version 2.0 (the # "License"); you may not use this file except in compliance # with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, # software distributed under the License is distributed on an # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY # KIND, either express or implied. See the License for the # specific language governing permissions and limitations # under the License. [bumpversion] current_version = 0.0.1rc6 parse = (?P\d+)\.(?P\d+)\.(?P\.?\w*)(\-(?P[a-z]+)(?P\d+))? serialize = {major}.{minor}.{patch}-dev{build} {major}.{minor}.{patch} [bumpversion:part:release] optional_value = prod first_value = dev values = dev prod ================================================ FILE: .github/PULL_REQUEST_TEMPLATE.md ================================================ ## Changes Title (replace this with a logical title for your changes) *Issue #, if available:* ### *Description of changes:* Replace this with the PR description (mention the changes you have made, why you have made them, provide some background and any references to the provider documentation if needed, etc.). For more information on contributing, please see [Contributing](../CONTRIBUTING.md) section of our documentation. ### Status Replace this: describe the PR status. Examples: - WIP - work in progress - DONE - ready for review ### Checklist (tick everything that applies) - [ ] [PreCommitChecks - Code linting](../CONTRIBUTING.md#InstallRunPreCommit) (required) - [ ] [Tests](../CONTRIBUTING.md#RunningTests) ================================================ FILE: .github/workflows/pre_commits.yml ================================================ --- name: PreCommitChecks on: push: branches: - master # - '!release*' pull_request: branches: - master # - '!release*' jobs: linting: name: Run pre-commit hooks on py3.6 runs-on: ubuntu-20.04 steps: #---------------------------------------------- # Checkout, SetUp Python, Load Cache and Run PreCommitChecks #---------------------------------------------- - uses: actions/checkout@v2 with: fetch-depth: 0 # Install Python - uses: actions/setup-python@v2 with: python-version: 3.6 - name: Check Python ${{ matrix.python-version }} version run: python -V - uses: actions/cache@v2 with: path: ~/.cache/pip key: ${{ runner.os }}-pip restore-keys: ${{ runner.os }}-pip - name: Install python requirements run: | python -m pip install --upgrade pip pip install -r requirements.txt pip install pre-commit # load cached venv if cache exists - name: Load cached venv id: cached-poetry-dependencies uses: actions/cache@v2 with: path: .venv key: venv-${{ runner.os }}-${{ hashFiles('**/requirements.txt') }} - run: pre-commit install - name: Run pre-commit hooks on all the files run: pre-commit run --all-files --show-diff-on-failure --color always --verbose # - uses: pre-commit/action@v2.0.3 # with: # token: ${{ secrets.GITHUB_TOKEN }} ================================================ FILE: .github/workflows/unittests.yml ================================================ --- name: Running unittest on: # Trigger the workflow on push or pull request, but only for the master branch push: branches: - master pull_request: branches: - master # Trigger the workflow on cron schedule schedule: - cron: '7 0 * * *' jobs: unittest: runs-on: ubuntu-20.04 timeout-minutes: 20 steps: - name: Checkout uses: actions/checkout@v1 - name: Setup Minikube uses: manusa/actions-setup-minikube@v2.7.1 with: minikube version: 'v1.26.1' kubernetes version: 'v1.25.0' github token: ${{ secrets.GITHUB_TOKEN }} - name: Set up Python uses: actions/setup-python@v1 with: python-version: '3.6' - name: Install python requirements run: | python -m pip install --upgrade pip pip install -r requirements.txt - name: Run unittest - runners run: ./run_tests.sh approve: needs: unittest runs-on: ubuntu-20.04 steps: - run: | # approve the pull request curl --request POST \ --url https://api.github.com/repos/${{github.repository}}/pulls/${{github.event.number}}/reviews \ --header 'authorization: Bearer ${{ secrets.GITHUB_TOKEN }}' \ --header 'content-type: application/json' \ -d '{"event":"APPROVE"}' ================================================ FILE: .github/workflows/versioning.yml ================================================ --- name: Versioning RC on: workflow_run: workflows: [Running unittest] branches: [master] types: - completed jobs: versioning-auto-commit: if: ${{ github.event.workflow_run.conclusion == 'success' }} name: Publish package runs-on: ubuntu-latest steps: - uses: actions/checkout@v1 - name: Set up Python uses: actions/setup-python@v1 with: python-version: '3.6' - name: Install dependencies for setup run: | python -m pip install --upgrade pip pip install bump2version - name: Project versioning run: bumpversion build - name: Commit report run: | git config --global user.name 'Liminal Bot' git commit -am "Increment version" git push ================================================ FILE: .gitignore ================================================ # # Licensed to the Apache Software Foundation (ASF) under one # or more contributor license agreements. See the NOTICE file # distributed with this work for additional information # regarding copyright ownership. The ASF licenses this file # to you under the Apache License, Version 2.0 (the # "License"); you may not use this file except in compliance # with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, # software distributed under the License is distributed on an # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY # KIND, either express or implied. See the License for the # specific language governing permissions and limitations # under the License. env .idea bin include lib venv /build .Python *.pyc pip-selfcheck.json .DS_Store /build apache_liminal.egg-info scripts/*.tar.gz scripts/*.whl dist ================================================ FILE: .pre-commit-config.yaml ================================================ # Licensed to the Apache Software Foundation (ASF) under one # or more contributor license agreements. See the NOTICE file # distributed with this work for additional information # regarding copyright ownership. The ASF licenses this file # to you under the Apache License, Version 2.0 (the # "License"); you may not use this file except in compliance # with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, # software distributed under the License is distributed on an # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY # KIND, either express or implied. See the License for the # specific language governing permissions and limitations # under the License. --- minimum_pre_commit_version: 2.16.0 exclude: > (?x)^( .+/.venv/.+|.+/dist/.+|.+/.autovenv|.+/docs/|.github )$ fail_fast: true default_language_version: python: python3 default_stages: - prepare-commit-msg - commit # - push repos: - repo: https://github.com/pre-commit/pre-commit-hooks rev: v4.0.1 hooks: - id: check-case-conflict - id: check-merge-conflict stages: - commit - id: check-added-large-files args: [--maxkb=1000] stages: - commit - id: detect-aws-credentials args: - --allow-missing-credentials - id: fix-encoding-pragma args: - --remove - id: detect-private-key - id: destroyed-symlinks - id: mixed-line-ending - id: trailing-whitespace - id: check-toml # To match test*.py instead # - id: name-tests-test # args: [--django] # exclude: | # - tests/test_licenses - id: end-of-file-fixer description: Ensures that a file is either empty, or ends with one newline. exclude_types: [sql] # types: [text] - id: pretty-format-json args: - --autofix - --no-sort-keys - --indent - '2' # files: pass_filenames: true - repo: https://github.com/Lucas-C/pre-commit-hooks rev: v1.1.10 hooks: - id: insert-license name: Add license for all md files files: \.md$ exclude: ^\.github/.*$ args: - --comment-style - - --license-filepath - dev/LICENSE.txt - --fuzzy-match-generates-todo - id: insert-license name: Add license for all SQL files files: \.sql$ exclude: ^\.github/.*$|^airflow/_vendor/ args: - --comment-style - /*||*/ - --license-filepath - dev/LICENSE.txt - --fuzzy-match-generates-todo - id: insert-license name: Add license for all other files exclude: ^\.github/.*$|^airflow/_vendor/ args: - --comment-style - '|#|' - --license-filepath - dev/LICENSE.txt - --fuzzy-match-generates-todo files: > \.properties$|\.cfg$|\.conf$|\.ini$|\.ldif$|\.readthedocs$|\.service$|\.tf$|Dockerfile.*$ - id: insert-license name: Add license for all JINJA template files files: ^airflow/www/templates/.*\.html$|^docs/templates/.*\.html$.*\.jinja2 exclude: ^\.github/.*$^airflow/_vendor/ args: - --comment-style - '{#||#}' - --license-filepath - dev/LICENSE.txt - --fuzzy-match-generates-todo - id: insert-license name: Add license for all shell files exclude: ^\.github/.*$|^airflow/_vendor/ files: ^breeze$|^breeze-complete$|\.sh$|\.bash$|\.bats$ args: - --comment-style - '|#|' - --license-filepath - dev/LICENSE.txt - --fuzzy-match-generates-todo - id: insert-license name: Add license for all Python files exclude: ^\.github/.*$|^airflow/_vendor/ types: [python] args: - --comment-style - '|#|' - --license-filepath - dev/LICENSE.txt - --fuzzy-match-generates-todo - id: insert-license name: Add license for all YAML files exclude: ^\.github/.*$|^airflow/_vendor/ types: [yaml] files: \.yml$|\.yaml$ args: - --comment-style - '|#|' - --license-filepath - dev/LICENSE.txt - --fuzzy-match-generates-todo # Python: Black formatter - repo: https://github.com/psf/black rev: 21.12b0 hooks: - id: black args: [--safe, --quiet, --config=./pyproject.toml] files: \.pyi?$ exclude: .github/ # override until resolved: https://github.com/psf/black/issues/402 types: [] # - repo: https://github.com/jumanjihouse/pre-commit-hook-yamlfmt # rev: 0.1.0 # hooks: # - id: yamlfmt # args: [--mapping, '2', --sequence, '4', --offset, '2'] # Yaml: lint - repo: https://github.com/adrienverge/yamllint rev: v1.26.3 hooks: - id: yamllint name: Check YAML files with yamllint entry: yamllint -c yamllint-config.yml # --strict types: [yaml] # Python - isort to sort imports in Python files - repo: https://github.com/timothycrosley/isort rev: 5.10.1 hooks: - id: isort name: Run isort to sort imports in Python files args: [--profile, black] files: \.py$ # To keep consistent with the global isort skip config defined in setup.cfg exclude: ^build/.*$|^.tox/.*$|^venv/.*$ # PyUpgrade - 3.6 - repo: https://github.com/asottile/pyupgrade rev: v2.29.1 hooks: - id: pyupgrade args: [--py36-plus] exclude: ^scripts/|^docs - repo: https://github.com/asottile/blacken-docs rev: v1.12.0 hooks: - id: blacken-docs alias: black additional_dependencies: [black==21.9b0] # - repo: https://github.com/PyCQA/bandit # rev: 1.7.1 # hooks: # - id: bandit # args: [-ll, -r, liminal] # files: .py$ # exclude: tests/test_licenses.py|^tests/runners ================================================ FILE: CONTRIBUTING.md ================================================ # Contributing to Liminal docs Apache Liminal use Sphinx and readthedocs.org to create and publish documentation to readthedocs.org ## Basic setup Here is what you need to do in order to work and create docs locally ## Installing sphinx and documentation More details about sphinx and readthedocs can be found [here:](https://docs.readthedocs.io/en/stable/intro/getting-started-with-sphinx.html) Here is what you need to do: ``` pip install sphinx ``` Inside your project go to docs lib, update the docs you like and build them: ``` cd docs make html ``` ## Automatically rebuild sphinx docs ``` pip install sphinx-autobuild cd docs sphinx-autobuild source/ build/html/ ``` Now open a browser on http://localhost:8000/ Your docs will automatically update ## Rebuilding the pythondoc liminal code documentation ``` cd docs sphinx-apidoc -o source/ ../liminal ``` ## RunningTests When doing local development and running Liminal unit-tests, make sure to set LIMINAL_STAND_ALONE_MODE=True 1. Setup Minikube 2.Install python requirements: `pip install -r requirements.txt` 3. Run tests: `./run_tests.sh` ## InstallRunPreCommit [pre-commit](https://pre-commit.com/) is used to install Python code linting and formatting tools: ### Getting started **Requires `python >=3.6`, `pre-commit>=1.14` and a `git` repository** 1. Run `pip install pre-commit` or `pip install -r requirements.txt` 2. Install the hooks: Simple install without post hooks: `pre-commit install` OR Install the hooks: `pre-commit install --install-hooks` Optional: Install the post commit hooks: `pre-commit install --hook-type post-commit` 1. To run pre commit hooks: 1. Either run `git commit` the new configuration files 1. Run `pre-commit run -a` to lint and format your entire project Now on every commit, `pre-commit` will use a git hook to run the tools. **Warning: the first commit will take some time because the tools are being installed by `pre-commit`** `pre-commit run -a` to lint and format your entire project ### Resolving failed commits * If `black` fail, they have reformatted your code. * You should check the changes made. Then simply "git add --update ." and re-commit or `git add` and `git commit` the changes. Example: ![Fixing black failed commits](https://user-images.githubusercontent.com/16241795/146011210-7bc11b24-2033-43f7-8150-5ece4fe7bfea.png) ### EnabledHooks 1. [black](https://black.readthedocs.io/en/stable/): a Python automatic code formatter 1. [yamllint](https://yamllint.readthedocs.io/): A linter for YAML files. yamllint does not only check for syntax validity, but for weirdnesses like key repetition and cosmetic problems such as lines length, trailing spaces, indentation, etc. 1. [OutOfBoxHooks](https://github.com/pre-commit/pre-commit-hooks) - Out-of-the-box hooks for pre-commit like Check for files that contain merge conflict strings 1. [Bandit](https://bandit.readthedocs.io/en/latest/) is a tool designed to find common security issues in Python code. 1. [blacken-docs](https://github.com/asottile/blacken-docs) Run `black` on python code blocks in documentation files 1. [pyupgrade](https://github.com/asottile/pyupgrade) A tool (and pre-commit hook) to automatically upgrade syntax for newer versions of the language. 1. [isort](https://pycqa.github.io/isort/) A Python utility / library to sort imports. ================================================ FILE: DISCLAIMER-WIP ================================================ Apache Liminal is an effort undergoing incubation at the Apache Software Foundation (ASF), sponsored by the Apache Incubator PMC. Incubation is required of all newly accepted projects until a further review indicates that the infrastructure, communications, and decision making process have stabilized in a manner consistent with other successful ASF projects. While incubation status is not necessarily a reflection of the completeness or stability of the code, it does indicate that the project has yet to be fully endorsed by the ASF. Some of the incubating project’s releases may not be fully compliant with ASF policy. For example, releases may have incomplete or un-reviewed licensing conditions. What follows is a list of known issues the project is currently aware of (note that this list, by definition, is likely to be incomplete): 1. The Liminal source code is distributed under the Apache License, Version 2.0. However building Liminal requires using a transitive required library chardet V 4.0.0, which is under LGPL license, Version 2.1. If you are planning to incorporate this work into your product/project, please be aware that you will need to conduct a thorough licensing review to determine the overall implications of including this work. ================================================ FILE: LICENSE ================================================ Apache License Version 2.0, January 2004 http://www.apache.org/licenses/ TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 1. Definitions. "License" shall mean the terms and conditions for use, reproduction, and distribution as defined by Sections 1 through 9 of this document. "Licensor" shall mean the copyright owner or entity authorized by the copyright owner that is granting the License. "Legal Entity" shall mean the union of the acting entity and all other entities that control, are controlled by, or are under common control with that entity. For the purposes of this definition, "control" means (i) the power, direct or indirect, to cause the direction or management of such entity, whether by contract or otherwise, or (ii) ownership of fifty percent (50%) or more of the outstanding shares, or (iii) beneficial ownership of such entity. "You" (or "Your") shall mean an individual or Legal Entity exercising permissions granted by this License. "Source" form shall mean the preferred form for making modifications, including but not limited to software source code, documentation source, and configuration files. "Object" form shall mean any form resulting from mechanical transformation or translation of a Source form, including but not limited to compiled object code, generated documentation, and conversions to other media types. "Work" shall mean the work of authorship, whether in Source or Object form, made available under the License, as indicated by a copyright notice that is included in or attached to the work (an example is provided in the Appendix below). "Derivative Works" shall mean any work, whether in Source or Object form, that is based on (or derived from) the Work and for which the editorial revisions, annotations, elaborations, or other modifications represent, as a whole, an original work of authorship. For the purposes of this License, Derivative Works shall not include works that remain separable from, or merely link (or bind by name) to the interfaces of, the Work and Derivative Works thereof. "Contribution" shall mean any work of authorship, including the original version of the Work and any modifications or additions to that Work or Derivative Works thereof, that is intentionally submitted to Licensor for inclusion in the Work by the copyright owner or by an individual or Legal Entity authorized to submit on behalf of the copyright owner. For the purposes of this definition, "submitted" means any form of electronic, verbal, or written communication sent to the Licensor or its representatives, including but not limited to communication on electronic mailing lists, source code control systems, and issue tracking systems that are managed by, or on behalf of, the Licensor for the purpose of discussing and improving the Work, but excluding communication that is conspicuously marked or otherwise designated in writing by the copyright owner as "Not a Contribution." "Contributor" shall mean Licensor and any individual or Legal Entity on behalf of whom a Contribution has been received by Licensor and subsequently incorporated within the Work. 2. Grant of Copyright License. Subject to the terms and conditions of this License, each Contributor hereby grants to You a perpetual, worldwide, non-exclusive, no-charge, royalty-free, irrevocable copyright license to reproduce, prepare Derivative Works of, publicly display, publicly perform, sublicense, and distribute the Work and such Derivative Works in Source or Object form. 3. Grant of Patent License. Subject to the terms and conditions of this License, each Contributor hereby grants to You a perpetual, worldwide, non-exclusive, no-charge, royalty-free, irrevocable (except as stated in this section) patent license to make, have made, use, offer to sell, sell, import, and otherwise transfer the Work, where such license applies only to those patent claims licensable by such Contributor that are necessarily infringed by their Contribution(s) alone or by combination of their Contribution(s) with the Work to which such Contribution(s) was submitted. If You institute patent litigation against any entity (including a cross-claim or counterclaim in a lawsuit) alleging that the Work or a Contribution incorporated within the Work constitutes direct or contributory patent infringement, then any patent licenses granted to You under this License for that Work shall terminate as of the date such litigation is filed. 4. Redistribution. You may reproduce and distribute copies of the Work or Derivative Works thereof in any medium, with or without modifications, and in Source or Object form, provided that You meet the following conditions: (a) You must give any other recipients of the Work or Derivative Works a copy of this License; and (b) You must cause any modified files to carry prominent notices stating that You changed the files; and (c) You must retain, in the Source form of any Derivative Works that You distribute, all copyright, patent, trademark, and attribution notices from the Source form of the Work, excluding those notices that do not pertain to any part of the Derivative Works; and (d) If the Work includes a "NOTICE" text file as part of its distribution, then any Derivative Works that You distribute must include a readable copy of the attribution notices contained within such NOTICE file, excluding those notices that do not pertain to any part of the Derivative Works, in at least one of the following places: within a NOTICE text file distributed as part of the Derivative Works; within the Source form or documentation, if provided along with the Derivative Works; or, within a display generated by the Derivative Works, if and wherever such third-party notices normally appear. The contents of the NOTICE file are for informational purposes only and do not modify the License. You may add Your own attribution notices within Derivative Works that You distribute, alongside or as an addendum to the NOTICE text from the Work, provided that such additional attribution notices cannot be construed as modifying the License. You may add Your own copyright statement to Your modifications and may provide additional or different license terms and conditions for use, reproduction, or distribution of Your modifications, or for any such Derivative Works as a whole, provided Your use, reproduction, and distribution of the Work otherwise complies with the conditions stated in this License. 5. Submission of Contributions. Unless You explicitly state otherwise, any Contribution intentionally submitted for inclusion in the Work by You to the Licensor shall be under the terms and conditions of this License, without any additional terms or conditions. Notwithstanding the above, nothing herein shall supersede or modify the terms of any separate license agreement you may have executed with Licensor regarding such Contributions. 6. Trademarks. This License does not grant permission to use the trade names, trademarks, service marks, or product names of the Licensor, except as required for reasonable and customary use in describing the origin of the Work and reproducing the content of the NOTICE file. 7. Disclaimer of Warranty. Unless required by applicable law or agreed to in writing, Licensor provides the Work (and each Contributor provides its Contributions) on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied, including, without limitation, any warranties or conditions of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A PARTICULAR PURPOSE. You are solely responsible for determining the appropriateness of using or redistributing the Work and assume any risks associated with Your exercise of permissions under this License. 8. Limitation of Liability. In no event and under no legal theory, whether in tort (including negligence), contract, or otherwise, unless required by applicable law (such as deliberate and grossly negligent acts) or agreed to in writing, shall any Contributor be liable to You for damages, including any direct, indirect, special, incidental, or consequential damages of any character arising as a result of this License or out of the use or inability to use the Work (including but not limited to damages for loss of goodwill, work stoppage, computer failure or malfunction, or any and all other commercial damages or losses), even if such Contributor has been advised of the possibility of such damages. 9. Accepting Warranty or Additional Liability. While redistributing the Work or Derivative Works thereof, You may choose to offer, and charge a fee for, acceptance of support, warranty, indemnity, or other liability obligations and/or rights consistent with this License. However, in accepting such obligations, You may act only on Your own behalf and on Your sole responsibility, not on behalf of any other Contributor, and only if You agree to indemnify, defend, and hold each Contributor harmless for any liability incurred by, or claims asserted against, such Contributor by reason of your accepting any such warranty or additional liability. END OF TERMS AND CONDITIONS APPENDIX: How to apply the Apache License to your work. To apply the Apache License to your work, attach the following boilerplate notice, with the fields enclosed by brackets "[]" replaced with your own identifying information. (Don't include the brackets!) The text should be enclosed in the appropriate comment syntax for the file format. We also recommend that a file or class name and description of purpose be included on the same "printed page" as the copyright notice for easier identification within third-party archives. Copyright [yyyy] [name of copyright owner] Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. ================================================ FILE: MANIFEST.in ================================================ # # Licensed to the Apache Software Foundation (ASF) under one # or more contributor license agreements. See the NOTICE file # distributed with this work for additional information # regarding copyright ownership. The ASF licenses this file # to you under the Apache License, Version 2.0 (the # "License"); you may not use this file except in compliance # with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, # software distributed under the License is distributed on an # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY # KIND, either express or implied. See the License for the # specific language governing permissions and limitations # under the License. include scripts/* requirements.txt requirements-airflow.txt recursive-include liminal/build/ * ================================================ FILE: NOTICE ================================================ Apache Liminal Copyright 2020-2023 The Apache Software Foundation This product includes software developed at The Apache Software Foundation (http://www.apache.org/). Licensed under the Apache License 2.0. ================================================ FILE: README.md ================================================ # Apache Liminal Apache Liminal is an end-to-end platform for data engineers & scientists, allowing them to build, train and deploy machine learning models in a robust and agile way. The platform provides the abstractions and declarative capabilities for data extraction & feature engineering followed by model training and serving. Liminal's goal is to operationalize the machine learning process, allowing data scientists to quickly transition from a successful experiment to an automated pipeline of model training, validation, deployment and inference in production, freeing them from engineering and non-functional tasks, and allowing them to focus on machine learning code and artifacts. # Basics Using simple YAML configuration, create your own schedule data pipelines (a sequence of tasks to perform), application servers, and more. ## Getting Started A simple getting stated guide for Liminal can be found [here](docs/getting-started/hello_world.md) ## Apache Liminal Documentation Full documentation of Apache Liminal can be found [here](docs/liminal) ## High Level Architecture High level architecture documentation can be found [here](docs/architecture.md) ## Example YAML config file ```yaml --- name: MyLiminalStack owner: Bosco Albert Baracus volumes: - volume: myvol1 local: path: /Users/me/myvol1 images: - image: my_python_task_img type: python source: write_inputs - image: my_parallelized_python_task_img source: write_outputs - image: my_server_image type: python_server source: myserver endpoints: - endpoint: /myendpoint1 module: my_server function: myendpoint1func pipelines: - pipeline: my_pipeline start_date: 1970-01-01 timeout_minutes: 45 schedule: 0 * 1 * * metrics: namespace: TestNamespace backends: [ 'cloudwatch' ] tasks: - task: my_python_task type: python description: static input task image: my_python_task_img env_vars: NUM_FILES: 10 NUM_SPLITS: 3 mounts: - mount: mymount volume: myvol1 path: /mnt/vol1 cmd: python -u write_inputs.py - task: my_parallelized_python_task type: python description: parallelized python task image: my_parallelized_python_task_img env_vars: FOO: BAR executors: 3 mounts: - mount: mymount volume: myvol1 path: /mnt/vol1 cmd: python -u write_inputs.py services: - service: my_python_server description: my python server image: my_server_image ``` # Installation 1. Install this repository (HEAD) ```bash pip install git+https://github.com/apache/incubator-liminal.git ``` 2. Optional: set LIMINAL_HOME to path of your choice (if not set, will default to ~/liminal_home) ```bash echo 'export LIMINAL_HOME=' >> ~/.bash_profile && source ~/.bash_profile ``` # Authoring pipelines This involves at minimum creating a single file called liminal.yml as in the example above. If your pipeline requires custom python code to implement tasks, they should be organized [like this](https://github.com/apache/incubator-liminal/tree/master/tests/runners/airflow/liminal) If your pipeline introduces imports of external packages which are not already a part of the liminal framework (i.e. you had to pip install them yourself), you need to also provide a requirements.txt in the root of your project. # Testing the pipeline locally When your pipeline code is ready, you can test it by running it locally on your machine. 1. Ensure you have The Docker engine running locally, and enable a local Kubernetes cluster: ![Kubernetes configured](https://raw.githubusercontent.com/apache/incubator-liminal/master/images/k8s_running.png) And allocate it at least 3 CPUs (under "Resources" in the Docker preference UI). If you want to execute your pipeline on a remote kubernetes cluster, make sure the cluster is configured using: ```bash kubectl config set-context ``` 2. Build the docker images used by your pipeline. In the example pipeline above, you can see that tasks and services have an "image" field - such as "my_static_input_task_image". This means that the task is executed inside a docker container, and the docker container is created from a docker image where various code and libraries are installed. You can take a look at what the build process looks like, e.g. [here](https://github.com/apache/incubator-liminal/tree/master/liminal/build/image/python) In order for the images to be available for your pipeline, you'll need to build them locally: ```bash cd liminal build ``` You'll see that a number of outputs indicating various docker images built. 3. Create a kubernetes local volume \ In case your Yaml includes working with [volumes](https://github.com/apache/incubator-liminal/blob/6253f8b2c9dc244af032979ec6d462dc3e07e170/docs/getting_started.md#mounted-volumes) please first run the following command: ```bash cd liminal create ``` 4. Deploy the pipeline: ```bash cd liminal deploy ``` Note: after upgrading liminal, it's recommended to issue the command ```bash liminal deploy --clean ``` This will rebuild the airlfow docker containers from scratch with a fresh version of liminal, ensuring consistency. 5. Start the server ```bash liminal start ``` 6. Stop the server ```bash liminal stop ``` 7. Display the server logs ```bash liminal logs --follow/--tail Number of lines to show from the end of the log: liminal logs --tail=10 Follow log output: liminal logs --follow ``` 8. Navigate to [http://localhost:8080/admin](http://localhost:8080/admin) 9. You should see your ![pipeline](https://raw.githubusercontent.com/apache/incubator-liminal/master/images/airflow.png) The pipeline is scheduled to run according to the ```json schedule: 0 * 1 * *``` field in the .yml file you provided. 10. To manually activate your pipeline: - Click your pipeline and then click "trigger DAG" - Click "Graph view" You should see the steps in your pipeline getting executed in "real time" by clicking "Refresh" periodically. ![Pipeline activation](https://raw.githubusercontent.com/apache/incubator-liminal/master/images/airflow_trigger.png) # Contributing More information on contributing can be found [here](CONTRIBUTING.md) # Community The Liminal community holds a public call every Monday - [Liminal Community Calendar](https://calendar.google.com/calendar/u/0/r?cid=jom1i20emghura6s6ookhe2skk@group.calendar.google.com) - [Dev-Mailing-List](https://lists.apache.org/list.html?dev@liminal.apache.org) ## Running Tests (for contributors) When doing local development and running Liminal unit-tests, make sure to set LIMINAL_STAND_ALONE_MODE=True ================================================ FILE: RETIRED.txt ================================================ This podling has been retired. Please see http://incubator.apache.org/projects/index.html#liminal ================================================ FILE: dev/LICENSE.txt ================================================ Licensed to the Apache Software Foundation (ASF) under one or more contributor license agreements. See the NOTICE file distributed with this work for additional information regarding copyright ownership. The ASF licenses this file to you under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. ================================================ FILE: dev/RELEASE.md ================================================ - [Apache Liminal source releases](#apache-liminal-source-releases) - [Apache Liminal Package](#apache-liminal-package) - [Prerequisites for the release manager preparing the release](#prerequisites-for-the-release-manager-preparing-the-release) - [version IDs](#version-ids) - [Testing the release locally](#testing-the-release-locally) - [Upload Public keys to id.apache.org](#upload-public-keys-to-idapacheorg) - [Configure PyPI uploads](#configure-pypi-uploads) - [Hardware used to prepare and verify the packages](#hardware-used-to-prepare-and-verify-the-packages) - [Apache Liminal packages](#apache-liminal-packages) - [Prepare the Apache Liminal Package RC](#prepare-the-apache-liminal-package-rc) - [Build RC artifacts (both source packages and convenience packages)](#build-rc-artifacts-both-source-packages-and-convenience-packages) - [Prepare PyPI convenience "snapshot" packages](#prepare-pypi-convenience-snapshot-packages) - [Vote and verify the Apache Liminal release candidate](#vote-and-verify-the-apache-liminal-release-candidate) - [Prepare Vote email on the Apache Liminal release candidate](#prepare-vote-email-on-the-apache-liminal-release-candidate) - [Verify the release candidate by PMCs (legal)](#verify-the-release-candidate-by-pmcs-legal) - [PMC responsibilities](#pmc-responsibilities) - [SVN check](#svn-check) - [Verify the licences](#verify-the-licences) - [Verify the signatures](#verify-the-signatures) - [Verify the SHA512 sum](#verify-the-sha512-sum) - [Verify if the release candidate "works" by Contributors](#verify-if-the-release-candidate-works-by-contributors) - [Publish the final Apache Liminal release](#publish-the-final-apache-liminal-release) - [Summarize the voting for the Apache Liminal release](#summarize-the-voting-for-the-apache-liminal-release) - [Publish release to SVN](#publish-release-to-svn) - [Prepare PyPI "release" packages](#prepare-pypi-release-packages) - [Update CHANGELOG.md](#update-changelogmd) - [Notify developers of release](#notify-developers-of-release) # Apache Liminal source releases ## Apache Liminal Package This package contains sources that allow the user building fully-functional Apache Liminal package. They contain sources for "apache-liminal" python package that installs "liminal" Python package and includes all the assets required to release the webserver UI coming with Apache Liminal The Source releases are the only "official" Apache Software Foundation releases, and they are distributed via [Official Apache Download sources](https://downloads.apache.org/) Following source releases Apache Liminal release manager also distributes convenience packages: * PyPI packages released via https://pypi.org/project/apache-liminal/ * Docker Images released via https://hub.docker.com/repository/docker/apache/liminal (coming soon...) Those convenience packages are not "official releases" of Apache Liminal, but the users who cannot or do not want to build the packages themselves can use them as a convenient way of installing Apache Liminal, however they are not considered as "official source releases". You can read more details about it in the [ASF Release Policy](http://www.apache.org/legal/release-policy.html). This document describes the process of releasing both - official source packages and convenience packages for Apache Liminal packages. # Prerequisites for the release manager preparing the release The person acting as release manager has to fulfill certain pre-requisites. More details and FAQs are available in the [ASF Release Policy](http://www.apache.org/legal/release-policy.html) but here some important pre-requisites are listed below. Note that release manager does not have to be a PMC - it is enough to be committer to assume the release manager role, but there are final steps in the process (uploading final releases to SVN) that can only be done by PMC member. If needed, the release manager can ask PMC to perform that final step of release. ## version IDs Should follow https://www.python.org/dev/peps/pep-0440/ ## Testing the release locally - Build a local version of liminal: ```bash # Set Version export LIMINAL_BUILD_VERSION=0.0.1rc1-incubating python setup.py sdist bdist_wheel ``` - Start a test project with a liminal .yml file in it - Clear folder $LIMINAL_HOME - Install liminal in the test project ```bash pip install liminal build liminal deploy --clean liminal start ``` Note: When you run liminal deploy, a liminal installs itself inside airflow container. You can control which version of liminal gets installed inside docker using LIMINAL_VERSION environment variable. This can be any string which results in a legal call to: ```bash pip install ${LIMINAL_VERSION} ``` This includes - A standard pip version like apache-liminal==0.0.1dev1 avail from pypi - A URL for git e.g. git+https://github.com/apache/incubator-liminal.git - A string indicating where to get the package from like --index apache-liminal==xyz - A local path to a .whl which is available inside the docker (e.g. which you placed in the scripts/ folder) This is useful if you are making changes in liminal locally and want to test them. If you don't specify this variable, liminal attempts to discover how to install itself by running a pip freeze and looking at the result. This covers pip repositories, files and installtion from URL. The fallback in case no string is found, is simply 'apache-liminal' assuming your .pypirc contains an index which has this package. ## Testing a version from testpypi: Installing liminal locally: pip install --index-url https://test.pypi.org/simple/ --extra-index-url https://pypi.org/simple apache-liminal Tell liminal where to take the version to put inside the airflow docker: export LIMINAL_VERSION='--index-url https://test.pypi.org/simple/ apache-liminal==0.0.1rc1-incubating' ## Upload Public keys to id.apache.org Make sure your public key is on id.apache.org and in KEYS. You will need to sign the release artifacts with your pgp key. After you have created a key, make sure you: - Add your GPG pub key to https://dist.apache.org/repos/dist/release/liminal/KEYS , follow the instructions at the top of that file. Upload your GPG public key to https://pgp.mit.edu - Add your key fingerprint to https://id.apache.org/ (login with your apache credentials, paste your fingerprint into the pgp fingerprint field and hit save). ```shell script # Create PGP Key gpg --gen-key # Checkout ASF dist repo svn checkout https://dist.apache.org/repos/dist/release/incubator/liminal cd liminal # Add your GPG pub key to KEYS file. Replace "Aviem Zur" with your name (gpg --list-sigs "Aviem Zur" && gpg --armor --export "Aviem Zur" ) >> KEYS # Commit the changes svn commit -m "Add PGP keys of Liminal developers" ``` See this for more detail on creating keys and what is required for signing releases. http://www.apache.org/dev/release-signing.html#basic-facts ## Configure PyPI uploads In order to not reveal your password in plain text, it's best if you create and configure API Upload tokens. You can add and copy the tokens here: * [Test PyPI](https://test.pypi.org/manage/account/token/) * [Prod PyPI](https://pypi.org/manage/account/token/) Create a `~/.pypirc` file: ```ini [distutils] index-servers = pypi pypitest [pypi] username=__token__ password= [pypitest] repository=https://test.pypi.org/legacy/ username=__token__ password= ``` Set proper permissions for the pypirc file: ```shell script $ chmod 600 ~/.pypirc ``` - Install [twine](https://pypi.org/project/twine/) if you do not have it already (it can be done in a separate virtual environment). ```shell script pip install twine ``` (more details [here](https://peterdowns.com/posts/first-time-with-pypi.html).) - Set proper permissions for the pypirc file: `$ chmod 600 ~/.pypirc` ## Hardware used to prepare and verify the packages The best way to prepare and verify the releases is to prepare them on a hardware owned and controlled by the committer acting as release manager. While strictly speaking, releases must only be verified on hardware owned and controlled by the committer, for practical reasons it's best if the packages are prepared using such hardware. More information can be found in this [FAQ](http://www.apache.org/legal/release-policy.html#owned-controlled-hardware) # Apache Liminal packages ## Prepare the Apache Liminal Package RC ### Build RC artifacts (both source packages and convenience packages) The Release Candidate artifacts we vote upon should be the exact ones we vote against, without any modification than renaming – i.e. the contents of the files must be the same between voted release canidate and final release. Because of this the version in the built artifacts that will become the official Apache releases must not include the rcN suffix. - Set environment variables ```bash # Set Version export LIMINAL_BUILD_VERSION=0.0.1rc1-incubating # Example after cloning git clone https://github.com/apache/incubating-liminal.git cd incubating-liminal ``` - Mark the next version ```bash # Set Version sed -i '/^current_version /s/=.*$/= '"$LIMINAL_BUILD_VERSION"'/' .bumpversion.cfg ``` - Tag your release ```bash git tag -a ${LIMINAL_BUILD_VERSION} -m "some message" ``` - Clean the checkout: the sdist step below will ```bash git clean -fxd ``` - Tarball the repo ```bash git archive --format=tar.gz ${LIMINAL_BUILD_VERSION} --prefix=apache-liminal-${LIMINAL_BUILD_VERSION}/ -o apache-liminal-${LIMINAL_BUILD_VERSION}-source.tar.gz ``` - Generate sdist NOTE: Make sure your checkout is clean at this stage - any untracked or changed files will otherwise be included in the file produced. ```bash python setup.py sdist bdist_wheel ``` - Generate SHA512/ASC (If you have not generated a key yet, generate it by following instructions on http://www.apache.org/dev/openpgp.html#key-gen-generate-key) ```bash dev/sign.sh apache-liminal-${LIMINAL_BUILD_VERSION}-source.tar.gz dev/sign.sh dist/apache-liminal-${LIMINAL_BUILD_VERSION}.tar.gz dev/sign.sh dist/apache_liminal-${LIMINAL_BUILD_VERSION}-py3-none-any.whl ``` - Push Tags ```bashs git push origin ${LIMINAL_BUILD_VERSION} ``` - Push the artifacts to ASF dev dist repo ``` # First clone the repo svn checkout https://dist.apache.org/repos/dist/dev/liminal liminal-dev # Create new folder for the release cd liminal-dev svn mkdir ${LIMINAL_BUILD_VERSION} # Move the artifacts to svn folder & commit mv apache{-,_}liminal-${LIMINAL_BUILD_VERSION}* ${LIMINAL_BUILD_VERSION}/ cd ${LIMINAL_BUILD_VERSION} svn add * svn commit -m "Add artifacts for Liminal ${LIMINAL_BUILD_VERSION}" ``` ### Prepare PyPI convenience "snapshot" packages At this point we have the artefact that we vote on, but as a convenience to developers we also want to publish "snapshots" of the RC builds to pypi for installing via pip. To do this we need to - Verify the artifacts that would be uploaded: ```bash twine check dist/* ``` - Upload the package to PyPi's test environment: ```bash twine upload -r pypitest dist/* ``` - Verify that the test package looks good by downloading it and installing it into a virtual environment. The package download link is available at: https://test.pypi.org/project/apache-liminal/#files - Upload the package to PyPi's production environment: `twine upload -r pypi dist/*` - Again, confirm that the package is available here: https://pypi.python.org/pypi/apache-liminal It is important to stress that this snapshot should not be named "release", and it is not supposed to be used by and advertised to the end-users who do not read the devlist. ## Vote and verify the Apache Liminal release candidate ### Prepare Vote email on the Apache Liminal release candidate - Use the dev/liminal-jira script to generate a list of Liminal JIRAs that were closed in the release. - Send out a vote to the dev@liminal.apache.org mailing list: Subject: ``` [VOTE] Liminal 1.10.2rc3 ``` Body: ``` Hey all, I have cut Liminal 1.10.2 RC3. This email is calling a vote on the release, which will last for 72 hours. Consider this my (binding) +1. Liminal 1.10.2 RC3 is available at: https://dist.apache.org/repos/dist/dev/liminal/1.10.2rc3/ *apache-liminal-1.10.2rc3-source.tar.gz* is a source release that comes with INSTALL instructions. *apache-liminal-1.10.2rc3-bin.tar.gz* is the binary Python "sdist" release. Public keys are available at: https://dist.apache.org/repos/dist/release/liminal/KEYS Only votes from PMC members are binding, but the release manager should encourage members of the community to test the release and vote with "(non-binding)". The test procedure for PMCs and Contributors who would like to test this RC are described in https://github.com/apache/liminal/blob/master/dev/README.md#vote-and-verify-the-apache-liminal-release-candidate Please note that the version number excludes the `rcX` string, so it's now simply 1.10.2. This will allow us to rename the artifact without modifying the artifact checksums when we actually release. Changes since 1.10.2rc2: *Bugs*: [LIMINAL-3732] ... ... *Improvements*: [LIMINAL-3302] ... ... *New features*: [LIMINAL-2874] ... ... *Doc-only Change*: [LIMINAL-XXX] Fix Minor issues in Documentation ... Cheers, ``` ### Verify the release candidate by PMCs (legal) #### PMC responsibilities The PMCs should verify the releases in order to make sure the release is following the [Apache Legal Release Policy](http://www.apache.org/legal/release-policy.html). At least 3 (+1) votes should be recorded in accordance to [Votes on Package Releases](https://www.apache.org/foundation/voting.html#ReleaseVotes) The legal checks include: * checking if the packages are present in the right dist folder on svn * verifying if all the sources have correct licences * verifying if release manager signed the releases with the right key * verifying if all the checksums are valid for the release #### SVN check The files should be present in the sub-folder of [Liminal dist](https://dist.apache.org/repos/dist/dev/incubator/liminal/) The following files should be present (9 files): * .tar.gz + .asc + .sha512 * -.whl + .asc + .sha512 As a PMC you should be able to clone the SVN repository: ```shell script svn co https://dist.apache.org/repos/dist/dev/incubator/liminal ``` Or update it if you already checked it out: ```shell script svn update . ``` #### Verify the licences This can be done with the Apache RAT tool. * Download the latest jar from https://creadur.apache.org/rat/download_rat.cgi (unpack the sources, the jar is inside) * Unpack the .tar.gz to a folder * Enter the folder and run the check (point to the place where you extracted the .jar) ```shell script java -jar ../../apache-rat-0.13/apache-rat-0.13.jar -E .rat-excludes -d . ``` #### Verify the signatures Make sure you have the key of person signed imported in your GPG. You can find the valid keys in [KEYS](https://dist.apache.org/repos/dist/release/liminal/KEYS). You can import the whole KEYS file: ```shell script gpg --import KEYS ``` You can also import the keys individually from a keyserver. The below one uses Kaxil's key and retrieves it from the default GPG keyserver [OpenPGP.org](https://keys.openpgp.org): ```shell script gpg --receive-keys 12717556040EEF2EEAF1B9C275FCCD0A25FA0E4B ``` You should choose to import the key when asked. Note that by being default, the OpenPGP server tends to be overloaded often and might respond with errors or timeouts. Many of the release managers also uploaded their keys to the [GNUPG.net](https://keys.gnupg.net) keyserver, and you can retrieve it from there. ```shell script gpg --keyserver keys.gnupg.net --receive-keys 12717556040EEF2EEAF1B9C275FCCD0A25FA0E4B ``` Once you have the keys, the signatures can be verified by running this: ```shell script for i in *.asc do echo "Checking $i"; gpg --verify `basename $i .sha512 ` done ``` This should produce results similar to the below. The "Good signature from ..." is indication that the signatures are correct. Do not worry about the "not certified with a trusted signature" warning. Most of the certificates used by release managers are self signed, that's why you get this warning. By importing the server in the previous step and importing it via ID from [KEYS](https://dist.apache.org/repos/dist/release/liminal/KEYS) page, you know that this is a valid Key already. ``` Checking apache-liminal-1.10.12rc4-bin.tar.gz.asc gpg: assuming signed data in 'apache-liminal-1.10.12rc4-bin.tar.gz' gpg: Signature made sob, 22 sie 2020, 20:28:28 CEST gpg: using RSA key 12717556040EEF2EEAF1B9C275FCCD0A25FA0E4B gpg: Good signature from "Kaxil Naik " [unknown] gpg: WARNING: This key is not certified with a trusted signature! gpg: There is no indication that the signature belongs to the owner. Primary key fingerprint: 1271 7556 040E EF2E EAF1 B9C2 75FC CD0A 25FA 0E4B Checking apache_liminal-1.10.12rc4-py2.py3-none-any.whl.asc gpg: assuming signed data in 'apache_liminal-1.10.12rc4-py2.py3-none-any.whl' gpg: Signature made sob, 22 sie 2020, 20:28:31 CEST gpg: using RSA key 12717556040EEF2EEAF1B9C275FCCD0A25FA0E4B gpg: Good signature from "Kaxil Naik " [unknown] gpg: WARNING: This key is not certified with a trusted signature! gpg: There is no indication that the signature belongs to the owner. Primary key fingerprint: 1271 7556 040E EF2E EAF1 B9C2 75FC CD0A 25FA 0E4B Checking apache-liminal-1.10.12rc4-source.tar.gz.asc gpg: assuming signed data in 'apache-liminal-1.10.12rc4-source.tar.gz' gpg: Signature made sob, 22 sie 2020, 20:28:25 CEST gpg: using RSA key 12717556040EEF2EEAF1B9C275FCCD0A25FA0E4B gpg: Good signature from "Kaxil Naik " [unknown] gpg: WARNING: This key is not certified with a trusted signature! gpg: There is no indication that the signature belongs to the owner. Primary key fingerprint: 1271 7556 040E EF2E EAF1 B9C2 75FC CD0A 25FA 0E4B ``` #### Verify the SHA512 sum Run this: ```shell script for i in *.sha512 do echo "Checking $i"; gpg --print-md SHA512 `basename $i .sha512 ` | diff - $i done ``` You should get output similar to: ``` Checking apache_liminal-1.10.12rc4-py3-none-any.whl.sha512 Checking apache-liminal-1.10.12rc4.tar.gz.sha512 ``` ### Verify if the release candidate "works" by Contributors This can be done (and we encourage to) by any of the Contributors. In fact, it's best if the actual users of Apache Liminal test it in their own staging/test installations. Each release candidate is available on PyPI apart from SVN packages, so everyone should be able to install the release candidate version of Liminal via simply ( is 1.10.12 for example, and is release candidate number 1,2,3,....). ```shell script pip install apache-liminal==rc ``` ### Seek approval from Incubator PMC [Post to the Incubator’s general list requesting approval from the Incubator PMC](https://lists.apache.org/thread.html/06655226ba08c16a8cb273f9b45e0b0a15ebaed0d06783fdd06a03f6@%3Cgeneral.incubator.apache.org%3E). Should the Incubator PMC vote to approve a release, the Podling MAY make that release available to the public under these conditions: * The release archive(s) MUST include the word "incubating" in the filename. * The release archive(s) MUST contain a disclaimer found in DISCLAIMER file. Releases for the Podling MUST be distributed through http://www.apache.org/dist/incubator/Podling https://lists.apache.org/thread.html/06655226ba08c16a8cb273f9b45e0b0a15ebaed0d06783fdd06a03f6@%3Cgeneral.incubator.apache.org%3E ## Publish the final Apache Liminal release ### Summarize the voting for the Apache Liminal release Once the vote has been passed, you will need to send a result vote to dev@liminal.apache.org: Subject: ``` [RESULT][VOTE] Liminal 1.10.2rc3 ``` Message: ``` Hello, Apache Liminal 1.10.2 (based on RC3) has been accepted. 3 “+1” binding votes received: .... 3 "+1" non-binding votes received: ... Vote thread: ... I'll continue with the release process, and the release announcement will follow shortly. Cheers, ``` ### Publish release to SVN You need to migrate the RC artifacts that passed to this repository: https://dist.apache.org/repos/dist/release/liminal/ (The migration should include renaming the files so that they no longer have the RC number in their filenames.) The best way of doing this is to svn cp between the two repos (this avoids having to upload the binaries again, and gives a clearer history in the svn commit logs): ```shell script # First clone the repo export RC=1.10.4rc5 export VERSION=${RC/rc?/} svn checkout https://dist.apache.org/repos/dist/release/liminal liminal-release # Create new folder for the release cd liminal-release svn mkdir ${VERSION} cd ${VERSION} # Move the artifacts to svn folder & commit for f in ../../liminal-dev/$RC/*; do svn cp $f ${$(basename $f)/rc?/}; done svn commit -m "Release Liminal ${VERSION} from ${RC}" # Remove old release # http://www.apache.org/legal/release-policy.html#when-to-archive cd .. export PREVIOUS_VERSION=1.10.1 svn rm ${PREVIOUS_VERSION} svn commit -m "Remove old release: ${PREVIOUS_VERSION}" ``` Verify that the packages appear in [liminal](https://dist.apache.org/repos/dist/release/liminal/) ### Prepare PyPI "release" packages At this point we release an official package: - Build the package: ```shell script python setup.py sdist bdist_wheel` ``` - Verify the artifacts that would be uploaded: ```shell script twine check dist/*` ``` - Upload the package to PyPi's test environment: ```shell script twine upload -r pypitest dist/* ``` - Verify that the test package looks good by downloading it and installing it into a virtual environment. The package download link is available at: https://test.pypi.org/project/apache-liminal/#files - Upload the package to PyPi's production environment: ```shell script twine upload -r pypi dist/* ``` - Again, confirm that the package is available here: https://pypi.python.org/pypi/apache-liminal ### Update CHANGELOG.md - Get a diff between the last version and the current version: ```shell script $ git log 1.8.0..1.9.0 --pretty=oneline ``` - Update CHANGELOG.md with the details, and commit it. ### Notify developers of release - Notify users@liminal.apache.org (cc'ing dev@liminal.apache.org and announce@apache.org) that the artifacts have been published: Subject: ```shell script cat < EOF ``` ================================================ FILE: dev/sign.sh ================================================ #!/usr/bin/env bash # # Licensed to the Apache Software Foundation (ASF) under one # or more contributor license agreements. See the NOTICE file # distributed with this work for additional information # regarding copyright ownership. The ASF licenses this file # to you under the Apache License, Version 2.0 (the # "License"); you may not use this file except in compliance # with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, # software distributed under the License is distributed on an # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY # KIND, either express or implied. See the License for the # specific language governing permissions and limitations # under the License. # Use this to sign the tar balls generated from # python setup.py sdist --formats=gztar # ie. sign.sh # you will still be required to type in your signing key password # or it needs to be available in your keychain NAME="${1}" gpg --armor --output "${NAME}.asc" --detach-sig "${NAME}" gpg --print-md SHA512 "${NAME}" > "${NAME}.sha512" ================================================ FILE: docs/Makefile ================================================ # # Licensed to the Apache Software Foundation (ASF) under one # or more contributor license agreements. See the NOTICE file # distributed with this work for additional information # regarding copyright ownership. The ASF licenses this file # to you under the Apache License, Version 2.0 (the # "License"); you may not use this file except in compliance # with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, # software distributed under the License is distributed on an # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY # KIND, either express or implied. See the License for the # specific language governing permissions and limitations # under the License. # Minimal makefile for Sphinx documentation # You can set these variables from the command line. SPHINXOPTS = SPHINXBUILD = sphinx-build SOURCEDIR = . BUILDDIR = build # Put it first so that "make" without argument is like "make help". help: @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) .PHONY: help Makefile # Catch-all target: route all unknown targets to Sphinx using the new # "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS). %: Makefile @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) ================================================ FILE: docs/README.md ================================================ # Apache Liminal Apache Liminal is an end-to-end platform for data engineers & scientists, allowing them to build, train and deploy machine learning models in a robust and agile way. The platform provides the abstractions and declarative capabilities for data extraction & feature engineering followed by model training and serving. Liminal's goal is to operationalize the machine learning process, allowing data scientists to quickly transition from a successful experiment to an automated pipeline of model training, validation, deployment and inference in production, freeing them from engineering and non-functional tasks, and allowing them to focus on machine learning code and artifacts. ## Basics Using simple YAML configuration, create your own schedule data pipelines (a sequence of tasks to perform), application servers, and more. ## Getting Started A simple hello world guide for Liminal can be found [here](getting-started/hello_world.md) \ A more advanced example which demonstrates a simple data-science workflow can be found [here](getting-started/iris_classification.md) ## Apache Liminal Documentation Full documentation of Apache Liminal can be found [here](liminal) ## High Level Architecture High level architecture documentation can be found [here](architecture.md) ================================================ FILE: docs/architecture.md ================================================ # High Level Architecture Liminal is an end-to-end platform for data engineers & scientists, allowing them to build, train and deploy machine learning models in a robust and agile way. The platform provides the abstractions and declarative capabilities for data extraction & feature engineering followed by model training and serving. Apache Liminal's goal is to operationalise the machine learning process, allowing data scientists to quickly transition from a successful experiment to an automated pipeline of model training, validation, deployment and inference in production, freeing them from engineering and non-functional tasks, and allowing them to focus on machine learning code and artifacts. ## Motivation The challenges involved in operationalizing machine learning models are one of the main reasons why many machine learning projects never make it to production. The process involves automating and orchestrating multiple steps which run on heterogeneous infrastructure - different compute environments, data processing platforms, ML frameworks, notebooks, containers and monitoring tools. There are no mature standards for this workflow, and most organizations do not have the experience to build it in-house. In the best case, dev-ds-devops teams form in order to accomplish this task together; in many cases, it's the data scientists who try to deal with this themselves without the knowledge or the inclination to become infrastructure experts. As a result, many projects never make it through the cycle. Those who do suffer from a very long lead time from a successful experiment to an operational, refreshable, deployed and monitored model in production. The goal of Apache Liminal is to simplify the creation and management of machine learning pipelines by data engineers & scientists. The platform provides declarative building blocks which define the workflow, orchestrate the underlying infrastructure, take care of non functional concerns, enabling focus in business logic / algorithm code. Some Commercial E2E solutions have started to emerge in the last few years, however, they are limited to specific parts of the workflow, such as Databricks MLFlow. Other solutions are tied to specific environments (e.g. SageMaker on AWS). ## High Level Architecture The platform is aimed to provide data engineers & scientists with a solution for end to end flows from model training to real time inference in production. It’s architecture enables and promotes adoption of specific components in existing (non-Liminal) frameworks, as well as seamless integration with other open source projects. Liminal was created to enable scalability in ML efforts and after a thorough review of available solutions and frameworks, which did not meet our main KPIs: - Provide an opinionated but customizable end-to-end workflow - Abstract away the complexity of underlying infrastructure - Support major open source tools and cloud-native infrastructure to carry out many of the steps - Allow teams to leverage their existing investments or bring in their tools of choice into the workflow We have found that other tech companies in the Hi-Tech ecosystem also have an interest in such a platform, hence decided to share our work with the community. The following diagram depicts these main components and where Apache Liminal comes in: ![](nstatic/liminal_arch_001.png) A classical data scientist workflow includes some base phases: _Train, Deploy and Consume._ **The Train phase includes the following tasks:** 1. Fetch - get the data needed to build a model - usually using SQL 1. Clean - make sure the data is useful for building the model 1. Prepare - split data and encode features from the data according to model needs 1. Train - Build the model and tune it 1. Evaluate - make sure the model is correct - run it on a test set, etc… 1. Validate - make sure the model is up to the standards you need **The Deploy phase includes these tasks:** 1. Deploy - make it available for usage in production 1. Inference - Batch or Real-time - use the model to evaluate data by your offline or online by your applications 1. Consume - The actual use of the models created by applications and ETLs, usually through APIs to the batch or real-time inference that usually rely on Model and Feature stores. Liminal provides its users a declarative composition capabilities to materialize these steps in a robust way, while exploiting existing frameworks and tools. e.g. Data science frameworks such as scikit-learn, Tensor flow, Keras and such, for running core data science algorithms; as numerous core mechanisms as data stores, processing engines, parallelism, schedulers, code deployment as well as batch and real-time inference. Liminal allows the creation and wiring of these kinds of functional and non functional tasks while making the underlying infrastructure used by these tasks very easy to use and even abstracted away entirely. While handling the non-functional aspects as monitoring (in a standard fashion) deployment, scheduling, resource management and execution. ![](nstatic/liminal_arch_002.png) ================================================ FILE: docs/conf.py ================================================ # # Licensed to the Apache Software Foundation (ASF) under one # or more contributor license agreements. See the NOTICE file # distributed with this work for additional information # regarding copyright ownership. The ASF licenses this file # to you under the Apache License, Version 2.0 (the # "License"); you may not use this file except in compliance # with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, # software distributed under the License is distributed on an # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY # KIND, either express or implied. See the License for the # specific language governing permissions and limitations # under the License. # # -*- coding: utf-8 -*- # # Configuration file for the Sphinx documentation builder. # # This file does only contain a selection of the most common options. For a # full list see the documentation: # http://www.sphinx-doc.org/en/master/config # -- Path setup -------------------------------------------------------------- # If extensions (or modules to document with autodoc) are in another directory, # add these directories to sys.path here. If the directory is relative to the # documentation root, use os.path.abspath to make it absolute, like shown here. # import os import sys sys.path.insert(0, os.path.abspath('source')) sys.path.insert(0, os.path.abspath('/')) # -- Project information ----------------------------------------------------- project = u'Apchae Liminal' copyright = u'Apache Liminal' author = u'Apache Software Foundation' # The short X.Y version version = u'' # The full version, including alpha/beta/rc tags release = u'0.0.1' # -- General configuration --------------------------------------------------- # If your documentation needs a minimal Sphinx version, state it here. # # needs_sphinx = '1.0' # Add any Sphinx extension module names here, as strings. They can be # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom # ones. extensions = [ 'recommonmark', 'sphinx.ext.autodoc', ] html_logo = 'nstatic/liminal_logo.png' # Add any paths that contain templates here, relative to this directory. templates_path = ['ntemplates'] # The suffix(es) of source filenames. # You can specify multiple suffix as a list of string: # # source_suffix = ['.rst', '.md'] source_suffix = '.rst' # The master toctree document. master_doc = 'index' # The language for content autogenerated by Sphinx. Refer to documentation # for a list of supported languages. # # This is also used if you do content translation via gettext catalogs. # Usually you set "language" from the command line for these cases. language = None # List of patterns, relative to source directory, that match files and # directories to ignore when looking for source files. # This pattern also affects html_static_path and html_extra_path. exclude_patterns = ['README.md', '**/README.md'] # The name of the Pygments (syntax highlighting) style to use. pygments_style = None # -- Options for HTML output ------------------------------------------------- # The theme to use for HTML and HTML Help pages. See the documentation for # a list of builtin themes. # html_theme = 'classic' # Theme options are theme-specific and customize the look and feel of a theme # further. For a list of options available for each theme, see the # documentation. # # html_theme_options = {} # Add any paths that contain custom static files (such as style sheets) here, # relative to this directory. They are copied after the builtin static files, # so a file named "default.css" will overwrite the builtin "default.css". html_static_path = ['nstatic'] # Custom sidebar templates, must be a dictionary that maps document names # to template names. # # The default sidebars (for documents that don't match any pattern) are # defined by theme itself. Builtin themes are using these templates by # default: ``['localtoc.html', 'relations.html', 'sourcelink.html', # 'searchbox.html']``. # # html_sidebars = {} # -- Options for HTMLHelp output --------------------------------------------- # Output file base name for HTML help builder. htmlhelp_basename = 'ApchaeLiminaldoc' # -- Options for LaTeX output ------------------------------------------------ latex_elements = { # The paper size ('letterpaper' or 'a4paper'). # # 'papersize': 'letterpaper', # The font size ('10pt', '11pt' or '12pt'). # # 'pointsize': '10pt', # Additional stuff for the LaTeX preamble. # # 'preamble': '', # Latex figure (float) alignment # # 'figure_align': 'htbp', } # Grouping the document tree into LaTeX files. List of tuples # (source start file, target name, title, # author, documentclass [howto, manual, or own class]). latex_documents = [ (master_doc, 'ApchaeLiminal.tex', u'Apchae Liminal Documentation', u'Apache Software Foundation', 'manual'), ] # -- Options for manual page output ------------------------------------------ # One entry per manual page. List of tuples # (source start file, name, description, authors, manual section). man_pages = [(master_doc, 'apchaeliminal', u'Apchae Liminal Documentation', [author], 1)] # -- Options for Texinfo output ---------------------------------------------- # Grouping the document tree into Texinfo files. List of tuples # (source start file, target name, title, author, # dir menu entry, description, category) texinfo_documents = [ ( master_doc, 'ApchaeLiminal', u'Apchae Liminal Documentation', author, 'ApchaeLiminal', 'One line description of project.', 'Miscellaneous', ), ] # -- Options for Epub output ------------------------------------------------- # Bibliographic Dublin Core info. epub_title = project # The unique identifier of the text. This can be a ISBN number # or the project homepage. # # epub_identifier = '' # A unique identification for the text. # # epub_uid = '' # A list of files that should not be packed into the epub file. epub_exclude_files = ['search.html'] def on_missing_reference(app, env, node, contnode): if node['reftype'] == 'any': return contnode else: return None def setup(app): app.connect('missing-reference', on_missing_reference) ================================================ FILE: docs/getting-started/hello_world.md ================================================ # Getting started / ***Hello World*** This guide will allow you to set up your first Apache Liminal environment and allow you to create some simple ML pipelines. These will be very similar to the ones you are going to build for real production scenarios. ## Prerequisites Python 3 (3.6 and up) [Docker Desktop](https://www.docker.com/products/docker-desktop) *Note: Make sure kubernetes cluster is running in docker desktop (or custom kubernetes installation on your machine).* ## Deploying the Example In this tutorial, we will go through setting up Liminal for the first time on your local machine. ### First, let’s build our example project: In the dev folder, just clone the example code from liminal: ```BASH git clone https://github.com/apache/incubator-liminal ``` ***Note:*** *You just cloned the entire Liminal Project, you actually only need examples folder.* Create a python virtual environment to isolate your runs: ```BASH cd incubator-liminal/examples/liminal-getting-started python3 -m venv env ``` Activate your virtual environment: ```BASH source env/bin/activate ``` Now we are ready to install liminal: ```BASH pip install apache-liminal ``` Let's build the images you need for the example: ```BASH liminal build ``` ##### The build will create docker images based on the liminal.yml file in the `images` section. ```BASH liminal deploy --clean ``` The deploy command deploys a liminal server and deploys any liminal.yml files in your working directory or any of its subdirectories to your liminal home directory. *Note: liminal home directory is located in the path defined in LIMINAL_HOME env variable. If the LIMINAL_HOME environemnet variable is not defined, home directory defaults to ~/liminal_home directory.* Now lets runs liminal: ```BASH liminal start ``` The start command spins up the liminal server containers which will run pipelines based on your deployed liminal.yml files. It runs the following three containers: * liminal-postgress * liminal-webserver * liminal-scheduler Once liminal server has completed starting up, you can navigate to admin UI in your browser: [http://localhost:8080](http://localhost:8080) By default liminal server starts Apache Airflow servers and admin UI will be that of Apache Airflow. ![](../nstatic/hello-world/airflow_main.png) ***Important:** Set off/on toggle to activate your pipeline (DAG), nothing will happen otherwise!* You can go to graph view to see all the tasks configured in the liminal.yml file: [http://localhost:8080/admin/airflow/graph?dag_id=example_pipeline]( http://localhost:8080/admin/airflow/graph?dag_id=example_pipeline ) #### Now lets see what actually happened to our task: ![](../nstatic/hello-world/airflow_view_dag.png) #### Click on “hello_world_example” and you will get this popup: ![](../nstatic/hello-world/airflow_view_log.png) #### Click on “view log” button and you can see the log of the current task run: ![](../nstatic/hello-world/airflow_task_log.png) ## Mounted volumes All tasks use a mounted volume as defined in the pipeline YAML: ```YAML name: GettingStartedPipeline volumes: - volume: gettingstartedvol claim_name: gettingstartedvol-pvc local: path: . ``` In our case the mounted volume will point to the liminal hello world example. \ The hello world task will read the **hello_world.json** file from the mounted volume and will write the **hello_world_output.json** to it. *Note:* Each task will internally mount the volume defined above to an internal representation, described under the task section in the yml: ```YAML task: ... mounts: - mount: taskmount volume: gettingstartedvol path: /mnt/vol1 ``` ## Here are the entire list of commands, if you want to start from scratch: ``` git clone https://github.com/apache/incubator-liminal cd examples/liminal-getting-started python3 -m venv env source env/bin/activate pip uninstall apache-liminal pip install apache-liminal Liminal build liminal deploy --clean liminal start ``` ## Closing up To make sure liminal containers are stopped use: ``` liminal stop ``` To deactivate the python virtual env use: ``` deactivate ``` ================================================ FILE: docs/getting-started/iris_classification.md ================================================ # Getting started / ***Iris Classification*** * [Setup your local environment](#Setup-your-local-environment) * [Setup liminal](#setup-liminal) * [Liminal build](#Liminal-build) * [Liminal create](#Liminal-create) * [Liminal deploy](#Liminal-deploy) * [Liminal start](#Liminal-start) * [Liminal YAML walkthrough](#Liminal-YAML-walkthrough) * [Evaluate the Iris Classification model](#Evaluate-the-iris-classification-model) * [Debugging Kubernetes Deployments](#Debugging-Kubernetes-Deployments) * [Closing up](#Closing-up) In this tutorial, we will guide you through setting up Apache Liminal on your local machine and run a simple machine-learning workflow, based on the classic Iris dataset classification example. \ More details in this [link](https://scikit-learn.org/stable/auto_examples/datasets/plot_iris_dataset.html). #### Prerequisites * [Python 3 (3.6 and up)](https://www.python.org/downloads) * [Python Virtual Environments](https://pypi.org/project/virtualenv) * [Docker Desktop](https://www.docker.com/products/docker-desktop) * [Kubernetes CLI (kubectl)](https://kubernetes.io/docs/tasks/tools/install-kubectl-macos) *Note: Make sure kubernetes cluster is running in docker desktop* We will define the following steps and services to implement the Iris classification example: \ Train, Validate & Deploy - Training and validation execution is managed by Liminal Airflow extension. The training task trains a regression model using a public dataset. \ We then validate the model and deploy it to a model-store in mounted volume. \ Inference - online inference is done using a Python Flask service running on the local Kubernetes in docker desktop. The service exposes the `/predict` endpoint. It reads the model stored in the mounted drive and uses it to evaluate the request. ## Setup your local env environment In the dev folder, clone the example code from liminal: ```BASH git clone https://github.com/apache/incubator-liminal ``` ***Note:*** *You just cloned the entire Liminal Project, you actually only need examples folder.* Create a python virtual environment to isolate your runs: ```BASH cd incubator-liminal/examples/aws-ml-app-demo python3 -m venv env ``` Activate your virtual environment: ```BASH source env/bin/activate ``` Now we are ready to install liminal: ```BASH pip install apache-liminal ``` ## Setup liminal ### Liminal build The build will create docker images based on the liminal.yml file in the `images` section and will create a kubernetes local volume. Be informed that all tasks use a mounted volume as defined in the pipeline YAML. \ In our case the mounted volume will point to the liminal Iris Classification example. The training task trains a regression model using a public dataset. We then validate the model and deploy it to a model-store in the mounted volume. ```BASH liminal build ``` ### Liminal deploy The deploy command deploys a liminal server and deploys any liminal.yml files in your working directory or any of its subdirectories to your liminal home directory. ```BASH liminal deploy --clean ``` *Note: liminal home directory is located in the path defined in LIMINAL_HOME env variable. If the LIMINAL_HOME environemnet variable is not defined, home directory defaults to ~/liminal_home directory.* ### Liminal start The start command spins up 3 containers that load the Apache Airflow stack. Liminal's Airflow extension is responsible to execute the workflows defined in the liminal.yml file as standard Airflow DAGs. ```BASH liminal start ``` It runs the following three containers: * liminal-postgress * liminal-webserver * liminal-scheduler Once liminal server has completed starting up, you can navigate to admin UI in your browser: [http://localhost:8080](http://localhost:8080) ![](../nstatic/iris-classification/airflow_main.png) ***Important:** Set off/on toggle to activate your pipeline (DAG), nothing will happen otherwise!* You can go to graph view to see all the tasks configured in the liminal.yml file: [http://localhost:8080/admin/airflow/graph?dag_id=my_datascience_pipeline]( http://localhost:8080/admin/airflow/graph?dag_id=my_datascience_pipeline ) #### Now lets see what actually happened to our task: ![](../nstatic/iris-classification/airflow_view_dag.png) #### Click on “train” and you will get this popup: ![](../nstatic/iris-classification/airflow_view_log.png) #### Click on “view log” button and you can see the log of the current task run: ![](../nstatic/iris-classification/airflow_task_log.png) ## Liminal YAML walkthrough * [Mounted volumes](#Mounted-volumes) * [Pipeline flow](#Pipeline-flow) ### Mounted volumes Declaration of the mounted volume in your liminal YAML: ```YAML name: MyDataScienceApp owner: Bosco Albert Baracus volumes: - volume: gettingstartedvol claim_name: gettingstartedvol-pvc local: path: . ``` ### Pipeline flow Declaration of the pipeline tasks flow in your liminal YAML: ```YAML pipelines: - pipeline: my_datascience_pipeline ... schedule: 0 * 1 * * tasks: - task: train type: python description: train model image: myorg/mydatascienceapp cmd: python -u training.py train ... - task: validate type: python description: validate model and deploy image: myorg/mydatascienceapp cmd: python -u training.py validate ... ``` ##### Each task will internally mount the volume defined above to an internal representation, described under the task section in the yml: ```YAML pipelines: ... tasks: - task: train ... env: MOUNT_PATH: /mnt/gettingstartedvol mounts: - mount: mymount volume: gettingstartedvol path: /mnt/gettingstartedvol ``` ###### We specify the `MOUNT_PATH` in which we store the trained model. ## Evaluate the iris classification model Once the Iris Classification model trainging is completed and model is deployed (to the mounted volume), you can launch a pod of the pre-built image which contains a flask server, by applying the following Kubernetes manifest configuration: ```BASH kubectl apply -f manifests/aws-ml-app-demo.yaml ``` Alternatively, create a Kubernetes pod from stdin: ```YAML cat < # Getting started / ***Spark application*** * [Setup your local environment](#Setup-your-local-environment) * [Spark on K8S](#Spark-On-K8S) * [Setup liminal](#setup-liminal) * [Liminal YAML walkthrough](#Liminal-YAML-walkthrough) * [Evaluate the Iris Classification model](#Evaluate-the-iris-classification-model) * [Debugging Kubernetes Deployments](#Debugging-Kubernetes-Deployments) * [Closing up](#Closing-up) In this tutorial, we will guide you through setting up Apache Liminal on your local machine and run a simple machine-learning workflow, based on the classic Iris dataset classification example including a feature engineering with Spark engine. \ More details in this [link](https://scikit-learn.org/stable/auto_examples/datasets/plot_iris_dataset.html). #### Prerequisites * [Python 3 (3.6 and up)](https://www.python.org/downloads) * [Python Virtual Environments](https://pypi.org/project/virtualenv) * [Docker Desktop](https://www.docker.com/products/docker-desktop) * [Kubernetes CLI (kubectl)](https://kubernetes.io/docs/tasks/tools/install-kubectl-macos) *Note: Make sure kubernetes cluster is running in docker desktop* ## Setup your local env environment In the dev folder, clone the example code from liminal: ```BASH git clone https://github.com/apache/incubator-liminal ``` ***Note:*** *You just cloned the entire Liminal Project, you actually only need examples folder.* Create a python virtual environment to isolate your runs: ```BASH cd incubator-liminal/examples/spark-app-demo/ python3 -m venv env ``` Activate your virtual environment: ```BASH source env/bin/activate ``` Now we are ready to install liminal: ```BASH pip install apache-liminal ``` ## Spark On K8S We will define the following steps and services to implement the Iris classification example including a simple feature engineering with Apache Spark: \ Clean data, Train, Validate & Deploy - Cleaning the input data set and prepare it for training and validation execution is managed by Liminal Airflow extension. The training task trains a regression model using a public dataset. \ We then validate the model and deploy it to a model-store in mounted volume. \ Inference - online inference is done using a Python Flask service running on the local Kubernetes in docker desktop. The service exposes the `/predict` endpoint. It reads the model stored in the mounted drive and uses it to evaluate the request. ### Setup liminal ```BASH cd incubator-liminal/examples/spark-app-demo/k8s ``` #### Liminal build The build will create docker images based on the liminal.yml file in the `images` section and will create a kubernetes local volume. Be informed that all tasks use a mounted volume as defined in the pipeline YAML. \ In our case the mounted volume will point to the liminal Iris Classification example. The training task trains a regression model using a public dataset. We then validate the model and deploy it to a model-store in the mounted volume. ```BASH liminal build ``` #### Liminal deploy The deploy command deploys a liminal server and deploys any liminal.yml files in your working directory or any of its subdirectories to your liminal home directory. ```BASH liminal deploy --clean ``` *Note: liminal home directory is located in the path defined in LIMINAL_HOME env variable. If the LIMINAL_HOME environemnet variable is not defined, home directory defaults to ~/liminal_home directory.* #### Liminal start The start command spins up 3 containers that load the Apache Airflow stack. Liminal's Airflow extension is responsible to execute the workflows defined in the liminal.yml file as standard Airflow DAGs. ```BASH liminal start ``` You can go to graph view to see all the tasks configured in the liminal.yml file: [http://localhost:8080/admin/airflow/graph?dag_id=my_first_spark_pipeline]( http://localhost:8080/admin/airflow/graph?dag_id=my_first_spark_pipeline ) You should see the following dag: ![](../nstatic/spark-demo-app/k8s_dag.png) ### Liminal YAML walkthrough * [Local archetype](#Local-archetype) * [Pipeline flow](#Pipeline-flow) #### Local archetype A superliminal for an easy local development ```YAML name: InfraSpark owner: Bosco Albert Baracus type: super executors: - executor: k8s type: kubernetes variables: output_root_dir: /mnt/gettingstartedvol input_root_dir: '' images: - image: my_spark_image type: spark source: . no_cache: True task_defaults: spark: executor: k8s executors: 2 application_source: '{{application}}' mounts: - mount: mymount volume: gettingstartedvol path: /mnt/gettingstartedvol ``` ###### We specify the `MOUNT_PATH` in which we store the trained model. You can read more about `superliminal` `variables` and `defaults` in [advanced.liminal.yml](../liminal/advanced.liminal.yml.md) #### Pipeline flow Declaration of the pipeline tasks flow in your liminal YAML: ```YAML name: MyFirstLiminalSparkApp super: InfraSpark owner: Bosco Albert Baracus variables: output_path: '{{output_root_dir}}/my_first_liminal_spark_app_outputs/' application: data_cleanup.py task_defaults: python: mounts: - mount: mymount volume: gettingstartedvol path: /mnt/gettingstartedvol pipelines: - pipeline: my_first_spark_pipeline start_date: 1970-01-01 timeout_minutes: 45 schedule: 0 * 1 * * tasks: - task: data_preprocessing type: spark description: prepare the data for training application_arguments: - '{{input_root_dir}}data/iris.csv' - '{{output_path}}' - task: train type: python description: train model image: myorg/mydatascienceapp cmd: python -u training.py train '{{output_path}}' env: MOUNT_PATH: /mnt/gettingstartedvol ... ``` ### Evaluate the iris classification model Once the Iris Classification model trainging is completed and model is deployed (to the mounted volume), you can launch a pod of the pre-built image which contains a flask server, by applying the following Kubernetes manifest configuration: ```BASH kubectl apply -f manifests/spark-app-demo.yaml ``` Alternatively, create a Kubernetes pod from stdin: ```YAML cat < # Apache Liminal Documentation ## Overview Apache Liminal is an end-to-end platform for data engineers & scientists, allowing them to build, train and deploy machine learning models in a robust and agile way. The platform provides the abstractions and declarative capabilities for data extraction & feature engineering followed by model training and serving. Liminal's goal is to operationalize the machine learning process, allowing data scientists to quickly transition from a successful experiment to an automated pipeline of model training, validation, deployment and inference in production, freeing them from engineering and non-functional tasks, and allowing them to focus on machine learning code and artifacts. ## Chapters 1. [liminal.yml](liminal.yml.md) 2. [Images](images) 3. [Pipelines](pipelines.md) 4. [Tasks](tasks) 5. [Services](services.md) 6. [Monitoring](monitoring.md) 7. [Metrics Backends](metrics_backends) 8. [Alerts Backends](alerts_backends) 9. [Advanced liminal.yml](advanced.liminal.yml.md) 10. [Executors](executors) ================================================ FILE: docs/liminal/advanced.liminal.yml.md ================================================ # Advanced liminal.yml In this section you will learn about advanced features of liminal.yml ## Variables Much like in programming languages, you can define variables for re-use across your liminal.yml file. ```yaml variables: myvar1: myvalue1 myvar2: myvalue2 ``` In the `variables` section of your liminal.yml you can define your variables as key value pairs ## Placeholders You can use placeholders of format `{{myplaceholder}}` in most any string value in your liminal.yml Make sure that any string that includes placeholders is surrounded by single or double quotes. For example: ```yaml variables: image_name: myorg/myrepo:myapp images: - image: '{{image_name}}' type: python source: . pipelines: - pipeline: my_pipeline tasks: - task: my_python_task type: python image: "{{image_name}}" cmd: python -u my_module.py ``` ### Placeholders rendering The process of rendering placeholders checks for values in several places, by order. #### Pipeline placeholder rendering When running pipelines placeholders will be rendered by replacing values from sources in the following order: 1. Current [DAG run conf](https://airflow.apache.org/docs/apache-airflow/stable/dag-run.html) (Airflow). 2. liminal.yml [variables](#variables) section. 3. [Airflow variables](https://airflow.apache.org/docs/apache-airflow/stable/howto/variable.html) (Airflow). 4. [Airflow macros](https://airflow.apache.org/docs/apache-airflow/stable/macros-ref.html) (Airflow) . For example: `"{{yesterday_ds}}"`. For more information see: 5. Airflow [Jinja Templating](https://airflow.apache.org/docs/apache-airflow/stable/concepts.html#jinja-templating) (Airflow). #### Build placeholder rendering When using `liminal build` placeholders will be rendered by replacing values from sources in the following order: 1. Environment variables. 2. liminal.yml [variables](#variables) section. ## Task variables In addition to the variables section you can also set specific variables for specific `task`s using the `variables` attribute of that task. For example: ```yaml - task: my_task type: python cmd: python -u my_module.py variables: myvar1: myvalue1 myvar2: myvalue2 ``` You can also pass a reference to a variable map (dictionary) set in your variables section: ```yaml variables: my_variable_map: myvar1: myvalue1 myvar2: myvalue2 my_variable_map2: myvar1: myvalue_a myvar2: myvalue_x pipelines: - pipeline: my_pipeline tasks: - task: my_task1 type: python cmd: python -u my_module.py variables: my_variable_map1 - task: my_task2 type: python cmd: python -u my_module.py variables: my_variable_map2 ``` ## Executors For fully detailed information on executors see: [executors](executors). Each `task` in your pipelines has a reference to an executor from the `executors` section of your liminal.yml file. If not specified, the appropriate default executor for that task type is used. ```yaml executors: - executor: my_kubernetes_executor type: kubernetes resources: request_memory: 128Mi pipelines: pipeline: my_pipeline tasks: - task: my_python_task type: python image: myorg/myrepo:mypythonapp executor: my_kubernetes_executor cmd: python -u my_module.py ``` In the example above we define an `executor` of type `kubernetes` with custom resources configuration. `executors` is a section in the root of your liminal.yml file and is a list of `executor`s defined by the following attributes: ### executor attributes `executor`: name of your executor. `type`: type of the executor. The current available image types are: `kubernetes` and `emr`. Different executor types support their own additional configuration. ## Task Defaults `task_defaults` is a section in the root of your liminal.yml file in which default attributes can be set for each `task` type. ```yaml task_defaults: python: executor: my_kubernetes_executor env_vars: env: {{env}} foo: bar ``` In the example above we set default attributes for any `python` task in any pipeline in the liminal system defined by our liminal.yml file. Each `python` task that does not set these attributes will default to the setting in `task_defaults`. If the same attribute is defined in both `task_defaults` and in the `task`, definitions from the `task` take precedence. If a map (dictionary) of values (for example `env_vars`) is defined in both `task_defaults` the two maps will be merged, with definitions in the `task` taking precedence in case key is defined in both maps. ## Pipeline Defaults `pipeline_defaults` is a section in the root of your liminal.yml file in which default attributes can be set for each `pipeline` in the liminal system defined in your liminal.yml file. ```yaml pipeline_defaults: start_date: 2021-01-01 timeout_minutes: 30 ``` In the example above we set default attributes for any `pipeline` defined in our liminal.yml file. Each `pipeline` that does not set these attributes will default to the setting in `pipeline_defaults`. If the same attribute is defined in both `pipeline_defaults` and in the `pipeline`, definitions from the `pipeline` take precedence. If `tasks` section is defined in `pipeline_defaults` each pipeline defined in our liminal.yml file will have the tasks defined in `pipeline_defaults`. A special `task` type `pipeline` may be used in `pipeline_defaults` `tasks` section. This task type is interpreted as " tasks defined in pipeline go here". This allows flexibility of defining common tasks to be run before and after the tasks of each pipeline defined in our liminal.yml file. For example: ```yaml pipeline_defaults: before_tasks: - task: my_common_setup_task type: python image: myorg/myrepo:mypythonapp cmd: python -u my_setup_module.py after_tasks: - task: my_common_teardown_task1 type: python image: myorg/myrepo:mypythonapp cmd: python -u my_teardown_module1.py - task: my_common_teardown_task2 type: python image: myorg/myrepo:mypythonapp cmd: python -u my_teardown_module2.py ``` In the example above we set a list of `tasks` in `pipeline_defaults` which leads to each pipeline defined in our liminal.yml file will have `my_common_setup_task` run before its tasks and `my_common_teardown_task1` and `my_common_teardown_task2` after its tasks. ## Inheritence Each liminal.yml defines a subliminal layer of a liminal system. A subliminal layer can inherit attributes of a superliminal layer by specifying `super: name_of_super` in the root of the yml. ```yaml name: my_system super: my_super ``` A super is found by this name if a liminal.yml file in your environment exists with that name. A superliminal layer liminal.yml file needs to define its `type` as `super`: ```yaml name: my_super type: super ``` If not specified, the `type` of a liminal.yml file defaults to `sub` (subliminal). A superliminal can also inherit from another superliminal by defining its own `super` attribute. ```yaml name: my_super type: super super: my_other_super ``` A liminal system can have 1 subliminal layer but many superliminal layers using inheritence. This allows us to chain common behaviors of our systems into several superliminal layers. If no `super` is defined for a liminal.yml file then it defaults to having the [base](https://github.com/incubator-liminal/liminal/core/config/defaults/base/liminal.yml) be its super. ### superliminal attribtues A superliminal layer can define the following attributes: ``` variables executors monitoring task_defaults pipeline_defaults ``` If the same attribute section is defined in both a superliminal and a lower layer of the system they will be merged, with key collisions favoring the lower level layer. ================================================ FILE: docs/liminal/executors/README.md ================================================ # Executors Each `task` in your pipelines has a reference to an executor from the `executors` section of your liminal.yml file. If not specified, the appropriate default executor for that task type is used. ```yaml executors: - executor: my_kubernetes_executor type: kubernetes resources: request_memory: 128Mi pipelines: pipeline: my_pipeline tasks: - task: my_python_task type: python image: myorg/myrepo:mypythonapp executor: my_kubernetes_executor cmd: python -u my_module.py ``` In the example above we define an `executor` of type `kubernetes` with custom resources configuration. `executors` is a section in the root of your liminal.yml file and is a list of `executor`s defined by the following attributes: ## executor attributes `executor`: name of your executor. `type`: type of the executor. The current available image types are: `kubernetes` and `emr`. Different executor types support their own additional configuration. ## task types 1. [kubernetes](kubernetes.md) 2. [emr](emr.md) ================================================ FILE: docs/liminal/executors/emr.md ================================================ # emr executor The `emr` executor allows you to run tasks on AWS EMR cluster. ```yaml - executor: my_emr_cluster type: emr cluser_name: liminal-cluster ``` ## attributes `cluster_name`: the name of the cluster uses by the executor OR `cluster_id`: the unique identifier of the cluster used by the executor. `aws_conn_id`: a reference to the emr connection. default: `aws_default` `cluster_states`: the cluster state filters to apply when searching for an existing cluster. default: `['RUNNING', 'WAITING']` `properties`: any attribute of [aws-resource-emr-step-properties]( https://docs.aws.amazon.com/AWSCloudFormation/latest/UserGuide/aws-resource-emr-step.html#aws-resource-emr-step-properties) (Optional) ## supported task types - [spark](../tasks/spark.md) - [sql](../tasks/sql.md) ================================================ FILE: docs/liminal/executors/index.rst ================================================ .. Apchae Liminal documentation master file, created by sphinx-quickstart on Sun Nov 15 08:45:27 2020. You can adapt this file completely to your liking, but it should at least contain the root `toctree` directive. .. Licensed to the Apache Software Foundation (ASF) under one or more contributor license agreements. See the NOTICE file distributed with this work for additional information regarding copyright ownership. The ASF licenses this file to you under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at .. .. http://www.apache.org/licenses/LICENSE-2.0 .. Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. .. Executors ========= Each ``task`` in your pipelines has a reference to an executor from the ``executors`` section of your liminal.yml file. If not specified, the appropriate default executor for that task type is used. .. code-block:: yaml executors: - executor: my_kubernetes_executor type: kubernetes resources: request_memory: 128Mi pipelines: pipeline: my_pipeline tasks: - task: my_python_task type: python image: myorg/myrepo:mypythonapp executor: my_kubernetes_executor cmd: python -u my_module.py .. In the example above we define an ``executor`` of type ``kubernetes`` with custom resources configuration. ``executors`` is a section in the root of your liminal.yml file and is a list of ``executor`` s defined by the following attributes: executor attributes ''''''''''''''''''' ``executor``: name of your executor. ``type``: type of the executor. The current available image types are: ``kubernetes`` and ``emr``. Different executor types support their own additional configuration. executor types '''''''''''''' .. toctree:: :maxdepth: 1 kubernetes ================================================ FILE: docs/liminal/executors/kubernetes.md ================================================ # kubernetes executor The `kubernetes` executor allows you to run tasks on kubernetes. ```yaml - executor: my_kubernetes_executor type: kubernetes ``` ## attributes If running pipelines on Airflow, any attribute of [KubernetesPodOperator](https://airflow.apache.org/docs/apache-airflow/1.10.12/_api/airflow/contrib/operators/kubernetes_pod_operator/index.html) can be set as an attribute of the executor. Note that some of these attributes are set in the `task`: ``` image cmd mounts name env_vars ``` For example, see: [python task](../tasks/python.md) ================================================ FILE: docs/liminal/extensibility.md ================================================ # Extensibility The extensibility designed to have an easy way for the users to add their tasks, executors, image builders and extend the library so that it fits the level of abstraction that suits the user environment. ## Location Tasks folder: `{LIMINAL_HOME}/plugins/tasks` Executors folder: `{LIMINAL_HOME}/plugins/executors` Image builders folder: `{LIMINAL_HOME}/plugins/images` ## Example ### Prerequisites Apache Liminal ### Guide Check out the examples for each one of the extensible item in [examples/extensibility](../../examples/extensibility) Copy the extensible items to the plugin location: ```shell cp -r ../../examples/extensibility/executors/* $LIMINAL_HOME/liminal/plugins/executors/ ``` ```shell cp -r ../../examples/extensibility/tasks/* $LIMINAL_HOME/liminal/plugins/tasks/ ``` ```shell cp -r ../../examples/extensibility/images/* $LIMINAL_HOME/liminal/plugins/images/ ``` ```shell liminal build . liminal deploy --clean liminal start ``` ================================================ FILE: docs/liminal/images/README.md ================================================ # Images In the `images` section you can configure how to pack your code into docker images. This can be achieved in liminal easily, without any Docker knowledge needed by setting just a few attributes: ```yaml images: - image: myorg/myrepo:mypythonapp type: python source: . - image: myorg/myrepo:myserver type: python_server source: path/to/my/server/code endpoints: - endpoint: /myendpoint module: my_module ``` `images` is a section in the root lof your liminal.yml file and is a list of `image`s, defined by the following attributes: ## image attributes `image`: name of your image, usually will be in the form of `user/repository:tag` common in docker image repositories such as Docker Hub, but for local development this can be any string. `type`: type of the image. Usually this will match your coding language, or your coding langauge followed by an `_` and a specific style of application (such as a server). `source`: location of source files to include in the image, this can be a relative path within your project, or even `.` which means the entire project should be packaged in the image. `build_cmd`: certain languages require to be built or compiled, here you can provide your build command to be executed in order to build your code. Other image types might require additional configuration, for example, servers require `endpoints` to be configured. ## image types 1. [python](python.md) 2. [python server](python_server.md) ================================================ FILE: docs/liminal/images/index.rst ================================================ .. Apchae Liminal documentation master file, created by sphinx-quickstart on Sun Nov 15 08:45:27 2020. You can adapt this file completely to your liking, but it should at least contain the root `toctree` directive. .. Licensed to the Apache Software Foundation (ASF) under one or more contributor license agreements. See the NOTICE file distributed with this work for additional information regarding copyright ownership. The ASF licenses this file to you under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at .. .. http://www.apache.org/licenses/LICENSE-2.0 .. Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. .. Images ====== In the ``images`` section you can configure how to pack your code into docker images. This can be achieved in liminal easily, without any Docker knowledge needed by setting just a few attributes: .. code-block:: yaml images: - image: myorg/myrepo:mypythonapp type: python source: . - image: myorg/myrepo:myserver type: python_server source: path/to/my/server/code endpoints: - endpoint: /myendpoint module: my_module .. ``images`` is a section in the root lof your liminal.yml file and is a list of ``image`` s, defined by the following attributes: image attributes '''''''''''''''' ``image``: name of your image, usually will be in the form of ``user/repository:tag`` common in docker image repositories such as Docker Hub, but for local development this can be any string. ``type``: type of the image. Usually this will match your coding language, or your coding langauge followed by an ``_`` and a specific style of application (such as a server). ``source``: location of source files to include in the image, this can be a relative path within your project, or even ``.`` which means the entire project should be packaged in the image. ``build_cmd``: certain languages require to be built or compiled, here you can provide your build command to be executed in order to build your code. Other image types might require additional configuration, for example, servers require ``endpoints`` to be configured. image types ''''''''''' .. toctree:: :maxdepth: 1 python python_server ================================================ FILE: docs/liminal/images/python.md ================================================ # python image The `python` image builder packages your python code as a docker image. ```yaml - image: myorg/myrepo:mypythonapp type: python source: relative/path/to/source ``` ## attributes `image`: name of your image. `source`: location of source files to include in the image, this can be a relative path within your project, or even `.` which means the entire project should be packaged in the image. ================================================ FILE: docs/liminal/images/python_server.md ================================================ # python_server image The `python_server` image builder packages your python code as a docker image which starts a web server to serve your code via HTTP calls. ```yaml - image: myorg/myrepo:mypythonserver type: python_server source: relative/path/to/source endpoints: - endpoint: /myendpoint module: my_module function: my_function - endpoint: /myotherendpoint module: my_module2 function: my_function2 ``` ## attributes `image`: name of your image. `source`: location of source files to include in the image, this can be a relative path within your project, or even `.` which means the entire project should be packaged in the image. `endpoint`: list of `endpoint`s to expose in the server. ## endpoint attributes `endpoint`: path to your endpoint in the server. `module`: module containing your user code. `function`: name of the function within the module that should be called when handling this endpoint. the function should expect a dictionary as an input (or None if no input) and should return a string to return in the response to the HTTP call. ================================================ FILE: docs/liminal/index.rst ================================================ .. Apchae Liminal documentation master file, created by sphinx-quickstart on Sun Nov 15 08:45:27 2020. You can adapt this file completely to your liking, but it should at least contain the root `toctree` directive. .. Licensed to the Apache Software Foundation (ASF) under one or more contributor license agreements. See the NOTICE file distributed with this work for additional information regarding copyright ownership. The ASF licenses this file to you under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at .. .. http://www.apache.org/licenses/LICENSE-2.0 .. Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. .. Apache Liminal Documentation ============================ Chapters: .. toctree:: :maxdepth: 1 liminal.yml images/index.rst pipelines tasks/index.rst services monitoring metrics_backends/index.rst advanced.liminal.yml executors/index.rst ================================================ FILE: docs/liminal/kubernetes/secret_util.md ================================================ # Secret Utils Each `task` in your pipelines has a reference to an secret from the `secrets` section of your liminal.yml file. ```yaml --- name: k8s_secret_example secrets: - secret: aws local_path_file: "~/.aws/credentials" executors: - executor: k8s type: kubernetes variables: AWS_CONFIG_FILE: /mnt/credentials task_defaults: python: executor: k8s image: amazon/aws-cli:2.7.23 executors: 2 env_vars: AWS_CONFIG_FILE: "/secret/credentials" secrets: - secret: aws remote_path: "/secret" pipelines: - pipeline: k8s_secret_example owner: Bosco Albert Baracus start_date: 1970-01-01 timeout_minutes: 10 schedule: 0 * 1 * * tasks: - task: my_python_task type: python cmd: aws s3 ls ``` That example manifest defines a Secret Opaque for AWS credentials used. The values are Base64 strings in the manifest; however, when you use the Secret with a Pod then the kubelet provides the decoded data to the Pod and its containers. In order to make use of the AWS credentials we define an environment variable `AWS_CONFIG_FILE` to authenticate our requests. For example, the files generated by the AWS CLI for a default profile configured with aws configure looks similar to the following. `~/.aws/credentials` ``` [default] aws_access_key_id= aws_secret_access_key= region = us-east-1 aws_session_token= ``` ================================================ FILE: docs/liminal/liminal.yml.md ================================================ ## liminal.yml Definition of your own liminal system is done via yml configuration file named liminal.yml in your project. This yml is the one place in which you define all characteristics and behavior of your liminal system. In this section we'll go over the anatomy of the liminal.yml file. ### root attributes At the root level of your limimal.yml you can define the most basic root characteristics of your liminal system such as name and owner: ```yaml name: MyLiminalSystem owner: Bosco Albert Baracus ``` ### images For fully detailed information on pipelines see: [images](images). In the `images` section you can configure how to pack your code into docker images. This can be achieved in liminal easily, without any Docker knowledge needed by setting just a few attributes: ```yaml images: - image: myorg/myrepo:mypythonapp type: python source: . - image: myorg/myrepo:myserver type: python_server source: path/to/my/server/code endpoints: - endpoint: /myendpoint module: my_module function: my_function ``` `images` is a section in the root lof your liminal.yml file and is a list of `image`s, defined by the following attributes: #### image attributes `image`: name of your image, usually will be in the form of `user/repository:tag` common in docker image repositories such as Docker Hub, but for local development this can be any string. `type`: type of the image. Usually this will match your coding language, or your coding langauge followed by an `_` and a specific style of application (such as a server). The current available image types are: `python`. `source`: location of source files to include in the image, this can be a relative path within your project, or even `.` which means the entire project should be packaged in the image. `build_cmd`: certain languages require to be built or compiled, here you can provide your build command to be executed in order to build your code. Other image types might require additional configuration, for example, servers require `endpoints` to be configured. ### pipelines For fully detailed information on pipelines see: [pipelines](pipelines.md). In the `pipelines` section you can configure scheduled/manually run pipelines of tasks to be executed sequentially: ```yaml pipelines: - pipeline: my_data_pipeline timeout_minutes: 30 start_date: 2020-03-01 schedule: 0 10 * * * tasks: - task: my_sql_task type: sql query: "SELECT * FROM {{my_database_name}}.{{my_table_name}} WHERE event_date_prt >= '{{yesterday_ds}}'" AND cms_platform = 'xsite' output_table: my_db.my_out_table output_path: s3://my_bky/{{env}}/mydir - task: my_python_task type: python image: myorg/myrepo:pythonapp cmd: python my_python_app.py env_vars: env: {{env}} fizz: buzz - task: my_python_task type: python image: myorg/myrepo:mypythonapp cmd: python -u my_module.py env_vars: env: {{env}} fizz: buzz ``` `pipelines` is a section in the root lof your liminal.yml file and is a list of `pipeline`s defined by the following attributes: #### pipeline attributes `pipeline`: name of your pipeline (must be unique per liminal server). `timeout_minutes`: maximum allowed pipeline run time in minutes, if run exceeds this time, pipeline and all running tasks will fail. `start_date`: start date for the pipeline. `schedule`: to be configured if the pipeline should run on a schedule. Format is [cron expression](https://en.wikipedia.org/wiki/Cron#CRON_expression). `tasks`: list of `task`s, defined by the following attributes: ##### task attributes For fully detailed information on tasks see: [tasks](tasks). `task`: name of your task (must be made of alphanumeric, dash and/or underscore characters only). `type`: type of the task. Examples of available task types are: `python`. and more.. Different task types require their own additional configuration. For example, `python` task requires `image` to be configured. ### services For fully detailed information on services see: [services](services.md). In the `services` section you can configure constantly running applications such as servers. ```yaml services: - service: my_server image: myorg/myrepo:myserver ``` `services` is a section in the root lof your liminal.yml file and is a list of `service`s, defined by the following attributes: #### service attributes `service`: name of your service. `image`: the service's docker image. ### monitoring For fully detailed information on monitoring see: [monitoring](monitoring.md). In the `monitoring` section you can configure monitoring for your pipelines and services. ```yaml monitoring: metrics_backends: - metrics_backend: cloudwatch_metrics type: aws_cloudwatch namespace: DataPipeline AWS_REGION_NAME: us-east-1 alerts_backends: - alerts_backend: cloudwatch_alerts type: aws_cloudwatch metrics_backend: cloudwatch_metrics ok_actions: ['arn:aws:sns:...'] alarm_actions: ['arn:aws:sns:...'] ``` `monitoring` is a section in the root lof your liminal.yml file and is a list of `metrics_backend`s and a list of `alerts_backend`s. `metrics_backend`s are where metrics for your pipelines are automatically sent. `alerts_backends`s automatically register alerts based on `metrics_backend` they are paired with. #### metrics_backend attributes `metrics_backend`: name of your metrics backend `type`: type of the metrics backend. The current available metrics backends are: `aws_cloudwatch`. Different metrics backend types require their own additional configuration. For example, `aws_cloudwatch` metrics backend requires `namespace` to be configured. #### alerts_backend attributes `alerts_backend`: name of your alerts backend `type`: type of the alerts backend. The current available alerts backends are: `aws_cloudwatch`. `metrics_backend`: name of the metrics backends to register alerts for. Different alerts backend types require their own additional configuration. For example, `aws_cloudwatch` alerts backend requires `alarm_actions` to be configured. ### advanced topics For documentation of more advanced features of liminal.yml see: [advanced liminal.yml](advanced.liminal.yml.md) ================================================ FILE: docs/liminal/metrics_backends/README.md ================================================ # Metrics Backends `metrics_backend` is the definition of a metrics backend to which all automatically generated metrics from your pipeline are sent to and is part of the `metrics_backends` list in the `monitoring` section of your liminal.yml ```yaml - metrics_backend: cloudwatch_metrics type: aws_cloudwatch namespace: DataPipeline AWS_REGION_NAME: us-east-1 ``` A `metrics_backend` is defined by the following attribtues: ## metrics_backend attributes `metrics_backend`: name of your metrics backend `type`: type of the metrics backend. The current available metrics backends are: `aws_cloudwatch`. Different metrics backend types require their own additional configuration. For example, `aws_cloudwatch` metrics backend requires `namespace` to be configured. ## metrics_backend types 1. [aws cloudwatch](aws_cloudwatch.md) ================================================ FILE: docs/liminal/metrics_backends/aws_cloudwatch.md ================================================ # aws_cloudwatch metrics backend The `aws_cloudwatch` metrics backend allows you to send all automatically generated metrics from your pipelines to AWS CloudWatch. ```yaml - metrics_backend: cloudwatch_metrics type: aws_cloudwatch namespace: DataPipeline AWS_REGION_NAME: us-east-1 ``` ## attributes `metrics_backend`: name of your metrics backend. `type`: type of the metrics backend. The current available metrics backends are: `aws_cloudwatch`. `namespace`: target namespace on AWS CloudWatch. `AWS_REGION_NAME`: target AWS region to report metrics to. `AWS_ACCESS_KEY_ID`: AWS access key id (optional). `AWS_SECRET_ACCESS_KEY`: AWS secret access key (optional). ================================================ FILE: docs/liminal/metrics_backends/index.rst ================================================ .. Apchae Liminal documentation master file, created by sphinx-quickstart on Sun Nov 15 08:45:27 2020. You can adapt this file completely to your liking, but it should at least contain the root `toctree` directive. .. Licensed to the Apache Software Foundation (ASF) under one or more contributor license agreements. See the NOTICE file distributed with this work for additional information regarding copyright ownership. The ASF licenses this file to you under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at .. .. http://www.apache.org/licenses/LICENSE-2.0 .. Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. .. Metrics Backends ================ ``metrics_backend`` is the definition of a metrics backend to which all automatically generated metrics from your pipeline are sent to and is part of the ``metrics_backends`` list in the ``monitoring`` section of your liminal.yml .. code-block:: yaml - metrics_backend: cloudwatch_metrics type: aws_cloudwatch namespace: DataPipeline AWS_REGION_NAME: us-east-1 .. A ``metrics_backend`` is defined by the following attributes: metrics_backend attributes '''''''''''''''''''''''''' ``metrics_backend``: name of your metrics backend ``type``: type of the metrics backend. The current available metrics backends are: ``aws_cloudwatch``. Different metrics backend types require their own additional configuration. For example, ``aws_cloudwatch`` metrics backend requires ``namespace`` to be configured. metrics_backend types ''''''''''''''''''''' .. toctree:: :maxdepth: 1 aws_cloudwatch ================================================ FILE: docs/liminal/monitoring.md ================================================ # Monitoring In the `monitoring` section you can configure monitoring for your pipelines and services. ```yaml monitoring: metrics_backends: - metrics_backend: cloudwatch_metrics type: aws_cloudwatch namespace: DataPipeline AWS_REGION_NAME: us-east-1 ``` `monitoring` is a section in the root lof your liminal.yml file and is a list of `metrics_backend`s and a list of `alerts_backend`s. `metrics_backend`s are where metrics for your pipelines are automatically sent. `alerts_backends`s automatically register alerts based on `metrics_backend` they are paired with. ## metrics_backend attributes For fully detailed information on metrics backends see: [metrics backends](metrics_backends). `metrics_backend`: name of your metrics backend `type`: type of the metrics backend. The current available metrics backends are: `aws_cloudwatch`. Different metrics backend types require their own additional configuration. For example, `aws_cloudwatch` metrics backend requires `namespace` to be configured. ================================================ FILE: docs/liminal/pipelines.md ================================================ # Pipelines In the `pipelines` section you can configure scheduled/manually run pipelines of tasks to be executed sequentially: ```yaml pipelines: - pipeline: my_data_pipeline timeout_minutes: 30 start_date: 2020-03-01 schedule: 0 10 * * * tasks: - task: my_sql_task type: sql query: "SELECT * FROM {{my_database_name}}.{{my_table_name}} WHERE event_date_prt >= '{{yesterday_ds}}'" AND cms_platform = 'xsite' output_table: my_db.my_out_table output_path: s3://my_bky/{{env}}/mydir - task: my_python_task type: python image: myorg/myrepo:pythonapp cmd: python my_python_app.py env_vars: env: {{env}} fizz: buzz ``` `pipelines` is a section in the root lof your liminal.yml file and is a list of `pipeline`s defined by the following attributes: ## pipeline attributes `pipeline`: name of your pipeline (must be unique per liminal server). `timeout_minutes`: maximum allowed pipeline run time in minutes, if run exceeds this time, pipeline and all running tasks will fail. `start_date`: start date for the pipeline. `schedule`: to be configured if the pipeline should run on a schedule. Format is [cron expression](https://en.wikipedia.org/wiki/Cron#CRON_expression). `tasks`: list of `task`s, defined by the following attributes: ## task attributes For fully detailed information on tasks see: [tasks](tasks). `task`: name of your task (must be made of alphanumeric, dash and/or underscore characters only). `type`: type of the task. Examples of available task types are: `python`. and more.. Different task types require their own additional configuration. For example, `python` task requires `image` to be configured. ================================================ FILE: docs/liminal/services.md ================================================ # Services In the `services` section you can configure constantly running applications such as servers. ```yaml services: - service: my_server image: myorg/myrepo:myserver ``` `services` is a section in the root lof your liminal.yml file and is a list of `service`s, defined by the following attributes: ## service attributes `service`: name of your service. `image`: the service's docker image. ================================================ FILE: docs/liminal/tasks/README.md ================================================ # Tasks `task` is the definition of a specific step in your pipeline, and is part of the `tasks` list in your pipeline definition. For fully detailed information on pipelines see: [pipelines](../pipelines.md). ```yaml - task: my_python_task image: myorg/myrepo:mypythonapp cmd: python -u my_module.py env_vars: env: {{env}} fizz: buzz ``` A `task` is defined by the following attributes: ## task attributes `task`: name of your task (must be made of alphanumeric, dash and/or underscore characters only). `type`: type of the task. Examples of available task types are: `python` and more.. Different task types require their own additional configuration. For example, `python` task requires `image` to be configured. ## task types 1. [python](python.md) 2. [spark](spark.md) 3. [create_cloudformation_stack](create_cloudformation_stack.md) 4. [delete_cloudformation_stack](delete_cloudformation_stack.md) ================================================ FILE: docs/liminal/tasks/create_cloudformation_stack.md ================================================ # create_cloudformation_stack task The `create_cloudformation_stack` task allows you to create cloudformation stack on AWS ```yaml - task: create_emr type: create_cloudformation_stack stack_name: liminal_document_emr properties: OnFailure: DO_NOTHING TimeoutInMinutes: 25 Capabilities: [ 'CAPABILITY_NAMED_IAM' ] TemplateURL: s3://liminal-doc/emr/template.yml Parameters: Environment: Production CoreServerCount: '2' ``` ## attributes `task`: name of your task (must be made of alphanumeric, dash and/or underscore characters only). `stack_name`: the name of the stack `properties`: any attribute of [aws-create-stack-request-parameters]( https://docs.aws.amazon.com/AWSCloudFormation/latest/APIReference/API_CreateStack.html#API_CreateStack_RequestParameters) ================================================ FILE: docs/liminal/tasks/delete_cloudformation_stack.md ================================================ # delete_cloudformation_stack task The `delete_cloudformation_stack` task allows you to delete stack ```yaml - task: delete_emr type: delete_cloudformation_stack stack_name: liminal_emr_stack_name description: delete stack ``` ## attributes `task`: name of your task (must be made of alphanumeric, dash and/or underscore characters only). `stack_name`: the name of the stack to delete ================================================ FILE: docs/liminal/tasks/index.rst ================================================ .. Apchae Liminal documentation master file, created by sphinx-quickstart on Sun Nov 15 08:45:27 2020. You can adapt this file completely to your liking, but it should at least contain the root `toctree` directive. .. Licensed to the Apache Software Foundation (ASF) under one or more contributor license agreements. See the NOTICE file distributed with this work for additional information regarding copyright ownership. The ASF licenses this file to you under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at .. .. http://www.apache.org/licenses/LICENSE-2.0 .. Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. .. Tasks ===== ``task`` is the definition of a specific step in your pipeline, and is part of the ``tasks`` list in your pipeline definition. For fully detailed information on pipelines see: `pipelines`_. .. _pipelines: ../pipelines.html .. code-block:: yaml - task: my_python_task type: python image: myorg/myrepo:mypythonapp cmd: python -u my_module.py env_vars: env: {{env}} fizz: buzz .. A ``task`` is defined by the following attributes: In the ``images`` section you can configure how to pack your code into docker images. This can be achieved in liminal easily, without any Docker knowledge needed by setting just a few attributes: task attributes '''''''''''''''' ``task``: name of your task (must be made of alphanumeric, dash and/or underscore characters only). ``type``: type of the task. Different task types require their own additional configuration. For example, ``python`` task requires ``image`` to be configured. task types '''''''''' .. toctree:: :maxdepth: 1 python ================================================ FILE: docs/liminal/tasks/python.md ================================================ # python task The `python` task allows you to run python code packaged as docker images. ```yaml - task: my_python_task type: python image: myorg/myrepo:mypythonapp cmd: python my_python_app.py env_vars: env: '{{env}}' fizz: buzz mounts: - mount: mymount volume: myvol1 path: /mnt/vol1 ``` ## attributes `task`: name of your task (must be made of alphanumeric, dash and/or underscore characters only). `image`: name of image to run. `cmd`: command to run when running the image. `env_vars`: environment variables to set when running the image. `mounts`: list of `mount`s defined by the following attributes: ### mount attributes `mount`: name of the mount. `volume`: volume to mount. volumes are defined in the `volumes` section of liminal.yml `path`: path in which to mount the volume. this is the path accessible to user code. ================================================ FILE: docs/liminal/tasks/spark.md ================================================ # spark task The `spark` task allows you to submit a spark application ```yaml - task: my_spark_app type: spark executor: emr_executor application_source: 'my_app.py', class: 'com.mycompany.MySparkApp', master: 'yarn', conf: spark.driver.memory: '1g', spark.driver.maxResultSize: '1g', spark.yarn.executor.memoryOverhead: '500M' application_arguments: --query: "select * from my_db.my_input_table where my_date_col >= " "'{{yesterday_ds}}'", --output: 'my_output_table' ``` ## attributes `task`: name of your task (must be made of alphanumeric, dash and/or underscore characters only). `executor`: executor name ([supported executors](#supported-executors)) `application_source`: the location of the spark application (jar location / python file) `class`: the entry point for your application `master`: the cluster manager to connect to.See the list of [allowed master URL's](https://spark.apache.org/docs/latest/submitting-applications.html#master-urls) (Optional) `conf`: arbitrary Spark configuration properties (Optional) `application_arguments`: arguments passed to the main method of your main class (Optional) ## supported executors - [emr](../executors/emr.md) ================================================ FILE: docs/make.bat ================================================ REM REM Licensed to the Apache Software Foundation (ASF) under one REM or more contributor license agreements. See the NOTICE file REM distributed with this work for additional information REM regarding copyright ownership. The ASF licenses this file REM to you under the Apache License, Version 2.0 (the REM "License"); you may not use this file except in compliance REM with the License. You may obtain a copy of the License at REM REM http://www.apache.org/licenses/LICENSE-2.0 REM REM Unless required by applicable law or agreed to in writing, REM software distributed under the License is distributed on an REM "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY REM KIND, either express or implied. See the License for the REM specific language governing permissions and limitations REM under the License. @ECHO OFF pushd %~dp0 REM Command file for Sphinx documentation if "%SPHINXBUILD%" == "" ( set SPHINXBUILD=sphinx-build ) set SOURCEDIR=. set BUILDDIR=build if "%1" == "" goto help %SPHINXBUILD% >NUL 2>NUL if errorlevel 9009 ( echo. echo.The 'sphinx-build' command was not found. Make sure you have Sphinx echo.installed, then set the SPHINXBUILD environment variable to point echo.to the full path of the 'sphinx-build' executable. Alternatively you echo.may add the Sphinx directory to PATH. echo. echo.If you don't have Sphinx installed, grab it from echo.http://sphinx-doc.org/ exit /b 1 ) %SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% goto end :help %SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% :end popd ================================================ FILE: docs/source/How_to_install_liminal_in_airflow_on_kubernetes.md ================================================ # Install Liminal in Airflow * [Workflow](#workflow) * [Prerequisites](#prerequisites) * [One time setup](#One-time-environment-setup) * [Deploying your Yaml files](#Deploying-your-Yaml-files) * [References and other resources](#references-and-other-resources) ## Workflow To deploy and run liminal on airflow, we need to complete two tasks: 1. Ensure that the Liminal python package is installed inside each one of Airflow's pods (schedulers, Workers and Web Server). \ This is a standard airflow management task, which can be achieved in multiple ways depending on your setup (we will cover a few later in this guide). 2. Ensure that the user's Liminal code (i.e. Yamls) are present in each of the pods' DAGs folder (typically located in /opt/airflow/dags). \ Here, our recommended approach is to use a folder on a shared filesystem (EFS) to host the airflow DAGs (and Liminal Yamls). \ This folder will be mounted into Airflow pods' DAGs folder - thus enabling each of the pods to pick the files up and run the Liminal DAGs. ![](assets/liminal_deployment_diagram.png) Below is a description of how to achieve this task using a commonly used Helm chart for Airflow on Kubernetes. ## Prerequisites ### Airflow on Kubernetes This guide assumes you have successfully installed Airflow on Kubernetes. If you haven't done so yet, we found the following Helm chart useful for achieving this goal: [airflow-helm-chart-7.16.0] You'll need to add the chart to your Helm installation and follow the instructions from the README.md ```sh helm repo add airflow-stable https://airflow-helm.github.io/charts helm repo update ``` ### A provisioned EFS file system EFS is an AWS solution which offers an NFS as a service. \ You can read up about EFS [here](https://aws.amazon.com/efs/features/) and set it up via AWS console or CLI. ## One time environment setup ### Adding the Apache Liminal package to Apache Airflow pods There are multiple ways to install liminal into the Airflow pods, depending on how you deployed Airflow on Kubernetes. \ In principle, it requires the package to be pip-installed into the Airflow docker - either during build time or in run time. 1. If you are rolling out your own Airflow Docker images based on the [official docker image](https://github.com/apache/airflow), you can add apache-liminal installation during the build: ```docker build --build-arg ADDITIONAL_PYTHON_DEPS="apache-liminal"``` 2. If you already have a running Airflow on Kubernetes, you can run a command which iterates over Airflow pods and executes a ```pip install apache-liminal``` on each of the pods. 3. If you used the [airflow-helm-chart-7.16.0], you just need to specifiy apache-liminal inside the ```requirements.txt``` file as per the instructions [here][airflow-helm-chart-7.16.0] ### Preparing a "deployment box" and an EFS folder to host the Liminal Yaml files The "deployment box" is the machine which has access to the Yamls you want to deploy. \ This machine will also mount the same EFS folder as Airflow, and use ```liminal deploy``` to push the Yamls into that folder. 1. Provision an EC2 instance which has access to the EFS 3. Mount the EFS onto the EC2 instance ```sudo mount -t nfs4 -o nfsvers=4.1,rsize=1048576,wsize=1048576,hard,timeo=600,retrans=2,noresvport :/ /mnt/efs``` 3. Create a folder on an EFS drive. \ This will be our target for deployment ```mkdir -p /mnt/efs/opt/airflow/liminal_home``` 4. ```export LIMINAL_HOME=/mnt/efs/opt/airflow/liminal_home``` ### Mounting the EFS folder into Airflow pods Now that we have an EFS drive, and we've created the shared folder on it, it's time to mount it onto the Airflow pods` Dags folder. The [guide][airflowInstallation] includes details on how to expose EFS as a Persistent Volume for kubernetes. Make sure to point the pods' PV to the right EFS folder, like so: ``` volumeMounts: - name: efs-data mountPath: /opt/airflow/dags ``` where efs-data is a PVC pointing to `:/opt/airflow/liminal_home` ## Deploying your Yaml files Finally, the one time setup is done, and by this point, you should have: 1. A deployment box which can deploy Liminal to an EFS folder 2. Airflow pods which will pick up the Yamls from that folder automatically. Deploying the actual Yaml files is the simple part. Each time you want to deploy the Liminal Yamls from the deployment box: ``` liminal deploy --path <<>> ``` ## References and other resources ### Setting up Airflow on Kubernetes with EFS [Setting up Airflow on Kubernetes with AWS EFS][airflowInstallation] ### Example script In order to make it easier to undernstad all the steps, we include a script which performs the Liminal-specific steps in the deployment. This script should be run from an EC2 machine which has access to the EFS you've provisioned. \ The below steps runs the example project of [liminal-getting-started][liminal-getting-started]. * Mount the EFS and setting environment parameters: * [`liminal_aws_deploy.sh`][liminal-aws-deploy] -o installation * Build dockers from your business logic: * [`liminal_aws_deploy.sh`][liminal-aws-deploy] -o build * Deploy the Yaml onto the EFS folder: * [`liminal_aws_deploy.sh`][liminal-aws-deploy] -o deployment ![](assets/liminal_aws_deploy.gif) ##### Seting up AWS ECR with kubernetes [How to configure and use AWS ECR with kubernetes][AWS-ECR-with-kubernetes] 1. `liminal_aws_deploy.sh -o build` will build dockers from the given liminal project path. \ The [liminal.yml][liminal-yml] defindes a docker image : ``` tasks: - task: python_hello_world_example type: python description: static input task image: python_hello_world_example_image ``` The Yaml defines task that will pull the docker image from your predefined registry. \ Therefore, you need to use a docker registry in your kubernetes cluster. \ One way to achieve it is to use [Amazon ECR][amazon-ecr]. [AWS-ECR-with-kubernetes]: [amazon-ecr]: [liminal-yml]: [liminal-getting-started]: [airflow-helm-chart-7.16.0]: [homebrew-kubectl]: [cluster-access-kubeconfig]: [liminal-aws-deploy]: [airflowChart]: [airflowInstallation]: [airflowImage]: ================================================ FILE: docs/source/liminal_aws_deploy.sh ================================================ #! /bin/bash # # Licensed to the Apache Software Foundation (ASF) under one # or more contributor license agreements. See the NOTICE file # distributed with this work for additional information # regarding copyright ownership. The ASF licenses this file # to you under the Apache License, Version 2.0 (the # "License"); you may not use this file except in compliance # with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, # software distributed under the License is distributed on an # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY # KIND, either express or implied. See the License for the # specific language governing permissions and limitations # under the License. help() { echo "$0: Get Started" echo "Usage: $0 -o option" echo "Liminal available options:" echo "install" echo "build" echo "deploy" } if [[ "$#" -lt 2 ]]; then help exit fi # Arguments processing while getopts ":o:" opt; do #install, build, deploy case $opt in o) option=$OPTARG case $option in install) ACTION="install" ;; build) ACTION="build" ;; deploy) ACTION="deploy" ;; esac esac done mount_efs() { EFS_ID=$1 echo "The EFS_ID is: ${EFS_ID}" echo "Create /mnt/efs" mkdir /mnt/efs echo "Mount the /mnt/efs" sudo mount -t nfs4 -o nfsvers=4.1,rsize=1048576,wsize=1048576,hard,timeo=600,retrans=2,noresvport ${EFS_ID}:/ /mnt/efs } install_liminal() { echo "Installing liminal on Airflow components" docker ps | grep k8s_airflow | awk '{print $1}' | xargs -I {} docker exec {} bash -c 'pip install apache-liminal' echo "Installing liminal locally" pip install apache-liminal } deploy_yaml() { DEPLOY_PATH=$1 echo "The DEPLOY_PATH is: ${DEPLOY_PATH}" echo 'Export the liminal home' export LIMINAL_HOME=$(find /mnt/efs/ -name "liminal_home") liminal deploy --path "${DEPLOY_PATH}" } build() { BUILD_PATH=$1 echo "The BUILD_PATH is: ${BUILD_PATH}" liminal build --path "${BUILD_PATH}" } case $ACTION in install) read -r -p "Please enter the EFS ID: " EFS_ID mount_efs $EFS_ID install_liminal exit 0 ;; build) read -r -p "Please enter the path to the liminal project: " BUILD_PATH build $BUILD_PATH exit 0 ;; deploy) read -r -p "Please enter the liminal yaml path: " DEPLOY_PATH deploy_yaml $DEPLOY_PATH exit 0 ;; *) help exit ;; esac ================================================ FILE: examples/aws-ml-app-demo/__init__.py ================================================ # # Licensed to the Apache Software Foundation (ASF) under one # or more contributor license agreements. See the NOTICE file # distributed with this work for additional information # regarding copyright ownership. The ASF licenses this file # to you under the Apache License, Version 2.0 (the # "License"); you may not use this file except in compliance # with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, # software distributed under the License is distributed on an # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY # KIND, either express or implied. See the License for the # specific language governing permissions and limitations # under the License. ================================================ FILE: examples/aws-ml-app-demo/liminal.yml ================================================ # Licensed to the Apache Software Foundation (ASF) under one # or more contributor license agreements. See the NOTICE file # distributed with this work for additional information # regarding copyright ownership. The ASF licenses this file # to you under the Apache License, Version 2.0 (the # "License"); you may not use this file except in compliance # with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, # software distributed under the License is distributed on an # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY # KIND, either express or implied. See the License for the # specific language governing permissions and limitations # under the License. --- name: MyDataScienceApp owner: Bosco Albert Baracus volumes: - volume: gettingstartedvol claim_name: gettingstartedvol-pvc local: path: . images: - image: myorg/mydatascienceapp type: python_server source: . endpoints: - endpoint: /predict module: serving function: predict - endpoint: /healthcheck module: serving function: healthcheck - endpoint: /version module: serving function: version services: - service: my_datascience_server image: myorg/mydatascienceapp pipelines: - pipeline: my_datascience_pipeline start_date: 1970-01-01 timeout_minutes: 45 schedule: 0 * 1 * * metrics: namespace: DataScience backends: [] tasks: - task: train type: python description: train model image: myorg/mydatascienceapp cmd: python -u training.py train env: MOUNT_PATH: /mnt/gettingstartedvol mounts: - mount: mymount volume: gettingstartedvol path: /mnt/gettingstartedvol - task: validate type: python description: validate model and deploy image: myorg/mydatascienceapp cmd: python -u training.py validate env: MOUNT_PATH: /mnt/gettingstartedvol mounts: - mount: mymount volume: gettingstartedvol path: /mnt/gettingstartedvol ================================================ FILE: examples/aws-ml-app-demo/manifests/aws-ml-app-demo.yaml ================================================ # Licensed to the Apache Software Foundation (ASF) under one # or more contributor license agreements. See the NOTICE file # distributed with this work for additional information # regarding copyright ownership. The ASF licenses this file # to you under the Apache License, Version 2.0 (the # "License"); you may not use this file except in compliance # with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, # software distributed under the License is distributed on an # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY # KIND, either express or implied. See the License for the # specific language governing permissions and limitations # under the License. --- apiVersion: v1 kind: Pod metadata: name: aws-ml-app-demo spec: volumes: - name: task-pv-storage persistentVolumeClaim: claimName: gettingstartedvol-pvc containers: - name: task-pv-container imagePullPolicy: Never image: myorg/mydatascienceapp lifecycle: postStart: exec: command: [/bin/bash, -c, apt update && apt install curl -y] ports: - containerPort: 80 name: http-server volumeMounts: - mountPath: /mnt/gettingstartedvol name: task-pv-storage ================================================ FILE: examples/aws-ml-app-demo/model_store.py ================================================ # # Licensed to the Apache Software Foundation (ASF) under one # or more contributor license agreements. See the NOTICE file # distributed with this work for additional information # regarding copyright ownership. The ASF licenses this file # to you under the Apache License, Version 2.0 (the # "License"); you may not use this file except in compliance # with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, # software distributed under the License is distributed on an # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY # KIND, either express or implied. See the License for the # specific language governing permissions and limitations # under the License. import glob import os import pickle import time MOUNT_PATH = os.environ.get('MOUNT_PATH', '/mnt/gettingstartedvol') PRODUCTION = 'production' CANDIDATE = 'candidate' _ONE_HOUR = 60 * 60 class ModelStore: def __init__(self, env): self.env = env self._latest_model = None self._latest_version = None self._last_check = time.time() def load_latest_model(self, force=False): if not self._latest_model or time.time() - self._last_check > _ONE_HOUR or force: self._latest_model, self._latest_version = self._download_latest_model() return self._latest_model, self._latest_version def save_model(self, model, version): key = 'model.p' path = f'{MOUNT_PATH}/{self.env}/{version}' os.makedirs(f'{path}', exist_ok=True) pickle.dump(model, open(f'{path}/{key}', "wb")) def _download_latest_model(self): objects = glob.glob(f'{MOUNT_PATH}/{self.env}/**/*') models = list(reversed(sorted(obj for obj in objects if obj.endswith('.p')))) latest_key = models[0] version = latest_key.rsplit('/')[-2] print(f'Loading model version {version}') return pickle.load(open(latest_key, 'rb')), version ================================================ FILE: examples/aws-ml-app-demo/requirements.txt ================================================ # # Licensed to the Apache Software Foundation (ASF) under one # or more contributor license agreements. See the NOTICE file # distributed with this work for additional information # regarding copyright ownership. The ASF licenses this file # to you under the Apache License, Version 2.0 (the # "License"); you may not use this file except in compliance # with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, # software distributed under the License is distributed on an # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY # KIND, either express or implied. See the License for the # specific language governing permissions and limitations # under the License. scikit-learn==0.23.2 apache-liminal==0.0.2 Werkzeug==1.0.1 itsdangerous==1.1.0 MarkupSafe==1.1.1 Flask ================================================ FILE: examples/aws-ml-app-demo/serving.py ================================================ # # Licensed to the Apache Software Foundation (ASF) under one # or more contributor license agreements. See the NOTICE file # distributed with this work for additional information # regarding copyright ownership. The ASF licenses this file # to you under the Apache License, Version 2.0 (the # "License"); you may not use this file except in compliance # with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, # software distributed under the License is distributed on an # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY # KIND, either express or implied. See the License for the # specific language governing permissions and limitations # under the License. import json import model_store from model_store import ModelStore _MODEL_STORE = ModelStore(model_store.PRODUCTION) _PETAL_WIDTH = 'petal_width' def predict(input_json): try: input_dict = json.loads(input_json) model, version = _MODEL_STORE.load_latest_model() result = str(model.predict_proba([[float(input_dict[_PETAL_WIDTH])]])[0][1]) return json.dumps({"result": result, "version": version}) except IndexError: return 'Failure: the model is not ready yet' except Exception as e: print(e) return 'Failure' def healthcheck(self): return 'Server is up!' def version(self): try: model, version = _MODEL_STORE.load_latest_model() print(f'version={version}') return version except Exception as e: return e ================================================ FILE: examples/aws-ml-app-demo/training.py ================================================ # # Licensed to the Apache Software Foundation (ASF) under one # or more contributor license agreements. See the NOTICE file # distributed with this work for additional information # regarding copyright ownership. The ASF licenses this file # to you under the Apache License, Version 2.0 (the # "License"); you may not use this file except in compliance # with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, # software distributed under the License is distributed on an # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY # KIND, either express or implied. See the License for the # specific language governing permissions and limitations # under the License. import sys import time import model_store import numpy as np from model_store import ModelStore from sklearn import datasets from sklearn.linear_model import LogisticRegression _CANDIDATE_MODEL_STORE = ModelStore(model_store.CANDIDATE) _PRODUCTION_MODEL_STORE = ModelStore(model_store.PRODUCTION) def train_model(): iris = datasets.load_iris() X = iris["data"][:, 3:] # petal width y = (iris["target"] == 2).astype(np.int) model = LogisticRegression() model.fit(X, y) version = round(time.time()) print(f'Saving model with version {version} to candidate model store.') _CANDIDATE_MODEL_STORE.save_model(model, version) def validate_model(): model, version = _CANDIDATE_MODEL_STORE.load_latest_model() print(f'Validating model with version {version} to candidate model store.') if not isinstance(model.predict([[1]]), np.ndarray): raise ValueError('Invalid model') print(f'Deploying model with version {version} to production model store.') _PRODUCTION_MODEL_STORE.save_model(model, version) if __name__ == '__main__': cmd = sys.argv[1] if cmd == 'train': train_model() elif cmd == 'validate': validate_model() else: raise ValueError(f"Unknown command {cmd}") ================================================ FILE: examples/extensibility/__init__.py ================================================ # # Licensed to the Apache Software Foundation (ASF) under one # or more contributor license agreements. See the NOTICE file # distributed with this work for additional information # regarding copyright ownership. The ASF licenses this file # to you under the Apache License, Version 2.0 (the # "License"); you may not use this file except in compliance # with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, # software distributed under the License is distributed on an # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY # KIND, either express or implied. See the License for the # specific language governing permissions and limitations # under the License. ================================================ FILE: examples/extensibility/executors/__init__.py ================================================ # # Licensed to the Apache Software Foundation (ASF) under one # or more contributor license agreements. See the NOTICE file # distributed with this work for additional information # regarding copyright ownership. The ASF licenses this file # to you under the Apache License, Version 2.0 (the # "License"); you may not use this file except in compliance # with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, # software distributed under the License is distributed on an # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY # KIND, either express or implied. See the License for the # specific language governing permissions and limitations # under the License. ================================================ FILE: examples/extensibility/executors/custom_executor.py ================================================ # # Licensed to the Apache Software Foundation (ASF) under one # or more contributor license agreements. See the NOTICE file # distributed with this work for additional information # regarding copyright ownership. The ASF licenses this file # to you under the Apache License, Version 2.0 (the # "License"); you may not use this file except in compliance # with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, # software distributed under the License is distributed on an # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY # KIND, either express or implied. See the License for the # specific language governing permissions and limitations # under the License. from plugins.tasks import custom_task from liminal.runners.airflow.model import executor class CustomExecutor(executor.Executor): supported_task_types = [custom_task.CustomTask] def _apply_executor_task_to_dag(self, **kwargs): print("Hello from custom executor") return kwargs['task'].custom_task_logic() ================================================ FILE: examples/extensibility/images/__init__.py ================================================ # # Licensed to the Apache Software Foundation (ASF) under one # or more contributor license agreements. See the NOTICE file # distributed with this work for additional information # regarding copyright ownership. The ASF licenses this file # to you under the Apache License, Version 2.0 (the # "License"); you may not use this file except in compliance # with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, # software distributed under the License is distributed on an # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY # KIND, either express or implied. See the License for the # specific language governing permissions and limitations # under the License. ================================================ FILE: examples/extensibility/images/custom_image.py ================================================ # # Licensed to the Apache Software Foundation (ASF) under one # or more contributor license agreements. See the NOTICE file # distributed with this work for additional information # regarding copyright ownership. The ASF licenses this file # to you under the Apache License, Version 2.0 (the # "License"); you may not use this file except in compliance # with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, # software distributed under the License is distributed on an # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY # KIND, either express or implied. See the License for the # specific language governing permissions and limitations # under the License. import os from liminal.build.image_builder import ImageBuilder class CustomImageBuilder(ImageBuilder): def __init__(self, config, base_path, relative_source_path, tag): super().__init__(config, base_path, relative_source_path, tag) @staticmethod def _dockerfile_path(): return os.path.join(os.path.dirname(__file__), 'custom_image_builder/Dockerfile') ================================================ FILE: examples/extensibility/images/custom_image_builder/Dockerfile ================================================ # # Licensed to the Apache Software Foundation (ASF) under one # or more contributor license agreements. See the NOTICE file # distributed with this work for additional information # regarding copyright ownership. The ASF licenses this file # to you under the Apache License, Version 2.0 (the # "License"); you may not use this file except in compliance # with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, # software distributed under the License is distributed on an # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY # KIND, either express or implied. See the License for the # specific language governing permissions and limitations # under the License. FROM alpine:3.4 WORKDIR /app COPY . /app/ ================================================ FILE: examples/extensibility/images/custom_image_builder/__init__.py ================================================ # # Licensed to the Apache Software Foundation (ASF) under one # or more contributor license agreements. See the NOTICE file # distributed with this work for additional information # regarding copyright ownership. The ASF licenses this file # to you under the Apache License, Version 2.0 (the # "License"); you may not use this file except in compliance # with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, # software distributed under the License is distributed on an # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY # KIND, either express or implied. See the License for the # specific language governing permissions and limitations # under the License. ================================================ FILE: examples/extensibility/liminal.yml ================================================ # # Licensed to the Apache Software Foundation (ASF) under one # or more contributor license agreements. See the NOTICE file # distributed with this work for additional information # regarding copyright ownership. The ASF licenses this file # to you under the Apache License, Version 2.0 (the # "License"); you may not use this file except in compliance # with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, # software distributed under the License is distributed on an # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY # KIND, either express or implied. See the License for the # specific language governing permissions and limitations # under the License. --- name: Extensability images: - image: custom_image_example type: custom_image executors: - executor: my_custom_exec type: custom_executor pipelines: - pipeline: getting_started_pipeline_extensibility owner: Bosco Albert Baracus start_date: 1970-01-01 timeout_minutes: 10 schedule: 0 * 1 * * tasks: - task: custom_task_example type: custom_task executor: my_custom_exec ================================================ FILE: examples/extensibility/tasks/__init__.py ================================================ # # Licensed to the Apache Software Foundation (ASF) under one # or more contributor license agreements. See the NOTICE file # distributed with this work for additional information # regarding copyright ownership. The ASF licenses this file # to you under the Apache License, Version 2.0 (the # "License"); you may not use this file except in compliance # with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, # software distributed under the License is distributed on an # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY # KIND, either express or implied. See the License for the # specific language governing permissions and limitations # under the License. ================================================ FILE: examples/extensibility/tasks/custom_task.py ================================================ # # Licensed to the Apache Software Foundation (ASF) under one # or more contributor license agreements. See the NOTICE file # distributed with this work for additional information # regarding copyright ownership. The ASF licenses this file # to you under the Apache License, Version 2.0 (the # "License"); you may not use this file except in compliance # with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, # software distributed under the License is distributed on an # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY # KIND, either express or implied. See the License for the # specific language governing permissions and limitations # under the License. from airflow.operators.bash import BashOperator from liminal.runners.airflow.model import task class CustomTask(task.Task): def custom_task_logic(self): hello_world = BashOperator( task_id='hello_world', bash_command='echo "hello from liminal custom task"', ) if self.parent: self.parent.set_downstream(hello_world) return hello_world ================================================ FILE: examples/liminal-getting-started/__init__.py ================================================ # # Licensed to the Apache Software Foundation (ASF) under one # or more contributor license agreements. See the NOTICE file # distributed with this work for additional information # regarding copyright ownership. The ASF licenses this file # to you under the Apache License, Version 2.0 (the # "License"); you may not use this file except in compliance # with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, # software distributed under the License is distributed on an # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY # KIND, either express or implied. See the License for the # specific language governing permissions and limitations # under the License. ================================================ FILE: examples/liminal-getting-started/hello_world.json ================================================ { "": [ "Licensed to the Apache Software Foundation (ASF) under one", "or more contributor license agreements. See the NOTICE file", "distributed with this work for additional information", "regarding copyright ownership. The ASF licenses this file", "to you under the Apache License, Version 2.0 (the", "\"License\"); you may not use this file except in compliance", "with the License. You may obtain a copy of the License at", "", " http://www.apache.org/licenses/LICENSE-2.0", "", "Unless required by applicable law or agreed to in writing,", "software distributed under the License is distributed on an", "\"AS IS\"BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY", "KIND, either express or implied. See the License for the", "specific language governing permissions and limitations", "under the License." ], "hello": "world" } ================================================ FILE: examples/liminal-getting-started/helloworld/__init__.py ================================================ # # Licensed to the Apache Software Foundation (ASF) under one # or more contributor license agreements. See the NOTICE file # distributed with this work for additional information # regarding copyright ownership. The ASF licenses this file # to you under the Apache License, Version 2.0 (the # "License"); you may not use this file except in compliance # with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, # software distributed under the License is distributed on an # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY # KIND, either express or implied. See the License for the # specific language governing permissions and limitations # under the License. ================================================ FILE: examples/liminal-getting-started/helloworld/hello_world.py ================================================ # # Licensed to the Apache Software Foundation (ASF) under one # or more contributor license agreements. See the NOTICE file # distributed with this work for additional information # regarding copyright ownership. The ASF licenses this file # to you under the Apache License, Version 2.0 (the # "License"); you may not use this file except in compliance # with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, # software distributed under the License is distributed on an # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY # KIND, either express or implied. See the License for the # specific language governing permissions and limitations # under the License. import json import os print('Hello world!\n') print('Environment:') print(os.environ) with open('/mnt/gettingstartedvol/hello_world_output.json', 'w') as file: file.write(json.dumps({'hello': 1, 'world': 2})) ================================================ FILE: examples/liminal-getting-started/liminal.yml ================================================ # Licensed to the Apache Software Foundation (ASF) under one # or more contributor license agreements. See the NOTICE file # distributed with this work for additional information # regarding copyright ownership. The ASF licenses this file # to you under the Apache License, Version 2.0 (the # "License"); you may not use this file except in compliance # with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, # software distributed under the License is distributed on an # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY # KIND, either express or implied. See the License for the # specific language governing permissions and limitations # under the License. --- name: GettingStartedPipeline volumes: - volume: gettingstartedvol claim_name: gettingstartedvol-pvc local: path: . variables: myvar: myval images: - image: python_hello_world_example_image type: python source: helloworld - image: liminal_getting_started_server_image type: python_server source: myserver endpoints: - endpoint: /myendpoint1 module: my_server function: myendpoint1func pipelines: - pipeline: getting_started_pipeline owner: Bosco Albert Baracus start_date: 1970-01-01 timeout_minutes: 10 schedule: 0 * 1 * * default_arg_loaded: check default_array_loaded: [2, 3, 4] default_object_loaded: key1: val1 key2: val2 metrics: namespace: TestNamespace backends: [] tasks: - task: python_hello_world_example type: python image: python_hello_world_example_image env_vars: env1: '{{myvar}}' env2: foo mounts: - mount: mymount volume: gettingstartedvol path: /mnt/gettingstartedvol cmd: python -u hello_world.py executors: 2 # cmd: python -u hello_world.py - task: python_hello_world_output_task type: python description: task with input from other task's output image: python_hello_world_example_image env_vars: env1: a env2: b mounts: - mount: mymount volume: gettingstartedvol path: /mnt/gettingstartedvol cmd: python -u hello_world.py services: - service: liminal_getting_started_python_server description: my python server image: liminal_getting_started_server_image ================================================ FILE: examples/liminal-getting-started/myserver/__init__.py ================================================ # # Licensed to the Apache Software Foundation (ASF) under one # or more contributor license agreements. See the NOTICE file # distributed with this work for additional information # regarding copyright ownership. The ASF licenses this file # to you under the Apache License, Version 2.0 (the # "License"); you may not use this file except in compliance # with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, # software distributed under the License is distributed on an # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY # KIND, either express or implied. See the License for the # specific language governing permissions and limitations # under the License. ================================================ FILE: examples/liminal-getting-started/myserver/my_server.py ================================================ # # Licensed to the Apache Software Foundation (ASF) under one # or more contributor license agreements. See the NOTICE file # distributed with this work for additional information # regarding copyright ownership. The ASF licenses this file # to you under the Apache License, Version 2.0 (the # "License"); you may not use this file except in compliance # with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, # software distributed under the License is distributed on an # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY # KIND, either express or implied. See the License for the # specific language governing permissions and limitations # under the License. def myendpoint1func(): return '1' ================================================ FILE: examples/liminal-getting-started/pip.conf ================================================ # # Licensed to the Apache Software Foundation (ASF) under one # or more contributor license agreements. See the NOTICE file # distributed with this work for additional information # regarding copyright ownership. The ASF licenses this file # to you under the Apache License, Version 2.0 (the # "License"); you may not use this file except in compliance # with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, # software distributed under the License is distributed on an # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY # KIND, either express or implied. See the License for the # specific language governing permissions and limitations # under the License. ================================================ FILE: examples/liminal-getting-started/requirements.txt ================================================ # # Licensed to the Apache Software Foundation (ASF) under one # or more contributor license agreements. See the NOTICE file # distributed with this work for additional information # regarding copyright ownership. The ASF licenses this file # to you under the Apache License, Version 2.0 (the # "License"); you may not use this file except in compliance # with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, # software distributed under the License is distributed on an # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY # KIND, either express or implied. See the License for the # specific language governing permissions and limitations # under the License. ================================================ FILE: examples/sagemaker_example/README.md ================================================ # sageMaker Enable access to AWS STS AssumeRole: Go yo IAM->Role of your user and add under Trust Relationships ``` { "Version": "2012-10-17", "Statement": [ { "Effect": "Allow", "Principal": { "Federated": "arn:aws:iam:::saml-provider/Okta" }, "Action": "sts:AssumeRoleWithSAML", "Condition": { "StringEquals": { "SAML:aud": "https://signin.aws.amazon.com/saml" } } }, { "Sid": "", "Effect": "Allow", "Principal": { "Service": "sagemaker.amazonaws.com" }, "Action": "sts:AssumeRole" } ] } ``` ================================================ FILE: examples/sagemaker_example/archetype/liminal.yaml ================================================ # Licensed to the Apache Software Foundation (ASF) under one # or more contributor license agreements. See the NOTICE file # distributed with this work for additional information # regarding copyright ownership. The ASF licenses this file # to you under the Apache License, Version 2.0 (the # "License"); you may not use this file except in compliance # with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, # software distributed under the License is distributed on an # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY # KIND, either express or implied. See the License for the # specific language governing permissions and limitations # under the License. --- # superliminal for local development name: InfraSageMaker owner: Bosco Albert Baracus type: super secrets: - secret: aws local_path_file: "~/.aws/credentials" executors: - executor: k8s type: kubernetes variables: output_root_dir: "/mnt/smvolume" input_root_dir: '' images: - image: my_sm_image source: . type: python no_cache: false task_defaults: python: executor: k8s image: my_sm_image application_source: '{{application}}' env_vars: AWS_CONFIG_FILE: "/secret/credentials" COMM_PATH: "{{output_root_dir}}" secrets: - secret: aws remote_path: "/secret" mounts: - mount: mymount volume: smvolume path: /mnt/smvolume volumes: - volume: smvolume claim_name: smvolume-pvc local: path: . ================================================ FILE: examples/sagemaker_example/data_preparation/__init__.py ================================================ # # Licensed to the Apache Software Foundation (ASF) under one # or more contributor license agreements. See the NOTICE file # distributed with this work for additional information # regarding copyright ownership. The ASF licenses this file # to you under the Apache License, Version 2.0 (the # "License"); you may not use this file except in compliance # with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, # software distributed under the License is distributed on an # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY # KIND, either express or implied. See the License for the # specific language governing permissions and limitations # under the License. ================================================ FILE: examples/sagemaker_example/data_preparation/data_preparation.py ================================================ # # Licensed to the Apache Software Foundation (ASF) under one # or more contributor license agreements. See the NOTICE file # distributed with this work for additional information # regarding copyright ownership. The ASF licenses this file # to you under the Apache License, Version 2.0 (the # "License"); you may not use this file except in compliance # with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, # software distributed under the License is distributed on an # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY # KIND, either express or implied. See the License for the # specific language governing permissions and limitations # under the License. import argparse import os from pathlib import Path import pandas as pd from data_preparation.data_uploader import get_uploader from sklearn.model_selection import train_test_split TEST_CSV = "diamonds_test.csv" TRAIN_CSV = "diamonds_train.csv" LABEL_COLUMN = 'price' DATASET_PUBLIC_URL = "https://www.openml.org/data/get_csv/21792853/dataset" def transform(data): X = data.drop(columns=LABEL_COLUMN) y = data[LABEL_COLUMN] X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42) train = X_train.copy() train[LABEL_COLUMN] = y_train test = X_test.copy() test[LABEL_COLUMN] = y_test return train, test def extract(input_uri): return pd.read_csv(input_uri) def load(train, test, output_uri_base, data_uploader): train.to_csv(TRAIN_CSV) test.to_csv(TEST_CSV) train_path = data_uploader.upload(TRAIN_CSV, os.path.join(output_uri_base, "train")) test_path = data_uploader.upload(TEST_CSV, os.path.join(output_uri_base, "test")) return train_path, test_path def data_pipeline(input_uri, output_uri_base, data_uploader): data = extract(input_uri) train, test = transform(data) return load(train, test, output_uri_base, data_uploader) if __name__ == "__main__": parser = argparse.ArgumentParser() parser.add_argument("--input_uri", default=DATASET_PUBLIC_URL) parser.add_argument( "--output_uri_base", default=f"file://{Path(__file__).parent.parent.absolute().joinpath('data')}", help="a uri starting with 's3', 'file' or a relative path " "which will be treated as sagemaker prefix", ) args = parser.parse_args() data_uploader = get_uploader(args.output_uri_base) data_pipeline(args.input_uri, args.output_uri_base, data_uploader) ================================================ FILE: examples/sagemaker_example/data_preparation/data_uploader.py ================================================ # # Licensed to the Apache Software Foundation (ASF) under one # or more contributor license agreements. See the NOTICE file # distributed with this work for additional information # regarding copyright ownership. The ASF licenses this file # to you under the Apache License, Version 2.0 (the # "License"); you may not use this file except in compliance # with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, # software distributed under the License is distributed on an # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY # KIND, either express or implied. See the License for the # specific language governing permissions and limitations # under the License. import os import shutil from abc import ABC, abstractmethod from urllib.parse import urlparse import boto3 import sagemaker def get_uploader(output_uri_base): o = urlparse(output_uri_base) if o.scheme == 's3': return S3DataUploader() elif o.scheme == "file": return LocalDataUploader() return SagemakerDataUploader() class DataUploader(ABC): def __init__(self): pass @abstractmethod def upload(self, local_path, output_uri_base): pass class LocalDataUploader(DataUploader): s3_client = None def upload(self, local_path, output_uri_base): o = urlparse(output_uri_base) os.makedirs(o.path, exist_ok=True) shutil.copy2(local_path, o.path) return o.path class S3DataUploader(DataUploader): s3_client = None def __init__(self): self.s3_client = boto3.client('s3') def upload(self, local_path, output_uri_base): o = urlparse(output_uri_base) response = self.s3_client.upload_file(local_path, o.netloc, o.path) class SagemakerDataUploader(DataUploader): sm_client = None sm_session = None region = None default_bucket = None def __init__(self): self.sm_client = boto3.client("sagemaker") self.sm_session = sagemaker.Session() self.region = self.sm_session.boto_session.region_name self.default_bucket = self.sm_session.default_bucket() def upload(self, local_path, output_uri_base): o = urlparse(output_uri_base) bucket = o.netloc or self.default_bucket return self.sm_session.upload_data(path=local_path, bucket=bucket, key_prefix=o.path) ================================================ FILE: examples/sagemaker_example/data_train/__init__.py ================================================ # # Licensed to the Apache Software Foundation (ASF) under one # or more contributor license agreements. See the NOTICE file # distributed with this work for additional information # regarding copyright ownership. The ASF licenses this file # to you under the Apache License, Version 2.0 (the # "License"); you may not use this file except in compliance # with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, # software distributed under the License is distributed on an # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY # KIND, either express or implied. See the License for the # specific language governing permissions and limitations # under the License. ================================================ FILE: examples/sagemaker_example/data_train/train.py ================================================ # # Licensed to the Apache Software Foundation (ASF) under one # or more contributor license agreements. See the NOTICE file # distributed with this work for additional information # regarding copyright ownership. The ASF licenses this file # to you under the Apache License, Version 2.0 (the # "License"); you may not use this file except in compliance # with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, # software distributed under the License is distributed on an # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY # KIND, either express or implied. See the License for the # specific language governing permissions and limitations # under the License. import argparse import os from pathlib import Path import joblib import numpy as np import pandas as pd from data_preparation.data_preparation import LABEL_COLUMN, TEST_CSV, TRAIN_CSV from sklearn.compose import ColumnTransformer from sklearn.linear_model import LinearRegression from sklearn.pipeline import Pipeline from sklearn.preprocessing import OneHotEncoder feature_column_names = ['carat', 'cut', 'color', 'clarity', 'depth', 'table', 'x', 'y', 'z'] categorical_cols = ['cut', 'clarity', 'color'] # inference functions MODEL_JOBLIB_FILENAME = "model.joblib" preprocessor = ColumnTransformer( transformers=[('categorical', OneHotEncoder(), categorical_cols)], remainder='passthrough' ) def train(train_df, test_df, n_jobs, model_dir): X_train = train_df[feature_column_names] y_train = train_df[LABEL_COLUMN] print(X_train.head(3)) X_test = test_df[feature_column_names] y_test = test_df[LABEL_COLUMN] # train diamond_price_model = LinearRegression(n_jobs=n_jobs) my_pipeline = Pipeline(steps=[('preprocessor', preprocessor), ('model', diamond_price_model)]) my_pipeline.fit(X_train, y_train) train_predictions = my_pipeline.predict(X_train) print(train_predictions) print("validating model") abs_err = np.abs(my_pipeline.predict(X_test) - y_test) print(f"Test prediction error:{abs_err} ") # persist model path = os.path.join(model_dir, MODEL_JOBLIB_FILENAME) joblib.dump(my_pipeline, path) print("model persisted at " + path) return path if __name__ == '__main__': data_path = Path(__file__).parent.parent.absolute().joinpath('data') parser = argparse.ArgumentParser() parser.add_argument('--model-dir', type=str, default=os.getenv('SM_MODEL_DIR', f"file://{data_path}")) parser.add_argument('--n-jobs', default=2) parser.add_argument( "--train", type=str, default=os.getenv("SM_CHANNEL_TRAIN", f"file://{data_path.joinpath('train')}") ) parser.add_argument( "--test", type=str, default=os.getenv("SM_CHANNEL_TEST", f"file://{data_path.joinpath('test')}") ) parser.add_argument("--train-file", type=str, default=TRAIN_CSV) parser.add_argument("--test-file", type=str, default=TEST_CSV) args = parser.parse_args() train_df = pd.read_csv(os.path.join(args.train, args.train_file)) test_df = pd.read_csv(os.path.join(args.test, args.test_file)) train(train_df=train_df, test_df=test_df, n_jobs=args.n_jobs, model_dir=args.model_dir) ================================================ FILE: examples/sagemaker_example/inference.py ================================================ # # Licensed to the Apache Software Foundation (ASF) under one # or more contributor license agreements. See the NOTICE file # distributed with this work for additional information # regarding copyright ownership. The ASF licenses this file # to you under the Apache License, Version 2.0 (the # "License"); you may not use this file except in compliance # with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, # software distributed under the License is distributed on an # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY # KIND, either express or implied. See the License for the # specific language governing permissions and limitations # under the License. import argparse import os from io import BytesIO import joblib import numpy as np import pandas as pd feature_column_names = ['carat', 'cut', 'color', 'clarity', 'depth', 'table', 'x', 'y', 'z'] categorical_cols = ['cut', 'clarity', 'color'] MODEL_JOBLIB_FILENAME = "model.joblib" def model_fn(model_dir): clf = joblib.load(os.path.join(model_dir, MODEL_JOBLIB_FILENAME)) return clf def input_fn(input_data, content_type): if content_type == "application/x-npy": load_bytes = BytesIO(input_data) input_np = np.load(load_bytes, allow_pickle=True) df = pd.DataFrame(data=input_np, columns=feature_column_names) return df else: raise ValueError( f"content type {content_type} is not supported by this inference endpoint. Please send a legal application/x-npy payload" ) def predict_fn(input_data, model): prediction = model.predict(input_data[feature_column_names]) return prediction def df_to_inference_input(): X_train = df[feature_column_names] rows = X_train.head(10) inference_input = rows.to_numpy() np_bytes = BytesIO() np.save(np_bytes, inference_input, allow_pickle=True) input_data = input_fn(np_bytes.getvalue(), "application/x-npy") return input_data if __name__ == "__main__": parser = argparse.ArgumentParser() parser.add_argument("--model-dir", type=str, default=os.getenv("SM_MODEL_DIR", "../data")) parser.add_argument("--data-path", type=str, default=f"../data/test/diamonds_test.csv") args = parser.parse_args() model = model_fn(args.model_dir) df = pd.read_csv(args.data_path) input_data = df_to_inference_input() predictions = predict_fn(input_data, model) print(predictions) ================================================ FILE: examples/sagemaker_example/liminal.yaml ================================================ # Licensed to the Apache Software Foundation (ASF) under one # or more contributor license agreements. See the NOTICE file # distributed with this work for additional information # regarding copyright ownership. The ASF licenses this file # to you under the Apache License, Version 2.0 (the # "License"); you may not use this file except in compliance # with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, # software distributed under the License is distributed on an # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY # KIND, either express or implied. See the License for the # specific language governing permissions and limitations # under the License. --- name: SageMakerExample super: InfraSageMaker owner: Bosco Albert Baracus pipelines: - pipeline: sagemaker_diamonds_train_and_inference start_date: 1970-01-01 timeout_minutes: 45 schedule: 0 * 1 * * tasks: - task: data_preprocessing type: python description: prepare the data for training cmd: python -u liminal_sm.py --action data_prep --output_uri_base liminal-sm-example/data - task: train type: python description: train model cmd: python -u liminal_sm.py --action train --base_job_name liminal-base-training-job --train_instance_type ml.m5.large --n_jobs 5 - task: deploy type: python description: Deploy model cmd: python -u liminal_sm.py --action deploy --model_name liminal-sm-diamonds-model --deploy_instance_type ml.m5.large - task: validate type: python description: validate model post deployment cmd: python -u liminal_sm.py --action validate ================================================ FILE: examples/sagemaker_example/liminal_sm.py ================================================ # # Licensed to the Apache Software Foundation (ASF) under one # or more contributor license agreements. See the NOTICE file # distributed with this work for additional information # regarding copyright ownership. The ASF licenses this file # to you under the Apache License, Version 2.0 (the # "License"); you may not use this file except in compliance # with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, # software distributed under the License is distributed on an # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY # KIND, either express or implied. See the License for the # specific language governing permissions and limitations # under the License. import os import tempfile from sagemaker.deserializers import NumpyDeserializer from sagemaker.predictor import Predictor from sagemaker.serializers import NumpySerializer from sm_ops import create_sm_args_parser, sm_data_prep, sm_deploy, sm_train, sm_validate COMM_PATH = os.getenv("COMM_PATH", tempfile.mkdtemp()) os.makedirs(COMM_PATH, exist_ok=True) def read_message(name): with open(os.path.join(COMM_PATH, name)) as f: lines = f.readlines() return [x.strip() for x in lines] def forward_message(name, lines): if not isinstance(lines, list): lines = [lines] lines = '\n'.join(lines) + '\n' with open(os.path.join(COMM_PATH, name), 'w') as f: f.writelines(lines) if __name__ == '__main__': choices = ['data_prep', 'train', 'deploy', 'validate', 'all'] parser = create_sm_args_parser() parser.add_argument("--action", choices=choices, default='all') args = parser.parse_args() steps = [args.action] if args.action == 'all': steps = choices[:-1] for step in steps: if step == 'data_prep': train, test = sm_data_prep(args.input_uri, args.output_uri_base) forward_message('data_prep', [train, test]) elif step == 'train': lines = read_message('data_prep') train, test = lines[0], lines[1] artifact = sm_train( train, test, base_job_name=args.base_job_name, instance_type=args.train_instance_type, n_jobs=args.n_jobs, ) forward_message('train', artifact) elif step == 'deploy': artifact = read_message('train')[0] predictor = sm_deploy( artifact=artifact, model_name=args.model_name, instance_type=args.deploy_instance_type ) forward_message('deploy', predictor.endpoint) else: endpoint = read_message('deploy')[0] train, test = read_message('data_prep') predictor = Predictor( endpoint_name=endpoint, serializer=NumpySerializer(), deserializer=NumpyDeserializer() ) result = sm_validate(predictor, test) print(f"avg abs error:{result}") ================================================ FILE: examples/sagemaker_example/requirements.txt ================================================ # # Licensed to the Apache Software Foundation (ASF) under one # or more contributor license agreements. See the NOTICE file # distributed with this work for additional information # regarding copyright ownership. The ASF licenses this file # to you under the Apache License, Version 2.0 (the # "License"); you may not use this file except in compliance # with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, # software distributed under the License is distributed on an # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY # KIND, either express or implied. See the License for the # specific language governing permissions and limitations # under the License. sagemaker==2.57.0 scikit-learn==0.23.2 s3fs==2022.5.0 protobuf==3.20.* ================================================ FILE: examples/sagemaker_example/sm_ops.py ================================================ # # Licensed to the Apache Software Foundation (ASF) under one # or more contributor license agreements. See the NOTICE file # distributed with this work for additional information # regarding copyright ownership. The ASF licenses this file # to you under the Apache License, Version 2.0 (the # "License"); you may not use this file except in compliance # with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, # software distributed under the License is distributed on an # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY # KIND, either express or implied. See the License for the # specific language governing permissions and limitations # under the License. import argparse import os import time import boto3 import numpy as np import pandas as pd import sagemaker from data_preparation.data_preparation import ( DATASET_PUBLIC_URL, LABEL_COLUMN, data_pipeline, ) from data_preparation.data_uploader import SagemakerDataUploader from data_train.train import feature_column_names from sagemaker.sklearn.estimator import SKLearn from sagemaker.sklearn.model import SKLearnModel FRAMEWORK_VERSION = "0.23-1" sm_boto3 = boto3.client("sagemaker") def get_role(): try: role = sagemaker.get_execution_role() except ValueError: role = os.getenv("SM_ROLE") return role def sm_train(train_path, test_path, base_job_name="Liminal-sm-training-job", instance_type="ml.m5.large", n_jobs=1): sklearn_estimator = SKLearn( entry_point="data_train/train.py", source_dir="./", role=get_role(), instance_count=1, instance_type=instance_type, framework_version=FRAMEWORK_VERSION, base_job_name=base_job_name, hyperparameters={ "n-jobs": n_jobs, }, ) sklearn_estimator.fit({"train": train_path, "test": test_path}) artifact = sm_boto3.describe_training_job(TrainingJobName=sklearn_estimator.latest_training_job.name)[ "ModelArtifacts" ]["S3ModelArtifacts"] print("Model artifact persisted at " + artifact) return artifact def sm_deploy(artifact, model_name="Liminal-sm-demo-model", instance_type="ml.m5.large"): timestamp = time.strftime("-%Y-%m-%d-%H-%M-%S", time.gmtime()) model = SKLearnModel( name=f"{model_name}-{timestamp}", entry_point="inference.py", model_data=artifact, role=get_role(), framework_version=FRAMEWORK_VERSION, ) predictor = model.deploy(instance_type=instance_type, initial_instance_count=1, wait=True) return predictor def sm_validate(predictor, test_path): if not test_path.startswith("s3"): test_path = f"s3://{test_path}" df = pd.read_csv(test_path) X_test_inference = df[feature_column_names].to_numpy() predictions = predictor.predict(data=X_test_inference) print("validating model") avg_err = np.average(np.abs(predictions - df[LABEL_COLUMN])) print(f"average abs error:{avg_err}") print(predictions) return avg_err def sm_data_prep(input_uri, output_uri_base): return data_pipeline( input_uri=input_uri or DATASET_PUBLIC_URL, output_uri_base=output_uri_base, data_uploader=SagemakerDataUploader(), ) def sm_main(args): test_path = 'sagemaker-us-east-1-468326661668/liminal-sm-example/test/diamonds_test.csv' train_path, test_path = sm_data_prep(args.input_uri, args.output_uri_base) artifact = sm_train( train_path, test_path, base_job_name=args.base_job_name, instance_type=args.train_instance_type, n_jobs=args.n_jobs, ) # artifact = 's3://sagemaker-us-east-1-468326661668/Liminal-sm-training-job-2022-06-06-14-30-39-827/output/model.tar.gz' predictor = sm_deploy(artifact, model_name=args.model_name, instance_type=args.deploy_instance_type) # predictor = Predictor(endpoint_name="Liminal-sm-demo-model-2022-06-06-14-35-48-079", serializer=NumpySerializer(), deserializer=NumpyDeserializer()) sm_validate(predictor, test_path) def create_sm_args_parser(): parser = argparse.ArgumentParser() parser.add_argument("--input_uri", default=DATASET_PUBLIC_URL) parser.add_argument("--output_uri_base", default="liminal-sm-example") parser.add_argument("--base_job_name", default="Liminal-sm-training-job") parser.add_argument("--train_instance_type", default="ml.m5.large") parser.add_argument("--n_jobs", default=1) parser.add_argument("--model_name", default="Liminal-sm-demo-model") parser.add_argument("--deploy_instance_type", default="ml.m5.large") return parser if __name__ == '__main__': parser = create_sm_args_parser() args = parser.parse_args() sm_main(args) ================================================ FILE: examples/spark-app-demo/k8s/__init__.py ================================================ # # Licensed to the Apache Software Foundation (ASF) under one # or more contributor license agreements. See the NOTICE file # distributed with this work for additional information # regarding copyright ownership. The ASF licenses this file # to you under the Apache License, Version 2.0 (the # "License"); you may not use this file except in compliance # with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, # software distributed under the License is distributed on an # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY # KIND, either express or implied. See the License for the # specific language governing permissions and limitations # under the License. ================================================ FILE: examples/spark-app-demo/k8s/archetype/liminal.yml ================================================ # Licensed to the Apache Software Foundation (ASF) under one # or more contributor license agreements. See the NOTICE file # distributed with this work for additional information # regarding copyright ownership. The ASF licenses this file # to you under the Apache License, Version 2.0 (the # "License"); you may not use this file except in compliance # with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, # software distributed under the License is distributed on an # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY # KIND, either express or implied. See the License for the # specific language governing permissions and limitations # under the License. --- # superliminal for local development name: InfraSpark owner: Bosco Albert Baracus type: super executors: - executor: k8s type: kubernetes variables: output_root_dir: /mnt/gettingstartedvol input_root_dir: '' images: - image: my_spark_image source: . type: spark no_cache: true task_defaults: spark: executor: k8s image: my_spark_image executors: 2 application_source: '{{application}}' mounts: - mount: mymount volume: gettingstartedvol path: /mnt/gettingstartedvol ================================================ FILE: examples/spark-app-demo/k8s/data/iris.csv ================================================ # # Licensed to the Apache Software Foundation (ASF) under one # or more contributor license agreements. See the NOTICE file # distributed with this work for additional information # regarding copyright ownership. The ASF licenses this file # to you under the Apache License, Version 2.0 (the # "License"); you may not use this file except in compliance # with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, # software distributed under the License is distributed on an # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY # KIND, either express or implied. See the License for the # specific language governing permissions and limitations # under the License. # # source: https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data sepallength,sepalwidth,petallength,petalwidth,class 5.1,3.5,1.4,0.2,Iris-setosa 4.9,3.0,1.4,0.2,Iris-setosa 4.7,3.2,1.3,0.2,Iris-setosa 4.6,3.1,1.5,0.2,Iris-setosa 5.0,3.6,1.4,0.2,Iris-setosa 5.4,3.9,1.7,0.4,Iris-setosa 4.6,3.4,1.4,0.3,Iris-setosa 5.0,3.4,1.5,0.2,Iris-setosa 4.4,2.9,1.4,0.2,Iris-setosa 4.9,3.1,1.5,0.1,Iris-setosa 5.4,3.7,1.5,0.2,Iris-setosa 4.8,3.4,1.6,0.2,Iris-setosa 4.8,3.0,1.4,0.1,Iris-setosa 4.3,3.0,1.1,0.1,Iris-setosa 5.8,4.0,1.2,0.2,Iris-setosa 5.7,4.4,1.5,0.4,Iris-setosa 5.4,3.9,1.3,0.4,Iris-setosa 5.1,3.5,1.4,0.3,Iris-setosa 5.7,3.8,1.7,0.3,Iris-setosa 5.1,3.8,1.5,0.3,Iris-setosa 5.4,3.4,1.7,0.2,Iris-setosa 5.1,3.7,1.5,0.4,Iris-setosa 4.6,3.6,1.0,0.2,Iris-setosa 5.1,3.3,1.7,0.5,Iris-setosa 4.8,3.4,1.9,0.2,Iris-setosa 5.0,3.0,1.6,0.2,Iris-setosa 5.0,3.4,1.6,0.4,Iris-setosa 5.2,3.5,1.5,0.2,Iris-setosa 5.2,3.4,1.4,0.2,Iris-setosa 4.7,3.2,1.6,0.2,Iris-setosa 4.8,3.1,1.6,0.2,Iris-setosa 5.4,3.4,1.5,0.4,Iris-setosa 5.2,4.1,1.5,0.1,Iris-setosa 5.5,4.2,1.4,0.2,Iris-setosa 4.9,3.1,1.5,0.1,Iris-setosa 5.0,3.2,1.2,0.2,Iris-setosa 5.5,3.5,1.3,0.2,Iris-setosa 4.9,3.1,1.5,0.1,Iris-setosa 4.4,3.0,1.3,0.2,Iris-setosa 5.1,3.4,1.5,0.2,Iris-setosa 5.0,3.5,1.3,0.3,Iris-setosa 4.5,2.3,1.3,0.3,Iris-setosa 4.4,3.2,1.3,0.2,Iris-setosa 5.0,3.5,1.6,0.6,Iris-setosa 5.1,3.8,1.9,0.4,Iris-setosa 4.8,3.0,1.4,0.3,Iris-setosa 5.1,3.8,1.6,0.2,Iris-setosa 4.6,3.2,1.4,0.2,Iris-setosa 5.3,3.7,1.5,0.2,Iris-setosa 5.0,3.3,1.4,0.2,Iris-setosa 7.0,3.2,4.7,1.4,Iris-versicolor 6.4,3.2,4.5,1.5,Iris-versicolor 6.9,3.1,4.9,1.5,Iris-versicolor 5.5,2.3,4.0,1.3,Iris-versicolor 6.5,2.8,4.6,1.5,Iris-versicolor 5.7,2.8,4.5,1.3,Iris-versicolor 6.3,3.3,4.7,1.6,Iris-versicolor 4.9,2.4,3.3,1.0,Iris-versicolor 6.6,2.9,4.6,1.3,Iris-versicolor 5.2,2.7,3.9,1.4,Iris-versicolor 5.0,2.0,3.5,1.0,Iris-versicolor 5.9,3.0,4.2,1.5,Iris-versicolor 6.0,2.2,4.0,1.0,Iris-versicolor 6.1,2.9,4.7,1.4,Iris-versicolor 5.6,2.9,3.6,1.3,Iris-versicolor 6.7,3.1,4.4,1.4,Iris-versicolor 5.6,3.0,4.5,1.5,Iris-versicolor 5.8,2.7,4.1,1.0,Iris-versicolor 6.2,2.2,4.5,1.5,Iris-versicolor 5.6,2.5,3.9,1.1,Iris-versicolor 5.9,3.2,4.8,1.8,Iris-versicolor 6.1,2.8,4.0,1.3,Iris-versicolor 6.3,2.5,4.9,1.5,Iris-versicolor 6.1,2.8,4.7,1.2,Iris-versicolor 6.4,2.9,4.3,1.3,Iris-versicolor 6.6,3.0,4.4,1.4,Iris-versicolor 6.8,2.8,4.8,1.4,Iris-versicolor 6.7,3.0,5.0,1.7,Iris-versicolor 6.0,2.9,4.5,1.5,Iris-versicolor 5.7,2.6,3.5,1.0,Iris-versicolor 5.5,2.4,3.8,1.1,Iris-versicolor 5.5,2.4,3.7,1.0,Iris-versicolor 5.8,2.7,3.9,1.2,Iris-versicolor 6.0,2.7,5.1,1.6,Iris-versicolor 5.4,3.0,4.5,1.5,Iris-versicolor 6.0,3.4,4.5,1.6,Iris-versicolor 6.7,3.1,4.7,1.5,Iris-versicolor 6.3,2.3,4.4,1.3,Iris-versicolor 5.6,3.0,4.1,1.3,Iris-versicolor 5.5,2.5,4.0,1.3,Iris-versicolor 5.5,2.6,4.4,1.2,Iris-versicolor 6.1,3.0,4.6,1.4,Iris-versicolor 5.8,2.6,4.0,1.2,Iris-versicolor 5.0,2.3,3.3,1.0,Iris-versicolor 5.6,2.7,4.2,1.3,Iris-versicolor 5.7,3.0,4.2,1.2,Iris-versicolor 5.7,2.9,4.2,1.3,Iris-versicolor 6.2,2.9,4.3,1.3,Iris-versicolor 5.1,2.5,3.0,1.1,Iris-versicolor 5.7,2.8,4.1,1.3,Iris-versicolor 6.3,3.3,6.0,2.5,Iris-virginica 5.8,2.7,5.1,1.9,Iris-virginica 7.1,3.0,5.9,2.1,Iris-virginica 6.3,2.9,5.6,1.8,Iris-virginica 6.5,3.0,5.8,2.2,Iris-virginica 7.6,3.0,6.6,2.1,Iris-virginica 4.9,2.5,4.5,1.7,Iris-virginica 7.3,2.9,6.3,1.8,Iris-virginica 6.7,2.5,5.8,1.8,Iris-virginica 7.2,3.6,6.1,2.5,Iris-virginica 6.5,3.2,5.1,2.0,Iris-virginica 6.4,2.7,5.3,1.9,Iris-virginica 6.8,3.0,5.5,2.1,Iris-virginica 5.7,2.5,5.0,2.0,Iris-virginica 5.8,2.8,5.1,2.4,Iris-virginica 6.4,3.2,5.3,2.3,Iris-virginica 6.5,3.0,5.5,1.8,Iris-virginica 7.7,3.8,6.7,2.2,Iris-virginica 7.7,2.6,6.9,2.3,Iris-virginica 6.0,2.2,5.0,1.5,Iris-virginica 6.9,3.2,5.7,2.3,Iris-virginica 5.6,2.8,4.9,2.0,Iris-virginica 7.7,2.8,6.7,2.0,Iris-virginica 6.3,2.7,4.9,1.8,Iris-virginica 6.7,3.3,5.7,2.1,Iris-virginica 7.2,3.2,6.0,1.8,Iris-virginica 6.2,2.8,4.8,1.8,Iris-virginica 6.1,3.0,4.9,1.8,Iris-virginica 6.4,2.8,5.6,2.1,Iris-virginica 7.2,3.0,5.8,1.6,Iris-virginica 7.4,2.8,6.1,1.9,Iris-virginica 7.9,3.8,6.4,2.0,Iris-virginica 6.4,2.8,5.6,2.2,Iris-virginica 6.3,2.8,5.1,1.5,Iris-virginica 6.1,2.6,5.6,1.4,Iris-virginica 7.7,3.0,6.1,2.3,Iris-virginica 6.3,3.4,5.6,2.4,Iris-virginica 6.4,3.1,5.5,1.8,Iris-virginica 6.0,3.0,4.8,1.8,Iris-virginica 6.9,3.1,5.4,2.1,Iris-virginica 6.7,3.1,5.6,2.4,Iris-virginica 6.9,3.1,5.1,2.3,Iris-virginica 5.8,2.7,5.1,1.9,Iris-virginica 6.8,3.2,5.9,2.3,Iris-virginica 6.7,3.3,5.7,2.5,Iris-virginica 6.7,3.0,5.2,2.3,Iris-virginica 6.3,2.5,5.0,1.9,Iris-virginica 6.5,3.0,5.2,2.0,Iris-virginica 6.2,3.4,5.4,2.3,Iris-virginica 5.9,3.0,5.1,1.8,Iris-virginica ================================================ FILE: examples/spark-app-demo/k8s/data_cleanup.py ================================================ # # Licensed to the Apache Software Foundation (ASF) under one # or more contributor license agreements. See the NOTICE file # distributed with this work for additional information # regarding copyright ownership. The ASF licenses this file # to you under the Apache License, Version 2.0 (the # "License"); you may not use this file except in compliance # with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, # software distributed under the License is distributed on an # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY # KIND, either express or implied. See the License for the # specific language governing permissions and limitations # under the License. import argparse import pyspark.sql.functions as F from pyspark.ml import Pipeline from pyspark.ml.feature import StandardScaler, StringIndexer, VectorAssembler from pyspark.ml.functions import vector_to_array from pyspark.sql import SparkSession def transform(data): columns_to_scale = data.columns[:-1] vectorizer = VectorAssembler(inputCols=columns_to_scale, outputCol="features") scaler = StandardScaler(inputCol="features", outputCol="scaled_features", withStd=True, withMean=True) labeler = StringIndexer(inputCol=data.columns[-1], outputCol='label') pipeline = Pipeline(stages=[vectorizer, scaler, labeler]) fitted = pipeline.fit(data) transformed = fitted.transform(data) result = transformed.withColumn("feature_arr", vector_to_array("scaled_features")).select( [F.col("feature_arr")[i].alias(columns_to_scale[i]) for i in range(len(columns_to_scale))] + ['label'] ) return result def extract(spark, input_uri): return spark.read.csv(input_uri, header=True, inferSchema=True, comment="#") def load(data, output_uri): data.coalesce(1).write.mode("overwrite").csv(output_uri, header=True) def data_pipeline(input_uri, output_uri): spark = SparkSession.builder.appName("Prepare Iris Data").getOrCreate() input = extract(spark, input_uri) data = transform(input) load(data, output_uri) spark.stop() if __name__ == "__main__": parser = argparse.ArgumentParser() parser.add_argument("--input_uri") parser.add_argument("--output_uri") args = parser.parse_args() data_pipeline(args.input_uri, args.output_uri) ================================================ FILE: examples/spark-app-demo/k8s/liminal.yml ================================================ # Licensed to the Apache Software Foundation (ASF) under one # or more contributor license agreements. See the NOTICE file # distributed with this work for additional information # regarding copyright ownership. The ASF licenses this file # to you under the Apache License, Version 2.0 (the # "License"); you may not use this file except in compliance # with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, # software distributed under the License is distributed on an # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY # KIND, either express or implied. See the License for the # specific language governing permissions and limitations # under the License. --- name: MyFirstLiminalSparkApp super: InfraSpark owner: Bosco Albert Baracus variables: training_data_path: '{{output_root_dir}}/iris/' application: data_cleanup.py images: - image: myorg/mydatascienceapp type: python_server source: . endpoints: - endpoint: /predict module: serving function: predict - endpoint: /healthcheck module: serving function: healthcheck - endpoint: /version module: serving function: version task_defaults: python: mounts: - mount: mymount volume: gettingstartedvol path: /mnt/gettingstartedvol pipelines: - pipeline: my_first_spark_pipeline start_date: 1970-01-01 timeout_minutes: 45 schedule: 0 * 1 * * tasks: - task: data_preprocessing type: spark description: prepare the data for training application_arguments: - --input_uri - '{{input_root_dir}}data/iris.csv' - --output_uri - '{{training_data_path}}' - task: train type: python description: train model image: myorg/mydatascienceapp cmd: python -u training.py --action train --input_uri '{{training_data_path}}' env: MOUNT_PATH: /mnt/gettingstartedvol - task: validate type: python description: validate model and deploy image: myorg/mydatascienceapp cmd: python -u training.py --action validate --input_uri '{{training_data_path}}' env: MOUNT_PATH: /mnt/gettingstartedvol volumes: - volume: gettingstartedvol claim_name: gettingstartedvol-pvc local: path: . services: - service: name: my_datascience_server type: python_server description: my ds server image: myorg/mydatascienceapp source: . endpoints: - endpoint: /predict module: serving function: predict - endpoint: /healthcheck module: serving function: healthcheck ================================================ FILE: examples/spark-app-demo/k8s/manifests/spark-app-demo.yaml ================================================ # Licensed to the Apache Software Foundation (ASF) under one # or more contributor license agreements. See the NOTICE file # distributed with this work for additional information # regarding copyright ownership. The ASF licenses this file # to you under the Apache License, Version 2.0 (the # "License"); you may not use this file except in compliance # with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, # software distributed under the License is distributed on an # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY # KIND, either express or implied. See the License for the # specific language governing permissions and limitations # under the License. --- apiVersion: v1 kind: Pod metadata: name: spark-app-demo spec: volumes: - name: task-pv-storage persistentVolumeClaim: claimName: gettingstartedvol-pvc containers: - name: task-pv-container imagePullPolicy: Never image: myorg/mydatascienceapp lifecycle: postStart: exec: command: [/bin/bash, -c, apt update && apt install curl -y] ports: - containerPort: 80 name: http-server volumeMounts: - mountPath: /mnt/gettingstartedvol name: task-pv-storage ================================================ FILE: examples/spark-app-demo/k8s/model_store.py ================================================ # # Licensed to the Apache Software Foundation (ASF) under one # or more contributor license agreements. See the NOTICE file # distributed with this work for additional information # regarding copyright ownership. The ASF licenses this file # to you under the Apache License, Version 2.0 (the # "License"); you may not use this file except in compliance # with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, # software distributed under the License is distributed on an # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY # KIND, either express or implied. See the License for the # specific language governing permissions and limitations # under the License. import glob import os import pickle import time MOUNT_PATH = os.environ.get('MOUNT_PATH', '/mnt/gettingstartedvol') PRODUCTION = 'production' CANDIDATE = 'candidate' _ONE_HOUR = 60 * 60 class ModelStore: def __init__(self, env): self.env = env self._latest_model = None self._latest_version = None self._last_check = time.time() def load_latest_model(self, force=False): if not self._latest_model or time.time() - self._last_check > _ONE_HOUR or force: self._latest_model, self._latest_version = self._download_latest_model() return self._latest_model, self._latest_version def save_model(self, model, version): key = 'model.p' path = f'{MOUNT_PATH}/{self.env}/{version}' os.makedirs(f'{path}', exist_ok=True) pickle.dump(model, open(f'{path}/{key}', "wb")) def _download_latest_model(self): objects = glob.glob(f'{MOUNT_PATH}/{self.env}/**/*') models = list(reversed(sorted(obj for obj in objects if obj.endswith('.p')))) latest_key = models[0] version = latest_key.rsplit('/')[-2] print(f'Loading model version {version}') return pickle.load(open(latest_key, 'rb')), version ================================================ FILE: examples/spark-app-demo/k8s/requirements.txt ================================================ # # Licensed to the Apache Software Foundation (ASF) under one # or more contributor license agreements. See the NOTICE file # distributed with this work for additional information # regarding copyright ownership. The ASF licenses this file # to you under the Apache License, Version 2.0 (the # "License"); you may not use this file except in compliance # with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, # software distributed under the License is distributed on an # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY # KIND, either express or implied. See the License for the # specific language governing permissions and limitations # under the License. scikit-learn==0.24.1 apache-liminal==0.0.2 pyspark==3.1.3 ================================================ FILE: examples/spark-app-demo/k8s/serving.py ================================================ # # Licensed to the Apache Software Foundation (ASF) under one # or more contributor license agreements. See the NOTICE file # distributed with this work for additional information # regarding copyright ownership. The ASF licenses this file # to you under the Apache License, Version 2.0 (the # "License"); you may not use this file except in compliance # with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, # software distributed under the License is distributed on an # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY # KIND, either express or implied. See the License for the # specific language governing permissions and limitations # under the License. import json import model_store from model_store import ModelStore _MODEL_STORE = ModelStore(model_store.PRODUCTION) _PETAL_WIDTH = 'petal_width' def predict(input_json): try: input_dict = json.loads(input_json) model, version = _MODEL_STORE.load_latest_model() result = str(model.predict_proba([list(input_dict[_PETAL_WIDTH])])[0][1]) return json.dumps({"result": result, "version": version}) except IndexError: return 'Failure: the model is not ready yet' except Exception as e: print(e) return 'Failure' def healthcheck(self): return 'Server is up!' def version(self): try: model, version = _MODEL_STORE.load_latest_model() print(f'version={version}') return version except Exception as e: return e ================================================ FILE: examples/spark-app-demo/k8s/training.py ================================================ # # Licensed to the Apache Software Foundation (ASF) under one # or more contributor license agreements. See the NOTICE file # distributed with this work for additional information # regarding copyright ownership. The ASF licenses this file # to you under the Apache License, Version 2.0 (the # "License"); you may not use this file except in compliance # with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, # software distributed under the License is distributed on an # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY # KIND, either express or implied. See the License for the # specific language governing permissions and limitations # under the License. import os import sys import time import model_store import numpy as np import pandas as pd from model_store import ModelStore from sklearn import datasets, model_selection from sklearn.linear_model import LogisticRegression from sklearn.model_selection import train_test_split _CANDIDATE_MODEL_STORE = ModelStore(model_store.CANDIDATE) _PRODUCTION_MODEL_STORE = ModelStore(model_store.PRODUCTION) import argparse import csv import numpy as np def load_iris_from_csv_file(f): df = pd.read_csv(f, header=0).reset_index(drop=True) types = {col: np.float64 for col in df.columns[:-1]} types['label'] = np.int32 df = df.astype(types) return df def get_dataset(d): print(f"searching for csv files in {d}") for root, dirs, files in os.walk(d): for file in files: if file.endswith(".csv"): return os.path.join(d, file) return None def load_and_split(input_uri): csv_file = get_dataset(input_uri) if csv_file: print(f"found {csv_file} dataset") iris = load_iris_from_csv_file(csv_file) return train_test_split(iris, test_size=0.2, random_state=8) def train_model(input_uri): train, test = load_and_split(input_uri) y = train.pop("label") X = train.loc[:, train.columns] model = LogisticRegression(max_iter=500) model.fit(X, y) scoring = 'accuracy' results = model_selection.cross_val_score(model, X, y, cv=5, scoring=scoring) print(f'Accuracy in cross validation: {results.mean()*100} % ({results.std()} std)') version = round(time.time()) print(f'Saving model with version {version} to candidate model store.') _CANDIDATE_MODEL_STORE.save_model(model, version) def validate_model(input_uri): model, version = _CANDIDATE_MODEL_STORE.load_latest_model() print(f'Validating model with version {version} to candidate model store.') if not isinstance(model.predict([[1, 1, 1, 1]]), np.ndarray): raise ValueError('Invalid model') train, test = load_and_split(input_uri) y = test.pop("label") X = test.loc[:, test.columns] result = model.score(X, y) print(f'model accuracy {result*100}') if result < 0.85: raise ValueError('model accuracy under threshold (0.85 ). Model is not promoted to production') print(f'Deploying model with version {version} to production model store.') _PRODUCTION_MODEL_STORE.save_model(model, version) if __name__ == '__main__': parser = argparse.ArgumentParser() parser.add_argument("--action", choices=['train', 'validate']) parser.add_argument("--input_uri") args = parser.parse_args() if args.action == 'train': train_model(args.input_uri) else: validate_model(args.input_uri) ================================================ FILE: install.sh ================================================ #!/bin/sh # # Licensed to the Apache Software Foundation (ASF) under one # or more contributor license agreements. See the NOTICE file # distributed with this work for additional information # regarding copyright ownership. The ASF licenses this file # to you under the Apache License, Version 2.0 (the # "License"); you may not use this file except in compliance # with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, # software distributed under the License is distributed on an # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY # KIND, either express or implied. See the License for the # specific language governing permissions and limitations # under the License. DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )" yes | pip uninstall apache-liminal cd "$DIR" || exit rm -rf build rm -rf dist rm "${LIMINAL_HOME:-${HOME}/liminal_home}"/*.whl python setup.py sdist bdist_wheel pip install dist/*.whl mkdir -p "${LIMINAL_HOME:-${HOME}/liminal_home}" cp dist/*.whl "${LIMINAL_HOME:-${HOME}/liminal_home}" cd - || exit ================================================ FILE: liminal/__init__.py ================================================ # # Licensed to the Apache Software Foundation (ASF) under one # or more contributor license agreements. See the NOTICE file # distributed with this work for additional information # regarding copyright ownership. The ASF licenses this file # to you under the Apache License, Version 2.0 (the # "License"); you may not use this file except in compliance # with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, # software distributed under the License is distributed on an # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY # KIND, either express or implied. See the License for the # specific language governing permissions and limitations # under the License. from liminal import settings settings.initialize() ================================================ FILE: liminal/build/__init__.py ================================================ # # Licensed to the Apache Software Foundation (ASF) under one # or more contributor license agreements. See the NOTICE file # distributed with this work for additional information # regarding copyright ownership. The ASF licenses this file # to you under the Apache License, Version 2.0 (the # "License"); you may not use this file except in compliance # with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, # software distributed under the License is distributed on an # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY # KIND, either express or implied. See the License for the # specific language governing permissions and limitations # under the License. ================================================ FILE: liminal/build/image/__init__.py ================================================ # # Licensed to the Apache Software Foundation (ASF) under one # or more contributor license agreements. See the NOTICE file # distributed with this work for additional information # regarding copyright ownership. The ASF licenses this file # to you under the Apache License, Version 2.0 (the # "License"); you may not use this file except in compliance # with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, # software distributed under the License is distributed on an # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY # KIND, either express or implied. See the License for the # specific language governing permissions and limitations # under the License. ================================================ FILE: liminal/build/image/python/Dockerfile ================================================ # # Licensed to the Apache Software Foundation (ASF) under one # or more contributor license agreements. See the NOTICE file # distributed with this work for additional information # regarding copyright ownership. The ASF licenses this file # to you under the Apache License, Version 2.0 (the # "License"); you may not use this file except in compliance # with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, # software distributed under the License is distributed on an # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY # KIND, either express or implied. See the License for the # specific language governing permissions and limitations # under the License. # Use an official Python runtime as a parent image FROM {{python}} # Install aptitude build-essential RUN apt-get update && apt-get install -y --reinstall build-essential \ make \ gcc # Set the working directory to /app WORKDIR /app # Order of operations is important here for docker's caching & incremental build performance. ! # Be careful when changing this code. ! # Install any needed packages specified in requirements.txt COPY ./requirements.txt /app/ # mount the secret in the correct location, then run pip install RUN python -m pip install --upgrade pip RUN {{mount}} pip install -r requirements.txt # Copy the current directory contents into the container at /app RUN echo "Copying source code.." COPY . /app/ ================================================ FILE: liminal/build/image/python/__init__.py ================================================ # # Licensed to the Apache Software Foundation (ASF) under one # or more contributor license agreements. See the NOTICE file # distributed with this work for additional information # regarding copyright ownership. The ASF licenses this file # to you under the Apache License, Version 2.0 (the # "License"); you may not use this file except in compliance # with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, # software distributed under the License is distributed on an # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY # KIND, either express or implied. See the License for the # specific language governing permissions and limitations # under the License. ================================================ FILE: liminal/build/image/python/python.py ================================================ # # Licensed to the Apache Software Foundation (ASF) under one # or more contributor license agreements. See the NOTICE file # distributed with this work for additional information # regarding copyright ownership. The ASF licenses this file # to you under the Apache License, Version 2.0 (the # "License"); you may not use this file except in compliance # with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, # software distributed under the License is distributed on an # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY # KIND, either express or implied. See the License for the # specific language governing permissions and limitations # under the License. import os from liminal.build.python import BasePythonImageBuilder class PythonImageBuilder(BasePythonImageBuilder): def __init__(self, config, base_path, relative_source_path, tag): super().__init__(config, base_path, relative_source_path, tag) @staticmethod def _dockerfile_path(): return os.path.join(os.path.dirname(__file__), 'Dockerfile') ================================================ FILE: liminal/build/image/python_server/Dockerfile ================================================ # # Licensed to the Apache Software Foundation (ASF) under one # or more contributor license agreements. See the NOTICE file # distributed with this work for additional information # regarding copyright ownership. The ASF licenses this file # to you under the Apache License, Version 2.0 (the # "License"); you may not use this file except in compliance # with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, # software distributed under the License is distributed on an # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY # KIND, either express or implied. See the License for the # specific language governing permissions and limitations # under the License. # Use an official Python runtime as a parent image FROM {{python}} # Install aptitude build-essential #RUN apt-get install -y --reinstall build-essential # Set the working directory to /app WORKDIR /app # Order of operations is important here for docker's caching & incremental build performance. ! # Be careful when changing this code. ! # Install any needed packages specified in python_server_requirements.txt and requirements.txt RUN pip install --upgrade pip COPY ./python_server_requirements.txt /app/ RUN pip install -r python_server_requirements.txt COPY ./requirements.txt /app/ RUN {{mount}} pip install -r requirements.txt # Copy the current directory contents into the container at /app RUN echo "Copying source code.." COPY . /app/ CMD python -u liminal_python_server.py ================================================ FILE: liminal/build/image/python_server/__init__.py ================================================ # # Licensed to the Apache Software Foundation (ASF) under one # or more contributor license agreements. See the NOTICE file # distributed with this work for additional information # regarding copyright ownership. The ASF licenses this file # to you under the Apache License, Version 2.0 (the # "License"); you may not use this file except in compliance # with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, # software distributed under the License is distributed on an # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY # KIND, either express or implied. See the License for the # specific language governing permissions and limitations # under the License. ================================================ FILE: liminal/build/image/python_server/liminal_python_server.py ================================================ # # Licensed to the Apache Software Foundation (ASF) under one # or more contributor license agreements. See the NOTICE file # distributed with this work for additional information # regarding copyright ownership. The ASF licenses this file # to you under the Apache License, Version 2.0 (the # "License"); you may not use this file except in compliance # with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, # software distributed under the License is distributed on an # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY # KIND, either express or implied. See the License for the # specific language governing permissions and limitations # under the License. import yaml from flask import Blueprint, Flask, request def __get_module(kls): parts = kls.split('.') module = ".".join(parts) m = __import__(module) for comp in parts[1:]: m = getattr(m, comp) return m def __get_endpoint_function(endpoint_config): module = __get_module(endpoint_config['module']) return module.__getattribute__(endpoint_config['function']) if __name__ == '__main__': with open('service.yml') as stream: config = yaml.safe_load(stream) endpoints = { endpoint_config['endpoint'][1:]: __get_endpoint_function(endpoint_config) for endpoint_config in config['endpoints'] } blueprint = Blueprint('liminal_python_server_blueprint', __name__) @blueprint.route('/favicon.ico', methods=('GET', 'POST')) def no_content(): return '', 204 @blueprint.route('/', defaults={'endpoint': ''}, methods=('GET', 'POST')) @blueprint.route('/', methods=('GET', 'POST')) def show(endpoint): if endpoint in endpoints: return endpoints[endpoint](request.get_data()) else: return 'Page not found.', 404 app = Flask(__name__) app.register_blueprint(blueprint) app.run(host='0.0.0.0', threaded=False, port=80) ================================================ FILE: liminal/build/image/python_server/python_server.py ================================================ # # Licensed to the Apache Software Foundation (ASF) under one # or more contributor license agreements. See the NOTICE file # distributed with this work for additional information # regarding copyright ownership. The ASF licenses this file # to you under the Apache License, Version 2.0 (the # "License"); you may not use this file except in compliance # with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, # software distributed under the License is distributed on an # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY # KIND, either express or implied. See the License for the # specific language governing permissions and limitations # under the License. import os import yaml from liminal.build.python import BasePythonImageBuilder class PythonServerImageBuilder(BasePythonImageBuilder): def __init__(self, config, base_path, relative_source_path, tag): super().__init__(config, base_path, relative_source_path, tag) @staticmethod def _dockerfile_path(): return os.path.join(os.path.dirname(__file__), 'Dockerfile') @staticmethod def _additional_files_from_paths(): return [ os.path.join(os.path.dirname(__file__), 'liminal_python_server.py'), os.path.join(os.path.dirname(__file__), 'python_server_requirements.txt'), ] def _additional_files_from_filename_content_pairs(self): return super()._additional_files_from_filename_content_pairs() + [('service.yml', yaml.safe_dump(self.config))] ================================================ FILE: liminal/build/image/python_server/python_server_requirements.txt ================================================ # # Licensed to the Apache Software Foundation (ASF) under one # or more contributor license agreements. See the NOTICE file # distributed with this work for additional information # regarding copyright ownership. The ASF licenses this file # to you under the Apache License, Version 2.0 (the # "License"); you may not use this file except in compliance # with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, # software distributed under the License is distributed on an # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY # KIND, either express or implied. See the License for the # specific language governing permissions and limitations # under the License. Flask pyyaml ================================================ FILE: liminal/build/image/spark/Dockerfile ================================================ # # Licensed to the Apache Software Foundation (ASF) under one # or more contributor license agreements. See the NOTICE file # distributed with this work for additional information # regarding copyright ownership. The ASF licenses this file # to you under the Apache License, Version 2.0 (the # "License"); you may not use this file except in compliance # with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, # software distributed under the License is distributed on an # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY # KIND, either express or implied. See the License for the # specific language governing permissions and limitations # under the License. FROM bitnami/spark:3.1.2-debian-10-r23 USER root WORKDIR /app COPY . /app/ RUN {{mount}} pip install -r requirements.txt ================================================ FILE: liminal/build/image/spark/__init__.py ================================================ # # Licensed to the Apache Software Foundation (ASF) under one # or more contributor license agreements. See the NOTICE file # distributed with this work for additional information # regarding copyright ownership. The ASF licenses this file # to you under the Apache License, Version 2.0 (the # "License"); you may not use this file except in compliance # with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, # software distributed under the License is distributed on an # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY # KIND, either express or implied. See the License for the # specific language governing permissions and limitations # under the License. ================================================ FILE: liminal/build/image/spark/spark.py ================================================ # # Licensed to the Apache Software Foundation (ASF) under one # or more contributor license agreements. See the NOTICE file # distributed with this work for additional information # regarding copyright ownership. The ASF licenses this file # to you under the Apache License, Version 2.0 (the # "License"); you may not use this file except in compliance # with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, # software distributed under the License is distributed on an # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY # KIND, either express or implied. See the License for the # specific language governing permissions and limitations # under the License. import os from liminal.build.python import BasePythonImageBuilder class SparkImageBuilder(BasePythonImageBuilder): def __init__(self, config, base_path, relative_source_path, tag): super().__init__(config, base_path, relative_source_path, tag) @staticmethod def _dockerfile_path(): return os.path.join(os.path.dirname(__file__), 'Dockerfile') ================================================ FILE: liminal/build/image_builder.py ================================================ # # Licensed to the Apache Software Foundation (ASF) under one # or more contributor license agreements. See the NOTICE file # distributed with this work for additional information # regarding copyright ownership. The ASF licenses this file # to you under the Apache License, Version 2.0 (the # "License"); you may not use this file except in compliance # with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, # software distributed under the License is distributed on an # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY # KIND, either express or implied. See the License for the # specific language governing permissions and limitations # under the License. import logging import os import shutil import subprocess import tempfile import time class ImageBuilder: """ Builds an image from source code """ __NO_CACHE = 'no_cache' def __init__(self, config, base_path, relative_source_path, tag): """ :param config: task/service config :param base_path: directory containing liminal yml :param relative_source_path: source path relative to liminal yml :param tag: image tag """ self.base_path = base_path self.relative_source_path = relative_source_path self.tag = tag self.config = config def build(self): """ Builds source code into an image. """ logging.info(f'[ ] Building image: {self.tag}') temp_dir = self.__temp_dir() self.__copy_source_code(temp_dir) self._write_additional_files(temp_dir) no_cache = '' if self.__NO_CACHE in self.config and self.config[self.__NO_CACHE]: no_cache = '--no-cache=true' docker = 'docker' if shutil.which('docker') is not None else '/usr/local/bin/docker' docker_build_command = f'{docker} build {no_cache} ' + f'--tag {self.tag} ' docker_build_command += f'--progress=plain {self._build_flags()} ' docker_build_command += f'{temp_dir}' if self._use_buildkit(): docker_build_command = f'DOCKER_BUILDKIT=1 {docker_build_command}' logging.info(docker_build_command) docker_build_out = [] try: build_start = time.time() # Poll process.stdout to show stdout live build_process = subprocess.Popen( docker_build_command, shell=True, stdout=subprocess.PIPE, stderr=subprocess.STDOUT ) logging.info('=' * 80) timeout = 960 while time.time() < build_start + timeout: output = build_process.stdout.readline() if build_process.poll() is not None: break if output: stdout_log_str = output.strip().decode('utf-8') docker_build_out.append(stdout_log_str) logging.info(stdout_log_str) logging.info('=' * 80) build_process.stdout.close() build_process.wait(time.time() - (build_start + timeout)) except subprocess.CalledProcessError as e: for line in str(e.output)[2:-3].split('\\n'): logging.info(line) raise e self.__remove_dir(temp_dir) logging.info(f'[X] Building image: {self.tag} (Success).') return '\n'.join(docker_build_out) def __copy_source_code(self, temp_dir): self.__copy_dir(os.path.join(self.base_path, self.relative_source_path), temp_dir) def _write_additional_files(self, temp_dir): for file in [self._dockerfile_path()] + self._additional_files_from_paths(): self.__copy_file(file, temp_dir) for filename, content in self._additional_files_from_filename_content_pairs(): with open(os.path.join(temp_dir, filename), 'w') as file: file.write(content) def __temp_dir(self): temp_dir = tempfile.mkdtemp() # Delete dir for shutil.copytree to work self.__remove_dir(temp_dir) return temp_dir @staticmethod def __remove_dir(temp_dir): shutil.rmtree(temp_dir) @staticmethod def __copy_dir(source_path, destination_path): shutil.copytree(source_path, destination_path) @staticmethod def __copy_file(source_file_path, destination_file_path): shutil.copy2(source_file_path, destination_file_path) @staticmethod def _dockerfile_path(): """ Path to Dockerfile """ raise NotImplementedError() @staticmethod def _additional_files_from_paths(): """ List of paths to additional files """ return [] def _additional_files_from_filename_content_pairs(self): """ File name and content pairs to create files from """ return [] def _build_flags(self): """ Additional build flags to add to docker build command. """ return '' def _use_buildkit(self): """ overwrite with True to use docker buildkit """ return False ================================================ FILE: liminal/build/liminal_apps_builder.py ================================================ # # Licensed to the Apache Software Foundation (ASF) under one # or more contributor license agreements. See the NOTICE file # distributed with this work for additional information # regarding copyright ownership. The ASF licenses this file # to you under the Apache License, Version 2.0 (the # "License"); you may not use this file except in compliance # with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, # software distributed under the License is distributed on an # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY # KIND, either express or implied. See the License for the # specific language governing permissions and limitations # under the License. import logging import os from liminal.core.config.config import ConfigUtil from liminal.core.util import class_util, extensible, files_util def build_liminal_apps(path): """ Build images for liminal apps in path. """ config_util = ConfigUtil(path) configs = config_util.safe_load(is_render_variables=True, soft_merge=True) for liminal_config in configs: base_path = os.path.dirname(files_util.resolve_pipeline_source_file(liminal_config['name'])) if 'images' in liminal_config: for image in liminal_config['images']: image_name = image['image'] if 'source' in image: image_type = image['type'] builder_class = __get_image_builder_class(image_type) if builder_class: __build_image(base_path, image, builder_class) else: raise ValueError(f'No such image type: {image_type}') else: logging.warning(f'No source configured for image {image_name}.') def __build_image(base_path, builder_config, builder): builder_instance = builder( config=builder_config, base_path=base_path, relative_source_path=builder_config['source'], tag=builder_config['image'], ) builder_instance.build() def __get_image_builder_class(task_type): return image_builder_types.get(task_type, None) logging.info(f'Loading image builder implementations..') image_builder_types = extensible.load_image_builders() logging.info(f'Finished loading image builder implementations: {image_builder_types}') logging.info(f'Loading service image builder implementations..') ================================================ FILE: liminal/build/python.py ================================================ # # Licensed to the Apache Software Foundation (ASF) under one # or more contributor license agreements. See the NOTICE file # distributed with this work for additional information # regarding copyright ownership. The ASF licenses this file # to you under the Apache License, Version 2.0 (the # "License"); you may not use this file except in compliance # with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, # software distributed under the License is distributed on an # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY # KIND, either express or implied. See the License for the # specific language governing permissions and limitations # under the License. import os from liminal.build.image_builder import ImageBuilder class PythonImageVersions: """ Handles the python versions for python images. """ @property def default_version(self): return '3.7' @property def supported_versions(self): return '3.6', '3.7', '3.8', '3.9' def get_image_name(self, python_version): """ :param python_version: The python version that would be installed in the docker image. For example '3.8', '3.8.1' etc. :type python_version: str :return: The name of the base (slim) python image :rtype: str """ if not python_version: python_version = self.default_version else: python_version = str(python_version) if python_version[:3] not in self.supported_versions: raise ValueError( f'liminal supports the following python versions: ' f'{self.supported_versions} but {python_version} ' f'were passed' ) return f'python:{python_version}-slim' class BasePythonImageBuilder(ImageBuilder): """ Base class for building python images. """ __PIP_CONF = 'pip_conf' __PYTHON_VERSION = 'python_version' def __init__(self, config, base_path, relative_source_path, tag, base_image=PythonImageVersions()): super().__init__(config, base_path, relative_source_path, tag) self._base_image = base_image @staticmethod def _dockerfile_path(): raise NotImplementedError() def _write_additional_files(self, temp_dir): requirements_file_path = os.path.join(temp_dir, 'requirements.txt') if not os.path.exists(requirements_file_path): with open(requirements_file_path, 'w'): pass super()._write_additional_files(temp_dir) def _additional_files_from_filename_content_pairs(self): with open(self._dockerfile_path()) as original: data = original.read() data = self.__mount_pip_conf(data) data = self.__add_python_base_version(data) return [('Dockerfile', data)] def __mount_pip_conf(self, data): new_data = data if self.__PIP_CONF in self.config: new_data = '# syntax = docker/dockerfile:1.0-experimental\n' + data new_data = new_data.replace('{{mount}}', '--mount=type=secret,id=pip_config,dst=/etc/pip.conf \\\n') else: new_data = new_data.replace('{{mount}} ', '') return new_data def __add_python_base_version(self, data): python_version = self.config.get(self.__PYTHON_VERSION) base_image = self._base_image.get_image_name(python_version) return data.replace('{{python}}', base_image) def _build_flags(self): if self.__PIP_CONF in self.config: return f'--secret id=pip_config,src={self.config[self.__PIP_CONF]}' else: return '' def _use_buildkit(self): if self.__PIP_CONF in self.config: return True ================================================ FILE: liminal/core/__init__.py ================================================ # # Licensed to the Apache Software Foundation (ASF) under one # or more contributor license agreements. See the NOTICE file # distributed with this work for additional information # regarding copyright ownership. The ASF licenses this file # to you under the Apache License, Version 2.0 (the # "License"); you may not use this file except in compliance # with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, # software distributed under the License is distributed on an # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY # KIND, either express or implied. See the License for the # specific language governing permissions and limitations # under the License. ================================================ FILE: liminal/core/config/__init__.py ================================================ # # Licensed to the Apache Software Foundation (ASF) under one # or more contributor license agreements. See the NOTICE file # distributed with this work for additional information # regarding copyright ownership. The ASF licenses this file # to you under the Apache License, Version 2.0 (the # "License"); you may not use this file except in compliance # with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, # software distributed under the License is distributed on an # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY # KIND, either express or implied. See the License for the # specific language governing permissions and limitations # under the License. ================================================ FILE: liminal/core/config/config.py ================================================ # # Licensed to the Apache Software Foundation (ASF) under one # or more contributor license agreements. See the NOTICE file # distributed with this work for additional information # regarding copyright ownership. The ASF licenses this file # to you under the Apache License, Version 2.0 (the # "License"); you may not use this file except in compliance # with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, # software distributed under the License is distributed on an # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY # KIND, either express or implied. See the License for the # specific language governing permissions and limitations # under the License. import logging import os import traceback from liminal.core import environment from liminal.core.config.defaults import base, default_configs from liminal.core.util import dict_util, files_util class ConfigUtil: """ Load and enrich config files under configs_path. """ __BASE = 'base' __PIPELINES = 'pipelines' __SUPER = 'super' __TYPE = 'type' __SUB = 'sub' __SERVICES = 'services' __TASKS = 'tasks' __PIPELINE_DEFAULTS = 'pipeline_defaults' __TASK_DEFAULTS = 'task_defaults' __BEFORE_TASKS = 'before_tasks' __AFTER_TASKS = 'after_tasks' __EXECUTORS = 'executors' __IMAGES = 'images' __BASE = "base" __PIPELINES = "pipelines" __SUPER = "super" __TYPE = "type" __SUB = "sub" __SERVICES = "services" __TASKS = "tasks" __PIPELINE_DEFAULTS = "pipeline_defaults" __TASK_DEFAULTS = "task_defaults" __BEFORE_TASKS = "before_tasks" __AFTER_TASKS = "after_tasks" __EXECUTORS = "executors" def __init__(self, configs_path): self.configs_path = configs_path self.config_files = files_util.load(configs_path) self.base = base.BASE self.loaded_subliminals = [] self.snapshot_path = os.path.join(environment.get_airflow_home_dir(), '../liminal_config_files') def safe_load(self, is_render_variables, soft_merge=False): """ :returns list of config files after enrich with defaults and supers """ if self.loaded_subliminals: return self.loaded_subliminals configs = self.config_files.values() enriched_configs = [] for subliminal in [config for config in configs if self.__is_subliminal(config)]: name = subliminal.get('name') logging.info(f'Loading yml {name}') # noinspection PyBroadException try: superliminal = self.__get_superliminal(subliminal, soft_merge) enriched_config = self.__merge_configs(subliminal, superliminal, is_render_variables, soft_merge) enriched_configs.append(enriched_config) except Exception: logging.error(f'Failed to load yml {name}') traceback.print_exc() self.loaded_subliminals = enriched_configs return self.loaded_subliminals def __merge_configs(self, subliminal, superliminal, is_render_variables, soft_merge): if not superliminal: return subliminal sub = subliminal.copy() supr = superliminal.copy() merged_superliminal = self.__merge_configs( supr, self.__get_superliminal(supr, soft_merge), is_render_variables, soft_merge ) sub[self.__EXECUTORS] = self.__merge_section(sub, merged_superliminal, self.__EXECUTORS) sub[self.__IMAGES] = self.__merge_section(sub, merged_superliminal, self.__IMAGES) if self.__is_subliminal(sub): return self.__merge_sub_and_super(sub, merged_superliminal, is_render_variables) else: return self.__merge_superliminals(sub, merged_superliminal) def __get_superliminal(self, liminal, soft_merge): superliminal = {} if not self.__is_base_config(liminal): superliminal_name = liminal.get(self.__SUPER, '') if not superliminal_name: superliminal = self.base else: superliminal = self.__get_config(superliminal_name) if not superliminal: supr_is_missing_msg = ( f"superliminal '{superliminal_name}' " + f"is missing from '{self.configs_path}'" ) if soft_merge: logging.warning(supr_is_missing_msg) else: raise FileNotFoundError(supr_is_missing_msg) return superliminal def __get_base_config(self): return self.base def __is_base_config(self, config): return config.get('name', '') == self.__BASE def __is_subliminal(self, config): is_subliminal = config.get(self.__TYPE, self.__SUB) != self.__SUPER if is_subliminal: config[self.__TYPE] = self.__SUB return is_subliminal def __get_config(self, config_name): return self.config_files.get(config_name) def __merge_sub_and_super(self, sub, supr, is_render_variables): merged_pipelines = list() for pipeline in sub.get(self.__PIPELINES, {}): final_pipeline = self.__apply_pipeline_defaults(sub, supr, pipeline) merged_pipelines.append(final_pipeline) sub[self.__PIPELINES] = merged_pipelines sub[self.__SERVICES] = default_configs.apply_service_defaults(sub, supr) sub = dict_util.merge_dicts(supr.copy(), sub) return default_configs.apply_variable_substitution(sub, supr, is_render_variables) def __merge_superliminals(self, super1, super2): super1_pipeline_defaults = super1.get(self.__PIPELINE_DEFAULTS, {}).copy() super2_pipeline_defaults = super2.get(self.__PIPELINE_DEFAULTS, {}).copy() super1[self.__PIPELINE_DEFAULTS] = super1_pipeline_defaults super1[self.__PIPELINE_DEFAULTS][self.__BEFORE_TASKS] = super2_pipeline_defaults.pop( self.__BEFORE_TASKS, [] ) + super1_pipeline_defaults.pop(self.__BEFORE_TASKS, []) super2[self.__PIPELINE_DEFAULTS] = super2_pipeline_defaults super1[self.__PIPELINE_DEFAULTS][self.__AFTER_TASKS] = super1_pipeline_defaults.pop( self.__AFTER_TASKS, [] ) + super2_pipeline_defaults.pop(self.__AFTER_TASKS, []) # merge supers tasks return dict_util.merge_dicts(super1, super2, True) def snapshot_final_liminal_configs(self): files_util.dump_liminal_configs(liminal_configs=self.loaded_subliminals, path=self.snapshot_path) def __merge_section(self, subliminal, superliminal, section): return self.__deep_list_keyword_merge(section[:-1], subliminal.get(section, []), superliminal.get(section, [])) @staticmethod def __apply_pipeline_defaults(subliminal, superliminal, pipeline): return default_configs.apply_pipeline_defaults(subliminal, superliminal, pipeline) @staticmethod def __deep_list_keyword_merge(unique_key_name, subliminal_list_conf, superliminal_list_conf): subliminal_key_map = {item[unique_key_name]: item for item in subliminal_list_conf} superliminal_key_map = {item[unique_key_name]: item for item in superliminal_list_conf} return list(dict_util.merge_dicts(superliminal_key_map, subliminal_key_map, recursive=True).values()) ================================================ FILE: liminal/core/config/defaults/__init__.py ================================================ # # Licensed to the Apache Software Foundation (ASF) under one # or more contributor license agreements. See the NOTICE file # distributed with this work for additional information # regarding copyright ownership. The ASF licenses this file # to you under the Apache License, Version 2.0 (the # "License"); you may not use this file except in compliance # with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, # software distributed under the License is distributed on an # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY # KIND, either express or implied. See the License for the # specific language governing permissions and limitations # under the License. ================================================ FILE: liminal/core/config/defaults/base/__init__.py ================================================ # # Licensed to the Apache Software Foundation (ASF) under one # or more contributor license agreements. See the NOTICE file # distributed with this work for additional information # regarding copyright ownership. The ASF licenses this file # to you under the Apache License, Version 2.0 (the # "License"); you may not use this file except in compliance # with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, # software distributed under the License is distributed on an # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY # KIND, either express or implied. See the License for the # specific language governing permissions and limitations # under the License. import os from liminal.core.util import files_util BASE = files_util.load(os.path.dirname(__file__))['base'].copy() ================================================ FILE: liminal/core/config/defaults/base/liminal.yml ================================================ # Licensed to the Apache Software Foundation (ASF) under one # or more contributor license agreements. See the NOTICE file # distributed with this work for additional information # regarding copyright ownership. The ASF licenses this file # to you under the Apache License, Version 2.0 (the # "License"); you may not use this file except in compliance # with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, # software distributed under the License is distributed on an # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY # KIND, either express or implied. See the License for the # specific language governing permissions and limitations # under the License. --- name: base type: super executors: - executor: default_k8s type: kubernetes - executor: airflow_executor type: airflow service_defaults: description: add defaults parameters for all services task_defaults: description: add defaults parameters for all tasks separate by task type python: executor: default_k8s spark: executor: default_k8s job_end: executor: airflow_executor job_start: executor: airflow_executor pipeline_defaults: description: add defaults parameters for all pipelines before_tasks: - task: start type: job_start after_tasks: - task: end type: job_end ================================================ FILE: liminal/core/config/defaults/default_configs.py ================================================ # # Licensed to the Apache Software Foundation (ASF) under one # or more contributor license agreements. See the NOTICE file # distributed with this work for additional information # regarding copyright ownership. The ASF licenses this file # to you under the Apache License, Version 2.0 (the # "License"); you may not use this file except in compliance # with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, # software distributed under the License is distributed on an # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY # KIND, either express or implied. See the License for the # specific language governing permissions and limitations # under the License. from liminal.core.util import dict_util from liminal.core.util.dict_util import merge_dicts __SERVICES = "services" __TASKS = "tasks" __BEFORE_TASKS = "before_tasks" __AFTER_TASKS = "after_tasks" def apply_variable_substitution(subliminal, superliminal, is_render_variables=False): """ if is_render_variables is True Replace all {{variable.key}} in subliminal with variable.value variable in subliminal.variables + superliminal.variables else merge subliminal.variables with superliminal.variables without replace placeholders """ keyword = "variables" merged_variables = dict_util.merge_dicts(subliminal.get(keyword, {}), superliminal.get(keyword, {}), True) if is_render_variables: for k, v in merged_variables.items(): if isinstance(v, str) or (not isinstance(v, dict) and not isinstance(v, list)): merged_variables[k] = dict_util.replace_placholders_in_string(str(v), merged_variables) merged_variables = dict_util.replace_placeholders(merged_variables, merged_variables) return dict_util.replace_placeholders(subliminal, merged_variables) else: subliminal[keyword] = merged_variables return subliminal def apply_service_defaults(subliminal, superliminal): """Apply defaults services :param subliminal: subliminal config :param superliminal: superliminal config :returns: enriched services with superliminal.service_defaults & subliminal.service_defaults """ keyword = "service_defaults" superliminal_service_defaults = superliminal.get(keyword, {}) subliminal_service_defaults = subliminal.get(keyword, {}) merged_service_defaults = merge_dicts(subliminal_service_defaults, superliminal_service_defaults, recursive=True) return [merge_dicts(service, merged_service_defaults, True) for service in subliminal.get(__SERVICES, {})] def apply_pipeline_defaults(subliminal, superliminal, pipeline): """Apply defaults values on given pipeline :param subliminal: subliminal config :param superliminal: superliminal config :param pipeline: to apply defaults on :returns: enriched pipeline with superliminal.pipeline_defaults & subliminal.pipeline_defaults """ keyword = "pipeline_defaults" superliminal_pipe_defaults = superliminal.get(keyword, {}).copy() subliminal_pipe_defaults = subliminal.get(keyword, {}).copy() superliminal_before_tasks = superliminal_pipe_defaults.pop(__BEFORE_TASKS, []) superliminal_after_tasks = superliminal_pipe_defaults.pop(__AFTER_TASKS, []) merged_pipeline_defaults = merge_dicts(subliminal_pipe_defaults, superliminal_pipe_defaults, True) pipeline = merge_dicts(pipeline, merged_pipeline_defaults, True) return apply_task_defaults( subliminal, superliminal, pipeline, superliminal_before_tasks=superliminal_before_tasks, superliminal_after_tasks=superliminal_after_tasks, ) def apply_task_defaults(subliminal, superliminal, pipeline, superliminal_before_tasks, superliminal_after_tasks): """Apply defaults task values on given pipeline :param subliminal: subliminal config :param superliminal: superliminal config :param pipeline: where 'tasks' list can be found it :param superliminal_before_tasks: superliminal before subtasks list :param superliminal_after_tasks: superliminal after subtasks list :returns: pipeline after enrich with superliminal.tasks and superliminal.task_defaults and subliminal.task_defaults """ pipeline[__TASKS] = __apply_task_defaults( subliminal, superliminal, pipeline.get(__TASKS, []), superliminal_before_tasks, superliminal_after_tasks ) return pipeline def __apply_task_defaults( subliminal, superliminal, subliminal_tasks, superliminal_before_tasks, superliminal_after_tasks ): keyword = "task_defaults" subliminal_tasks_defaults = subliminal.get(keyword, {}) superliminal_tasks_defaults = superliminal.get(keyword, {}) merged_task_defaults = merge_dicts(subliminal_tasks_defaults, superliminal_tasks_defaults, recursive=True) return __enrich_tasks(subliminal_tasks, superliminal_before_tasks, superliminal_after_tasks, merged_task_defaults) def __enrich_tasks(sub_tasks, super_before_tasks, super_after_tasks, task_defaults): merged_tasks = super_before_tasks + sub_tasks + super_after_tasks return [merge_dicts(task, task_defaults.get(task.get('type', ''), {}), recursive=True) for task in merged_tasks] ================================================ FILE: liminal/core/environment.py ================================================ # # Licensed to the Apache Software Foundation (ASF) under one # or more contributor license agreements. See the NOTICE file # distributed with this work for additional information # regarding copyright ownership. The ASF licenses this file # to you under the Apache License, Version 2.0 (the # "License"); you may not use this file except in compliance # with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, # software distributed under the License is distributed on an # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY # KIND, either express or implied. See the License for the # specific language governing permissions and limitations # under the License. import logging import os import subprocess DEFAULT_DAGS_ZIP_NAME = 'liminal.zip' DEFAULT_LIMINAL_HOME = os.path.expanduser('~/liminal_home') DEFAULT_PIPELINES_SUBDIR = "pipelines" LIMINAL_HOME_PARAM_NAME = "LIMINAL_HOME" LIMINAL_HOME_CLOUD_PARAM_NAME = "AIRFLOW__CORE__LIMINAL_HOME" LIMINAL_VERSION_PARAM_NAME = 'LIMINAL_VERSION' def get_liminal_home(): if LIMINAL_HOME_CLOUD_PARAM_NAME in os.environ: return os.environ.get(LIMINAL_HOME_CLOUD_PARAM_NAME) if not os.environ.get(LIMINAL_HOME_PARAM_NAME): logging.info("no environment parameter called LIMINAL_HOME detected") logging.info(f"registering {DEFAULT_LIMINAL_HOME} as the LIMINAL_HOME directory") os.environ[LIMINAL_HOME_PARAM_NAME] = DEFAULT_LIMINAL_HOME return os.environ.get(LIMINAL_HOME_PARAM_NAME, DEFAULT_LIMINAL_HOME) def get_dags_dir(): # if we are inside airflow, we will take it from the configured dags folder base_dir = os.environ.get("AIRFLOW__CORE__DAGS_FOLDER", get_liminal_home()) return os.path.join(base_dir, DEFAULT_PIPELINES_SUBDIR) def get_liminal_version(): result = os.environ.get(LIMINAL_VERSION_PARAM_NAME, None) if not result: output = subprocess.run( ['pip freeze | grep \'apache-liminal\''], capture_output=True, env=os.environ, shell=True ) pip_res = output.stdout.decode('UTF-8').strip() liminal_home = get_liminal_home() whl_files = [file for file in os.listdir(liminal_home) if file.endswith(".whl")] if whl_files: value = 'file://' + os.path.join(liminal_home, whl_files[0]) elif ' @ ' in pip_res: value = pip_res[pip_res.index(' @ ') + 3 :] else: value = pip_res logging.info(f'LIMINAL_VERSION not set. Setting it to currently installed version: {value}') os.environ[LIMINAL_VERSION_PARAM_NAME] = value return os.environ.get(LIMINAL_VERSION_PARAM_NAME, 'apache-liminal') def get_airflow_home_dir(): # if we are inside airflow, we will take it from the configured dags folder base_dir = os.environ.get("AIRFLOW__CORE__DAGS_FOLDER", get_liminal_home()) return os.path.join(base_dir) ================================================ FILE: liminal/core/util/__init__.py ================================================ # # Licensed to the Apache Software Foundation (ASF) under one # or more contributor license agreements. See the NOTICE file # distributed with this work for additional information # regarding copyright ownership. The ASF licenses this file # to you under the Apache License, Version 2.0 (the # "License"); you may not use this file except in compliance # with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, # software distributed under the License is distributed on an # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY # KIND, either express or implied. See the License for the # specific language governing permissions and limitations # under the License. ================================================ FILE: liminal/core/util/class_util.py ================================================ # # Licensed to the Apache Software Foundation (ASF) under one # or more contributor license agreements. See the NOTICE file # distributed with this work for additional information # regarding copyright ownership. The ASF licenses this file # to you under the Apache License, Version 2.0 (the # "License"); you may not use this file except in compliance # with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, # software distributed under the License is distributed on an # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY # KIND, either express or implied. See the License for the # specific language governing permissions and limitations # under the License. import importlib.util import pkgutil def find_subclasses_in_packages(packages, parent_class): """ Finds all subclasses of given parent class within given packages :return: map of module ref -> class """ module_content = {} for p in packages: module_content.update(import_module(p)) subclasses = set() work = [parent_class] while work: parent = work.pop() for child in parent.__subclasses__(): if child not in subclasses: work.append(child) # verify that the found class is in the relevant module for p in packages: if p in child.__module__: subclasses.add(child) break return {sc.__module__.split(".")[-1]: sc for sc in subclasses} def import_module(package, recursive=True): """ Import all submodules of a module :param package: package (name or actual module) :type package: str | module :rtype: dict[str, types.ModuleType] :param recursive: search recursively (default: True) :type recursive: bool """ if isinstance(package, str): package = importlib.import_module(package) results = {} for loader, name, is_pkg in pkgutil.walk_packages(package.__path__): if not name == 'liminal_python_server': full_name = package.__name__ + '.' + name results[full_name] = importlib.import_module(full_name) if recursive and is_pkg: results.update(import_module(full_name)) return results def __get_class(the_module, the_class): m = __import__(the_module) for comp in the_module.split('.')[1:] + [the_class]: m = getattr(m, comp) return m ================================================ FILE: liminal/core/util/dict_util.py ================================================ # # Licensed to the Apache Software Foundation (ASF) under one # or more contributor license agreements. See the NOTICE file # distributed with this work for additional information # regarding copyright ownership. The ASF licenses this file # to you under the Apache License, Version 2.0 (the # "License"); you may not use this file except in compliance # with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, # software distributed under the License is distributed on an # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY # KIND, either express or implied. See the License for the # specific language governing permissions and limitations # under the License. import re from collections import OrderedDict from sqlalchemy.util import OrderedSet from liminal.runners.airflow.config import standalone_variable_backend def merge_dicts(dict1, dict2, recursive=False): """ :returns dict1 enriched by dict2 """ if not recursive: return {**dict1, **dict2} return __merge_dicts(dict1, dict2) def __merge_dicts(dict1, dict2): # recursive merge merged_dicts = OrderedDict() dict_1_keys = dict1.keys() dict_2_keys = dict2.keys() for k in OrderedSet(dict_1_keys).union(dict_2_keys): if k in dict1 and k in dict2: if isinstance(dict1[k], dict) and isinstance(dict2[k], dict): merged_dicts[k] = dict(__merge_dicts(dict1[k], dict2[k])) else: merged_dicts[k] = dict1[k] elif k in dict_1_keys: merged_dicts[k] = dict1[k] else: merged_dicts[k] = dict2[k] return merged_dicts __PLACE_HOLDER_PATTERN = r"{{\s*([a-zA-Z0-9._-]+)\s*}}" def replace_placeholders(dct, variables): """ Replace all {{variable.key}} in dct with variable.value variable in variables """ return dict(__replace_placeholders(dct, variables)) def replace_placholders_in_string(string_value, variables, pattern=__PLACE_HOLDER_PATTERN): return re.sub(pattern, lambda m: __repl(m, variables), string_value, flags=re.IGNORECASE) def __replace_placeholders(dct, variables): dct_items = dct.items() for k, v in dct_items: if isinstance(v, str): yield k, replace_placholders_in_string(v, variables) elif isinstance(v, dict): yield k, dict(__replace_placeholders(v, variables)) elif isinstance(v, list): yield k, list(__replace_placeholder_in_list(v, variables)) else: yield k, v def __replace_placeholder_in_list(lst, variables): for v in lst: if isinstance(v, str): yield replace_placholders_in_string(v, variables) elif isinstance(v, dict): yield dict(__replace_placeholders(v, variables)) elif isinstance(v, list): yield __replace_placeholder_in_list(v, variables) else: yield v def __repl(matched, variables): origin = matched.group(0) key = matched.group(1) return variables.get(key, __try_backend_variables(key, default=origin)) def __try_backend_variables(key, default): return standalone_variable_backend.get_variable(key, default) ================================================ FILE: liminal/core/util/env_util.py ================================================ # # Licensed to the Apache Software Foundation (ASF) under one # or more contributor license agreements. See the NOTICE file # distributed with this work for additional information # regarding copyright ownership. The ASF licenses this file # to you under the Apache License, Version 2.0 (the # "License"); you may not use this file except in compliance # with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, # software distributed under the License is distributed on an # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY # KIND, either express or implied. See the License for the # specific language governing permissions and limitations # under the License. import os def is_running_on_jenkins(): jenkins_url = os.getenv('JENKINS_URL') return jenkins_url is not None and jenkins_url ================================================ FILE: liminal/core/util/extensible.py ================================================ # # Licensed to the Apache Software Foundation (ASF) under one # or more contributor license agreements. See the NOTICE file # distributed with this work for additional information # regarding copyright ownership. The ASF licenses this file # to you under the Apache License, Version 2.0 (the # "License"); you may not use this file except in compliance # with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, # software distributed under the License is distributed on an # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY # KIND, either express or implied. See the License for the # specific language governing permissions and limitations # under the License. import sys from liminal.build.image_builder import ImageBuilder from liminal.core.util import class_util from liminal.runners.airflow.model import executor from liminal.runners.airflow.model.task import Task __PLUGINS = 'plugins' def __generate_extra_paths(plugin_type, extra_paths): return (extra_paths or []) + ([f'{__PLUGINS}.{plugin_type}'] if f'{__PLUGINS}.{plugin_type}' in sys.path else []) def load_executors(extra_paths=None): """ Load all Executor extensions """ package_paths = ['liminal.runners.airflow.executors'] + __generate_extra_paths('executors', extra_paths) return class_util.find_subclasses_in_packages(package_paths, executor.Executor) def load_tasks(extra_paths=None): """ Load all Task extensions """ package_paths = ['liminal.runners.airflow.tasks'] + __generate_extra_paths('tasks', extra_paths) return class_util.find_subclasses_in_packages(package_paths, Task) def load_image_builders(extra_paths=None): """ Load all ImageBuilder extensions """ package_paths = ['liminal.build.image'] + __generate_extra_paths('images', extra_paths) return class_util.find_subclasses_in_packages(package_paths, ImageBuilder) ================================================ FILE: liminal/core/util/files_util.py ================================================ # # Licensed to the Apache Software Foundation (ASF) under one # or more contributor license agreements. See the NOTICE file # distributed with this work for additional information # regarding copyright ownership. The ASF licenses this file # to you under the Apache License, Version 2.0 (the # "License"); you may not use this file except in compliance # with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, # software distributed under the License is distributed on an # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY # KIND, either express or implied. See the License for the # specific language governing permissions and limitations # under the License. import logging import os import yaml # { path -> { name -> config_file } } cached_files = dict() cached_source_files = dict() def resolve_pipeline_source_file(config_name): """ Return the source (file) path of the given config name """ return cached_source_files[config_name] def load(path): """ :param path: config path :returns dict of {name -> loaded config file} from all liminal.y[a]ml files under given path """ if cached_files.get(path): return cached_files[path] config_entities = {} for file_data in find_config_files(path): with open(file_data) as data: config_file = yaml.safe_load(data) config_entities[config_file['name']] = config_file cached_source_files[config_file['name']] = file_data cached_files[path] = config_entities return config_entities def find_config_files(path): """ :param path: config path :returns list of all liminal.y[a]ml files under config path """ files = [] logging.info(path) for r, d, f in os.walk(path): for file in f: if os.path.basename(file) in ['liminal.yml', 'liminal.yaml']: logging.info(os.path.join(r, file)) files.append(os.path.join(r, file)) return files def dump_liminal_configs(liminal_configs, path): if not (os.path.exists(path)): os.mkdir(path) logging.info(f"Starting to dump liminal configs into {path}") for liminal_config in liminal_configs: dump_liminal_config(liminal_config, f'{path}/{liminal_config["name"]}.yml') def dump_liminal_config(liminal_config, file_path): with open(file_path, 'w') as config_file: logging.info(f"Dumping {liminal_config['name']} into {file_path}") yaml.dump(liminal_config, config_file, default_flow_style=False) ================================================ FILE: liminal/docker/__init__.py ================================================ # # Licensed to the Apache Software Foundation (ASF) under one # or more contributor license agreements. See the NOTICE file # distributed with this work for additional information # regarding copyright ownership. The ASF licenses this file # to you under the Apache License, Version 2.0 (the # "License"); you may not use this file except in compliance # with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, # software distributed under the License is distributed on an # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY # KIND, either express or implied. See the License for the # specific language governing permissions and limitations # under the License. ================================================ FILE: liminal/kubernetes/__init__.py ================================================ # # Licensed to the Apache Software Foundation (ASF) under one # or more contributor license agreements. See the NOTICE file # distributed with this work for additional information # regarding copyright ownership. The ASF licenses this file # to you under the Apache License, Version 2.0 (the # "License"); you may not use this file except in compliance # with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, # software distributed under the License is distributed on an # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY # KIND, either express or implied. See the License for the # specific language governing permissions and limitations # under the License. ================================================ FILE: liminal/kubernetes/secret_util.py ================================================ # # Licensed to the Apache Software Foundation (ASF) under one # or more contributor license agreements. See the NOTICE file # distributed with this work for additional information # regarding copyright ownership. The ASF licenses this file # to you under the Apache License, Version 2.0 (the # "License"); you may not use this file except in compliance # with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, # software distributed under the License is distributed on an # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY # KIND, either express or implied. See the License for the # specific language governing permissions and limitations # under the License. import base64 import logging import os import sys from pathlib import Path from time import sleep from kubernetes import client, config from kubernetes.client import V1Secret # noinspection PyBroadException try: config.load_kube_config() except Exception: msg = "Kubernetes is not running\n" sys.stdout.write(f"INFO: {msg}") _LOG = logging.getLogger('volume_util') _LOCAL_VOLUMES = set() _kubernetes = client.CoreV1Api() def get_secret_configs(liminal_config): secrets_config = liminal_config.get('secrets', []) for volume_config in secrets_config: if 'secret' in volume_config and 'local_path_file' not in volume_config: secret_path = f"{os.getcwd()}/credentials-{volume_config['secret']}.txt" open(secret_path, 'a').close() volume_config['local_path_file'] = secret_path return secrets_config def create_local_secrets(liminal_config): secrets_config = get_secret_configs(liminal_config) for secret_config in secrets_config: logging.info(f'Creating local kubernetes secret if needed: {secret_config}') create_secret(secret_config) def create_secret(conf, namespace='default') -> None: name = conf['secret'] _LOG.info(f'Requested secret {name}') if name not in _LOCAL_VOLUMES: matching_secrets = _kubernetes.list_namespaced_secret( namespace, field_selector=f'metadata.name={name}' ).to_dict()['items'] if len(matching_secrets) == 0: _create_secret(namespace, conf, name) sleep(5) _LOCAL_VOLUMES.add(name) def _create_secret(namespace, conf, name): _LOG.info(f'Creating kubernetes secret {name} with spec {conf}') _kubernetes.create_namespaced_secret( namespace, V1Secret( api_version='v1', kind='Secret', metadata={ 'name': name, 'labels': {"apache/incubator-liminal": "liminal.apache.org"}, }, data={ 'credentials': base64.b64encode( Path(os.path.expanduser(conf['local_path_file'])).read_text().encode('ascii') ).decode('ascii') }, ), ) def delete_local_secrets(liminal_config, base_dir): secrets_config = get_secret_configs(liminal_config, base_dir) for secret_config in secrets_config: logging.info(f'Delete local secret if needed: {secret_config}') delete_local_secret(secret_config) def delete_local_secret(name, namespace='default'): matching_secrets = _kubernetes.list_namespaced_secret(namespace, field_selector=f'metadata.name={name}').to_dict()[ 'items' ] if len(matching_secrets) > 0: _LOG.info(f'Deleting secret {name}') _kubernetes.delete_namespaced_secret(name, namespace) if name in _LOCAL_VOLUMES: _LOCAL_VOLUMES.remove(name) ================================================ FILE: liminal/kubernetes/volume_util.py ================================================ # # Licensed to the Apache Software Foundation (ASF) under one # or more contributor license agreements. See the NOTICE file # distributed with this work for additional information # regarding copyright ownership. The ASF licenses this file # to you under the Apache License, Version 2.0 (the # "License"); you may not use this file except in compliance # with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, # software distributed under the License is distributed on an # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY # KIND, either express or implied. See the License for the # specific language governing permissions and limitations # under the License. import logging import os import sys from time import sleep from kubernetes import client, config from kubernetes.client import V1PersistentVolume, V1PersistentVolumeClaim # noinspection PyBroadException try: config.load_kube_config() except Exception: msg = "Kubernetes is not running\n" sys.stdout.write(f"INFO: {msg}") _LOG = logging.getLogger('volume_util') _LOCAL_VOLUMES = set() _kubernetes = client.CoreV1Api() def get_volume_configs(liminal_config, base_dir): volumes_config = liminal_config.get('volumes', []) for volume_config in volumes_config: if 'local' in volume_config: path = volume_config['local']['path'] if path.startswith(".."): path = os.path.join(base_dir, path) if path.startswith("."): path = os.path.join(base_dir, path[1:]) volume_config['local']['path'] = path return volumes_config def create_local_volumes(liminal_config, base_dir): volumes_config = get_volume_configs(liminal_config, base_dir) for volume_config in volumes_config: logging.info(f'Creating local kubernetes volume if needed: {volume_config}') create_local_volume(volume_config) def create_local_volume(conf, namespace='default') -> None: name = conf['volume'] _LOG.info(f'Requested volume {name}') if name not in _LOCAL_VOLUMES: matching_volumes = _kubernetes.list_persistent_volume(field_selector=f'metadata.name={name}').to_dict()[ 'items' ] while len(matching_volumes) == 0: _create_local_volume(conf, name) sleep(5) matching_volumes = _kubernetes.list_persistent_volume(field_selector=f'metadata.name={name}').to_dict()[ 'items' ] pvc_name = conf.get('claim_name', f'{name}-pvc') matching_claims = _kubernetes.list_persistent_volume_claim_for_all_namespaces( field_selector=f'metadata.name={pvc_name}' ).to_dict()['items'] while len(matching_claims) == 0: _create_persistent_volume_claim(pvc_name, name, namespace) sleep(5) matching_claims = _kubernetes.list_persistent_volume_claim_for_all_namespaces( field_selector=f'metadata.name={pvc_name}' ).to_dict()['items'] _LOCAL_VOLUMES.add(name) def delete_local_volumes(liminal_config, base_dir): volumes_config = get_volume_configs(liminal_config, base_dir) for volume_config in volumes_config: logging.info(f'Delete local kubernetes volume if needed: {volume_config}') delete_local_volume(volume_config['volume']) def delete_local_volume(name, namespace='default'): pvc_name = f'{name}-pvc' matching_claims = _list_persistent_volume_claims(pvc_name) if len(matching_claims) > 0: _LOG.info(f'Deleting persistent volume claim {pvc_name}') _kubernetes.delete_namespaced_persistent_volume_claim(pvc_name, namespace) while len(matching_claims) > 0: matching_claims = _list_persistent_volume_claims(pvc_name) matching_volumes = _list_persistent_volumes(name) if len(matching_volumes) > 0: _LOG.info(f'Deleting persistent volume {name}') _kubernetes.delete_persistent_volume(name) while len(matching_volumes) > 0: matching_volumes = _list_persistent_volumes(name) if name in _LOCAL_VOLUMES: _LOCAL_VOLUMES.remove(name) def _list_persistent_volume_claims(name): return _kubernetes.list_persistent_volume_claim_for_all_namespaces( field_selector=f'metadata.name={name}' ).to_dict()['items'] def _list_persistent_volumes(name): return _kubernetes.list_persistent_volume(field_selector=f'metadata.name={name}').to_dict()['items'] def _create_persistent_volume_claim(pvc_name, volume_name, namespace): _LOG.info(f'Creating persistent volume claim {pvc_name} with volume {volume_name}') spec = { 'volumeName': volume_name, 'volumeMode': 'Filesystem', 'storageClassName': 'local-storage', 'accessModes': ['ReadWriteOnce'], 'resources': {'requests': {'storage': '100Gi'}}, } _kubernetes.create_namespaced_persistent_volume_claim( namespace, V1PersistentVolumeClaim( api_version='v1', kind='PersistentVolumeClaim', metadata={'name': pvc_name}, spec=spec ), ) def _create_local_volume(conf, name): _LOG.info(f'Creating persistent volume {name} with spec {conf}') spec = { 'capacity': {'storage': '100Gi'}, 'volumeMode': 'Filesystem', 'accessModes': ['ReadWriteOnce'], 'persistentVolumeReclaimPolicy': 'Retain', 'storageClassName': 'local-storage', 'nodeAffinity': { 'required': { 'nodeSelectorTerms': [ {'matchExpressions': [{'key': 'kubernetes.io/hostname', 'operator': 'NotIn', 'values': ['']}]} ] } }, } spec.update(conf) _kubernetes.create_persistent_volume( V1PersistentVolume(api_version='v1', kind='PersistentVolume', metadata={'name': name}, spec=spec) ) ================================================ FILE: liminal/logging/__init__.py ================================================ # # Licensed to the Apache Software Foundation (ASF) under one # or more contributor license agreements. See the NOTICE file # distributed with this work for additional information # regarding copyright ownership. The ASF licenses this file # to you under the Apache License, Version 2.0 (the # "License"); you may not use this file except in compliance # with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, # software distributed under the License is distributed on an # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY # KIND, either express or implied. See the License for the # specific language governing permissions and limitations # under the License. ================================================ FILE: liminal/logging/logging_setup.py ================================================ # # Licensed to the Apache Software Foundation (ASF) under one # or more contributor license agreements. See the NOTICE file # distributed with this work for additional information # regarding copyright ownership. The ASF licenses this file # to you under the Apache License, Version 2.0 (the # "License"); you may not use this file except in compliance # with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, # software distributed under the License is distributed on an # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY # KIND, either express or implied. See the License for the # specific language governing permissions and limitations # under the License. import logging import os from logging.handlers import RotatingFileHandler from liminal.core import environment LOGS_DIR = 'logs' LOG_FILENAME = 'liminal.log' MAX_FILE_SIZE = 10485760 # 10 MB def logging_initialization(): root_logger = logging.getLogger() log_formatter = logging.Formatter( '[%(asctime)s] [%(filename)s:%(lineno)d] %(levelname)s - %(message)s', '%m-%d %H:%M:%S' ) logs_dir = os.path.join(environment.get_liminal_home(), LOGS_DIR) os.makedirs(logs_dir, exist_ok=True) file_handler = RotatingFileHandler(os.path.join(logs_dir, LOG_FILENAME), maxBytes=MAX_FILE_SIZE, backupCount=3) root_logger.addHandler(file_handler) root_logger.setLevel(logging.INFO) [h.setFormatter(log_formatter) for h in root_logger.handlers] logging.info('Logging initialization completed') ================================================ FILE: liminal/monitoring/__init__.py ================================================ # # Licensed to the Apache Software Foundation (ASF) under one # or more contributor license agreements. See the NOTICE file # distributed with this work for additional information # regarding copyright ownership. The ASF licenses this file # to you under the Apache License, Version 2.0 (the # "License"); you may not use this file except in compliance # with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, # software distributed under the License is distributed on an # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY # KIND, either express or implied. See the License for the # specific language governing permissions and limitations # under the License. ================================================ FILE: liminal/runners/__init__.py ================================================ # # Licensed to the Apache Software Foundation (ASF) under one # or more contributor license agreements. See the NOTICE file # distributed with this work for additional information # regarding copyright ownership. The ASF licenses this file # to you under the Apache License, Version 2.0 (the # "License"); you may not use this file except in compliance # with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, # software distributed under the License is distributed on an # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY # KIND, either express or implied. See the License for the # specific language governing permissions and limitations # under the License. ================================================ FILE: liminal/runners/airflow/__init__.py ================================================ ## # Licensed to the Apache Software Foundation (ASF) under one # or more contributor license agreements. See the NOTICE file # distributed with this work for additional information # regarding copyright ownership. The ASF licenses this file # to you under the Apache License, Version 2.0 (the # "License"); you may not use this file except in compliance # with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, # software distributed under the License is distributed on an # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY # KIND, either express or implied. See the License for the # specific language governing permissions and limitations # under the License. from datetime import datetime from unittest.mock import MagicMock import pytz class DummyDagRun: def __init__(self) -> None: self.start_date = pytz.utc.localize(datetime.utcnow()) self.conf = None self.run_id = "run_id" @staticmethod def get_task_instances(): return [] def get_task_instance(self, _): return self from datetime import datetime TASK_ID_SEPARATOR = '.' class DummyDag: def __init__(self, dag_id, task_id): self.dag_id = dag_id self.task_id = task_id self.try_number = 0 self.is_subdag = False self.execution_date = '2017-05-21T00:00:00' self.dag_run_id = 'dag_run_id' self.owner = ['owner1', 'owner2'] self.email = ['email1@test.com'] self.task = MagicMock(name='task', owner=self.owner, email=self.email) self.context = { 'dag': self, 'task': self, 'ti': self, 'ts': datetime.now().timestamp(), 'dag_run': DummyDagRun(), } def get_dagrun(self): return self.context['dag_run'] @staticmethod def xcom_push(key, value): return key, value ================================================ FILE: liminal/runners/airflow/config/__init__.py ================================================ # # Licensed to the Apache Software Foundation (ASF) under one # or more contributor license agreements. See the NOTICE file # distributed with this work for additional information # regarding copyright ownership. The ASF licenses this file # to you under the Apache License, Version 2.0 (the # "License"); you may not use this file except in compliance # with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, # software distributed under the License is distributed on an # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY # KIND, either express or implied. See the License for the # specific language governing permissions and limitations # under the License. ================================================ FILE: liminal/runners/airflow/config/standalone_variable_backend.py ================================================ # # Licensed to the Apache Software Foundation (ASF) under one # or more contributor license agreements. See the NOTICE file # distributed with this work for additional information # regarding copyright ownership. The ASF licenses this file # to you under the Apache License, Version 2.0 (the # "License"); you may not use this file except in compliance # with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, # software distributed under the License is distributed on an # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY # KIND, either express or implied. See the License for the # specific language governing permissions and limitations # under the License. import logging import os from os import environ from sqlite3 import OperationalError from airflow.models import Variable LIMINAL_STAND_ALONE_MODE_KEY = "LIMINAL_STAND_ALONE_MODE" # noinspection PyBroadException def get_variable(key, default_val): if liminal_local_mode(): return os.environ.get(key, default_val) else: try: return Variable.get(key, default_var=default_val) except OperationalError as e: logging.warning( f'Failed to find variable {key} in Airflow variables table.' f' Error: {e.__class__.__module__}.{e.__class__.__name__}' ) except Exception as e: logging.warning(f'Failed to find variable {key} in Airflow variables table. Error: {e}') return default_val def liminal_local_mode(): stand_alone = environ.get(LIMINAL_STAND_ALONE_MODE_KEY, "False") return stand_alone.strip().lower() == "true" ================================================ FILE: liminal/runners/airflow/dag/__init__.py ================================================ # # Licensed to the Apache Software Foundation (ASF) under one # or more contributor license agreements. See the NOTICE file # distributed with this work for additional information # regarding copyright ownership. The ASF licenses this file # to you under the Apache License, Version 2.0 (the # "License"); you may not use this file except in compliance # with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, # software distributed under the License is distributed on an # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY # KIND, either express or implied. See the License for the # specific language governing permissions and limitations # under the License. import os from liminal.core import environment as env from liminal.runners.airflow.dag import liminal_register_dags BASE_PATH = os.path.join(env.get_airflow_home_dir(), env.DEFAULT_PIPELINES_SUBDIR) def register_dags(): return liminal_register_dags.register_dags(BASE_PATH) ================================================ FILE: liminal/runners/airflow/dag/liminal_dags.py ================================================ # # Licensed to the Apache Software Foundation (ASF) under one # or more contributor license agreements. See the NOTICE file # distributed with this work for additional information # regarding copyright ownership. The ASF licenses this file # to you under the Apache License, Version 2.0 (the # "License"); you may not use this file except in compliance # with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, # software distributed under the License is distributed on an # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY # KIND, either express or implied. See the License for the # specific language governing permissions and limitations # under the License. import traceback from airflow import DAG import liminal.runners.airflow.dag as liminal pipelines = liminal.register_dags() for pipeline, dag in pipelines: try: globals()[pipeline] = dag except Exception: traceback.print_exc() ================================================ FILE: liminal/runners/airflow/dag/liminal_register_dags.py ================================================ # # Licensed to the Apache Software Foundation (ASF) under one # or more contributor license agreements. See the NOTICE file # distributed with this work for additional information # regarding copyright ownership. The ASF licenses this file # to you under the Apache License, Version 2.0 (the # "License"); you may not use this file except in compliance # with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, # software distributed under the License is distributed on an # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY # KIND, either express or implied. See the License for the # specific language governing permissions and limitations # under the License. import logging import os import traceback from datetime import datetime, timedelta from airflow import DAG from liminal.core.config.config import ConfigUtil from liminal.core.util import extensible from liminal.runners.airflow.executors import airflow __DEPENDS_ON_PAST = 'depends_on_past' # noinspection PyBroadException def register_dags(configs_path): """ Registers pipelines in liminal yml files found in given path (recursively) as airflow DAGs. """ logging.info(f'Registering DAGs from path: {configs_path}') config_util = ConfigUtil(configs_path) configs = config_util.safe_load(is_render_variables=False) if os.getenv('POD_NAMESPACE') != "jenkins": config_util.snapshot_final_liminal_configs() dags = [] logging.info(f'found {len(configs)} liminal configs in path: {configs_path}') for config in configs: name = config['name'] if 'name' in config else None try: if not name: raise ValueError('liminal.yml missing field `name`') logging.info(f"Registering DAGs for {name}") owner = config.get('owner') trigger_rule = 'all_success' if 'always_run' in config and config['always_run']: trigger_rule = 'all_done' executors = __initialize_executors(config) default_executor = airflow.AirflowExecutor("default_executor", liminal_config=config, executor_config={}) for pipeline in config['pipelines']: default_args = __default_args(pipeline) dag = __initialize_dag(default_args, pipeline, owner) parent = None for task in pipeline['tasks']: task_type = task['type'] task_instance = get_task_class(task_type)( task_id=task['task'], dag=dag, parent=parent, trigger_rule=trigger_rule, liminal_config=config, pipeline_config=pipeline, task_config=task, variables=config.get('variables', {}), ) executor_id = task.get('executor') if executor_id: executor = executors[executor_id] else: logging.info( f"Did not find `executor` in ${task['task']} config." f" Using the default executor (${type(default_executor)})" f" instead." ) executor = default_executor parent = executor.apply_task_to_dag(task=task_instance) logging.info(f'registered DAG {dag.dag_id}: {dag.tasks}') dags.append((pipeline['pipeline'], dag)) except Exception: logging.error(f'Failed to register DAGs for {name}') traceback.print_exc() return dags def __initialize_executors(liminal_config): executors = {} for executor_config in liminal_config.get('executors', {}): executors[executor_config['executor']] = get_executor_class(executor_config['type'])( executor_config['executor'], liminal_config, executor_config ) return executors def __initialize_dag(default_args, pipeline, owner): pipeline_name = pipeline['pipeline'] schedule_interval = default_args.get('schedule_interval', None) if not schedule_interval: schedule_interval = default_args.get('schedule', None) if owner and 'owner' not in default_args: default_args['owner'] = owner start_date = pipeline.get('start_date', datetime.min.time()) if not isinstance(start_date, datetime): start_date = datetime.combine(start_date, datetime.min.time()) default_args.pop('tasks', None) default_args.pop('schedule', None) default_args.pop('monitoring', None) default_args.pop('schedule_interval', None) dag = DAG( dag_id=pipeline_name, default_args=default_args, dagrun_timeout=timedelta(minutes=pipeline['timeout_minutes']), start_date=start_date, schedule_interval=schedule_interval, catchup=False, ) return dag def __default_args(pipeline): default_args = {k: v for k, v in pipeline.items()} override_args = { 'start_date': datetime.combine(pipeline['start_date'], datetime.min.time()), __DEPENDS_ON_PAST: default_args[__DEPENDS_ON_PAST] if __DEPENDS_ON_PAST in default_args else False, } default_args.update(override_args) return default_args logging.info(f'Loading task implementations..') tasks_by_liminal_name = extensible.load_tasks() logging.info(f'Finished loading task implementations: {tasks_by_liminal_name.keys()}') executors_by_liminal_name = extensible.load_executors() logging.info(f'Finished loading executor implementations: {executors_by_liminal_name.keys()}') def get_task_class(task_type): return tasks_by_liminal_name[task_type] def get_executor_class(executor_type): return executors_by_liminal_name[executor_type] ================================================ FILE: liminal/runners/airflow/executors/__init__.py ================================================ # # Licensed to the Apache Software Foundation (ASF) under one # or more contributor license agreements. See the NOTICE file # distributed with this work for additional information # regarding copyright ownership. The ASF licenses this file # to you under the Apache License, Version 2.0 (the # "License"); you may not use this file except in compliance # with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, # software distributed under the License is distributed on an # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY # KIND, either express or implied. See the License for the # specific language governing permissions and limitations # under the License. ================================================ FILE: liminal/runners/airflow/executors/airflow.py ================================================ # # Licensed to the Apache Software Foundation (ASF) under one # or more contributor license agreements. See the NOTICE file # distributed with this work for additional information # regarding copyright ownership. The ASF licenses this file # to you under the Apache License, Version 2.0 (the # "License"); you may not use this file except in compliance # with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, # software distributed under the License is distributed on an # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY # KIND, either express or implied. See the License for the # specific language governing permissions and limitations # under the License. from liminal.runners.airflow.model import executor from liminal.runners.airflow.tasks import airflow class AirflowExecutor(executor.Executor): """ Execute the task steps """ supported_task_types = [airflow.AirflowTask] def _apply_executor_task_to_dag(self, **kwargs): return kwargs['task'].apply_task_to_dag() ================================================ FILE: liminal/runners/airflow/executors/emr.py ================================================ # # Licensed to the Apache Software Foundation (ASF) under one # or more contributor license agreements. See the NOTICE file # distributed with this work for additional information # regarding copyright ownership. The ASF licenses this file # to you under the Apache License, Version 2.0 (the # "License"); you may not use this file except in compliance # with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, # software distributed under the License is distributed on an # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY # KIND, either express or implied. See the License for the # specific language governing permissions and limitations # under the License. from airflow.providers.amazon.aws.operators.emr_add_steps import EmrAddStepsOperator from airflow.providers.amazon.aws.sensors.emr_step import EmrStepSensor from liminal.runners.airflow.model import executor from liminal.runners.airflow.tasks import hadoop class EMRExecutor(executor.Executor): """ Executes a EMR steps """ supported_task_types = [hadoop.HadoopTask] def __init__(self, executor_id, liminal_config, executor_config): super().__init__(executor_id, liminal_config, executor_config) self.aws_conn_id = self.executor_config.get('aws_conn_id', 'aws_default') self.cluster_states = self.executor_config.get('cluster_state', ['RUNNING', 'WAITING']) self.job_flow_id = self.executor_config.get('cluster_id', None) self.job_flow_name = self.executor_config.get('cluster_name', None) def _apply_executor_task_to_dag(self, **kwargs): task = kwargs['task'] parent = task.parent self._validate_task_type(task) # assuming emr already exists add_step = executor.add_variables_to_operator( EmrAddStepsOperator( task_id=f'{task.task_id}_add_step', job_flow_id=self.job_flow_id, job_flow_name=self.job_flow_name, aws_conn_id=self.aws_conn_id, steps=self.__generate_emr_step(task.task_id, [str(x) for x in task.get_runnable_command()]), cluster_states=self.cluster_states, ), task, ) if task.parent: parent.set_downstream(add_step) emr_sensor_step = executor.add_variables_to_operator( EmrStepSensor( task_id=f'{task.task_id}_watch_step', job_flow_id="{{ task_instance.xcom_pull('" + add_step.task_id + "', key='job_flow_id') }}", step_id="{{ task_instance.xcom_pull('" + add_step.task_id + "', key='return_value')[0] }}", aws_conn_id=self.aws_conn_id, ), task, ) add_step.set_downstream(emr_sensor_step) return emr_sensor_step def __generate_emr_step(self, task_id, args): return [ { 'Name': task_id, **self.executor_config.get('properties', {}), 'HadoopJarStep': {'Jar': 'command-runner.jar', 'Args': args}, } ] ================================================ FILE: liminal/runners/airflow/executors/kubernetes.py ================================================ # # Licensed to the Apache Software Foundation (ASF) under one # or more contributor license agreements. See the NOTICE file # distributed with this work for additional information # regarding copyright ownership. The ASF licenses this file # to you under the Apache License, Version 2.0 (the # "License"); you may not use this file except in compliance # with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, # software distributed under the License is distributed on an # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY # KIND, either express or implied. See the License for the # specific language governing permissions and limitations # under the License. import copy import datetime import logging import os from airflow.providers.cncf.kubernetes.operators.kubernetes_pod import ( KubernetesPodOperator, ) from kubernetes.client import V1Volume, V1VolumeMount from kubernetes.client import models as k8s from liminal.core.util import env_util from liminal.runners.airflow.config.standalone_variable_backend import get_variable from liminal.runners.airflow.model import executor from liminal.runners.airflow.tasks import containerable from liminal.runners.airflow.tasks.containerable import ContainerTask _LOG = logging.getLogger(__name__) class KubernetesPodExecutor(executor.Executor): """ Kubernetes pod executor """ supported_task_types = [containerable.ContainerTask] def __init__(self, task_id, liminal_config, executor_config): super().__init__(task_id, liminal_config, executor_config) self.task_name = self.executor_config['executor'] self.volumes = self._volumes() def _apply_executor_task_to_dag(self, **kwargs): task = kwargs['task'] parent = task.parent self._validate_task_type(task) pod_task = executor.add_variables_to_operator( KubernetesPodOperator(trigger_rule=task.trigger_rule, **self.__kubernetes_kwargs(task)), task ) if parent: parent.set_downstream(pod_task) return pod_task def _volumes(self): volumes_config = self.liminal_config.get('volumes', []) secrets_config = self.liminal_config.get('secrets', []) volumes = [] for volume_config in volumes_config: name = volume_config['volume'] claim_name = volume_config.get('claim_name') if not claim_name and 'local' in volume_config: claim_name = f'{name}-pvc' volume = V1Volume(name=name, persistent_volume_claim={'claimName': claim_name}) volumes.append(volume) for secret_config in secrets_config: name = secret_config['secret'] secret = V1Volume(name=name, secret={'secretName': name}) volumes.append(secret) return volumes def __kubernetes_kwargs(self, task: ContainerTask): config = copy.deepcopy(self.executor_config) kubernetes_kwargs = { 'task_id': task.task_id, 'image': task.image, 'arguments': task.arguments, 'namespace': os.environ.get('AIRFLOW__KUBERNETES__NAMESPACE', 'default'), 'name': task.task_id.replace('_', '-'), 'in_cluster': os.environ.get('AIRFLOW__KUBERNETES__IN_CLUSTER', False), 'image_pull_policy': get_variable('image_pull_policy', default_val='IfNotPresent'), 'get_logs': config.pop('get_logs', True), 'is_delete_operator_pod': config.pop('is_delete_operator_pod', True), 'startup_timeout_seconds': config.pop('startup_timeout_seconds', 1200), 'env_vars': [k8s.V1EnvVar(name=x, value=v) for x, v in task.env_vars.items()], 'do_xcom_push': task.task_config.get('do_xcom_push', False), 'image_pull_secrets': config.pop('image_pull_secrets', 'regcred'), 'volumes': self.volumes, 'config_file': os.environ.get('AIRFLOW__KUBERNETES__CONFIG_FILE'), 'cluster_context': os.environ.get('AIRFLOW__KUBERNETES__CLUSTER_CONTEXT', None), 'cmds': task.cmds, 'volume_mounts': [ V1VolumeMount( name=mount['volume'], mount_path=mount['path'], sub_path=mount.get('sub_path'), read_only=mount.get('read_only', False), ) for mount in task.mounts ] + [ V1VolumeMount( name=secret['secret'], mount_path=secret['remote_path'], sub_path=secret.get('sub_path'), read_only=secret.get('read_only', False), ) for secret in task.secrets ], } config.pop('in_cluster', None) config.pop('volumes', None) config.pop('volume_mounts', None) config.pop('executor', None) config.pop('type', None) kubernetes_kwargs.update(config) if env_util.is_running_on_jenkins(): kubernetes_kwargs['affinity'] = self.__jenkins_kubernetes_affinity() kubernetes_kwargs['namespace'] = 'jenkins' if not task.dag: kubernetes_kwargs.update( { 'start_date': datetime.datetime(1970, 1, 1), } ) return kubernetes_kwargs @staticmethod def __jenkins_kubernetes_affinity(): return { "podAffinity": { "requiredDuringSchedulingIgnoredDuringExecution": [ { "labelSelector": { "matchExpressions": [{"key": "liminal", "operator": "In", "values": ["unittest"]}] }, "namespaces": ["jenkins"], "topologyKey": "kubernetes.io/hostname", } ] } } ================================================ FILE: liminal/runners/airflow/model/__init__.py ================================================ # # Licensed to the Apache Software Foundation (ASF) under one # or more contributor license agreements. See the NOTICE file # distributed with this work for additional information # regarding copyright ownership. The ASF licenses this file # to you under the Apache License, Version 2.0 (the # "License"); you may not use this file except in compliance # with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, # software distributed under the License is distributed on an # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY # KIND, either express or implied. See the License for the # specific language governing permissions and limitations # under the License. ================================================ FILE: liminal/runners/airflow/model/executor.py ================================================ # # Licensed to the Apache Software Foundation (ASF) under one # or more contributor license agreements. See the NOTICE file # distributed with this work for additional information # regarding copyright ownership. The ASF licenses this file # to you under the Apache License, Version 2.0 (the # "License"); you may not use this file except in compliance # with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, # software distributed under the License is distributed on an # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY # KIND, either express or implied. See the License for the # specific language governing permissions and limitations # under the License. from abc import ABC, abstractmethod from airflow.models import BaseOperator from liminal.runners.airflow.operators.operator_with_variable_resolving import ( OperatorWithVariableResolving, ) def add_variables_to_operator(operator, task) -> BaseOperator: """ :param operator: Airflow operator :type operator: BaseOperator :param task: Task instance :returns: OperatorWithVariableResolving wrapping given operator """ return OperatorWithVariableResolving( dag=task.dag, task_config=task.task_config, variables=task.variables, liminal_task_instance=task, operator=operator, ) class Executor(ABC): """ Executor Task. """ # list of task types supported by the executor supported_task_types = [] def __init__(self, executor_id, liminal_config, executor_config): self.liminal_config = liminal_config self.executor_id = executor_id self.executor_config = executor_config def apply_task_to_dag(self, **kwargs): task = kwargs['task'] self._validate_task_type(task) return self._apply_executor_task_to_dag(task=task) @abstractmethod def _apply_executor_task_to_dag(self, **kwargs): pass def _validate_task_type(self, task): assert any( [isinstance(task, tYp) for tYp in self.supported_task_types] ), f'supported task types: {self.supported_task_types}' ================================================ FILE: liminal/runners/airflow/model/task.py ================================================ # # Licensed to the Apache Software Foundation (ASF) under one # or more contributor license agreements. See the NOTICE file # distributed with this work for additional information # regarding copyright ownership. The ASF licenses this file # to you under the Apache License, Version 2.0 (the # "License"); you may not use this file except in compliance # with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, # software distributed under the License is distributed on an # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY # KIND, either express or implied. See the License for the # specific language governing permissions and limitations # under the License. """ Base task. """ import json from abc import ABC from airflow.models import BaseOperator from liminal.runners.airflow.model import executor class Task(ABC): """ Task. """ def __init__( self, task_id, dag, parent, trigger_rule, liminal_config, pipeline_config, task_config, variables=None ): self.liminal_config = liminal_config self.dag = dag self.pipeline_config = pipeline_config self.task_id = task_id self.parent = parent self.trigger_rule = trigger_rule self.task_config = task_config self.variables = variables def serialize(self) -> str: """ :returns: JSON string representation of this task """ data = { 'task_id': self.task_id, 'dag': None, 'parent': self.parent, 'trigger_rule': self.trigger_rule, 'liminal_config': self.liminal_config, 'pipeline_config': self.pipeline_config, 'task_config': self.task_config, 'variables': self.variables, } return json.dumps(data, default=str) def _add_variables_to_operator(self, operator) -> BaseOperator: """ :param operator: Airflow operator :type operator: BaseOperator :returns: OperatorWithVariableResolving wrapping given operator """ return executor.add_variables_to_operator(operator, self) ================================================ FILE: liminal/runners/airflow/operators/__init__.py ================================================ # # Licensed to the Apache Software Foundation (ASF) under one # or more contributor license agreements. See the NOTICE file # distributed with this work for additional information # regarding copyright ownership. The ASF licenses this file # to you under the Apache License, Version 2.0 (the # "License"); you may not use this file except in compliance # with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, # software distributed under the License is distributed on an # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY # KIND, either express or implied. See the License for the # specific language governing permissions and limitations # under the License. ================================================ FILE: liminal/runners/airflow/operators/cloudformation.py ================================================ # # Licensed to the Apache Software Foundation (ASF) under one # or more contributor license agreements. See the NOTICE file # distributed with this work for additional information # regarding copyright ownership. The ASF licenses this file # to you under the Apache License, Version 2.0 (the # "License"); you may not use this file except in compliance # with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, # software distributed under the License is distributed on an # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY # KIND, either express or implied. See the License for the # specific language governing permissions and limitations # under the License. """ This module contains CloudFormation create/delete stack operators. Can be removed when Airflow 2.0.0 is released. """ from typing import List from airflow.models import BaseOperator from airflow.providers.amazon.aws.hooks.base_aws import AwsBaseHook from airflow.sensors.base import BaseSensorOperator from botocore.exceptions import ClientError # noinspection PyAbstractClass class CloudFormationHook(AwsBaseHook): """ Interact with AWS CloudFormation. """ def __init__(self, region_name=None, *args, **kwargs): self.region_name = region_name self.conn = None super().__init__(*args, client_type='cloudformation', **kwargs) def get_conn(self): self.conn = self.get_client_type('cloudformation', self.region_name) return self.conn class BaseCloudFormationOperator(BaseOperator): """ Base operator for CloudFormation operations. :param params: parameters to be passed to CloudFormation. :type dict :param aws_conn_id: aws connection to uses :type aws_conn_id: str """ template_fields: List[str] = [] template_ext = () ui_color = '#1d472b' ui_fgcolor = '#FFF' def __init__(self, params, aws_conn_id='aws_default', *args, **kwargs): super().__init__(*args, **kwargs) self.params = params self.aws_conn_id = aws_conn_id def execute(self, context): self.log.info('Parameters: %s', self.params) self.cloudformation_op(CloudFormationHook(aws_conn_id=self.aws_conn_id).get_conn()) def cloudformation_op(self, cloudformation): """ This is the main method to run CloudFormation operation. """ raise NotImplementedError() class CloudFormationCreateStackOperator(BaseCloudFormationOperator): """ An operator that creates a CloudFormation stack. :param params: parameters to be passed to CloudFormation. For possible arguments see: https://boto3.amazonaws.com/v1/documentation/api/latest/reference/services/cloudformation.html#CloudFormation.Client.create_stack :type dict :param aws_conn_id: aws connection to uses :type aws_conn_id: str """ template_fields: List[str] = [] template_ext = () ui_color = '#6b9659' def __init__(self, params, aws_conn_id='aws_default', *args, **kwargs): super().__init__(params=params, aws_conn_id=aws_conn_id, *args, **kwargs) def cloudformation_op(self, cloudformation): cloudformation.create_stack(**self.params) class CloudFormationDeleteStackOperator(BaseCloudFormationOperator): """ An operator that deletes a CloudFormation stack. :param params: parameters to be passed to CloudFormation. For possible arguments see: https://boto3.amazonaws.com/v1/documentation/api/latest/reference/services/cloudformation.html#CloudFormation.Client.delete_stack :type dict :param aws_conn_id: aws connection to uses :type aws_conn_id: str """ template_fields: List[str] = [] template_ext = () ui_color = '#1d472b' ui_fgcolor = '#FFF' def __init__(self, params, aws_conn_id='aws_default', *args, **kwargs): super().__init__(params=params, aws_conn_id=aws_conn_id, *args, **kwargs) def cloudformation_op(self, cloudformation): cloudformation.delete_stack(**self.params) class BaseCloudFormationSensor(BaseSensorOperator): """ Waits for a stack operation to complete on AWS CloudFormation. :param stack_name: The name of the stack to wait for (templated) :type stack_name: str :param aws_conn_id: ID of the Airflow connection where credentials and extra configuration are stored :type aws_conn_id: str :param poke_interval: Time in seconds that the job should wait between each try :type poke_interval: int """ def __init__( self, stack_name, complete_status, in_progress_status, aws_conn_id='aws_default', poke_interval=30, *args, **kwargs, ): super().__init__(poke_interval=poke_interval, *args, **kwargs) self.aws_conn_id = aws_conn_id self.stack_name = stack_name self.complete_status = complete_status self.in_progress_status = in_progress_status self.hook = None def poke(self, context): """ Checks for existence of the stack in AWS CloudFormation. """ cloudformation = self.get_hook().get_conn() self.log.info('Poking for stack %s', self.stack_name) try: stacks = cloudformation.describe_stacks(StackName=self.stack_name)['Stacks'] stack_status = stacks[0]['StackStatus'] if stack_status == self.complete_status: return True elif stack_status == self.in_progress_status: return False else: raise ValueError(f'Stack {self.stack_name} in bad state: {stack_status}') except ClientError as e: if 'does not exist' in str(e): if not self.allow_non_existing_stack_status(): raise ValueError(f'Stack {self.stack_name} does not exist') else: return True else: raise e def get_hook(self): """ Gets the AwsGlueCatalogHook """ if not self.hook: self.hook = CloudFormationHook(aws_conn_id=self.aws_conn_id) return self.hook def allow_non_existing_stack_status(self): """ Boolean value whether or not sensor should allow non existing stack responses. """ return False class CloudFormationCreateStackSensor(BaseCloudFormationSensor): """ Waits for a stack to be created successfully on AWS CloudFormation. :param stack_name: The name of the stack to wait for (templated) :type stack_name: str :param aws_conn_id: ID of the Airflow connection where credentials and extra configuration are stored :type aws_conn_id: str :param poke_interval: Time in seconds that the job should wait between each try :type poke_interval: int """ template_fields = ['stack_name'] ui_color = '#C5CAE9' def __init__(self, stack_name, aws_conn_id='aws_default', poke_interval=30, *args, **kwargs): super().__init__( stack_name=stack_name, complete_status='CREATE_COMPLETE', in_progress_status='CREATE_IN_PROGRESS', aws_conn_id=aws_conn_id, poke_interval=poke_interval, *args, **kwargs, ) class CloudFormationDeleteStackSensor(BaseCloudFormationSensor): """ Waits for a stack to be deleted successfully on AWS CloudFormation. :param stack_name: The name of the stack to wait for (templated) :type stack_name: str :param aws_conn_id: ID of the Airflow connection where credentials and extra configuration are stored :type aws_conn_id: str :param poke_interval: Time in seconds that the job should wait between each try :type poke_interval: int """ template_fields = ['stack_name'] ui_color = '#C5CAE9' def __init__(self, stack_name, aws_conn_id='aws_default', poke_interval=30, *args, **kwargs): super().__init__( stack_name=stack_name, complete_status='DELETE_COMPLETE', in_progress_status='DELETE_IN_PROGRESS', aws_conn_id=aws_conn_id, poke_interval=poke_interval, *args, **kwargs, ) def allow_non_existing_stack_status(self): return True ================================================ FILE: liminal/runners/airflow/operators/job_status_operator.py ================================================ # # Licensed to the Apache Software Foundation (ASF) under one # or more contributor license agreements. See the NOTICE file # distributed with this work for additional information # regarding copyright ownership. The ASF licenses this file # to you under the Apache License, Version 2.0 (the # "License"); you may not use this file except in compliance # with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, # software distributed under the License is distributed on an # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY # KIND, either express or implied. See the License for the # specific language governing permissions and limitations # under the License. from datetime import datetime from typing import Any import pytz from airflow.contrib.hooks.aws_hook import AwsHook from airflow.exceptions import AirflowException from airflow.lineage import apply_lineage from airflow.models import BaseOperator from airflow.utils.db import provide_session from airflow.utils.decorators import apply_defaults from airflow.utils.state import State class JobStatusOperator(BaseOperator): """ Base operator for job status operators. """ template_ext = () @apply_defaults def __init__(self, backends, *args, **kwargs): super().__init__(*args, **kwargs) self.backends = backends self.cloudwatch = None def execute(self, context): for backend in self.backends: if backend in self.report_functions: for metric in self.metrics(context): self.report_functions[backend](self, metric) else: raise AirflowException(f'No such metrics backend: {backend}') def metrics(self, context): raise NotImplementedError def send_metric_to_cloudwatch(self, metric): self.get_cloudwatch().put_metric_data(metric) report_functions = {'cloudwatch': send_metric_to_cloudwatch} def get_cloudwatch(self): if not self.cloudwatch: self.cloudwatch = CloudWatchHook() return self.cloudwatch class JobStartOperator(JobStatusOperator): ui_color = '#c5e5e8' def __init__(self, namespace, application_name, backends, *args, **kwargs): super().__init__(backends=backends, *args, **kwargs) self.namespace = namespace self.application_name = application_name def metrics(self, context): return [Metric(self.namespace, 'JobStarted', 1, [Tag('ApplicationName', self.application_name)])] class JobEndOperator(JobStatusOperator): ui_color = '#6d8fad' def __init__(self, namespace, application_name, backends, *args, **kwargs): super().__init__(backends=backends, *args, **kwargs) self.namespace = namespace self.application_name = application_name self.__job_result = 0 def execute(self, context): self.__calculate_job_result(context) super().execute(context) def metrics(self, context): duration = round( (pytz.utc.localize(datetime.utcnow()) - context['ti'].get_dagrun().start_date).total_seconds() ) self.log.info('Elapsed time: %s' % duration) self.log.info(f'dag final job result: {self.__job_result}') return [ Metric(self.namespace, 'JobResult', self.__job_result, [Tag('ApplicationName', self.application_name)]), Metric(self.namespace, 'JobDuration', duration, [Tag('ApplicationName', self.application_name)]), ] def __log_and_get_state(self, task_instance): state = task_instance.state self.log.info(f'Task {task_instance.task_id} finished with state = {state}') return state def __calculate_job_result(self, context): self.log.info('scanning task instances states.. ') task_instances = context['dag_run'].get_task_instances() task_states = [ self.__log_and_get_state(task_instance) for task_instance in task_instances if task_instance.task_id != context['task_instance'].task_id ] self.__job_result = 0 if all((state == State.SUCCESS or state == State.SKIPPED) for state in task_states): self.__job_result = 1 @apply_lineage @provide_session def post_execute(self, context: Any, result: Any = None, session=None): if self.__job_result == 0: self.log.info("Failing this DAG run due to task failure.") dag_run = context['ti'].get_dagrun() dag_run.end_date = datetime.utcnow() dag_run.state = State.FAILED session.merge(dag_run) # noinspection PyAbstractClass class CloudWatchHook(AwsHook): """ Interact with AWS CloudWatch. """ def __init__(self, region_name=None, *args, **kwargs): super().__init__(*args, **kwargs) self.region_name = region_name self.conn = self.get_client_type('cloudwatch', self.region_name) def get_conn(self): return self.conn def put_metric_data(self, metric): value = metric.value cloudwatch = self.get_conn() dimensions = [{'Name': tag.name, 'Value': tag.value} for tag in metric.tags] cloudwatch.put_metric_data( Namespace=metric.namespace, MetricData=[ { 'MetricName': metric.name, 'Dimensions': dimensions, 'Timestamp': datetime.utcnow(), 'Value': value, 'Unit': 'None', } ], ) self.log.info(f'Published metric: {metric.name} with value: {value}') class Metric: """ Metric. :param namespace: namespace. :type name: str :param name: name. :type name: str :param value: value. :type value: float :param tags: list of tags. :type tags: List[str] """ def __init__(self, namespace, name, value, tags): self.namespace = namespace self.name = name self.value = value self.tags = tags class Tag: def __init__(self, name, value): self.name = name self.value = value ================================================ FILE: liminal/runners/airflow/operators/operator_with_variable_resolving.py ================================================ # # Licensed to the Apache Software Foundation (ASF) under one # or more contributor license agreements. See the NOTICE file # distributed with this work for additional information # regarding copyright ownership. The ASF licenses this file # to you under the Apache License, Version 2.0 (the # "License"); you may not use this file except in compliance # with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, # software distributed under the License is distributed on an # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY # KIND, either express or implied. See the License for the # specific language governing permissions and limitations # under the License. import inspect import logging import re from datetime import datetime from typing import Any, Dict, Optional, Set import jinja2 from airflow.models import BaseOperator from airflow.settings import Session from jinja2 import Environment from liminal.runners.airflow.config import standalone_variable_backend _VAR_REGEX = '(.*){{([^}]*)}}(.*)' _BASE_OPERATOR_ATTRIBUTES = list(inspect.signature(BaseOperator.__init__).parameters.keys()) class OperatorWithVariableResolving(BaseOperator): """ Operator delegator that handles liminal variable substitution at run time """ def __init__(self, dag, task_config: dict, variables: dict = None, liminal_task_instance=None, **kwargs): self.operator_delegate: BaseOperator = kwargs.pop('operator') self.liminal_task_instance = liminal_task_instance.serialize() if liminal_task_instance else None if variables: self.variables = variables.copy() else: self.variables = {} self.task_config = task_config super().__init__(task_id=self.operator_delegate.task_id, dag=dag) self._LOG = logging.getLogger(self.__class__.__name__) def execute(self, context): attributes = self._get_operator_delegate_attributes() self._LOG.info(f'task_config: {self.task_config}') self._LOG.info(f'variables: {self.variables}') self.operator_delegate.template_fields = set(list(self.operator_delegate.template_fields) + attributes) self.operator_delegate.render_template_fields(context, LiminalEnvironment(self.variables, self.task_config)) self.operator_delegate.render_template_fields(context) if 'ti' in context: context['ti'].xcom_push(key="liminal_task_instance", value=self.liminal_task_instance) return self.operator_delegate.execute(context) def post_execute(self, context, result=None): self.operator_delegate.post_execute(context, result) def _get_operator_delegate_attributes(self): return [ attr for attr in dir(self.operator_delegate) if attr not in _BASE_OPERATOR_ATTRIBUTES and attr not in dir(BaseOperator) and not attr.startswith('_') and attr not in ('args', 'kwargs', 'lineage_data', 'subdag', 'template_fields') ] def pre_execute(self, context: Any): return self.operator_delegate.pre_execute(context) def on_kill(self) -> None: self.operator_delegate.on_kill() def render_template_fields(self, context: Dict, jinja_env: Optional[jinja2.Environment] = None) -> None: pass def render_template( self, content: Any, context: Dict, jinja_env: Optional[jinja2.Environment] = None, seen_oids: Optional[Set] = None, ) -> Any: value = self.operator_delegate.render_template( content, context, LiminalEnvironment(self.variables, self.task_config) ) return self.operator_delegate.render_template(value, context, jinja_env, seen_oids) def get_template_env(self) -> jinja2.Environment: return self.operator_delegate.get_template_env() def prepare_template(self) -> None: self.operator_delegate.prepare_template() def resolve_template_files(self) -> None: self.operator_delegate.resolve_template_files() def clear( self, start_date: Optional[datetime] = None, end_date: Optional[datetime] = None, upstream: bool = False, downstream: bool = False, session: Session = None, ): return self.operator_delegate.clear(start_date, end_date, upstream, downstream, session) def run( self, start_date: Optional[datetime] = None, end_date: Optional[datetime] = None, ignore_first_depends_on_past: bool = True, ignore_ti_state: bool = False, mark_success: bool = False, ) -> None: self.operator_delegate.run(start_date, end_date, ignore_first_depends_on_past, ignore_ti_state, mark_success) class LiminalEnvironment(Environment): def __init__(self, variables, task_config=None): super().__init__() self.val = None self.variables = variables.copy() logging.info(f'variables: {variables}') if task_config and 'variables' in task_config: task_variables = task_config['variables'] if isinstance(task_variables, dict): self.variables.update(task_variables) elif isinstance(task_variables, str): variables_key = self.from_string(task_variables).render() if variables_key in variables: self.variables.update(variables[variables_key]) def from_string(self, val, **kwargs): self.val = val return self def render(self, *_, **kwargs): """ Implements jinja2.environment.Template.render """ conf = kwargs['dag_run'].conf if 'dag_run' in kwargs else {} return self.__render(self.val, conf, set()) def __render(self, val: str, dag_run_conf: dict, unresolved_tags: set): token = re.match(_VAR_REGEX, val) if token and token[2].strip() not in unresolved_tags: tag_name = token[2].strip() prefix = self.__render(token[1], dag_run_conf, unresolved_tags) suffix = self.__render(token[3], dag_run_conf, unresolved_tags) if dag_run_conf and tag_name in dag_run_conf: return self.__render(prefix + str(dag_run_conf[tag_name]) + suffix, dag_run_conf, unresolved_tags) elif tag_name in self.variables: return self.__render(prefix + str(self.variables[tag_name]) + suffix, dag_run_conf, unresolved_tags) else: backend_value = standalone_variable_backend.get_variable(tag_name, None) if backend_value: return self.__render(prefix + backend_value + suffix, dag_run_conf, unresolved_tags) else: unresolved_tags.add(tag_name) return self.__render(prefix + '{{' + token[2] + '}}' + suffix, dag_run_conf, unresolved_tags) else: return val def add_variables_to_operator(operator, task) -> BaseOperator: """ :param operator: Airflow operator :type operator: BaseOperator :param task: Task instance :type task: Task :returns: OperatorWithVariableResolving wrapping given operator """ return OperatorWithVariableResolving( dag=task.dag, task_config=task.task_config, variables=task.variables, liminal_task_instance=task, operator=operator, ) ================================================ FILE: liminal/runners/airflow/tasks/__init__.py ================================================ # # Licensed to the Apache Software Foundation (ASF) under one # or more contributor license agreements. See the NOTICE file # distributed with this work for additional information # regarding copyright ownership. The ASF licenses this file # to you under the Apache License, Version 2.0 (the # "License"); you may not use this file except in compliance # with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, # software distributed under the License is distributed on an # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY # KIND, either express or implied. See the License for the # specific language governing permissions and limitations # under the License. ================================================ FILE: liminal/runners/airflow/tasks/airflow.py ================================================ # # Licensed to the Apache Software Foundation (ASF) under one # or more contributor license agreements. See the NOTICE file # distributed with this work for additional information # regarding copyright ownership. The ASF licenses this file # to you under the Apache License, Version 2.0 (the # "License"); you may not use this file except in compliance # with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, # software distributed under the License is distributed on an # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY # KIND, either express or implied. See the License for the # specific language governing permissions and limitations # under the License. from abc import ABC, abstractmethod from liminal.runners.airflow.model import task class AirflowTask(task.Task, ABC): """ Airflow task """ @abstractmethod def apply_task_to_dag(self): pass ================================================ FILE: liminal/runners/airflow/tasks/containerable.py ================================================ # # Licensed to the Apache Software Foundation (ASF) under one # or more contributor license agreements. See the NOTICE file # distributed with this work for additional information # regarding copyright ownership. The ASF licenses this file # to you under the Apache License, Version 2.0 (the # "License"); you may not use this file except in compliance # with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, # software distributed under the License is distributed on an # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY # KIND, either express or implied. See the License for the # specific language governing permissions and limitations # under the License. import json import logging import os from abc import ABC from typing import Dict from liminal.core import environment from liminal.runners.airflow.config import standalone_variable_backend from liminal.runners.airflow.config.standalone_variable_backend import get_variable from liminal.runners.airflow.model import task _LOG = logging.getLogger(__name__) ENV = 'env' DEFAULT = 'default' class ContainerTask(task.Task, ABC): """ K8S Containerable task """ def __init__( self, task_id, dag, parent, trigger_rule, liminal_config, pipeline_config, task_config, variables=None ): super().__init__(task_id, dag, parent, trigger_rule, liminal_config, pipeline_config, task_config, variables) env = standalone_variable_backend.get_variable(ENV, DEFAULT) self.env_vars = self.__env_vars(env) self.image = self.task_config['image'] self.mounts = self.task_config.get('mounts', []) self.secrets = self.task_config.get('secrets', []) self.cmds, self.arguments = self._kubernetes_cmds_and_arguments() def _kubernetes_cmds_and_arguments(self): cmds = ['/bin/sh', '-c'] arguments = [self.task_config['cmd']] return cmds, arguments @staticmethod def __get_local_env_params_from_env_file(): env_file = f'{environment.get_liminal_home()}/env' if os.path.isfile(env_file): _LOG.info(f'found env file at {env_file}') result = {} with open(env_file) as f: lines = f.readlines() for line in lines: if line and line.strip() and line.strip()[0:1] != '#': parts = line.strip().split('=') if len(parts) == 2: result[parts[0]] = parts[1] return result else: return {} def __env_vars(self, env) -> Dict: env_vars = dict(self.task_config['env_vars']) if 'env_vars' in self.task_config else {} env_vars.update(self.__get_local_env_params_from_env_file()) airflow_configuration_variable = get_variable( f'''{self.pipeline_config['pipeline']}_dag_configuration''', default_val=None ) if airflow_configuration_variable: airflow_configs = json.loads(airflow_configuration_variable) environment_variables_key = f"{self.pipeline_config['pipeline']}_environment_variables" if environment_variables_key in airflow_configs: env_vars = airflow_configs[environment_variables_key] if ENV not in env_vars: env_vars[ENV] = env return {k: str(v) for k, v in env_vars.items()} ================================================ FILE: liminal/runners/airflow/tasks/create_cloudformation_stack.py ================================================ # # Licensed to the Apache Software Foundation (ASF) under one # or more contributor license agreements. See the NOTICE file # distributed with this work for additional information # regarding copyright ownership. The ASF licenses this file # to you under the Apache License, Version 2.0 (the # "License"); you may not use this file except in compliance # with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, # software distributed under the License is distributed on an # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY # KIND, either express or implied. See the License for the # specific language governing permissions and limitations # under the License. from airflow.operators.dummy import DummyOperator from airflow.operators.python import BranchPythonOperator from flatdict import FlatDict from liminal.runners.airflow.operators.cloudformation import ( CloudFormationCreateStackOperator, CloudFormationCreateStackSensor, CloudFormationHook, ) from liminal.runners.airflow.operators.operator_with_variable_resolving import ( OperatorWithVariableResolving, ) from liminal.runners.airflow.tasks import airflow class CreateCloudFormationStackTask(airflow.AirflowTask): """ Creates cloud_formation stack. """ def __init__( self, task_id, dag, parent, trigger_rule, liminal_config, pipeline_config, task_config, variables=None ): super().__init__(task_id, dag, parent, trigger_rule, liminal_config, pipeline_config, task_config, variables) self.stack_name = task_config['stack_name'] def apply_task_to_dag(self): check_cloudformation_stack_exists_task = self._add_variables_to_operator( BranchPythonOperator( templates_dict={'stack_name': self.stack_name}, task_id=f'is-cloudformation-{self.task_id}-running', python_callable=self.__cloudformation_stack_running_branch, provide_context=True, ) ) create_cloudformation_stack_task = self._add_variables_to_operator( CloudFormationCreateStackOperator( task_id=f'create-cloudformation-{self.task_id}', params={**self.__reformatted_params()} ) ) create_stack_sensor_task = self._add_variables_to_operator( CloudFormationCreateStackSensor( task_id=f'cloudformation-watch-{self.task_id}-create', stack_name=self.stack_name, ) ) stack_creation_end_task = DummyOperator( task_id=f'creation-end-{self.task_id}', trigger_rule='all_done', dag=self.dag ) if self.parent: self.parent.set_downstream(check_cloudformation_stack_exists_task) create_stack_sensor_task.set_downstream(stack_creation_end_task) create_cloudformation_stack_task.set_downstream(create_stack_sensor_task) check_cloudformation_stack_exists_task.set_downstream(create_cloudformation_stack_task) check_cloudformation_stack_exists_task.set_downstream(stack_creation_end_task) return stack_creation_end_task def __cloudformation_stack_running_branch(self, stack_name): cloudformation = CloudFormationHook().get_conn() try: stack_status = cloudformation.describe_stacks(StackName=stack_name)['Stacks'][0]['StackStatus'] if stack_status in ['CREATE_COMPLETE', 'DELETE_FAILED']: print(f'Stack {stack_name} is running') return f'creation-end-{self.task_id}' else: print(f'Stack {stack_name} is not running') except Exception as e: if 'does not exist' in str(e): print(f'Stack {stack_name} does not exist') return f'create-cloudformation-{self.task_id}' else: raise e return f'create-cloudformation-{self.task_id}' # noinspection PyUnusedLocal def __reformatted_params(self, **kwargs): return { 'StackName': self.stack_name, **self.task_config['properties'], 'Parameters': [ {'ParameterKey': x, 'ParameterValue': str(y)} for (x, y) in FlatDict(self.task_config['properties']['Parameters']).items() ], } ================================================ FILE: liminal/runners/airflow/tasks/delete_cloudformation_stack.py ================================================ # # Licensed to the Apache Software Foundation (ASF) under one # or more contributor license agreements. See the NOTICE file # distributed with this work for additional information # regarding copyright ownership. The ASF licenses this file # to you under the Apache License, Version 2.0 (the # "License"); you may not use this file except in compliance # with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, # software distributed under the License is distributed on an # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY # KIND, either express or implied. See the License for the # specific language governing permissions and limitations # under the License. from airflow.operators.dummy_operator import DummyOperator from airflow.operators.python_operator import BranchPythonOperator from airflow.utils.trigger_rule import TriggerRule from liminal.runners.airflow.operators.cloudformation import ( CloudFormationDeleteStackOperator, CloudFormationDeleteStackSensor, ) from liminal.runners.airflow.tasks import airflow class DeleteCloudFormationStackTask(airflow.AirflowTask): """ Deletes cloud_formation stack. """ def __init__( self, task_id, dag, parent, trigger_rule, liminal_config, pipeline_config, task_config, variables=None ): super().__init__(task_id, dag, parent, trigger_rule, liminal_config, pipeline_config, task_config, variables) self.stack_name = task_config['stack_name'] def apply_task_to_dag(self): check_dags_queued_task = BranchPythonOperator( task_id=f'{self.task_id}-is-dag-queue-empty', python_callable=self.__queued_dag_runs_exists, provide_context=True, trigger_rule=TriggerRule.ALL_DONE, dag=self.dag, ) delete_stack_task = self._add_variables_to_operator( CloudFormationDeleteStackOperator( task_id=f'delete-cloudformation-{self.task_id}', params={'StackName': self.stack_name}, ) ) delete_stack_sensor = self._add_variables_to_operator( CloudFormationDeleteStackSensor( task_id=f'cloudformation-watch-{self.task_id}-delete', stack_name=self.stack_name, ) ) stack_delete_end_task = DummyOperator(task_id=f'delete-end-{self.task_id}', dag=self.dag) if self.parent: self.parent.set_downstream(check_dags_queued_task) check_dags_queued_task.set_downstream(stack_delete_end_task) check_dags_queued_task.set_downstream(delete_stack_task) delete_stack_task.set_downstream(delete_stack_sensor) delete_stack_sensor.set_downstream(stack_delete_end_task) return stack_delete_end_task # noinspection PyUnusedLocal def __queued_dag_runs_exists(self, **kwargs): if self.dag.get_num_active_runs() > 1: return f'delete-end-{self.task_id}' else: return f'delete-cloudformation-{self.task_id}' ================================================ FILE: liminal/runners/airflow/tasks/hadoop.py ================================================ # # Licensed to the Apache Software Foundation (ASF) under one # or more contributor license agreements. See the NOTICE file # distributed with this work for additional information # regarding copyright ownership. The ASF licenses this file # to you under the Apache License, Version 2.0 (the # "License"); you may not use this file except in compliance # with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, # software distributed under the License is distributed on an # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY # KIND, either express or implied. See the License for the # specific language governing permissions and limitations # under the License. from abc import ABC, abstractmethod from liminal.runners.airflow.model import task class HadoopTask(task.Task, ABC): """ Hadoop task """ @abstractmethod def get_runnable_command(self): pass ================================================ FILE: liminal/runners/airflow/tasks/job_end.py ================================================ # # Licensed to the Apache Software Foundation (ASF) under one # or more contributor license agreements. See the NOTICE file # distributed with this work for additional information # regarding copyright ownership. The ASF licenses this file # to you under the Apache License, Version 2.0 (the # "License"); you may not use this file except in compliance # with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, # software distributed under the License is distributed on an # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY # KIND, either express or implied. See the License for the # specific language governing permissions and limitations # under the License. from liminal.runners.airflow.operators.job_status_operator import JobEndOperator from liminal.runners.airflow.tasks import airflow class JobEndTask(airflow.AirflowTask): """ Job end task. Reports job end metrics. """ def __init__( self, task_id, dag, parent, trigger_rule, liminal_config, pipeline_config, task_config, variables=None ): super().__init__(task_id, dag, parent, trigger_rule, liminal_config, pipeline_config, task_config, variables) metrics = self.liminal_config.get('metrics', {}) self.metrics_namespace = metrics.get('namespace', '') self.metrics_backends = metrics.get('backends', []) def apply_task_to_dag(self): job_end_task = JobEndOperator( task_id='end', namespace=self.metrics_namespace, application_name=self.pipeline_config['pipeline'], backends=self.metrics_backends, dag=self.dag, trigger_rule=self.trigger_rule, ) if self.parent: self.parent.set_downstream(job_end_task) return job_end_task ================================================ FILE: liminal/runners/airflow/tasks/job_start.py ================================================ # # Licensed to the Apache Software Foundation (ASF) under one # or more contributor license agreements. See the NOTICE file # distributed with this work for additional information # regarding copyright ownership. The ASF licenses this file # to you under the Apache License, Version 2.0 (the # "License"); you may not use this file except in compliance # with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, # software distributed under the License is distributed on an # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY # KIND, either express or implied. See the License for the # specific language governing permissions and limitations # under the License. from liminal.runners.airflow.operators.job_status_operator import JobStartOperator from liminal.runners.airflow.tasks import airflow class JobStartTask(airflow.AirflowTask): """ Job start task. Reports job start metrics. """ def __init__( self, task_id, dag, parent, trigger_rule, liminal_config, pipeline_config, task_config, variables=None ): super().__init__(task_id, dag, parent, trigger_rule, liminal_config, pipeline_config, task_config, variables) metrics = self.liminal_config.get('metrics', {}) self.metrics_namespace = metrics.get('namespace', '') self.metrics_backends = metrics.get('backends', []) def apply_task_to_dag(self): job_start_task = JobStartOperator( task_id='start', namespace=self.metrics_namespace, application_name=self.pipeline_config['pipeline'], backends=self.metrics_backends, dag=self.dag, trigger_rule=self.trigger_rule, ) if self.parent: self.parent.set_downstream(job_start_task) return job_start_task ================================================ FILE: liminal/runners/airflow/tasks/python.py ================================================ # # Licensed to the Apache Software Foundation (ASF) under one # or more contributor license agreements. See the NOTICE file # distributed with this work for additional information # regarding copyright ownership. The ASF licenses this file # to you under the Apache License, Version 2.0 (the # "License"); you may not use this file except in compliance # with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, # software distributed under the License is distributed on an # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY # KIND, either express or implied. See the License for the # specific language governing permissions and limitations # under the License. from liminal.runners.airflow.tasks import containerable class PythonTask(containerable.ContainerTask): """ Python task. """ def _kubernetes_cmds_and_arguments(self): cmds = ['/bin/bash', '-c'] arguments = [self.__cmd()] return cmds, arguments def __cmd(self): return self.task_config['cmd'] ================================================ FILE: liminal/runners/airflow/tasks/spark.py ================================================ # # Licensed to the Apache Software Foundation (ASF) under one # or more contributor license agreements. See the NOTICE file # distributed with this work for additional information # regarding copyright ownership. The ASF licenses this file # to you under the Apache License, Version 2.0 (the # "License"); you may not use this file except in compliance # with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, # software distributed under the License is distributed on an # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY # KIND, either express or implied. See the License for the # specific language governing permissions and limitations # under the License. from itertools import chain from flatdict import FlatDict from liminal.runners.airflow.tasks import containerable, hadoop class SparkTask(hadoop.HadoopTask, containerable.ContainerTask): """ Executes a Spark application. """ def __init__( self, task_id, dag, parent, trigger_rule, liminal_config, pipeline_config, task_config, variables=None ): task_config['image'] = task_config.get('image', '') task_config['cmd'] = task_config.get('cmd', []) task_config['env_vars'] = {'SPARK_LOCAL_HOSTNAME': 'localhost'} super().__init__(task_id, dag, parent, trigger_rule, liminal_config, pipeline_config, task_config, variables) def get_runnable_command(self): """ Return spark-submit runnable command """ return self.__generate_spark_submit() def __generate_spark_submit(self): spark_submit = ['spark-submit'] spark_arguments = self.__spark_args() application_arguments = self.__additional_arguments() spark_submit.extend(spark_arguments) spark_submit.extend(application_arguments) return [str(x) for x in spark_submit] def __spark_args(self): # reformat spark conf flat_conf_args = list() spark_arguments = { 'master': self.task_config.get('master', None), 'class': self.task_config.get('class', None), } source_code = self.task_config.get("application_source") for conf_arg in [f'{k}={v}' for (k, v) in FlatDict(self.task_config.get('conf', {})).items()]: flat_conf_args.append('--conf') flat_conf_args.append(conf_arg) spark_conf = self.__parse_spark_arguments(spark_arguments) spark_conf.extend(flat_conf_args) spark_conf.extend([source_code]) return spark_conf def __additional_arguments(self): application_arguments = self.task_config.get('application_arguments', {}) if type(application_arguments) == list: return application_arguments return self.__interleaving(application_arguments.keys(), application_arguments.values()) def __parse_spark_arguments(self, spark_arguments): spark_arguments = {x[0]: x[1] for x in spark_arguments.items() if x[1]} return self.__interleaving( [f'--{k}' for k in spark_arguments.keys() if spark_arguments[k]], spark_arguments.values() ) @staticmethod def __interleaving(keys, values): return list(chain.from_iterable(zip(keys, values))) def _kubernetes_cmds_and_arguments(self): return self.__generate_spark_submit(), [] ================================================ FILE: liminal/runners/airflow/tasks/sql.py ================================================ # # Licensed to the Apache Software Foundation (ASF) under one # or more contributor license agreements. See the NOTICE file # distributed with this work for additional information # regarding copyright ownership. The ASF licenses this file # to you under the Apache License, Version 2.0 (the # "License"); you may not use this file except in compliance # with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, # software distributed under the License is distributed on an # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY # KIND, either express or implied. See the License for the # specific language governing permissions and limitations # under the License. from liminal.runners.airflow.model import task class SqlTask(task.Task): """ Executes an SQL application. """ ================================================ FILE: liminal/settings.py ================================================ # # Licensed to the Apache Software Foundation (ASF) under one # or more contributor license agreements. See the NOTICE file # distributed with this work for additional information # regarding copyright ownership. The ASF licenses this file # to you under the Apache License, Version 2.0 (the # "License"); you may not use this file except in compliance # with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, # software distributed under the License is distributed on an # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY # KIND, either express or implied. See the License for the # specific language governing permissions and limitations # under the License. import os import sys from liminal.core.environment import get_liminal_home LIMINAL_HOME = get_liminal_home() def prepare_syspath(): plugins = os.path.join(LIMINAL_HOME, 'liminal') sys.path.append(plugins) def initialize(): # making sure all extensible modules are in root path prepare_syspath() ================================================ FILE: liminal/sql/__init__.py ================================================ # # Licensed to the Apache Software Foundation (ASF) under one # or more contributor license agreements. See the NOTICE file # distributed with this work for additional information # regarding copyright ownership. The ASF licenses this file # to you under the Apache License, Version 2.0 (the # "License"); you may not use this file except in compliance # with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, # software distributed under the License is distributed on an # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY # KIND, either express or implied. See the License for the # specific language governing permissions and limitations # under the License. ================================================ FILE: liminal-arch.md ================================================ # Liminal Liminal is an end-to-end platform for data engineers & scientists, allowing them to build, train and deploy machine learning models in a robust and agile way. The platform provides the abstractions and declarative capabilities for data extraction & feature engineering followed by model training and serving. Apache Liminal's goal is to operationalise the machine learning process, allowing data scientists to quickly transition from a successful experiment to an automated pipeline of model training, validation, deployment and inference in production, freeing them from engineering and non-functional tasks, and allowing them to focus on machine learning code and artifacts. ## Motivation The challenges involved in operationalizing machine learning models are one of the main reasons why many machine learning projects never make it to production. The process involves automating and orchestrating multiple steps which run on heterogeneous infrastructure - different compute environments, data processing platforms, ML frameworks, notebooks, containers and monitoring tools. There are no mature standards for this workflow, and most organizations do not have the experience to build it in-house. In the best case, dev-ds-devops teams form in order to accomplish this task together; in many cases, it's the data scientists who try to deal with this themselves without the knowledge or the inclination to become infrastructure experts. As a result, many projects never make it through the cycle. Those who do suffer from a very long lead time from a successful experiment to an operational, refreshable, deployed and monitored model in production. The goal of Apache Liminal is to simplify the creation and management of machine learning pipelines by data engineers & scientists. The platform provides declarative building blocks which define the workflow, orchestrate the underlying infrastructure, take care of non functional concerns, enabling focus in business logic / algorithm code. Some Commercial E2E solutions have started to emerge in the last few years, however, they are limited to specific parts of the workflow, such as Databricks MLFlow. Other solutions are tied to specific environments (e.g. SageMaker on AWS). ## High Level Architecture The platform is aimed to provide data engineers & scientists with a solution for end to end flows from model training to real time inference in production. It’s architecture enables and promotes adoption of specific components in existing (non-Liminal) frameworks, as well as seamless integration with other open source projects. Liminal was created to enable scalability in ML efforts and after a thorough review of available solutions and frameworks, which did not meet our main KPIs: Provide an opinionated but customizable end-to-end workflow Abstract away the complexity of underlying infrastructure Support major open source tools and cloud-native infrastructure to carry out many of the steps Allow teams to leverage their existing investments or bring in their tools of choice into the workflow We have found that other tech companies in the Israeli Hi-Tech ecosystem also have an interest in such a platform, hence decided to share our work with the community. The following diagram depicts these main components and where Apache Liminal comes in: ![raibow-arch1](https://raw.githubusercontent.com/apache/incubator-liminal/master/images/liminal_001.png) A classical data scientist workflow includes some base phases: _Train, Deploy and Consume._ **The Train phase includes the following tasks:** 1. Fetch - get the data needed to build a model - usually using SQL 1. Clean - make sure the data is useful for building the model 1. Prepare - split data and encode features from the data according to model needs 1. Train - Build the model and tune it 1. Evaluate - make sure the model is correct - run it on a test set, etc… 1. Validate - make sure the model is up to the standards you need **The Deploy phase includes these tasks:** 1. Deploy - make it available for usage in production 1. Inference - Batch or Real-time - use the model to evaluate data by your offline or online by your applications 1. Consume - The actual use of the models created by applications and ETLs, usually through APIs to the batch or real-time inference that usually rely on Model and Feature stores. Liminal provides its users a declarative composition capabilities to materialize these steps in a robust way, while exploiting existing frameworks and tools. e.g. Data science frameworks such as scikit-learn, Tensor flow, Keras and such, for running core data science algorithms; as numerous core mechanisms as data stores, processing engines, parallelism, schedulers, code deployment as well as batch and real-time inference. Liminal allows the creation and wiring of these kinds of functional and non functional tasks while making the underlying infrastructure used by these tasks very easy to use and even abstracted away entirely. While handling the non-functional aspects as monitoring (in a standard fashion) deployment, scheduling, resource management and execution. ![raibow-arch2](https://raw.githubusercontent.com/apache/incubator-liminal/master/images/liminal_002.png) ================================================ FILE: pyproject.toml ================================================ # # Licensed to the Apache Software Foundation (ASF) under one # or more contributor license agreements. See the NOTICE file # distributed with this work for additional information # regarding copyright ownership. The ASF licenses this file # to you under the Apache License, Version 2.0 (the # "License"); you may not use this file except in compliance # with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, # software distributed under the License is distributed on an # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY # KIND, either express or implied. See the License for the # specific language governing permissions and limitations # under the License. [tool.black] line-length = 119 # TODO: Decide if 119 or 110? target-version = ['py36', 'py37', 'py38', 'py39'] skip-string-normalization = true include = '\.pyi?$' exclude = ''' ( /( \.eggs | \.git | \.mypy_cache | \.tox | _build | build | dist )/ | src/buildstream/_protos ) ''' ================================================ FILE: requirements.txt ================================================ # # Licensed to the Apache Software Foundation (ASF) under one # or more contributor license agreements. See the NOTICE file # distributed with this work for additional information # regarding copyright ownership. The ASF licenses this file # to you under the Apache License, Version 2.0 (the # "License"); you may not use this file except in compliance # with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, # software distributed under the License is distributed on an # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY # KIND, either express or implied. See the License for the # specific language governing permissions and limitations # under the License. docker==4.2.0 apache-airflow==2.1.2 click==7.1.1 Flask==1.1.1 pyyaml==5.4.1 boto3==1.17.112 botocore==1.20.112 wheel==0.36.2 termcolor~=1.1.0 docker-pycreds==0.4.0 typing==3.7.4.1 GitPython==3.1.11 moto==1.3.14 diskcache==3.1.1 croniter==0.3.31 pytz==2020.5 pytzdata==2020.1 freezegun==1.1.0 statsd>=3.3.0, <4.0 sqlalchemy~=1.3.15 flatdict==3.4.0 jinja2>=2.10.1, <2.11.0 python-json-logger==2.0.1 requests==2.26.0 apache-airflow-providers-amazon==1.4.0 apache-airflow-providers-cncf-kubernetes==1.0.2 #apache-airflow-providers-google==4.0.0 #apache-airflow[google,amazon,apache.spark]==2.0.0 cfn-lint==0.53.0 pre-commit==2.16.0 Werkzeug==1.0.1 itsdangerous==1.1.0 MarkupSafe==1.1.1 ================================================ FILE: run_tests.sh ================================================ #!/bin/sh # # Licensed to the Apache Software Foundation (ASF) under one # or more contributor license agreements. See the NOTICE file # distributed with this work for additional information # regarding copyright ownership. The ASF licenses this file # to you under the Apache License, Version 2.0 (the # "License"); you may not use this file except in compliance # with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, # software distributed under the License is distributed on an # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY # KIND, either express or implied. See the License for the # specific language governing permissions and limitations # under the License. python -m unittest ================================================ FILE: scripts/Dockerfile-airflow ================================================ # # Licensed to the Apache Software Foundation (ASF) under one # or more contributor license agreements. See the NOTICE file # distributed with this work for additional information # regarding copyright ownership. The ASF licenses this file # to you under the Apache License, Version 2.0 (the # "License"); you may not use this file except in compliance # with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, # software distributed under the License is distributed on an # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY # KIND, either express or implied. See the License for the # specific language governing permissions and limitations # under the License. FROM apache/airflow:2.1.2-python3.8 ARG LIMINAL_VERSION='apache-liminal' ARG DAG_FOLDER='/opt/airflow/dags/' ADD scripts/* ${DAG_FOLDER} ADD *.whl ${DAG_FOLDER} RUN ls -ls ${DAG_FOLDER} ADD liminal/runners/airflow/dag/liminal_dags.py ${DAG_FOLDER} ADD scripts/webserver_config.py /opt/airflow/ USER airflow RUN pip --disable-pip-version-check install -r /opt/airflow/dags/requirements-airflow.txt --user RUN pip --disable-pip-version-check install --user ${LIMINAL_VERSION} ================================================ FILE: scripts/__init__.py ================================================ # # Licensed to the Apache Software Foundation (ASF) under one # or more contributor license agreements. See the NOTICE file # distributed with this work for additional information # regarding copyright ownership. The ASF licenses this file # to you under the Apache License, Version 2.0 (the # "License"); you may not use this file except in compliance # with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, # software distributed under the License is distributed on an # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY # KIND, either express or implied. See the License for the # specific language governing permissions and limitations # under the License. ================================================ FILE: scripts/docker-compose.yml ================================================ # Licensed to the Apache Software Foundation (ASF) under one # or more contributor license agreements. See the NOTICE file # distributed with this work for additional information # regarding copyright ownership. The ASF licenses this file # to you under the Apache License, Version 2.0 (the # "License"); you may not use this file except in compliance # with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, # software distributed under the License is distributed on an # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY # KIND, either express or implied. See the License for the # specific language governing permissions and limitations # under the License. --- version: '3' x-airflow-common: &airflow-common image: liminal-airflow build: context: . dockerfile: scripts/Dockerfile-airflow args: LIMINAL_VERSION: ${LIMINAL_VERSION} environment: &airflow-common-env LOAD_EX: n AIRFLOW__CORE__EXECUTOR: LocalExecutor KUBECONFIG: /home/airflow/kube/config AWS_DEFAULT_REGION: "${AWS_DEFAULT_REGION:-us-east-1}" AIRFLOW__CORE__LOAD_EXAMPLES: 'False' AIRFLOW__WEBSERVER__WORKERS: '1' AIRFLOW__WEBSERVER__EXPOSE_CONFIG: 'True' _AIRFLOW_DB_UPGRADE: 'true' LIMINAL_HOME: /opt/airflow/dags AIRFLOW__CORE__LOAD_DEFAULT_CONNECTIONS: 'False' AIRFLOW__CORE__SQL_ALCHEMY_CONN: postgresql+psycopg2://airflow:airflow@postgres:5432/airflow AIRFLOW_CONN_METADATA_DB: postgresql+psycopg2://airflow:airflow@postgres:5432/airflow AIRFLOW_VAR__METADATA_DB_SCHEMA: airflow volumes: - ${LIMINAL_HOME}/liminal:/opt/airflow/dags/liminal/ - ${LIMINAL_HOME}:/opt/airflow/dags - ${LIMINAL_HOME}/logs:/opt/airflow/logs - ${HOME}/.kube:/home/airflow/kube depends_on: postgres: condition: service_healthy healthcheck: test: [CMD-SHELL, '[ -f /usr/local/airflow/airflow-webserver.pid ]'] interval: 30s timeout: 30s retries: 3 logging: options: max-size: 10m max-file: '3' restart: always services: postgres: image: postgres:13 container_name: liminal-postgress environment: POSTGRES_USER: airflow POSTGRES_PASSWORD: airflow POSTGRES_DB: '' ports: - 5432:5432 volumes: - ${LIMINAL_HOME}/db:/var/lib/postgresql/data logging: options: max-size: 10m max-file: '3' healthcheck: test: [CMD, pg_isready, -U, airflow] interval: 30s timeout: 30s retries: 3 restart: always webserver: <<: *airflow-common container_name: liminal-webserver environment: <<: *airflow-common-env ports: - 8080:8080 command: webserver scheduler: <<: *airflow-common container_name: liminal-scheduler environment: <<: *airflow-common-env ports: - 8793:8793 command: scheduler ================================================ FILE: scripts/liminal ================================================ #!/usr/bin/env python3 # # Licensed to the Apache Software Foundation (ASF) under one # or more contributor license agreements. See the NOTICE file # distributed with this work for additional information # regarding copyright ownership. The ASF licenses this file # to you under the Apache License, Version 2.0 (the # "License"); you may not use this file except in compliance # with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, # software distributed under the License is distributed on an # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY # KIND, either express or implied. See the License for the # specific language governing permissions and limitations # under the License. import logging import os import pathlib import shutil import subprocess import sys import yaml import click import scripts from liminal import settings from liminal.build import liminal_apps_builder from liminal.core import environment from liminal.core.config.config import ConfigUtil from liminal.core.util import files_util from liminal.kubernetes import volume_util, secret_util from liminal.logging.logging_setup import logging_initialization from liminal.runners.airflow import dag logging_initialization() settings.initialize() try: import importlib.resources as pkg_resources except ImportError: # Try backported to PY<37 `importlib_resources`. import importlib_resources as pkg_resources @click.group() def cli(): pass def docker_is_running(): try: return not subprocess.check_output("docker info >/dev/null 2>&1", shell=True) except subprocess.CalledProcessError: msg = "Docker is not running. Please start docker service on your machine\n" sys.stderr.write(f"ERROR: {msg}") raise RuntimeError(msg) @cli.command("build", short_help="builds dockers from your business logic") @click.option('--path', default=os.getcwd(), help='Build within this path.') def build(path): click.echo(f'Building liminal apps in {path}') if docker_is_running(): liminal_apps_builder.build_liminal_apps(path) configs = ConfigUtil(path).safe_load(is_render_variables=True, soft_merge=True) for config in configs: if config.get('volumes'): logging.info(f'local volume is being created') volume_util.create_local_volumes(config, os.path.dirname( files_util.resolve_pipeline_source_file(config['name']))) def deploy_liminal_core_internal(liminal_home, clean): # noinspection PyTypeChecker with pkg_resources.path(dag, 'liminal_dags.py') as p: dags_path = p os.makedirs(environment.get_dags_dir(), exist_ok=True) dag_target_file = os.path.join(environment.get_liminal_home(), 'liminal_dags.py') shutil.copyfile(dags_path, dag_target_file) # initialize the env. variable which indicates to the docke compose which # liminal to install in airflow docker liminal_version = environment.get_liminal_version() logging.info(f'Deploying liminal version: {liminal_version}') # if liminal is installed from local file - the developer needs to put it in the /scripts folder # in which case it will end up inside the container during build if liminal_version.find("file://") > -1: local_file_name = os.path.basename(liminal_version) full_path = os.path.join('/opt/airflow/dags', local_file_name) logging.info(f'Liminal was installed locally, setting LIMINAL_VERSION to {full_path}') os.environ[environment.LIMINAL_VERSION_PARAM_NAME] = full_path _, project_dir = get_docker_compose_paths() shutil.copyfile(os.path.join(liminal_home, local_file_name), os.path.join(project_dir, local_file_name)) if clean: docker_compose_command('down', ['--remove-orphans', '--rmi', 'local']) docker_compose_command('build', []) docker_compose_command('run', ['webserver', 'db reset', '-y']) docker_compose_command('run', ['webserver', 'db init']) docker_compose_command('run', ['webserver', 'users create ' '--role Admin ' '--username admin ' '--email admin ' '--firstname admin ' '--lastname admin ' '--password admin']) docker_compose_command('down', ['--remove-orphans']) def docker_compose_command(command_name, args): detached_mode = False docker_compose_path, project_dir = get_docker_compose_paths() concated_args = ' '.join(args) run_command = [ 'docker-compose ' f'-f "{docker_compose_path}" ' '-p liminal --project-directory ' f'{project_dir} {command_name} {concated_args}' ] logging.info(run_command[0]) if 'tail' in str(args) or '-d' in str(args) or command_name in ['ps', 'down']: detached_mode = True if detached_mode: output = subprocess.run(run_command, env=os.environ, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE, universal_newlines=True) return output.stdout, output.stderr else: subprocess.call(run_command, env=os.environ, shell=True) return '', '' @cli.command("deploy", short_help="deploys your liminal.yaml files to $LIMINAL_HOME folder") @click.option('--path', default=os.getcwd(), help="folder containing liminal.yaml files") @click.option('--clean', is_flag=True, default=False, help="re-deploy liminal airflow components from scratch") def deploy_liminal_apps(path, clean): click.echo("deploying liminal yaml files") liminal_home = environment.get_liminal_home() if (clean): click.echo("cleaning liminal home directory") shutil.rmtree(liminal_home) os.makedirs(liminal_home, exist_ok=True) os.makedirs(environment.get_dags_dir(), exist_ok=True) deploy_liminal_core_internal(liminal_home, clean) config_files = files_util.find_config_files(path) for config_file in config_files: if (is_file_empty(config_file)): continue click.echo(f"deploying liminal file: {config_file}") relative_path = os.path.relpath(config_file, os.path.dirname(path)) target_yml_name = os.path.join(environment.get_dags_dir(), relative_path) pathlib.Path(target_yml_name).parent.mkdir(parents=True, exist_ok=True) shutil.copyfile(config_file, target_yml_name) configs_path = ConfigUtil(os.path.dirname(config_file)).safe_load(is_render_variables=True, soft_merge=True) for config in configs_path: if config.get('secrets'): logging.info(f'Secret volume is being created') secret_util.create_local_secrets(config) def is_file_empty(file): try: with open(file, "r") as stream: data_loaded = yaml.safe_load(stream) except yaml.YAMLError as exc: print(exc) return data_loaded is None def liminal_is_running(): stdout, stderr = docker_compose_command('ps', []) return "liminal" in stdout @cli.command("stop", short_help="stops the docker compose") def stop(): if docker_is_running() and liminal_is_running(): # initialize liminal home by default environment.get_liminal_home() docker_compose_command('down', []) @cli.command("logs", short_help="display the docker compose logs") @click.option('--follow', '-f', is_flag=True, default=False, help="Follow log output.") @click.option('--tail', '-t', default=0, type=int, help="Number of lines to show from the end of the log") def logs(follow, tail): if docker_is_running() and liminal_is_running(): # initialize liminal home by default environment.get_liminal_home() if follow: docker_compose_command('logs', ['--follow']) if tail > 0: stdout, stderr = docker_compose_command('logs', [f'--tail={tail}']) logging.info(stdout) @cli.command("start", short_help="starts a local airflow in docker compose. should be run after deploy. " + "Make sure docker is running on your machine") @click.option('--detached-mode', '-d', is_flag=True, default=False, help="Start liminal in detached mode.") def start(detached_mode): liminal_version = environment.get_liminal_version() logging.info(f'starting liminal version: {liminal_version}') if docker_is_running(): # initialize liminal home by default environment.get_liminal_home() if detached_mode: docker_compose_command('up', ['-d']) else: docker_compose_command('up', []) @cli.command("delete", short_help="delete liminal resource registration") @click.option('--path', default=os.getcwd(), help='Delete within this path.') @click.option('--clean', is_flag=True, default=False, help="Delete all resource: DAGs, Volumes") def delete(path, clean): configs = ConfigUtil(path).safe_load(is_render_variables=True, soft_merge=True) for config in configs: if config.get('volumes'): logging.info(f'local volume is being deleted') volume_util.delete_local_volumes(config, os.path.dirname( files_util.resolve_pipeline_source_file(config['name']))) if clean: dir_path = os.path.abspath(environment.get_liminal_home()) shutil.rmtree(dir_path) docker_compose_command('down', ['--remove-orphans', '--rmi', 'local']) def get_docker_compose_paths(): # noinspection PyTypeChecker with pkg_resources.path(scripts, 'docker-compose.yml') as p: docker_compose_path = p project_dir = pathlib.Path(docker_compose_path).parent.parent.absolute() return docker_compose_path, project_dir if __name__ == '__main__': cli() ================================================ FILE: scripts/requirements-airflow.txt ================================================ # # Licensed to the Apache Software Foundation (ASF) under one # or more contributor license agreements. See the NOTICE file # distributed with this work for additional information # regarding copyright ownership. The ASF licenses this file # to you under the Apache License, Version 2.0 (the # "License"); you may not use this file except in compliance # with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, # software distributed under the License is distributed on an # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY # KIND, either express or implied. See the License for the # specific language governing permissions and limitations # under the License. click==7.1.1 pyyaml boto3==1.12.10 botocore==1.15.21 kubernetes diskcache==3.1.1 flatdict==4.0.1 kafka-python==2.0.2 influxdb-client ================================================ FILE: scripts/webserver_config.py ================================================ # # Licensed to the Apache Software Foundation (ASF) under one # or more contributor license agreements. See the NOTICE file # distributed with this work for additional information # regarding copyright ownership. The ASF licenses this file # to you under the Apache License, Version 2.0 (the # "License"); you may not use this file except in compliance # with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, # software distributed under the License is distributed on an # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY # KIND, either express or implied. See the License for the # specific language governing permissions and limitations # under the License. """Default configuration for the Airflow webserver""" import os from flask_appbuilder.security.manager import AUTH_DB # from flask_appbuilder.security.manager import AUTH_LDAP # from flask_appbuilder.security.manager import AUTH_OAUTH # from flask_appbuilder.security.manager import AUTH_OID # from flask_appbuilder.security.manager import AUTH_REMOTE_USER basedir = os.path.abspath(os.path.dirname(__file__)) # Flask-WTF flag for CSRF WTF_CSRF_ENABLED = True # ---------------------------------------------------- # AUTHENTICATION CONFIG # ---------------------------------------------------- # For details on how to set up each of the following authentication, see # http://flask-appbuilder.readthedocs.io/en/latest/security.html# authentication-methods # for details. # The authentication type # AUTH_OID : Is for OpenID # AUTH_DB : Is for database # AUTH_LDAP : Is for LDAP # AUTH_REMOTE_USER : Is for using REMOTE_USER from web server # AUTH_OAUTH : Is for OAuth AUTH_TYPE = AUTH_DB # Uncomment to setup Full admin role name # AUTH_ROLE_ADMIN = 'Admin' # Uncomment to setup Public role name, no authentication needed AUTH_ROLE_PUBLIC = 'Admin' # Will allow user self registration # AUTH_USER_REGISTRATION = True # The recaptcha it's automatically enabled for user self registration is active and the keys are necessary # RECAPTCHA_PRIVATE_KEY = PRIVATE_KEY # RECAPTCHA_PUBLIC_KEY = PUBLIC_KEY # Config for Flask-Mail necessary for user self registration # MAIL_SERVER = 'smtp.gmail.com' # MAIL_USE_TLS = True # MAIL_USERNAME = 'yourappemail@gmail.com' # MAIL_PASSWORD = 'passwordformail' # MAIL_DEFAULT_SENDER = 'sender@gmail.com' # The default user self registration role # AUTH_USER_REGISTRATION_ROLE = "Public" # When using OAuth Auth, uncomment to setup provider(s) info # Google OAuth example: # OAUTH_PROVIDERS = [{ # 'name':'google', # 'token_key':'access_token', # 'icon':'fa-google', # 'remote_app': { # 'api_base_url':'https://www.googleapis.com/oauth2/v2/', # 'client_kwargs':{ # 'scope': 'email profile' # }, # 'access_token_url':'https://accounts.google.com/o/oauth2/token', # 'authorize_url':'https://accounts.google.com/o/oauth2/auth', # 'request_token_url': None, # 'client_id': GOOGLE_KEY, # 'client_secret': GOOGLE_SECRET_KEY, # } # }] # When using LDAP Auth, setup the ldap server # AUTH_LDAP_SERVER = "ldap://ldapserver.new" # When using OpenID Auth, uncomment to setup OpenID providers. # example for OpenID authentication # OPENID_PROVIDERS = [ # { 'name': 'Yahoo', 'url': 'https://me.yahoo.com' }, # { 'name': 'AOL', 'url': 'http://openid.aol.com/' }, # { 'name': 'Flickr', 'url': 'http://www.flickr.com/' }, # { 'name': 'MyOpenID', 'url': 'https://www.myopenid.com' }] # ---------------------------------------------------- # Theme CONFIG # ---------------------------------------------------- # Flask App Builder comes up with a number of predefined themes # that you can use for Apache Airflow. # http://flask-appbuilder.readthedocs.io/en/latest/customizing.html#changing-themes # Please make sure to remove "navbar_color" configuration from airflow.cfg # in order to fully utilize the theme. (or use that property in conjunction with theme) # APP_THEME = "bootstrap-theme.css" # default bootstrap # APP_THEME = "amelia.css" # APP_THEME = "cerulean.css" # APP_THEME = "cosmo.css" # APP_THEME = "cyborg.css" # APP_THEME = "darkly.css" # APP_THEME = "flatly.css" # APP_THEME = "journal.css" # APP_THEME = "lumen.css" # APP_THEME = "paper.css" # APP_THEME = "readable.css" # APP_THEME = "sandstone.css" # APP_THEME = "simplex.css" # APP_THEME = "slate.css" # APP_THEME = "solar.css" # APP_THEME = "spacelab.css" # APP_THEME = "superhero.css" # APP_THEME = "united.css" # APP_THEME = "yeti.css" ================================================ FILE: setup.py ================================================ #!/usr/bin/env python3 # # Licensed to the Apache Software Foundation (ASF) under one # or more contributor license agreements. See the NOTICE file # distributed with this work for additional information # regarding copyright ownership. The ASF licenses this file # to you under the Apache License, Version 2.0 (the # "License"); you may not use this file except in compliance # with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, # software distributed under the License is distributed on an # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY # KIND, either express or implied. See the License for the # specific language governing permissions and limitations # under the License. import logging import os import setuptools with open("README.md") as fh: long_description = fh.read() with open('requirements.txt') as f: requirements = f.read().splitlines() logging.info(requirements) setuptools.setup( name="apache-liminal", version=os.environ.get("LIMINAL_BUILD_VERSION", os.environ.get('LIMINAL_VERSION', None)), author="dev@liminal.apache.org", author_email='dev@liminal.apache.org', description="A package for authoring and deploying machine learning workflows", long_description=long_description, long_description_content_type="text/markdown", url="https://github.com/apache/incubator-liminal", packages=setuptools.find_packages(), classifiers=[ "Programming Language :: Python :: 3", "License :: OSI Approved :: Apache Software License", "Operating System :: OS Independent", ], license='Apache License, Version 2.0', python_requires='>=3.6', install_requires=requirements, scripts=['scripts/liminal'], include_package_data=True, package_data={ 'liminal': [ '../DISCLAIMER', '../LICENSE', '../NOTICE', '../README.md', '../liminal/core/config/defaults/base/liminal.yml', ], }, ) ================================================ FILE: tests/__init__.py ================================================ # # Licensed to the Apache Software Foundation (ASF) under one # or more contributor license agreements. See the NOTICE file # distributed with this work for additional information # regarding copyright ownership. The ASF licenses this file # to you under the Apache License, Version 2.0 (the # "License"); you may not use this file except in compliance # with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, # software distributed under the License is distributed on an # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY # KIND, either express or implied. See the License for the # specific language governing permissions and limitations # under the License. ================================================ FILE: tests/liminal/__init__.py ================================================ # # Licensed to the Apache Software Foundation (ASF) under one # or more contributor license agreements. See the NOTICE file # distributed with this work for additional information # regarding copyright ownership. The ASF licenses this file # to you under the Apache License, Version 2.0 (the # "License"); you may not use this file except in compliance # with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, # software distributed under the License is distributed on an # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY # KIND, either express or implied. See the License for the # specific language governing permissions and limitations # under the License. ================================================ FILE: tests/liminal/core/__init__.py ================================================ # # Licensed to the Apache Software Foundation (ASF) under one # or more contributor license agreements. See the NOTICE file # distributed with this work for additional information # regarding copyright ownership. The ASF licenses this file # to you under the Apache License, Version 2.0 (the # "License"); you may not use this file except in compliance # with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, # software distributed under the License is distributed on an # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY # KIND, either express or implied. See the License for the # specific language governing permissions and limitations # under the License. ================================================ FILE: tests/liminal/core/config/__init__.py ================================================ # # Licensed to the Apache Software Foundation (ASF) under one # or more contributor license agreements. See the NOTICE file # distributed with this work for additional information # regarding copyright ownership. The ASF licenses this file # to you under the Apache License, Version 2.0 (the # "License"); you may not use this file except in compliance # with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, # software distributed under the License is distributed on an # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY # KIND, either express or implied. See the License for the # specific language governing permissions and limitations # under the License. ================================================ FILE: tests/liminal/core/config/defaults/__init__.py ================================================ # # Licensed to the Apache Software Foundation (ASF) under one # or more contributor license agreements. See the NOTICE file # distributed with this work for additional information # regarding copyright ownership. The ASF licenses this file # to you under the Apache License, Version 2.0 (the # "License"); you may not use this file except in compliance # with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, # software distributed under the License is distributed on an # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY # KIND, either express or implied. See the License for the # specific language governing permissions and limitations # under the License. ================================================ FILE: tests/liminal/core/config/defaults/test_apply_variables_substitution.py ================================================ # # Licensed to the Apache Software Foundation (ASF) under one # or more contributor license agreements. See the NOTICE file # distributed with this work for additional information # regarding copyright ownership. The ASF licenses this file # to you under the Apache License, Version 2.0 (the # "License"); you may not use this file except in compliance # with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, # software distributed under the License is distributed on an # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY # KIND, either express or implied. See the License for the # specific language governing permissions and limitations # under the License. from unittest import TestCase from liminal.core.config.defaults import default_configs class TestApplyVariablesSubstitution(TestCase): def test_apply(self): subliminal = {'variables': {'one': 'one_value', 'two': 'two_value'}} superliminal = {'variables': {'three': 'three_value', 'two': 'two_super_value'}} expected = {'variables': {'one': 'one_value', 'two': 'two_value', 'three': 'three_value'}} self.assertEqual(expected, default_configs.apply_variable_substitution(subliminal, superliminal)) def test_apply_superliminal_is_missing_variables(self): subliminal = {'variables': {'one': 'one_value', 'two': 'two_value'}} superliminal = {} expected = {'variables': {'one': 'one_value', 'two': 'two_value'}} self.assertEqual(expected, default_configs.apply_variable_substitution(subliminal, superliminal)) def test_apply_subliminal_is_missing_variables(self): subliminal = {} superliminal = {'variables': {'one': 'one_value', 'two': 'two_value'}} expected = {'variables': {'one': 'one_value', 'two': 'two_value'}} self.assertEqual(expected, default_configs.apply_variable_substitution(subliminal, superliminal)) ================================================ FILE: tests/liminal/core/config/defaults/test_defaults_pipeline_config.py ================================================ # # Licensed to the Apache Software Foundation (ASF) under one # or more contributor license agreements. See the NOTICE file # distributed with this work for additional information # regarding copyright ownership. The ASF licenses this file # to you under the Apache License, Version 2.0 (the # "License"); you may not use this file except in compliance # with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, # software distributed under the License is distributed on an # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY # KIND, either express or implied. See the License for the # specific language governing permissions and limitations # under the License. from unittest import TestCase from liminal.core.config.defaults import default_configs class TestDefaultsPipelineConfig(TestCase): def test_apply(self): pipeline = {"name": "mypipe", "param": "constant", "tasks": [{"task": "middle_task", "type": "python"}]} subliminal = { "name": "my_subliminal_test", "pipelines": [ { "name": "mypipe", "param": "constant", "tasks": [{"task": "middle_task", "type": "python", "env_vars": {"env1": "env1"}}], } ], "pipeline_defaults": {"param1": "param1_value"}, } superliminal = { "pipeline_defaults": { "param2": "param2super_value", "param3": "param3super_value", "before_tasks": [{"task": "first_task", "type": "python"}], "after_tasks": [{"task": "end_task", "type": "python"}], }, "task_defaults": { "python": { "default_task_super": "default_task_super_value", "env_vars": {"env1": "env1super", "env2": "env2super"}, } }, } expected = { "param": "constant", "param3": "param3super_value", "param2": "param2super_value", "tasks": [ { "type": "python", "env_vars": {"env1": "env1super", "env2": "env2super"}, "task": "first_task", "default_task_super": "default_task_super_value", }, { "type": "python", "env_vars": {"env1": "env1super", "env2": "env2super"}, "task": "middle_task", "default_task_super": "default_task_super_value", }, { "type": "python", "env_vars": {"env1": "env1super", "env2": "env2super"}, "task": "end_task", "default_task_super": "default_task_super_value", }, ], "name": "mypipe", "param1": "param1_value", } self.assertEqual(expected, default_configs.apply_pipeline_defaults(subliminal, superliminal, pipeline)) ================================================ FILE: tests/liminal/core/config/defaults/test_defaults_service_config.py ================================================ # # Licensed to the Apache Software Foundation (ASF) under one # or more contributor license agreements. See the NOTICE file # distributed with this work for additional information # regarding copyright ownership. The ASF licenses this file # to you under the Apache License, Version 2.0 (the # "License"); you may not use this file except in compliance # with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, # software distributed under the License is distributed on an # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY # KIND, either express or implied. See the License for the # specific language governing permissions and limitations # under the License. from unittest import TestCase from liminal.core.config.defaults import default_configs class TestDefaultsServiceConfig(TestCase): def test_apply(self): subliminal = { "services": [ {"name": "my_python_server", "type": "python_server", "image": "default_image"}, {"name": "my_python_server_for_stg", "type": "python_server", "image": "test_image"}, ] } superliminal = {"service_defaults": {"param": "param1", "param2": "param2"}} expected = [ { 'image': 'default_image', 'name': 'my_python_server', 'param': 'param1', 'param2': 'param2', 'type': 'python_server', }, { 'image': 'test_image', 'name': 'my_python_server_for_stg', 'param': 'param1', 'param2': 'param2', 'type': 'python_server', }, ] self.assertEqual(expected, default_configs.apply_service_defaults(subliminal, superliminal)) ================================================ FILE: tests/liminal/core/config/defaults/test_defaults_tasks_config.py ================================================ # # Licensed to the Apache Software Foundation (ASF) under one # or more contributor license agreements. See the NOTICE file # distributed with this work for additional information # regarding copyright ownership. The ASF licenses this file # to you under the Apache License, Version 2.0 (the # "License"); you may not use this file except in compliance # with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, # software distributed under the License is distributed on an # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY # KIND, either express or implied. See the License for the # specific language governing permissions and limitations # under the License. from unittest import TestCase from liminal.core.config.defaults import default_configs class TestDefaultsTaskConfig(TestCase): def test_apply(self): pipeline = { 'pipeline': 'mypipe', "tasks": [ {"task": "middle", "type": "spark", "task_param": "task_middle_param"}, {"task": "end", "type": "python", "task_param": "task_end_param"}, ], } subliminal = {'pipelines': [pipeline]} superliminal = { "task_defaults": { "python": {"env_vars": {"env2": "env2value"}}, "spark": {"task_param": "task_spark_param", "executor": "emr"}, } } expected = { 'pipeline': 'mypipe', 'tasks': [ {'env_vars': {'env1': 'env1value', 'env2': 'env2value'}, 'task': 'start', 'type': 'python'}, {'executor': 'emr', 'task': 'middle', 'task_param': 'task_middle_param', 'type': 'spark'}, {'env_vars': {'env2': 'env2value'}, 'task': 'end', 'task_param': 'task_end_param', 'type': 'python'}, ], } self.assertEqual( expected, default_configs.apply_task_defaults( subliminal, superliminal, pipeline=pipeline, superliminal_before_tasks=[{"task": "start", "type": "python", "env_vars": {"env1": "env1value"}}], superliminal_after_tasks=[], ), ) def test_missing_tasks_from_supr(self): pipeline = { 'pipeline': 'mypipe', "tasks": [ {"task": "middle", "type": "spark", "task_param": "task_middle_param"}, {"task": "end", "type": "python", "task_param": "task_end_param"}, ], } subliminal = {'pipelines': [pipeline]} superliminal = { "task_defaults": { "python": {"env_vars": {"env2": "env2value"}}, "spark": {"task_param": "task_spark_param"}, } } expected = { 'pipeline': 'mypipe', 'tasks': [ {'task': 'middle', 'task_param': 'task_middle_param', 'type': 'spark'}, {'env_vars': {'env2': 'env2value'}, 'task': 'end', 'task_param': 'task_end_param', 'type': 'python'}, ], } self.assertEqual( expected, default_configs.apply_task_defaults( subliminal, superliminal, pipeline=pipeline, superliminal_before_tasks=[], superliminal_after_tasks=[] ), ) def test_missing_tasks_from_sub(self): pipeline = {'pipeline': 'mypipe'} subliminal = {'pipelines': [pipeline]} superliminal = { "task_defaults": { "python": {"env_vars": {"env2": "env2value"}}, "spark": { "task_param": "task_start_param", }, } } expected = { 'pipeline': 'mypipe', 'tasks': [ {'task': 'start', 'task_param': 'task_middle_param', 'type': 'spark'}, {'env_vars': {'env2': 'env2value'}, 'task': 'end', 'task_param': 'task_end_param', 'type': 'end'}, ], } self.assertEqual( expected, default_configs.apply_task_defaults( subliminal, superliminal, pipeline=pipeline, superliminal_before_tasks=[{"task": "start", "task_param": "task_middle_param", "type": "spark"}], superliminal_after_tasks=[ {"env_vars": {"env2": "env2value"}, "task": "end", "task_param": "task_end_param", "type": "end"} ], ), ) ================================================ FILE: tests/liminal/core/config/test_config.py ================================================ # # Licensed to the Apache Software Foundation (ASF) under one # or more contributor license agreements. See the NOTICE file # distributed with this work for additional information # regarding copyright ownership. The ASF licenses this file # to you under the Apache License, Version 2.0 (the # "License"); you may not use this file except in compliance # with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, # software distributed under the License is distributed on an # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY # KIND, either express or implied. See the License for the # specific language governing permissions and limitations # under the License. import os from unittest import TestCase, mock from liminal.core.config.config import ConfigUtil # noinspection PyUnresolvedReferences,DuplicatedCode class TestConfigUtil(TestCase): @mock.patch('liminal.core.util.files_util.load') def test_safe_load(self, find_config_files_mock): subliminal = { 'name': 'my_subliminal_test', 'type': 'sub', 'super': 'my_superliminal_test', 'images': [{'image': 'my_image'}], 'pipelines': [{'name': 'mypipe1', 'param': 'constant'}, {'name': 'mypipe2', 'param': 'constant'}], 'pipeline_defaults': {'param1': 'param1_value'}, 'task_defaults': {'job_start': {'task_sub_def': 'task_sub_def_value'}}, } superliminal = { 'name': 'my_superliminal_test', 'type': 'super', 'super': 'super_superliminal', 'images': [{'image': 'my_image', 'source': '.'}], 'pipeline_defaults': {'param2': 'param2super_value', 'param3': 'param3super_value'}, 'task_defaults': { 'job_start': { 'task_def1': 'task_def1_value', 'task_def2': { 'task_def2_1': 'task_def2_1_value', }, } }, } super_superliminal = { 'name': 'super_superliminal', 'type': 'super', 'pipeline_defaults': { 'param2': 'param2super_value', 'param3': 'param3hyper_value', 'param4': 'param4hyper_value', }, } expected = [ { 'executors': [{'executor': 'default_k8s', 'type': 'kubernetes'}], 'name': 'my_subliminal_test', 'pipeline_defaults': {'param1': 'param1_value'}, 'pipelines': [ { 'description': 'add defaults parameters for all pipelines', 'name': 'mypipe1', 'param': 'constant', 'param1': 'param1_value', 'param2': 'param2super_value', 'param3': 'param3super_value', 'param4': 'param4hyper_value', 'tasks': [ { 'task': 'start', 'task_def1': 'task_def1_value', 'task_def2': {'task_def2_1': 'task_def2_1_value'}, 'task_sub_def': 'task_sub_def_value', 'type': 'job_start', }, {'task': 'end', 'type': 'job_end'}, ], }, { 'description': 'add defaults parameters for all pipelines', 'name': 'mypipe2', 'param': 'constant', 'param1': 'param1_value', 'param2': 'param2super_value', 'param3': 'param3super_value', 'param4': 'param4hyper_value', 'tasks': [ { 'task': 'start', 'task_def1': 'task_def1_value', 'task_def2': {'task_def2_1': 'task_def2_1_value'}, 'task_sub_def': 'task_sub_def_value', 'type': 'job_start', }, {'task': 'end', 'type': 'job_end'}, ], }, ], 'service_defaults': {'description': 'add defaults parameters for all ' 'services'}, 'images': [{'image': 'my_image', 'source': '.'}], 'services': [], 'super': 'my_superliminal_test', 'task_defaults': {'job_start': {'task_sub_def': 'task_sub_def_value'}}, 'type': 'sub', } ] expected = [ { 'executors': [ {'executor': 'default_k8s', 'type': 'kubernetes'}, {'executor': 'airflow_executor', 'type': 'airflow'}, ], 'images': [{'image': 'my_image', 'source': '.'}], 'name': 'my_subliminal_test', 'pipeline_defaults': {'param1': 'param1_value'}, 'pipelines': [ { 'description': 'add defaults parameters for all pipelines', 'name': 'mypipe1', 'param': 'constant', 'param1': 'param1_value', 'param2': 'param2super_value', 'param3': 'param3super_value', 'param4': 'param4hyper_value', 'tasks': [ { 'executor': 'airflow_executor', 'task': 'start', 'task_def1': 'task_def1_value', 'task_def2': {'task_def2_1': 'task_def2_1_value'}, 'task_sub_def': 'task_sub_def_value', 'type': 'job_start', }, {'executor': 'airflow_executor', 'task': 'end', 'type': 'job_end'}, ], }, { 'description': 'add defaults parameters for all pipelines', 'name': 'mypipe2', 'param': 'constant', 'param1': 'param1_value', 'param2': 'param2super_value', 'param3': 'param3super_value', 'param4': 'param4hyper_value', 'tasks': [ { 'executor': 'airflow_executor', 'task': 'start', 'task_def1': 'task_def1_value', 'task_def2': {'task_def2_1': 'task_def2_1_value'}, 'task_sub_def': 'task_sub_def_value', 'type': 'job_start', }, {'executor': 'airflow_executor', 'task': 'end', 'type': 'job_end'}, ], }, ], 'service_defaults': {'description': 'add defaults parameters for all ' 'services'}, 'services': [], 'super': 'my_superliminal_test', 'task_defaults': {'job_start': {'task_sub_def': 'task_sub_def_value'}}, 'type': 'sub', } ] find_config_files_mock.return_value = { 'my_subliminal_test': subliminal, 'my_superliminal_test': superliminal, 'super_superliminal': super_superliminal, } config_util = ConfigUtil('') self.assertEqual(expected, config_util.safe_load(is_render_variables=True)) # validate cache self.assertEqual(expected, config_util.loaded_subliminals) @mock.patch('liminal.core.util.files_util.load') def test_get_config(self, find_config_files_mock): find_config_files_mock.return_value = { 'my_subliminal_test': {'type': 'sub'}, 'my_superliminal_test': {'type': 'super'}, } config_util = ConfigUtil('') self.assertEqual({'type': 'sub'}, config_util._ConfigUtil__get_config('my_subliminal_test')) self.assertEqual({'type': 'super'}, config_util._ConfigUtil__get_config('my_superliminal_test')) @mock.patch('liminal.core.util.files_util.load') def test_get_superliminal(self, find_config_files_mock): base = { 'executors': [ {'executor': 'default_k8s', 'type': 'kubernetes'}, {'executor': 'airflow_executor', 'type': 'airflow'}, ], 'name': 'base', 'pipeline_defaults': { 'after_tasks': [{'task': 'end', 'type': 'job_end'}], 'before_tasks': [{'task': 'start', 'type': 'job_start'}], 'description': 'add defaults parameters for all ' 'pipelines', }, 'service_defaults': {'description': 'add defaults parameters for all ' 'services'}, 'task_defaults': { 'description': 'add defaults parameters for all tasks ' 'separate by task type', 'job_end': {'executor': 'airflow_executor'}, 'job_start': {'executor': 'airflow_executor'}, 'python': {'executor': 'default_k8s'}, 'spark': {'executor': 'default_k8s'}, }, 'type': 'super', } subliminal = {'name': 'subliminal_test', 'type': 'sub'} find_config_files_mock.return_value = {'subliminal_test': subliminal} config_util = ConfigUtil('') self.assertEqual(base, config_util._ConfigUtil__get_superliminal(subliminal, False)) self.assertEqual({}, config_util._ConfigUtil__get_superliminal(base, False)) liminal = {'name': 'subliminal_test', 'type': 'sub', 'super': 'my_superliminal'} with self.assertRaises(FileNotFoundError): config_util._ConfigUtil__get_superliminal(liminal, False) @mock.patch('liminal.core.util.files_util.load') def test_merge_superliminals(self, find_config_files_mock): superliminal = { 'name': 'my_superliminal_test', 'type': 'super', 'super': 'super_superliminal', 'pipeline_defaults': { 'before_tasks': [ {'task': 'start-2', 'type': 'spark'}, ], 'after_tasks': [{'task': 'end-1', 'type': 'spark'}], }, 'task_defaults': {'task_def1': 'task_def1_value'}, } super_superliminal = { 'name': 'super_superliminal', 'type': 'super', 'pipeline_defaults': { 'before_tasks': [{'task': 'start-1', 'type': 'spark'}], 'after_tasks': [{'task': 'end-2', 'type': 'spark'}], }, } config_util = ConfigUtil('') find_config_files_mock.return_value = {'super_superliminal': super_superliminal, 'superliminal': superliminal} expected = { 'name': 'my_superliminal_test', 'pipeline_defaults': { 'after_tasks': [{'task': 'end-1', 'type': 'spark'}, {'task': 'end-2', 'type': 'spark'}], 'before_tasks': [{'task': 'start-1', 'type': 'spark'}, {'task': 'start-2', 'type': 'spark'}], }, 'super': 'super_superliminal', 'task_defaults': {'task_def1': 'task_def1_value'}, 'type': 'super', } self.assertEqual( expected, dict(config_util._ConfigUtil__merge_superliminals(superliminal, super_superliminal)) ) @mock.patch('liminal.core.util.files_util.load') @mock.patch.dict(os.environ, {'env': 'myenv', 'LIMINAL_STAND_ALONE_MODE': 'True'}) def test_safe_load_with_variables(self, find_config_files_mock): subliminal = { 'name': 'my_subliminal_test', 'type': 'sub', 'super': 'my_superliminal_test', 'variables': { 'var': 'simple case', 'var-2': '-case', 'var_2': '_case', 'image': 'prod image', 'a': '{{env}}1', 'b': '{{a}}2', 'c': '{{a}}{{b}}2', }, 'pipelines': [ { 'name': 'mypipe1', 'param': '{{var}}', 'tasks': [ {'task': 'sub_tasks', 'type': 'dummy'}, ], }, { 'name': 'mypipe2', 'param': '{{var-2 }}', 'tasks': [ {'task': 'sub_tasks', 'type': 'dummy'}, ], }, ], 'pipeline_defaults': {'param1': '{{var-2}}'}, 'task_defaults': {'job_start': {'task_def1:': 'task_sub_def_value'}}, 'services': [ {'name': 'my_python_server', 'type': 'python_server', 'image': '{{image}}'}, {'name': 'my_python_server_for_stg', 'type': 'python_server', 'image': '{{default_image}}'}, ], } superliminal = { 'name': 'my_superliminal_test', 'type': 'super', 'variables': { 'var-2': 'override', 'var3': 'super_var', 'default_image': 'default_image_value', 'image': 'default_image_value', }, 'super': 'super_superliminal', 'pipeline_defaults': { 'param2': '{{pipe-var}}', 'param3': 'param3super_value', 'before_tasks': [ {'task': 'second_task', 'type': 'dummy'}, ], }, 'task_defaults': { 'pipeline': { 'path': '{{var-2}}', 'task_def1': 'task_def1_value', 'task_def2': { 'task_def2_1': 'task_def2_1_value', }, } }, } super_superliminal = { 'name': 'super_superliminal', 'type': 'super', 'variables': {'default_image': 'def_default_image_value'}, 'pipeline_defaults': { 'global_conf': '{{var3}}', 'param2': 'param2super_value', 'param3': 'param3hyper_value', 'param4': 'param4hyper_value', 'after_tasks': [ {'task': 'before_last_task', 'type': 'dummy'}, ], }, } expected = [ { 'executors': [ {'executor': 'default_k8s', 'type': 'kubernetes'}, {'executor': 'airflow_executor', 'type': 'airflow'}, ], 'name': 'my_subliminal_test', 'pipeline_defaults': {'param1': '-case'}, 'pipelines': [ { 'description': 'add defaults parameters for all pipelines', 'global_conf': 'super_var', 'name': 'mypipe1', 'param': 'simple case', 'param1': '-case', 'param2': '{{pipe-var}}', 'param3': 'param3super_value', 'param4': 'param4hyper_value', 'tasks': [ { 'executor': 'airflow_executor', 'task': 'start', 'task_def1:': 'task_sub_def_value', 'type': 'job_start', }, {'task': 'second_task', 'type': 'dummy'}, {'task': 'sub_tasks', 'type': 'dummy'}, {'task': 'before_last_task', 'type': 'dummy'}, {'executor': 'airflow_executor', 'task': 'end', 'type': 'job_end'}, ], }, { 'description': 'add defaults parameters for all pipelines', 'global_conf': 'super_var', 'name': 'mypipe2', 'param': '-case', 'param1': '-case', 'param2': '{{pipe-var}}', 'param3': 'param3super_value', 'param4': 'param4hyper_value', 'tasks': [ { 'executor': 'airflow_executor', 'task': 'start', 'task_def1:': 'task_sub_def_value', 'type': 'job_start', }, {'task': 'second_task', 'type': 'dummy'}, {'task': 'sub_tasks', 'type': 'dummy'}, {'task': 'before_last_task', 'type': 'dummy'}, {'executor': 'airflow_executor', 'task': 'end', 'type': 'job_end'}, ], }, ], 'service_defaults': {'description': 'add defaults parameters for all ' 'services'}, 'images': [], 'services': [ { 'description': 'add defaults parameters for all services', 'image': 'prod image', 'name': 'my_python_server', 'type': 'python_server', }, { 'description': 'add defaults parameters for all services', 'image': 'default_image_value', 'name': 'my_python_server_for_stg', 'type': 'python_server', }, ], 'super': 'my_superliminal_test', 'task_defaults': {'job_start': {'task_def1:': 'task_sub_def_value'}}, 'type': 'sub', 'variables': { 'a': 'myenv1', 'b': 'myenv12', 'c': 'myenv1myenv122', 'image': 'prod image', 'var': 'simple case', 'var-2': '-case', 'var_2': '_case', }, } ] find_config_files_mock.return_value = { 'my_subliminal_test': subliminal, 'my_superliminal_test': superliminal, 'super_superliminal': super_superliminal, } config_util = ConfigUtil('') self.assertEqual(expected, config_util.safe_load(is_render_variables=True)) # validate cache self.assertEqual(expected, config_util.loaded_subliminals) @mock.patch('os.path.exists') @mock.patch('liminal.core.environment.get_airflow_home_dir') @mock.patch('liminal.core.util.files_util.load') @mock.patch.dict(os.environ, {'LIMINAL_STAND_ALONE_MODE': 'True', 'POD_NAMESPACE': 'my_pod_ns'}) def test_liminal_config_snapshot(self, find_config_files_mock, get_airflow_dir_mock, path_exists_mock): subliminal = { 'name': 'my_subliminal_test', 'type': 'sub', 'variables': {'var': 1, 'var-2': True}, 'pipelines': [{'name': 'mypipe1', 'param': '{{var}}'}, {'name': 'mypipe2', 'param': '{{var-2 }}'}], } expected = { 'name': 'my_subliminal_test', 'type': 'sub', 'executors': [ {'executor': 'default_k8s', 'type': 'kubernetes'}, {'executor': 'airflow_executor', 'type': 'airflow'}, ], 'service_defaults': {'description': 'add defaults parameters for all services'}, 'task_defaults': { 'description': 'add defaults parameters for all tasks separate by task type', 'python': {'executor': 'default_k8s'}, 'spark': {'executor': 'default_k8s'}, 'job_end': {'executor': 'airflow_executor'}, 'job_start': {'executor': 'airflow_executor'}, }, 'pipeline_defaults': { 'description': 'add defaults parameters for all pipelines', 'before_tasks': [{'task': 'start', 'type': 'job_start'}], 'after_tasks': [{'task': 'end', 'type': 'job_end'}], }, 'variables': {'var': 1, 'var-2': True}, 'pipelines': [ { 'name': 'mypipe1', 'param': '1', 'description': 'add defaults parameters for all pipelines', 'tasks': [ {'task': 'start', 'type': 'job_start', 'executor': 'airflow_executor'}, {'task': 'end', 'type': 'job_end', 'executor': 'airflow_executor'}, ], }, { 'name': 'mypipe2', 'param': 'True', 'description': 'add defaults parameters for all pipelines', 'tasks': [ {'task': 'start', 'type': 'job_start', 'executor': 'airflow_executor'}, {'task': 'end', 'type': 'job_end', 'executor': 'airflow_executor'}, ], }, ], 'images': [], 'services': [], } find_config_files_mock.return_value = {'my_subliminal_test': subliminal} get_airflow_dir_mock.return_value = '/tmp' path_exists_mock.return_value = True with mock.patch('builtins.open', mock.mock_open()) as m: with mock.patch('yaml.dump') as ydm: config_util = ConfigUtil('') config_util.safe_load(is_render_variables=True) config_util.snapshot_final_liminal_configs() m.assert_called_once_with(os.path.join('/tmp', '../liminal_config_files/my_subliminal_test.yml'), 'w') ydm.assert_called_once_with(expected, m.return_value, default_flow_style=False) @mock.patch('liminal.core.util.files_util.load') def test_soft_merge_load(self, find_config_files_mock): subliminal = {'name': 'my_name', 'type': 'sub', 'super': 'my_super'} find_config_files_mock.return_value = {'my_subliminal_test': subliminal} config_util = ConfigUtil('') self.assertEqual([subliminal], config_util.safe_load(is_render_variables=True, soft_merge=True)) def test_non_soft_merge_load(self): subliminal = {'name': 'my_name', 'type': 'sub', 'super': 'my_super'} config_util = ConfigUtil('') self.assertRaises(FileNotFoundError, config_util._ConfigUtil__get_superliminal, subliminal, False) ================================================ FILE: tests/liminal/core/util/__init__.py ================================================ # # Licensed to the Apache Software Foundation (ASF) under one # or more contributor license agreements. See the NOTICE file # distributed with this work for additional information # regarding copyright ownership. The ASF licenses this file # to you under the Apache License, Version 2.0 (the # "License"); you may not use this file except in compliance # with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, # software distributed under the License is distributed on an # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY # KIND, either express or implied. See the License for the # specific language governing permissions and limitations # under the License. ================================================ FILE: tests/liminal/core/util/test_dict_utils.py ================================================ # # Licensed to the Apache Software Foundation (ASF) under one # or more contributor license agreements. See the NOTICE file # distributed with this work for additional information # regarding copyright ownership. The ASF licenses this file # to you under the Apache License, Version 2.0 (the # "License"); you may not use this file except in compliance # with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, # software distributed under the License is distributed on an # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY # KIND, either express or implied. See the License for the # specific language governing permissions and limitations # under the License. import os from unittest import TestCase, mock from liminal.core.util import dict_util class TestDictUtils(TestCase): def setUp(self) -> None: self.dict1 = {"env": "env1", "env_dict": {"env3": "env3"}, "env4": "env4"} self.dict2 = {"env": "env1", "env_dict": {"env2": "env2"}} def test_merge_dicts(self): expected = {"env": "env1", "env4": "env4", "env_dict": {"env2": "env2"}} self.assertEqual(expected, dict_util.merge_dicts(self.dict1, self.dict2)) def test_recursive_merge_dicts(self): expected = {"env": "env1", "env4": "env4", "env_dict": {"env2": "env2", "env3": "env3"}} self.assertEqual(expected, dict_util.merge_dicts(self.dict1, self.dict2, True)) def test_merge_with_empty(self): self.assertEqual(self.dict2, dict_util.merge_dicts({}, self.dict2, True)) self.assertEqual(self.dict2, dict_util.merge_dicts({}, self.dict2)) self.assertEqual(self.dict2, dict_util.merge_dicts(self.dict2, {}, True)) self.assertEqual(self.dict2, dict_util.merge_dicts(self.dict2, {})) def test_replace_variables_simple_case(self): dct = { "env": "{{env_var}}", "env_dict": {"env3": "{{ var1 }}"}, "env4": "{{var2 }}", "env5": "{{{var2}}", "env6": "{{var3}}", } variables = {"env_var": "env value", "var1": "value1", "var2": "value2"} expected = { "env": "env value", "env_dict": {"env3": "value1"}, "env4": "value2", "env5": "{value2", "env6": "{{var3}}", } self.assertEqual(expected, dict_util.replace_placeholders(dct, variables)) def test_replace_variables_empty_var(self): dct = { "env": "{{env_var}}", "env_dict": {"env3": "{{ var1 }}"}, "env4": "{{var2 }}", "env5": "{{{var2}}", "env6": "{{var3}}", } self.assertEqual(dct, dict_util.replace_placeholders(dct, {})) @mock.patch.dict(os.environ, {"LIMINAL_STAND_ALONE_MODE": "False"}) @mock.patch('airflow.models.Variable.get') def test_replace_variables_flat_replace(self, airflow_variable_mock): def airflow_variable_values(key, default_var): return 'liminal playground' if key == 'playground' else default_var airflow_variable_mock.side_effect = airflow_variable_values dct = { "query": "select * from my_table " "where event_type = {{event_type}} and region = {{region}}", "env": "{{prod}}, {{stg}}, {{playground}}", "optional": "{{optionals}}", } variables = { "region": "us_east_1", "event_type": "subscription", "prod": "liminal production", "stg": "liminal staging", } expected = { 'query': 'select * from my_table ' 'where event_type = subscription and region = us_east_1', "env": "liminal production, liminal staging, liminal playground", "optional": "{{optionals}}", } self.assertEqual(expected, dict_util.replace_placeholders(dct, variables)) @mock.patch.dict(os.environ, {"LIMINAL_STAND_ALONE_MODE": "False"}) @mock.patch('airflow.models.Variable.get') def test_replace_variables_with_nested_list(self, airflow_variable_mock): def airflow_variable_values(key, default_var): return 'liminal playground' if key == 'playground' else default_var airflow_variable_mock.side_effect = airflow_variable_values dct = { "query": "select * from my_table " "where event_type = {{event_type}} and region = {{region}}", "env": ['{{prod}}', '{{stg}}', '{{playground}}'], "tasks": [{'id': 'id1', 'image': '{{image}}'}], "optional": "{{optionals}}", } variables = { "region": "us_east_1", "event_type": "subscription", "prod": "liminal production", "stg": "liminal staging", "image": "my_image_name", } expected = { 'env': ['liminal production', 'liminal staging', 'liminal playground'], 'optional': '{{optionals}}', 'query': 'select * from my_table where event_type = subscription and region = ' 'us_east_1', 'tasks': [{'id': 'id1', 'image': 'my_image_name'}], } self.assertEqual(expected, dict_util.replace_placeholders(dct, variables)) @mock.patch.dict(os.environ, {"table": "my_table", "LIMINAL_STAND_ALONE_MODE": "True"}) def test_replace_variables_from_env(self): dct = { "query": "select * from my_table " "where event_type = {{event_type}} and region = {{region}} from {{table}}" } variables = {} expected = { 'query': 'select * from my_table ' 'where event_type = {{event_type}} ' 'and region = {{region}} from my_table' } self.assertEqual(expected, dict_util.replace_placeholders(dct, variables)) @mock.patch.dict(os.environ, {"table": "my_table", "LIMINAL_STAND_ALONE_MODE": "True"}) def test_replace_variables_from_variable_and_not_env(self): dct = { "query": "select * from my_table " "where event_type = {{event_type}} and region = {{region}} from {{table}}" } variables = {"table": "my_variable_table"} expected = { 'query': 'select * from my_table ' 'where event_type = {{event_type}} ' 'and region = {{region}} from my_variable_table' } self.assertEqual(expected, dict_util.replace_placeholders(dct, variables)) @mock.patch.dict(os.environ, {"table": "my_table", "LIMINAL_STAND_ALONE_MODE": "False"}) @mock.patch('airflow.models.Variable.get') def test_replace_variables_from_airflow_and_not_enc(self, airflow_variable_mock): def airflow_variable_values(key, default_var): return 'my_airflow_table' if key == 'table' else default_var airflow_variable_mock.side_effect = airflow_variable_values dct = { "query": "select * from my_table " "where event_type = {{event_type}} and region = {{region}} from {{table}}" } variables = {} expected = { 'query': 'select * from my_table ' 'where event_type = {{event_type}} ' 'and region = {{region}} from my_airflow_table' } self.assertEqual(expected, dict_util.replace_placeholders(dct, variables)) ================================================ FILE: tests/liminal/kubernetes/__init__.py ================================================ # # Licensed to the Apache Software Foundation (ASF) under one # or more contributor license agreements. See the NOTICE file # distributed with this work for additional information # regarding copyright ownership. The ASF licenses this file # to you under the Apache License, Version 2.0 (the # "License"); you may not use this file except in compliance # with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, # software distributed under the License is distributed on an # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY # KIND, either express or implied. See the License for the # specific language governing permissions and limitations # under the License. ================================================ FILE: tests/liminal/kubernetes/test_volume_util.py ================================================ # # Licensed to the Apache Software Foundation (ASF) under one # or more contributor license agreements. See the NOTICE file # distributed with this work for additional information # regarding copyright ownership. The ASF licenses this file # to you under the Apache License, Version 2.0 (the # "License"); you may not use this file except in compliance # with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, # software distributed under the License is distributed on an # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY # KIND, either express or implied. See the License for the # specific language governing permissions and limitations # under the License. import sys import unittest from kubernetes import config from liminal.kubernetes import volume_util try: config.load_kube_config() except Exception: msg = "Kubernetes is not running\n" sys.stdout.write(f"INFO: {msg}") class TestKubernetesVolume(unittest.TestCase): def setUp(self) -> None: self.config = { 'volumes': [ { 'volume': 'gettingstartedvol-test', 'claim_name': 'gettingstartedvol-test-pvc', 'local': {'path': '.'}, } ] } def test_volume_config(self): volumes_config = volume_util.get_volume_configs(self.config, ".") self.assertEqual(str(self.config['volumes'][0]), str(volumes_config[0])) def test_create_volume(self): self._delete_volumes() self._create_volumes() matching_volumes = volume_util._list_persistent_volumes(self.config['volumes'][0]['volume']) self.assertTrue( self.config['volumes'][0]['volume'] in matching_volumes[0]['metadata']['name'], self.config['volumes'][0]['claim_name'] in matching_volumes[0]['spec']['claim_ref']['name'], ) def test_delete_volume(self): self._create_volumes() self._delete_volumes() matching_volumes = volume_util._list_persistent_volumes(self.config['volumes'][0]['volume']) self.assertEqual([], matching_volumes) def _create_volumes(self): volume_util.create_local_volumes(self.config, ".") def _delete_volumes(self): volume_util.delete_local_volumes(self.config, ".") ================================================ FILE: tests/runners/__init__.py ================================================ # # Licensed to the Apache Software Foundation (ASF) under one # or more contributor license agreements. See the NOTICE file # distributed with this work for additional information # regarding copyright ownership. The ASF licenses this file # to you under the Apache License, Version 2.0 (the # "License"); you may not use this file except in compliance # with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, # software distributed under the License is distributed on an # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY # KIND, either express or implied. See the License for the # specific language governing permissions and limitations # under the License. ================================================ FILE: tests/runners/airflow/__init__.py ================================================ # # Licensed to the Apache Software Foundation (ASF) under one # or more contributor license agreements. See the NOTICE file # distributed with this work for additional information # regarding copyright ownership. The ASF licenses this file # to you under the Apache License, Version 2.0 (the # "License"); you may not use this file except in compliance # with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, # software distributed under the License is distributed on an # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY # KIND, either express or implied. See the License for the # specific language governing permissions and limitations # under the License. ================================================ FILE: tests/runners/airflow/build/__init__.py ================================================ # # Licensed to the Apache Software Foundation (ASF) under one # or more contributor license agreements. See the NOTICE file # distributed with this work for additional information # regarding copyright ownership. The ASF licenses this file # to you under the Apache License, Version 2.0 (the # "License"); you may not use this file except in compliance # with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, # software distributed under the License is distributed on an # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY # KIND, either express or implied. See the License for the # specific language governing permissions and limitations # under the License. ================================================ FILE: tests/runners/airflow/build/http/__init__.py ================================================ # # Licensed to the Apache Software Foundation (ASF) under one # or more contributor license agreements. See the NOTICE file # distributed with this work for additional information # regarding copyright ownership. The ASF licenses this file # to you under the Apache License, Version 2.0 (the # "License"); you may not use this file except in compliance # with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, # software distributed under the License is distributed on an # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY # KIND, either express or implied. See the License for the # specific language governing permissions and limitations # under the License. ================================================ FILE: tests/runners/airflow/build/http/python/__init__.py ================================================ # # Licensed to the Apache Software Foundation (ASF) under one # or more contributor license agreements. See the NOTICE file # distributed with this work for additional information # regarding copyright ownership. The ASF licenses this file # to you under the Apache License, Version 2.0 (the # "License"); you may not use this file except in compliance # with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, # software distributed under the License is distributed on an # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY # KIND, either express or implied. See the License for the # specific language governing permissions and limitations # under the License. ================================================ FILE: tests/runners/airflow/build/http/python/test_python_server_image_builder.py ================================================ # # Licensed to the Apache Software Foundation (ASF) under one # or more contributor license agreements. See the NOTICE file # distributed with this work for additional information # regarding copyright ownership. The ASF licenses this file # to you under the Apache License, Version 2.0 (the # "License"); you may not use this file except in compliance # with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, # software distributed under the License is distributed on an # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY # KIND, either express or implied. See the License for the # specific language governing permissions and limitations # under the License. import json import logging import os import threading import time import unittest import urllib.request from unittest import TestCase import docker from liminal.build.image.python_server.python_server import PythonServerImageBuilder from liminal.build.python import PythonImageVersions IMAGE_NAME = 'liminal_server_image' class TestPythonServer(TestCase): def setUp(self) -> None: super().setUp() self.docker_client = docker.from_env() self.config = self.__create_conf() self.__remove_containers() def tearDown(self) -> None: self.__remove_containers() self.docker_client.close() def test_build_python_server(self): versions = list(PythonImageVersions().supported_versions) for version in versions: build_out = self.__test_build_python_server(python_version=version) self.assertTrue('RUN pip install -r requirements.txt' in build_out, 'Incorrect pip command') def test_build_python_server_with_pip_conf(self): build_out = self.__test_build_python_server(use_pip_conf=True) self.assertTrue( 'RUN --mount=type=secret,id=pip_config,dst=/etc/pip.conf pip install' in build_out, 'Incorrect pip command', ) def __test_build_python_server(self, use_pip_conf=False, python_version=None): base_path = os.path.join(os.path.dirname(__file__), '../../../liminal') config = self.__create_conf() if use_pip_conf: config['pip_conf'] = os.path.join(base_path, 'pip.conf') if python_version: config['python_version'] = python_version builder = PythonServerImageBuilder( config=config, base_path=base_path, relative_source_path='myserver', tag=IMAGE_NAME ) build_out = str(builder.build()) thread = threading.Thread(target=self.__run_container) thread.start() time.sleep(20) logging.info('Sending request to server') json_string = '{"key1": "val1", "key2": "val2"}' encoding = 'ascii' server_response = str( urllib.request.urlopen('http://localhost:9294/myendpoint1', data=json_string.encode(encoding)) .read() .decode(encoding) ) logging.info(f'Response from server: {server_response}') self.assertEqual(f'Input was: {json.loads(json_string)}', server_response) server_favicon_response = urllib.request.urlopen('http://localhost:9294/favicon.ico') self.assertEqual(204, server_favicon_response.status) self.__remove_containers() return build_out def __remove_containers(self): logging.info(f'Stopping containers with image: {IMAGE_NAME}') matching_containers = self.__get_docker_containers() for container in matching_containers: container_id = container.id logging.info(f'Stopping container {container_id}') self.docker_client.api.stop(container_id) logging.info(f'Removing container {container_id}') self.docker_client.api.remove_container(container_id) while len(matching_containers) > 0: matching_containers = self.__get_docker_containers() def __get_docker_containers(self): return self.docker_client.containers.list(filters={'ancestor': IMAGE_NAME}) def __run_container(self): try: logging.info(f'Running container for image: {IMAGE_NAME}') self.docker_client.containers.run(IMAGE_NAME, ports={'80/tcp': 9294}, detach=True) except Exception as err: logging.exception(err) pass @staticmethod def __create_conf(): return { 'image': IMAGE_NAME, 'cmd': 'foo bar', 'source': 'baz', 'input_type': 'my_input_type', 'input_path': 'my_input', 'output_path': '/my_output.json', 'no_cache': True, 'endpoints': [{'endpoint': '/myendpoint1', 'module': 'my_server', 'function': 'myendpoint1func'}], } if __name__ == '__main__': unittest.main() ================================================ FILE: tests/runners/airflow/build/python/__init__.py ================================================ # # Licensed to the Apache Software Foundation (ASF) under one # or more contributor license agreements. See the NOTICE file # distributed with this work for additional information # regarding copyright ownership. The ASF licenses this file # to you under the Apache License, Version 2.0 (the # "License"); you may not use this file except in compliance # with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, # software distributed under the License is distributed on an # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY # KIND, either express or implied. See the License for the # specific language governing permissions and limitations # under the License. ================================================ FILE: tests/runners/airflow/build/python/test_python_image_builder.py ================================================ # # Licensed to the Apache Software Foundation (ASF) under one # or more contributor license agreements. See the NOTICE file # distributed with this work for additional information # regarding copyright ownership. The ASF licenses this file # to you under the Apache License, Version 2.0 (the # "License"); you may not use this file except in compliance # with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, # software distributed under the License is distributed on an # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY # KIND, either express or implied. See the License for the # specific language governing permissions and limitations # under the License. import logging import os import shutil import tempfile from unittest import TestCase import docker from liminal.build.image.python.python import PythonImageBuilder from liminal.build.python import PythonImageVersions class TestPythonImageBuilder(TestCase): __IMAGE_NAME = 'my_python_task_img' def setUp(self) -> None: super().setUp() os.environ['TMPDIR'] = '/tmp' self.temp_dir = self.__temp_dir() self.temp_airflow_dir = self.__temp_dir() def tearDown(self) -> None: super().tearDown() self.__remove_dir(self.temp_dir) self.__remove_dir(self.temp_airflow_dir) def test_build(self): for python_version in [None, PythonImageVersions().supported_versions[0]]: build_out = self.__test_build(python_version=python_version) self.assertTrue('RUN pip install -r requirements.txt' in build_out, 'Incorrect pip command') self.__test_image() def test_build_with_pip_conf(self): build_out = self.__test_build(use_pip_conf=True) self.assertTrue( 'RUN --mount=type=secret,id=pip_config,dst=/etc/pip.conf pip install' in build_out, 'Incorrect pip command', ) self.__test_image() def test_with_unsupported_python_version(self): with self.assertRaises(ValueError): self.__test_build(python_version='3.5.2') def __test_build(self, use_pip_conf=False, python_version=None): config = self.__create_conf('my_task') base_path = os.path.join(os.path.dirname(__file__), '../../liminal') if use_pip_conf: config['pip_conf'] = os.path.join(base_path, 'pip.conf') if python_version: config['python_version'] = python_version builder = PythonImageBuilder( config=config, base_path=base_path, relative_source_path='write_inputs', tag=self.__IMAGE_NAME ) build_out = str(builder.build()) return build_out def __test_image(self): docker_client = docker.from_env() docker_client.images.get(self.__IMAGE_NAME) cmds = ['/bin/bash', '-c', 'python write_inputs.py'] container_log = docker_client.containers.run( self.__IMAGE_NAME, cmds, volumes={self.temp_dir: {'bind': '/mnt/vol1', 'mode': 'rw'}}, environment={'NUM_FILES': 10, 'NUM_SPLITS': 3}, ) docker_client.close() logging.info(container_log) self.assertEqual( "b'" "Writing input file /mnt/vol1/inputs/0/input0.json\\n" "Writing input file /mnt/vol1/inputs/1/input1.json\\n" "Writing input file /mnt/vol1/inputs/2/input2.json\\n" "Writing input file /mnt/vol1/inputs/0/input3.json\\n" "Writing input file /mnt/vol1/inputs/1/input4.json\\n" "Writing input file /mnt/vol1/inputs/2/input5.json\\n" "Writing input file /mnt/vol1/inputs/0/input6.json\\n" "Writing input file /mnt/vol1/inputs/1/input7.json\\n" "Writing input file /mnt/vol1/inputs/2/input8.json\\n" "Writing input file /mnt/vol1/inputs/0/input9.json\\n" "'", str(container_log), ) def __create_conf(self, task_id): return { 'task': task_id, 'cmd': 'foo bar', 'image': self.__IMAGE_NAME, 'source': 'baz', 'no_cache': True, } @staticmethod def __temp_dir(): temp_dir = tempfile.mkdtemp() return temp_dir @staticmethod def __remove_dir(temp_dir): shutil.rmtree(temp_dir, ignore_errors=True) ================================================ FILE: tests/runners/airflow/build/spark/__init__.py ================================================ # # Licensed to the Apache Software Foundation (ASF) under one # or more contributor license agreements. See the NOTICE file # distributed with this work for additional information # regarding copyright ownership. The ASF licenses this file # to you under the Apache License, Version 2.0 (the # "License"); you may not use this file except in compliance # with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, # software distributed under the License is distributed on an # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY # KIND, either express or implied. See the License for the # specific language governing permissions and limitations # under the License. ================================================ FILE: tests/runners/airflow/build/spark/test_spark_image_builder.py ================================================ # # Licensed to the Apache Software Foundation (ASF) under one # or more contributor license agreements. See the NOTICE file # distributed with this work for additional information # regarding copyright ownership. The ASF licenses this file # to you under the Apache License, Version 2.0 (the # "License"); you may not use this file except in compliance # with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, # software distributed under the License is distributed on an # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY # KIND, either express or implied. See the License for the # specific language governing permissions and limitations # under the License. import logging import os import shutil import tempfile from unittest import TestCase import docker from liminal.build.image.spark.spark import SparkImageBuilder class TestSparkImageBuilder(TestCase): __IMAGE_NAME = 'my_spark_image' def setUp(self) -> None: super().setUp() os.environ['TMPDIR'] = '/tmp' self.temp_dir = self.__temp_dir() self.temp_airflow_dir = self.__temp_dir() def tearDown(self) -> None: super().tearDown() self.__remove_dir(self.temp_dir) self.__remove_dir(self.temp_airflow_dir) def test_build(self): build_out = self.__test_build() self.assertTrue('RUN pip install -r requirements.txt' in build_out, 'Incorrect pip command') self.__test_image() def __test_build(self): config = self.__create_conf('my_task') base_path = os.path.join(os.path.dirname(__file__), '../../../apps/test_spark_app') builder = SparkImageBuilder( config=config, base_path=base_path, relative_source_path='wordcount', tag=self.__IMAGE_NAME ) build_out = str(builder.build()) return build_out def __test_image(self): docker_client = docker.from_env() docker_client.images.get(self.__IMAGE_NAME) cmds = ['spark-submit', 'wordcount.py', 'words.txt', '/mnt/vol1/outputs'] container_log = docker_client.containers.run( self.__IMAGE_NAME, cmds, volumes={self.temp_dir: {'bind': '/mnt/vol1', 'mode': 'rw'}} ) docker_client.close() logging.info(container_log) self.assertEqual( "b'\\n" "my: 1\\n" "first: 1\\n" "liminal: 1\\n" "spark: 1\\n" "task: 1\\n" "writing the results to /mnt/vol1/outputs\\n'", str(container_log), ) def __create_conf(self, task_id): return { 'task': task_id, 'cmd': 'foo bar', 'image': self.__IMAGE_NAME, 'source': 'baz', 'no_cache': True, } @staticmethod def __temp_dir(): temp_dir = tempfile.mkdtemp() return temp_dir @staticmethod def __remove_dir(temp_dir): shutil.rmtree(temp_dir, ignore_errors=True) ================================================ FILE: tests/runners/airflow/build/test_liminal_apps_builder.py ================================================ # # Licensed to the Apache Software Foundation (ASF) under one # or more contributor license agreements. See the NOTICE file # distributed with this work for additional information # regarding copyright ownership. The ASF licenses this file # to you under the Apache License, Version 2.0 (the # "License"); you may not use this file except in compliance # with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, # software distributed under the License is distributed on an # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY # KIND, either express or implied. See the License for the # specific language governing permissions and limitations # under the License. import os import unittest from unittest import TestCase import docker from liminal.build import liminal_apps_builder class TestLiminalAppsBuilder(TestCase): __image_names = ['my_python_task_img', 'my_parallelized_python_task_img'] def setUp(self) -> None: self.docker_client = docker.client.from_env() self.__remove_images() def tearDown(self) -> None: self.__remove_images() self.docker_client.close() def __remove_images(self): for image_name in self.__image_names: if len(self.docker_client.images.list(image_name)) > 0: self.docker_client.images.remove(image=image_name, force=True) def test_build_liminal(self): liminal_apps_builder.build_liminal_apps(os.path.join(os.path.dirname(__file__), '../liminal')) for image in self.__image_names: self.docker_client.images.get(image) if __name__ == '__main__': unittest.main() ================================================ FILE: tests/runners/airflow/compiler/__init__.py ================================================ # # Licensed to the Apache Software Foundation (ASF) under one # or more contributor license agreements. See the NOTICE file # distributed with this work for additional information # regarding copyright ownership. The ASF licenses this file # to you under the Apache License, Version 2.0 (the # "License"); you may not use this file except in compliance # with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, # software distributed under the License is distributed on an # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY # KIND, either express or implied. See the License for the # specific language governing permissions and limitations # under the License. ================================================ FILE: tests/runners/airflow/dag/__init__.py ================================================ # # Licensed to the Apache Software Foundation (ASF) under one # or more contributor license agreements. See the NOTICE file # distributed with this work for additional information # regarding copyright ownership. The ASF licenses this file # to you under the Apache License, Version 2.0 (the # "License"); you may not use this file except in compliance # with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, # software distributed under the License is distributed on an # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY # KIND, either express or implied. See the License for the # specific language governing permissions and limitations # under the License. ================================================ FILE: tests/runners/airflow/dag/test_liminal_dags.py ================================================ # # Licensed to the Apache Software Foundation (ASF) under one # or more contributor license agreements. See the NOTICE file # distributed with this work for additional information # regarding copyright ownership. The ASF licenses this file # to you under the Apache License, Version 2.0 (the # "License"); you may not use this file except in compliance # with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, # software distributed under the License is distributed on an # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY # KIND, either express or implied. See the License for the # specific language governing permissions and limitations # under the License. import os import unittest from unittest import TestCase, mock from liminal.core.config.config import ConfigUtil from liminal.runners.airflow.dag import liminal_register_dags from liminal.runners.airflow.operators.job_status_operator import ( JobEndOperator, JobStartOperator, ) class Test(TestCase): def test_register_dags(self): dags = self.get_register_dags() self.assertEqual(len(dags), 1) test_pipeline = dags[0][1] # TODO: elaborate tests to assert all dags have correct tasks self.assertEqual(test_pipeline.dag_id, 'my_pipeline') def test_default_start_task(self): dags = self.get_register_dags()[0] task_dict = dags[1].task_dict self.assertIsInstance(task_dict['start'], JobStartOperator) def test_default_end_task(self): dags = self.get_register_dags()[0] task_dict = dags[1].task_dict self.assertIsInstance(task_dict['end'], JobEndOperator) def test_default_args(self): dag = self.get_register_dags()[0] default_args = dag[1].default_args keys = default_args.keys() self.assertIn('default_arg_loaded', keys) self.assertIn('default_array_loaded', keys) self.assertIn('default_object_loaded', keys) @staticmethod @mock.patch.object(ConfigUtil, "snapshot_final_liminal_configs") def get_register_dags(mock_snapshot_final_liminal_configs): mock_snapshot_final_liminal_configs.side_effect = None base_path = os.path.join(os.path.dirname(__file__), '../../apps/test_app') return liminal_register_dags.register_dags(base_path) if __name__ == '__main__': unittest.main() ================================================ FILE: tests/runners/airflow/executors/__init__.py ================================================ # # Licensed to the Apache Software Foundation (ASF) under one # or more contributor license agreements. See the NOTICE file # distributed with this work for additional information # regarding copyright ownership. The ASF licenses this file # to you under the Apache License, Version 2.0 (the # "License"); you may not use this file except in compliance # with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, # software distributed under the License is distributed on an # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY # KIND, either express or implied. See the License for the # specific language governing permissions and limitations # under the License. ================================================ FILE: tests/runners/airflow/executors/test_airflow_executor.py ================================================ # # Licensed to the Apache Software Foundation (ASF) under one # or more contributor license agreements. See the NOTICE file # distributed with this work for additional information # regarding copyright ownership. The ASF licenses this file # to you under the Apache License, Version 2.0 (the # "License"); you may not use this file except in compliance # with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, # software distributed under the License is distributed on an # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY # KIND, either express or implied. See the License for the # specific language governing permissions and limitations # under the License. from unittest import TestCase from unittest.mock import MagicMock from liminal.runners.airflow.executors import airflow from liminal.runners.airflow.tasks.airflow import AirflowTask class TestAirflowExecutorTask(TestCase): """ Test AirflowExecutor task """ def test_apply_task_to_dag(self): task0 = MagicMock(spec=AirflowTask) task0.apply_task_to_dag = MagicMock() airflow.AirflowExecutor("executor-task", {}, {}).apply_task_to_dag(task=task0) task0.apply_task_to_dag.assert_called_once() ================================================ FILE: tests/runners/airflow/executors/test_emr.py ================================================ # # Licensed to the Apache Software Foundation (ASF) under one # or more contributor license agreements. See the NOTICE file # distributed with this work for additional information # regarding copyright ownership. The ASF licenses this file # to you under the Apache License, Version 2.0 (the # "License"); you may not use this file except in compliance # with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, # software distributed under the License is distributed on an # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY # KIND, either express or implied. See the License for the # specific language governing permissions and limitations # under the License. from copy import deepcopy from unittest import TestCase, mock from unittest.mock import MagicMock import boto3 from airflow.contrib.hooks.emr_hook import EmrHook from airflow.contrib.operators.emr_add_steps_operator import EmrAddStepsOperator from airflow.contrib.sensors.emr_step_sensor import EmrStepSensor from moto import mock_emr from liminal.runners.airflow import DummyDag from liminal.runners.airflow.executors.emr import EMRExecutor from liminal.runners.airflow.operators.operator_with_variable_resolving import ( OperatorWithVariableResolving, ) from liminal.runners.airflow.tasks import hadoop from tests.util import dag_test_utils @mock_emr class TestEMRExecutorTask(TestCase): """ Test EMRExecutor task """ def setUp(self) -> None: self.run_job_flow_args = dict( Instances={ "InstanceCount": 1, "KeepJobFlowAliveWhenNoSteps": True, "MasterInstanceType": "c3.medium", "Placement": {"AvailabilityZone": "us-east-1"}, "SlaveInstanceType": "c3.xlarge", }, JobFlowRole="EMR_EC2_DefaultRole", LogUri="s3://liminal/log", Name="test-emr-cluster", ServiceRole="EMR_DefaultRole", VisibleToAllUsers=True, ) self.client = boto3.client("emr", region_name="us-east-1") args = deepcopy(self.run_job_flow_args) self.cluster_id = self.client.run_job_flow(**args)["JobFlowId"] self.dag = dag_test_utils.create_dag() self.dag.context = DummyDag(dag_id=self.dag.dag_id, task_id="").context self.executor_name = 'test-emr-cluster' executor_config = { 'executor': self.executor_name, 'cluster_name': self.executor_name, 'aws_conn_id': 'us-east-1', 'type': 'emr', 'properties': {'ActionOnFailure': 'CONTINUE'}, } self.hadoop_task = MagicMock(spec=hadoop.HadoopTask) self.hadoop_task.get_runnable_command.return_value = ['spark-submit', 'test', 'params', '--param'] self.hadoop_task.task_id = 'spark-task' self.hadoop_task.dag = self.dag self.hadoop_task.trigger_rule = 'all_done' self.hadoop_task.parent = None self.hadoop_task.task_config = {} self.hadoop_task.variables = {} self.emr = EMRExecutor(self.executor_name, liminal_config={}, executor_config=executor_config) def test_apply_task_to_dag(self): self.emr.apply_task_to_dag(task=self.hadoop_task) self.assertEqual(len(self.dag.tasks), 2) self.assertIsInstance(self.dag.tasks[0], OperatorWithVariableResolving) self.assertIsInstance(self.dag.tasks[0].operator_delegate, EmrAddStepsOperator) self.assertIsInstance(self.dag.tasks[1], OperatorWithVariableResolving) self.assertIsInstance(self.dag.tasks[1].operator_delegate, EmrStepSensor) @mock.patch.object(EmrHook, 'get_conn') def test_add_step(self, mock_emr_hook_get_conn): aws_region = 'us-east-1' mock_emr_hook_get_conn.return_value = boto3.client('emr', region_name=aws_region) self.emr.apply_task_to_dag(task=self.hadoop_task) emr_add_step_task = self.dag.tasks[0] self.assertIsInstance(emr_add_step_task, OperatorWithVariableResolving) self.assertIsInstance(emr_add_step_task.operator_delegate, EmrAddStepsOperator) emr_add_step_task.render_template_fields({}) step_id = emr_add_step_task.execute(self.dag.context)[0] desc_step = self.client.describe_step(ClusterId=self.cluster_id, StepId=step_id)['Step'] self.assertEqual( desc_step['Config'], {'Jar': 'command-runner.jar', 'Properties': {}, 'Args': ['spark-submit', 'test', 'params', '--param']}, ) self.assertEqual(desc_step['Name'], 'spark-task') self.assertEqual(desc_step['ActionOnFailure'], 'CONTINUE') def test_watch_step(self): self.emr.apply_task_to_dag(task=self.hadoop_task, parents=[]) emr_watch_step_task = self.dag.tasks[1] self.assertIsInstance(emr_watch_step_task, OperatorWithVariableResolving) self.assertIsInstance(emr_watch_step_task.operator_delegate, EmrStepSensor) # todo - elaborate tests ================================================ FILE: tests/runners/airflow/liminal/__init__.py ================================================ # # Licensed to the Apache Software Foundation (ASF) under one # or more contributor license agreements. See the NOTICE file # distributed with this work for additional information # regarding copyright ownership. The ASF licenses this file # to you under the Apache License, Version 2.0 (the # "License"); you may not use this file except in compliance # with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, # software distributed under the License is distributed on an # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY # KIND, either express or implied. See the License for the # specific language governing permissions and limitations # under the License. ================================================ FILE: tests/runners/airflow/liminal/liminal.yml ================================================ # Licensed to the Apache Software Foundation (ASF) under one # or more contributor license agreements. See the NOTICE file # distributed with this work for additional information # regarding copyright ownership. The ASF licenses this file # to you under the Apache License, Version 2.0 (the # "License"); you may not use this file except in compliance # with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, # software distributed under the License is distributed on an # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY # KIND, either express or implied. See the License for the # specific language governing permissions and limitations # under the License. --- name: MyPipeline volumes: - volume: myvol1 local: path: /tmp/liminal_tests secrets: - secret: aws local_path_file: "~/.aws/credentials" images: - image: my_python_task_img type: python source: write_inputs no_cache: true - image: my_parallelized_python_task_img type: python source: write_outputs no_cache: true - image: my_server_image type: python_server source: myserver no_cache: true endpoints: - endpoint: /myendpoint1 module: my_server function: myendpoint1func pipelines: - pipeline: my_pipeline owner: Bosco Albert Baracus start_date: 1970-01-01 timeout_minutes: 45 schedule: 0 * 1 * * default_arg_loaded: check default_array_loaded: [2, 3, 4] default_object_loaded: key1: val1 key2: val2 metrics: namespace: TestNamespace backends: [] tasks: - task: my_python_task type: python description: static input task image: my_python_task_img env_vars: NUM_FILES: 10 NUM_SPLITS: 3 AWS_CONFIG_FILE: "/mnt/credentials" AWS_PROFILE: "dev" secrets: - secret: aws remote_path: "/mnt" mounts: - mount: mymount volume: myvol1 path: /mnt/vol1 cmd: python -u write_inputs.py - task: my_parallelized_python_task type: python description: parallelized python task image: my_parallelized_python_task_img env_vars: FOO: BAR executors: 3 mounts: - mount: mymount volume: myvol1 path: /mnt/vol1 cmd: python -u write_inputs.py services: - service: my_python_server description: my python server image: my_server_image ================================================ FILE: tests/runners/airflow/liminal/myserver/__init__.py ================================================ # # Licensed to the Apache Software Foundation (ASF) under one # or more contributor license agreements. See the NOTICE file # distributed with this work for additional information # regarding copyright ownership. The ASF licenses this file # to you under the Apache License, Version 2.0 (the # "License"); you may not use this file except in compliance # with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, # software distributed under the License is distributed on an # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY # KIND, either express or implied. See the License for the # specific language governing permissions and limitations # under the License. ================================================ FILE: tests/runners/airflow/liminal/myserver/my_server.py ================================================ # # Licensed to the Apache Software Foundation (ASF) under one # or more contributor license agreements. See the NOTICE file # distributed with this work for additional information # regarding copyright ownership. The ASF licenses this file # to you under the Apache License, Version 2.0 (the # "License"); you may not use this file except in compliance # with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, # software distributed under the License is distributed on an # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY # KIND, either express or implied. See the License for the # specific language governing permissions and limitations # under the License. import json def myendpoint1func(input_json): input_dict = json.loads(input_json) if input_json else {} return f'Input was: {input_dict}' ================================================ FILE: tests/runners/airflow/liminal/pip.conf ================================================ # # Licensed to the Apache Software Foundation (ASF) under one # or more contributor license agreements. See the NOTICE file # distributed with this work for additional information # regarding copyright ownership. The ASF licenses this file # to you under the Apache License, Version 2.0 (the # "License"); you may not use this file except in compliance # with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, # software distributed under the License is distributed on an # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY # KIND, either express or implied. See the License for the # specific language governing permissions and limitations # under the License. ================================================ FILE: tests/runners/airflow/liminal/requirements.txt ================================================ # # Licensed to the Apache Software Foundation (ASF) under one # or more contributor license agreements. See the NOTICE file # distributed with this work for additional information # regarding copyright ownership. The ASF licenses this file # to you under the Apache License, Version 2.0 (the # "License"); you may not use this file except in compliance # with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, # software distributed under the License is distributed on an # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY # KIND, either express or implied. See the License for the # specific language governing permissions and limitations # under the License. pillow ================================================ FILE: tests/runners/airflow/liminal/write_inputs/__init__.py ================================================ # # Licensed to the Apache Software Foundation (ASF) under one # or more contributor license agreements. See the NOTICE file # distributed with this work for additional information # regarding copyright ownership. The ASF licenses this file # to you under the Apache License, Version 2.0 (the # "License"); you may not use this file except in compliance # with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, # software distributed under the License is distributed on an # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY # KIND, either express or implied. See the License for the # specific language governing permissions and limitations # under the License. ================================================ FILE: tests/runners/airflow/liminal/write_inputs/write_inputs.py ================================================ # # Licensed to the Apache Software Foundation (ASF) under one # or more contributor license agreements. See the NOTICE file # distributed with this work for additional information # regarding copyright ownership. The ASF licenses this file # to you under the Apache License, Version 2.0 (the # "License"); you may not use this file except in compliance # with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, # software distributed under the License is distributed on an # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY # KIND, either express or implied. See the License for the # specific language governing permissions and limitations # under the License. import json import os inputs_dir = '/mnt/vol1/inputs/' num_files = int(os.environ['NUM_FILES']) num_splits = int(os.environ['NUM_SPLITS']) # create input files - split by round robin for i in range(0, num_files): split_id = i % num_splits split_dir = os.path.join(inputs_dir, str(split_id)) if not os.path.exists(split_dir): os.makedirs(split_dir) filename = os.path.join(split_dir, f'input{i}.json') with open(filename, 'w') as f: print(f'Writing input file {filename}') f.write(json.dumps({'mykey': f'myval{i}'})) ================================================ FILE: tests/runners/airflow/liminal/write_outputs/__init__.py ================================================ # # Licensed to the Apache Software Foundation (ASF) under one # or more contributor license agreements. See the NOTICE file # distributed with this work for additional information # regarding copyright ownership. The ASF licenses this file # to you under the Apache License, Version 2.0 (the # "License"); you may not use this file except in compliance # with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, # software distributed under the License is distributed on an # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY # KIND, either express or implied. See the License for the # specific language governing permissions and limitations # under the License. ================================================ FILE: tests/runners/airflow/liminal/write_outputs/write_outputs.py ================================================ # # Licensed to the Apache Software Foundation (ASF) under one # or more contributor license agreements. See the NOTICE file # distributed with this work for additional information # regarding copyright ownership. The ASF licenses this file # to you under the Apache License, Version 2.0 (the # "License"); you may not use this file except in compliance # with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, # software distributed under the License is distributed on an # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY # KIND, either express or implied. See the License for the # specific language governing permissions and limitations # under the License. import json import logging import os # Needs to be fixed in the following Jira: # https://issues.apache.org/jira/browse/LIMINAL-76 inputs_dir = f'/mnt/vol1/inputs/' outputs_dir = '/mnt/vol1/outputs/' if not os.path.exists(outputs_dir): os.makedirs(outputs_dir) for directory in os.listdir(inputs_dir): logging.info(f'Running write_outputs for split id {directory}') inputs_dir_file = os.path.join(inputs_dir, directory) for filename in os.listdir(inputs_dir_file): with open(os.path.join(inputs_dir_file, filename)) as infile, open( os.path.join(outputs_dir, filename.replace('input', 'output').replace('.json', '.txt')), 'w' ) as outfile: logging.info(f'Writing output file: {outfile.name}') data = json.loads(infile.read()) outfile.write(data['mykey']) ================================================ FILE: tests/runners/airflow/operators/__init__.py ================================================ # # Licensed to the Apache Software Foundation (ASF) under one # or more contributor license agreements. See the NOTICE file # distributed with this work for additional information # regarding copyright ownership. The ASF licenses this file # to you under the Apache License, Version 2.0 (the # "License"); you may not use this file except in compliance # with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, # software distributed under the License is distributed on an # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY # KIND, either express or implied. See the License for the # specific language governing permissions and limitations # under the License. ================================================ FILE: tests/runners/airflow/operators/test_operator_with_variable_resolving.py ================================================ # # Licensed to the Apache Software Foundation (ASF) under one # or more contributor license agreements. See the NOTICE file # distributed with this work for additional information # regarding copyright ownership. The ASF licenses this file # to you under the Apache License, Version 2.0 (the # "License"); you may not use this file except in compliance # with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, # software distributed under the License is distributed on an # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY # KIND, either express or implied. See the License for the # specific language governing permissions and limitations # under the License. import os from unittest import TestCase from unittest.mock import MagicMock, patch from airflow.models import BaseOperator from liminal.runners.airflow.operators.operator_with_variable_resolving import ( OperatorWithVariableResolving, ) class TestKubernetesPodOperatorWithAutoImage(TestCase): @patch.dict(os.environ, {'env': 'myenv', 'LIMINAL_STAND_ALONE_MODE': 'True', 'nested': 'value1'}) def test_value_from_environment_and_liminal_config(self): variables = {'my-image': 'my-image', 'something-value1': 'stg'} operator = TestOperator(task_id='abc', field='{{env}}', expected='{{my-image}}') ret_val = OperatorWithVariableResolving( dag=None, operator=operator, task_id='my_task', task_config={}, variables=variables ).execute({}) self.assertEqual(operator.field, 'myenv') self.assertEqual(ret_val, 'my-image') @patch.dict(os.environ, {'env': 'myenv', 'LIMINAL_STAND_ALONE_MODE': 'True', 'nested': 'value1'}) def test_value_from_task_variables(self): variables = {'my-image': 'my-image', 'my_dict_var': {'env': 'myenv2'}} operator = TestOperator(task_id='abc', field='{{env}}', expected='{{my-image}}') ret_val = OperatorWithVariableResolving( dag=None, operator=operator, task_id='my_task', task_config={'variables': 'my_dict_var'}, variables=variables, ).execute({}) self.assertEqual(operator.field, 'myenv2') self.assertEqual(ret_val, 'my-image') @patch.dict(os.environ, {'env': 'myenv', 'LIMINAL_STAND_ALONE_MODE': 'True', 'nested': 'value1'}) def test_value_from_inline_task_variables(self): variables = {'my-image': 'my-image'} operator = TestOperator(task_id='abc', field='{{env}}', expected='{{my-image}}') ret_val = OperatorWithVariableResolving( dag=None, operator=operator, task_id='my_task', task_config={'variables': {'env': 'myenv2'}}, variables=variables, ).execute({}) self.assertEqual(operator.field, 'myenv2') self.assertEqual(ret_val, 'my-image') @patch.dict(os.environ, {'env': 'staging', 'LIMINAL_STAND_ALONE_MODE': 'True'}) def test_nested_variables(self): variables = { 'nested-the-s3-location': 'good', 'my-image': 'my-image', 'something-value1': 'stg', 'env_val-staging': 'the-s3-location', 'nested': 'value1', } operator = TestOperator( task_id='abc', field='something-{{nested-{{env_val-{{env}}}}}}', expected='{{my-image}}' ) OperatorWithVariableResolving( dag=None, operator=operator, task_id='my_task', task_config={}, variables=variables ).execute({}) self.assertEqual(operator.field, 'something-good') @patch.dict(os.environ, {'env': 'staging', 'LIMINAL_STAND_ALONE_MODE': 'True'}) def test_dag_run_conf_parameters(self): dag_run = MagicMock() dag_run.conf = { 'nested-the-s3-location': 'good', 'my-image': 'my-image', 'something-value1': 'stg', 'env_val-staging': 'the-s3-location', 'nested': 'value1', } operator = TestOperator( task_id='abc', field='something-{{nested-{{env_val-{{env}}}}}}', expected='{{my-image}}' ) OperatorWithVariableResolving( dag=None, operator=operator, task_id='my_task', task_config={}, variables={} ).execute({'dag_run': dag_run}) self.assertEqual(operator.field, 'something-good') @patch.dict(os.environ, {'env': 'staging', 'LIMINAL_STAND_ALONE_MODE': 'True'}) def test_duplicated_params(self): dag_run = MagicMock() dag_run.conf = { 'nested-the-s3-location': 'good', 'my-image': 'my-image', 'something-value1': 'stg', 'env_val-staging': 'the-s3-location', 'nested': 'value1', } operator = TestOperator( task_id='abc', field='something-{{nested-{{env_val-{{env}}}}}}-' '{{nested-{{env_val-{{env}}}}}}', expected='{{my-image}}', ) OperatorWithVariableResolving( dag=None, operator=operator, task_id='my_task', task_config={}, variables={} ).execute({'dag_run': dag_run}) self.assertEqual(operator.field, 'something-good-good') operator = TestOperator(task_id='abc', field='something-{{env}} {{env}}', expected='{{my-image}}') OperatorWithVariableResolving( dag=None, operator=operator, task_id='my_task', task_config={}, variables={} ).execute({'dag_run': dag_run}) self.assertEqual(operator.field, 'something-staging staging') dag_run.conf = {'var1': '{{var2}}', 'var2': 'bla', 'my-image': 'my-image'} operator = TestOperator(task_id='abc', field='{{ var1 }}', expected='{{my-image}}') OperatorWithVariableResolving( dag=None, operator=operator, task_id='my_task', task_config={}, variables={} ).execute({'dag_run': dag_run}) self.assertEqual(operator.field, 'bla') @patch.dict(os.environ, {'env': 'staging', 'LIMINAL_STAND_ALONE_MODE': 'True'}) def test_non_existing_param(self): dag_run = MagicMock() dag_run.conf = {'var1': '{{var2}}', 'var2': 'bla'} operator = TestOperator(task_id='abc', field='{{myimage}}', expected='{{myimage}}') OperatorWithVariableResolving( dag=None, operator=operator, task_id='my_task', task_config={}, variables={} ).execute({'dag_run': dag_run}) self.assertEqual(operator.field, '') class BaseTestOperator(BaseOperator): def execute(self, context): raise NotImplementedError() def __init__(self, field, expected, *args, **kwargs): self.field = field self.expected = expected super().__init__(*args, **kwargs) class TestOperator(BaseTestOperator): def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) def execute(self, context): return self.expected ================================================ FILE: tests/runners/airflow/tasks/__init__.py ================================================ # # Licensed to the Apache Software Foundation (ASF) under one # or more contributor license agreements. See the NOTICE file # distributed with this work for additional information # regarding copyright ownership. The ASF licenses this file # to you under the Apache License, Version 2.0 (the # "License"); you may not use this file except in compliance # with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, # software distributed under the License is distributed on an # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY # KIND, either express or implied. See the License for the # specific language governing permissions and limitations # under the License. ================================================ FILE: tests/runners/airflow/tasks/test_create_cloudformation_stack.py ================================================ # # Licensed to the Apache Software Foundation (ASF) under one # or more contributor license agreements. See the NOTICE file # distributed with this work for additional information # regarding copyright ownership. The ASF licenses this file # to you under the Apache License, Version 2.0 (the # "License"); you may not use this file except in compliance # with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, # software distributed under the License is distributed on an # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY # KIND, either express or implied. See the License for the # specific language governing permissions and limitations # under the License. from datetime import datetime from unittest import TestCase, mock from unittest.mock import MagicMock from airflow.operators.dummy import DummyOperator from airflow.operators.python import BranchPythonOperator from moto import mock_cloudformation from liminal.runners.airflow.executors import airflow from liminal.runners.airflow.operators.cloudformation import ( CloudFormationCreateStackOperator, CloudFormationCreateStackSensor, CloudFormationHook, ) from liminal.runners.airflow.operators.operator_with_variable_resolving import ( OperatorWithVariableResolving, ) from liminal.runners.airflow.tasks.create_cloudformation_stack import ( CreateCloudFormationStackTask, ) from tests.util import dag_test_utils # noinspection DuplicatedCode @mock_cloudformation class TestCreateCloudFormationStackTask(TestCase): """ Test CreateCloudFormationStackTask """ def setUp(self) -> None: self.dag = dag_test_utils.create_dag() self.dag.context = { 'ti': self.dag, 'ts': datetime.now().timestamp(), 'execution_date': datetime.now().timestamp(), } self.dag.get_dagrun = MagicMock() self.cluster_name = "liminal-cluster-for-tests" self.config = { 'task': 'create_emr', 'type': 'create_cloudformation_stack', 'description': 'create emr', 'stack_name': self.cluster_name, 'properties': { 'OnFailure': 'DO_NOTHING', 'TimeoutInMinutes': 25, 'Capabilities': ['CAPABILITY_NAMED_IAM'], 'TemplateURL': 'https://s3.amazonaws.com/liminal-tests/emr_cluster_creation.yml', 'Parameters': { 'Environment': 'Staging', 'OwnerTeam': 'liminal-team', 'Tenancy': 'Ephemeral', 'MasterServerCount': '1', 'CoreServerCount': '1', 'EmrApplicationRelease': '5.28', 'InstanceTypeMaster': 'm5.xlarge', 'InstanceTypeCore': 'm5.xlarge', }, }, } self.create_cloudformation_task = CreateCloudFormationStackTask( self.config['task'], self.dag, [], trigger_rule='all_success', liminal_config={}, pipeline_config={}, task_config=self.config, ) airflow.AirflowExecutor("airflow-executor", {}, {}).apply_task_to_dag(task=self.create_cloudformation_task) def test_apply_task_to_dag(self): self.assertEqual(len(self.dag.tasks), 4) self.assertIsInstance(self.dag.tasks[0], OperatorWithVariableResolving) self.assertIsInstance(self.dag.tasks[0].operator_delegate, BranchPythonOperator) self.assertIsInstance(self.dag.tasks[1], OperatorWithVariableResolving) self.assertIsInstance(self.dag.tasks[1].operator_delegate, CloudFormationCreateStackOperator) self.assertIsInstance(self.dag.tasks[2], OperatorWithVariableResolving) self.assertIsInstance(self.dag.tasks[2].operator_delegate, CloudFormationCreateStackSensor) self.assertIsInstance(self.dag.tasks[3], DummyOperator) def test_cloudformation_does_not_exist(self): with mock.patch.object(CloudFormationHook, 'get_conn') as mock_conn: mock_cf_conn = MagicMock() mock_cf_conn.describe_stacks.return_value.raiseError.side_effect = Exception() mock_conn.return_value = mock_cf_conn is_cloudformation_exists = self.dag.tasks[0].operator_delegate print(is_cloudformation_exists) self.assertEqual( is_cloudformation_exists.python_callable(stack_name=self.cluster_name), 'create-cloudformation-create_emr', ) def test_cloudformation_exist_and_running(self): is_cloudformation_exists = self.dag.tasks[0].operator_delegate for status in ['CREATE_COMPLETE', 'DELETE_FAILED']: with mock.patch.object(CloudFormationHook, 'get_conn') as mock_conn: mock_cloudformation_conn = MagicMock() mock_cloudformation_conn.describe_stacks.return_value = {'Stacks': [{'StackStatus': status}]} mock_conn.return_value = mock_cloudformation_conn self.assertEqual( is_cloudformation_exists.python_callable(stack_name=self.cluster_name), 'creation-end-create_emr' ) def test_cloudformation_exists_and_not_running(self): is_cloudformation_exists = self.dag.tasks[0].operator_delegate for status in ['DELETED']: with mock.patch.object(CloudFormationHook, 'get_conn') as mock_conn: mock_cloudformation_conn = MagicMock() mock_cloudformation_conn.describe_stacks.return_value = {'Stacks': [{'StackStatus': status}]} mock_conn.return_value = mock_cloudformation_conn self.assertEqual( is_cloudformation_exists.python_callable(stack_name=self.cluster_name), 'create-cloudformation-create_emr', ) def test_cloudformation_create_stack_operator_task(self): create_cloudformation_stack = self.dag.tasks[1] self.assertEqual(create_cloudformation_stack.task_id, 'create-cloudformation-create_emr') ================================================ FILE: tests/runners/airflow/tasks/test_delete_cloudformation_stack.py ================================================ # # Licensed to the Apache Software Foundation (ASF) under one # or more contributor license agreements. See the NOTICE file # distributed with this work for additional information # regarding copyright ownership. The ASF licenses this file # to you under the Apache License, Version 2.0 (the # "License"); you may not use this file except in compliance # with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, # software distributed under the License is distributed on an # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY # KIND, either express or implied. See the License for the # specific language governing permissions and limitations # under the License. from datetime import datetime from unittest import TestCase from unittest.mock import MagicMock from airflow.operators.dummy_operator import DummyOperator from airflow.operators.python_operator import BranchPythonOperator from airflow.utils.trigger_rule import TriggerRule from moto import mock_cloudformation from liminal.runners.airflow.operators.cloudformation import ( CloudFormationDeleteStackOperator, CloudFormationDeleteStackSensor, ) from liminal.runners.airflow.operators.operator_with_variable_resolving import ( OperatorWithVariableResolving, ) from liminal.runners.airflow.tasks.delete_cloudformation_stack import ( DeleteCloudFormationStackTask, ) from tests.util import dag_test_utils # noinspection DuplicatedCode @mock_cloudformation class TestDeleteCloudFormationStackTask(TestCase): """ Test DeleteCloudFormationStackTask """ def setUp(self) -> None: self.dag = dag_test_utils.create_dag() self.dag.context = { 'ti': self.dag, 'ts': datetime.now().timestamp(), 'execution_date': datetime.now().timestamp(), } self.dag.get_dagrun = MagicMock() self.cluster_name = "liminal-cluster-for-tests" self.delete_cloudformation_task = DeleteCloudFormationStackTask( 'delete-emr', self.dag, [], trigger_rule='all_success', liminal_config={}, pipeline_config={}, task_config={'stack_name': self.cluster_name}, ) self.delete_cloudformation_task.apply_task_to_dag() def test_apply_task_to_dag(self): self.assertEqual(len(self.dag.tasks), 4) self.assertIsInstance(self.dag.tasks[0], BranchPythonOperator) self.assertIsInstance(self.dag.tasks[1], OperatorWithVariableResolving) self.assertIsInstance(self.dag.tasks[1].operator_delegate, CloudFormationDeleteStackOperator) self.assertIsInstance(self.dag.tasks[2], OperatorWithVariableResolving) self.assertIsInstance(self.dag.tasks[2].operator_delegate, CloudFormationDeleteStackSensor) self.assertIsInstance(self.dag.tasks[3], DummyOperator) def test_check_dags_queued_task(self): self.dag.get_num_active_runs = MagicMock() self.dag.get_num_active_runs.return_value = 2 check_dags_queued_task = self.dag.tasks[0] self.assertEqual(check_dags_queued_task.trigger_rule, TriggerRule.ALL_DONE) self.assertEqual(check_dags_queued_task.python_callable(), f'delete-end-delete-emr') self.dag.get_num_active_runs.return_value = 1 self.assertEqual(check_dags_queued_task.python_callable(), f'delete-cloudformation-delete-emr') ================================================ FILE: tests/runners/airflow/tasks/test_job_end.py ================================================ # # Licensed to the Apache Software Foundation (ASF) under one # or more contributor license agreements. See the NOTICE file # distributed with this work for additional information # regarding copyright ownership. The ASF licenses this file # to you under the Apache License, Version 2.0 (the # "License"); you may not use this file except in compliance # with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, # software distributed under the License is distributed on an # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY # KIND, either express or implied. See the License for the # specific language governing permissions and limitations # under the License. import unittest from unittest import TestCase from liminal.runners.airflow.executors import airflow from liminal.runners.airflow.tasks import job_end from tests.util import dag_test_utils # noinspection DuplicatedCode class TestJobEndTask(TestCase): def test_apply_task_to_dag(self): dag = dag_test_utils.create_dag() task0 = job_end.JobEndTask( task_id='job_end', dag=dag, pipeline_config={'pipeline': 'my_end_pipeline'}, task_config={}, parent=None, trigger_rule='all_done', liminal_config={'metrics': {'namespace': 'EndJobNameSpace', 'backends': ['cloudwatch']}}, ) airflow.AirflowExecutor("airflow-executor", {}, {}).apply_task_to_dag(task=task0) self.assertEqual(len(dag.tasks), 1) dag_task0 = dag.tasks[0] self.assertEqual(dag_task0.namespace, 'EndJobNameSpace') self.assertEqual(dag_task0.backends, ['cloudwatch']) self.assertEqual(dag_task0.task_id, 'end') def test_apply_task_to_dag_missing_metrics(self): conf = {'pipeline': 'my_pipeline'} dag = dag_test_utils.create_dag() task0 = job_end.JobEndTask( task_id="job_end", dag=dag, pipeline_config={'pipeline': 'my_end_pipeline'}, liminal_config=conf, parent=None, trigger_rule='all_done', task_config={}, ) airflow.AirflowExecutor("airflow-executor", {}, {}).apply_task_to_dag(task=task0) self.assertEqual(len(dag.tasks), 1) dag_task0 = dag.tasks[0] self.assertEqual(dag_task0.namespace, '') self.assertEqual(dag_task0.backends, []) self.assertEqual(dag_task0.trigger_rule, 'all_done') def test_apply_task_to_dag_with_partial_configuration(self): dag = dag_test_utils.create_dag() task0 = job_end.JobEndTask( task_id="job_enc", dag=dag, liminal_config={'metrics': {'namespace': 'EndJobNameSpace'}}, pipeline_config={'pipeline': 'my_end_pipeline'}, task_config={}, parent=None, trigger_rule='all_done', ) task0.apply_task_to_dag() self.assertEqual(len(dag.tasks), 1) dag_task0 = dag.tasks[0] self.assertEqual(dag_task0.namespace, 'EndJobNameSpace') self.assertEqual(dag_task0.backends, []) self.assertEqual(dag_task0.trigger_rule, 'all_done') if __name__ == '__main__': unittest.main() ================================================ FILE: tests/runners/airflow/tasks/test_job_start.py ================================================ # # Licensed to the Apache Software Foundation (ASF) under one # or more contributor license agreements. See the NOTICE file # distributed with this work for additional information # regarding copyright ownership. The ASF licenses this file # to you under the Apache License, Version 2.0 (the # "License"); you may not use this file except in compliance # with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, # software distributed under the License is distributed on an # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY # KIND, either express or implied. See the License for the # specific language governing permissions and limitations # under the License. import unittest from unittest import TestCase from liminal.runners.airflow.executors import airflow from liminal.runners.airflow.tasks import job_start from tests.util import dag_test_utils # noinspection DuplicatedCode class TestJobStartTask(TestCase): def test_apply_task_to_dag(self): dag = dag_test_utils.create_dag() task0 = job_start.JobStartTask( task_id="start_task", dag=dag, liminal_config={'metrics': {'namespace': 'StartJobNameSpace', 'backends': ['cloudwatch']}}, pipeline_config={'pipeline': 'my_start_pipeline'}, task_config={}, parent=None, trigger_rule='all_success', ) task0.apply_task_to_dag() self.assertEqual(len(dag.tasks), 1) dag_task0 = dag.tasks[0] self.assertEqual(dag_task0.namespace, 'StartJobNameSpace') self.assertEqual(dag_task0.backends, ['cloudwatch']) self.assertEqual(dag_task0.task_id, 'start') def test_apply_task_to_dag_missing_metrics(self): conf = {'pipeline': 'my_pipeline'} dag = dag_test_utils.create_dag() task0 = job_start.JobStartTask( task_id="start_task", dag=dag, liminal_config=conf, task_config={}, pipeline_config={'pipeline': 'my_end_pipeline'}, parent=None, trigger_rule='all_success', ) airflow.AirflowExecutor("airflow-executor", {}, {}).apply_task_to_dag(task=task0) self.assertEqual(len(dag.tasks), 1) dag_task0 = dag.tasks[0] self.assertEqual(dag_task0.namespace, '') self.assertEqual(dag_task0.backends, []) self.assertEqual(dag_task0.trigger_rule, 'all_success') def test_apply_task_to_dag_with_partial_configuration(self): dag = dag_test_utils.create_dag() task0 = job_start.JobStartTask( task_id="start_task", dag=dag, liminal_config={'metrics': {'namespace': 'StartJobNameSpace'}}, pipeline_config={'pipeline': 'my_start_pipeline'}, task_config={}, parent=None, trigger_rule='all_success', ) airflow.AirflowExecutor("airflow-executor", {}, {}).apply_task_to_dag(task=task0) self.assertEqual(len(dag.tasks), 1) dag_task0 = dag.tasks[0] self.assertEqual(dag_task0.namespace, 'StartJobNameSpace') self.assertEqual(dag_task0.backends, []) if __name__ == '__main__': unittest.main() ================================================ FILE: tests/runners/airflow/tasks/test_python.py ================================================ # # Licensed to the Apache Software Foundation (ASF) under one # or more contributor license agreements. See the NOTICE file # distributed with this work for additional information # regarding copyright ownership. The ASF licenses this file # to you under the Apache License, Version 2.0 (the # "License"); you may not use this file except in compliance # with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, # software distributed under the License is distributed on an # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY # KIND, either express or implied. See the License for the # specific language governing permissions and limitations # under the License. import os import tempfile import unittest from unittest import TestCase from liminal.build import liminal_apps_builder from liminal.kubernetes import secret_util, volume_util from liminal.runners.airflow import DummyDag from liminal.runners.airflow.executors.kubernetes import KubernetesPodExecutor from liminal.runners.airflow.tasks import python from tests.util import dag_test_utils class TestPythonTask(TestCase): _VOLUME_NAME = 'myvol1' _SECRET_NAME = 'aws' def setUp(self) -> None: volume_util.delete_local_volume(self._VOLUME_NAME) secret_util.delete_local_secret(self._SECRET_NAME) os.environ['TMPDIR'] = '/tmp' self.temp_dir = tempfile.mkdtemp() self.liminal_config = { 'volumes': [ { 'volume': self._VOLUME_NAME, 'local': {'path': self.temp_dir.replace("/var/folders", "/private/var/folders")}, } ], 'secrets': [{'secret': self._SECRET_NAME, 'remote_path': "/mnt"}], } volume_util.create_local_volumes(self.liminal_config, None) secret_util.create_local_secrets(self.liminal_config) liminal_apps_builder.build_liminal_apps(os.path.join(os.path.dirname(__file__), '../liminal')) def test_apply_task_to_dag(self): dag = dag_test_utils.create_dag() task0 = self.__create_python_task( dag, 'my_input_task', None, 'my_python_task_img', 'python -u write_inputs.py', env_vars={'NUM_FILES': 10, 'NUM_SPLITS': 3, 'AWS_CONFIG_FILE': '/mnt/credentials', 'AWS_PROFILE': 'dev'}, ) executor = KubernetesPodExecutor( task_id='k8s', liminal_config=self.liminal_config, executor_config={'executor': 'k8s', 'name': 'mypod'} ) executor.apply_task_to_dag(task=task0) task1 = self.__create_python_task( dag, 'my_output_task', dag.tasks[0], 'my_parallelized_python_task_img', 'python -u write_outputs.py', executors=3, ) executor.apply_task_to_dag(task=task1) for task in dag.tasks: print(f'Executing task {task.task_id}') task.execute(DummyDag('my_dag', task.task_id).context) inputs_dir = os.path.join(self.temp_dir, 'inputs') outputs_dir = os.path.join(self.temp_dir, 'outputs') self.assertListEqual(sorted(os.listdir(self.temp_dir)), sorted(['outputs', 'inputs'])) inputs_dir_contents = sorted(os.listdir(inputs_dir)) self.assertListEqual(inputs_dir_contents, ['0', '1', '2']) self.assertListEqual( sorted(os.listdir(os.path.join(inputs_dir, '0'))), ['input0.json', 'input3.json', 'input6.json', 'input9.json'], ) self.assertListEqual( sorted(os.listdir(os.path.join(inputs_dir, '1'))), ['input1.json', 'input4.json', 'input7.json'] ) self.assertListEqual( sorted(os.listdir(os.path.join(inputs_dir, '2'))), ['input2.json', 'input5.json', 'input8.json'] ) self.assertListEqual( sorted(os.listdir(outputs_dir)), [ 'output0.txt', 'output1.txt', 'output2.txt', 'output3.txt', 'output4.txt', 'output5.txt', 'output6.txt', 'output7.txt', 'output8.txt', 'output9.txt', ], ) for filename in os.listdir(outputs_dir): with open(os.path.join(outputs_dir, filename)) as f: expected_file_content = filename.replace('output', 'myval').replace('.txt', '') self.assertEqual(f.read(), expected_file_content) def __create_python_task(self, dag, task_id, parent, image, cmd, env_vars=None, executors=None): self.liminal_config['volumes'] = [ { 'volume': self._VOLUME_NAME, 'local': {'path': self.temp_dir.replace("/var/folders", "/private/var/folders")}, } ] self.liminal_config['executors'] = [ { 'executor': 'k8s', 'type': 'kubernetes', } ] task_config = { 'task': task_id, 'cmd': cmd, 'image': image, 'env_vars': env_vars if env_vars is not None else {}, 'mounts': [{'mount': 'mymount', 'volume': self._VOLUME_NAME, 'path': '/mnt/vol1'}], 'secrets': [{'secret': self._SECRET_NAME, 'remote_path': '/mnt'}], } if executors: task_config['executors'] = executors return python.PythonTask( task_id=task_id, dag=dag, liminal_config=self.liminal_config, pipeline_config={'pipeline': 'my_pipeline'}, task_config=task_config, parent=parent, trigger_rule='all_success', ) if __name__ == '__main__': unittest.main() ================================================ FILE: tests/runners/airflow/tasks/test_spark_task.py ================================================ # # Licensed to the Apache Software Foundation (ASF) under one # or more contributor license agreements. See the NOTICE file # distributed with this work for additional information # regarding copyright ownership. The ASF licenses this file # to you under the Apache License, Version 2.0 (the # "License"); you may not use this file except in compliance # with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, # software distributed under the License is distributed on an # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY # KIND, either express or implied. See the License for the # specific language governing permissions and limitations # under the License. import os import tempfile from unittest import TestCase from liminal.build import liminal_apps_builder from liminal.kubernetes import volume_util from liminal.runners.airflow import DummyDag from liminal.runners.airflow.executors.kubernetes import KubernetesPodExecutor from liminal.runners.airflow.tasks.spark import SparkTask from tests.util import dag_test_utils class TestSparkTask(TestCase): """ Test Spark Task """ _VOLUME_NAME = 'myvol1' def test_spark_on_k8s(self): volume_util.delete_local_volume(self._VOLUME_NAME) os.environ['TMPDIR'] = '/tmp' self.temp_dir = tempfile.mkdtemp() self.liminal_config = { 'volumes': [ { 'volume': self._VOLUME_NAME, 'local': {'path': self.temp_dir.replace("/var/folders", "/private/var/folders")}, } ] } volume_util.create_local_volumes(self.liminal_config, None) # build spark image liminal_apps_builder.build_liminal_apps(os.path.join(os.path.dirname(__file__), '../../apps/test_spark_app')) outputs_dir = os.path.join(self.temp_dir, 'outputs') task_config = { 'task': "my_spark_task", 'image': "my_spark_image", 'application_source': 'wordcount.py', 'application_arguments': ['words.txt', '/mnt/vol1/outputs/'], 'env_vars': {}, 'mounts': [{'mount': 'mymount', 'volume': self._VOLUME_NAME, 'path': '/mnt/vol1'}], } dag = dag_test_utils.create_dag() task1 = SparkTask( task_id="my_spark_task", dag=dag, liminal_config=self.liminal_config, pipeline_config={'pipeline': 'my_pipeline'}, task_config=task_config, parent=None, trigger_rule='all_success', ) executor = KubernetesPodExecutor( task_id='k8s', liminal_config=self.liminal_config, executor_config={'executor': 'k8s', 'name': 'mypod'} ) executor.apply_task_to_dag(task=task1) for task in dag.tasks: print(f'Executing task {task.task_id}') task.execute(DummyDag('my_dag', task.task_id).context) expected_output = ( '{"word":"my","count":1}\n' '{"word":"first","count":1}\n' '{"word":"liminal","count":1}\n' '{"word":"spark","count":1}\n' '{"word":"task","count":1}\n'.split("\n") ) actual = '' for filename in os.listdir(outputs_dir): if filename.endswith(".json"): with open(os.path.join(outputs_dir, filename)) as f: actual = f.read() self.assertEqual(actual.split("\n"), expected_output) def test_get_runnable_command(self): task_config = { 'application_source': 'my_app.py', 'master': 'yarn', 'class': 'org.apache.liminal.MySparkApp', 'conf': { 'spark.driver.memory': '1g', 'spark.driver.maxResultSize': '1g', 'spark.yarn.executor.memoryOverhead': '500M', }, 'application_arguments': { '--query': "select * from " "my_table where date_prt >= " "'{{yesterday_ds}}'", '--output': 'mytable', }, } expected = [ 'spark-submit', '--master', 'yarn', '--class', 'org.apache.liminal.MySparkApp', '--conf', 'spark.driver.maxResultSize=1g', '--conf', 'spark.driver.memory=1g', '--conf', 'spark.yarn.executor.memoryOverhead=500M', 'my_app.py', '--query', "select * from my_table where " "date_prt >= '{{yesterday_ds}}'", '--output', 'mytable', ] actual = SparkTask( 'my_spark_task', DummyDag('dag-id', 'my_spark_task'), [], trigger_rule='all_success', liminal_config={}, pipeline_config={'pipeline': 'pipeline'}, task_config=task_config, ).get_runnable_command() self.assertEqual(actual.sort(), expected.sort()) def test_missing_spark_arguments(self): task_config = { 'application_source': 'my_app.py', 'application_arguments': { '--query': "select * from my_table where" " date_prt >= '{{yesterday_ds}}'", '--output': 'myoutputtable', }, } expected = [ 'spark-submit', 'my_app.py', '--query', "select * from my_table where " "date_prt >= '{{yesterday_ds}}'", '--output', 'myoutputtable', ] actual = SparkTask( 'my_spark_task', DummyDag('dag-id', 'my_spark_task'), [], trigger_rule='all_success', liminal_config={}, pipeline_config={'pipeline': 'pipeline'}, task_config=task_config, ).get_runnable_command() self.assertEqual(actual.sort(), expected.sort()) def test_partially_missing_spark_arguments(self): task_config = { 'application_source': 'my_app.py', 'class': 'org.apache.liminal.MySparkApp', 'conf': { 'spark.driver.memory': '1g', 'spark.driver.maxResultSize': '1g', 'spark.yarn.executor.memoryOverhead': '500M', }, 'application_arguments': { '--query': "select * from my_table where " "date_prt >= '{{yesterday_ds}}'", '--output': 'myoutputtable', }, } expected = [ 'spark-submit', '--class', 'org.apache.liminal.MySparkApp', '--conf', 'spark.driver.maxResultSize=1g', '--conf', 'spark.driver.memory=1g', '--conf', 'spark.yarn.executor.memoryOverhead=500M', 'my_app.py', '--query', 'select * from my_table where ' "date_prt >= '{{yesterday_ds}}'", '--output', 'myoutputtable', ] actual = SparkTask( 'my_spark_task', DummyDag('dag-id', 'my_spark_task'), [], trigger_rule='all_success', liminal_config={}, pipeline_config={'pipeline': 'pipeline'}, task_config=task_config, ).get_runnable_command() self.assertEqual(actual.sort(), expected.sort()) ================================================ FILE: tests/runners/apps/__init__.py ================================================ # # Licensed to the Apache Software Foundation (ASF) under one # or more contributor license agreements. See the NOTICE file # distributed with this work for additional information # regarding copyright ownership. The ASF licenses this file # to you under the Apache License, Version 2.0 (the # "License"); you may not use this file except in compliance # with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, # software distributed under the License is distributed on an # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY # KIND, either express or implied. See the License for the # specific language governing permissions and limitations # under the License. ================================================ FILE: tests/runners/apps/test/liminal.yml ================================================ # Licensed to the Apache Software Foundation (ASF) under one # or more contributor license agreements. See the NOTICE file # distributed with this work for additional information # regarding copyright ownership. The ASF licenses this file # to you under the Apache License, Version 2.0 (the # "License"); you may not use this file except in compliance # with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, # software distributed under the License is distributed on an # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY # KIND, either express or implied. See the License for the # specific language governing permissions and limitations # under the License. ================================================ FILE: tests/runners/apps/test_app/__init__.py ================================================ # # Licensed to the Apache Software Foundation (ASF) under one # or more contributor license agreements. See the NOTICE file # distributed with this work for additional information # regarding copyright ownership. The ASF licenses this file # to you under the Apache License, Version 2.0 (the # "License"); you may not use this file except in compliance # with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, # software distributed under the License is distributed on an # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY # KIND, either express or implied. See the License for the # specific language governing permissions and limitations # under the License. ================================================ FILE: tests/runners/apps/test_app/defaults/__init__.py ================================================ # # Licensed to the Apache Software Foundation (ASF) under one # or more contributor license agreements. See the NOTICE file # distributed with this work for additional information # regarding copyright ownership. The ASF licenses this file # to you under the Apache License, Version 2.0 (the # "License"); you may not use this file except in compliance # with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, # software distributed under the License is distributed on an # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY # KIND, either express or implied. See the License for the # specific language governing permissions and limitations # under the License. ================================================ FILE: tests/runners/apps/test_app/defaults/liminal.yml ================================================ # Licensed to the Apache Software Foundation (ASF) under one # or more contributor license agreements. See the NOTICE file # distributed with this work for additional information # regarding copyright ownership. The ASF licenses this file # to you under the Apache License, Version 2.0 (the # "License"); you may not use this file except in compliance # with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, # software distributed under the License is distributed on an # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY # KIND, either express or implied. See the License for the # specific language governing permissions and limitations # under the License. --- name: default_liminal type: super service_defaults: one: two task_defaults: env_vars: env1: a pipeline_defaults: one: one two: twoww ================================================ FILE: tests/runners/apps/test_app/extra/__init__.py ================================================ # # Licensed to the Apache Software Foundation (ASF) under one # or more contributor license agreements. See the NOTICE file # distributed with this work for additional information # regarding copyright ownership. The ASF licenses this file # to you under the Apache License, Version 2.0 (the # "License"); you may not use this file except in compliance # with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, # software distributed under the License is distributed on an # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY # KIND, either express or implied. See the License for the # specific language governing permissions and limitations # under the License. ================================================ FILE: tests/runners/apps/test_app/extra/liminal.yml ================================================ # Licensed to the Apache Software Foundation (ASF) under one # or more contributor license agreements. See the NOTICE file # distributed with this work for additional information # regarding copyright ownership. The ASF licenses this file # to you under the Apache License, Version 2.0 (the # "License"); you may not use this file except in compliance # with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, # software distributed under the License is distributed on an # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY # KIND, either express or implied. See the License for the # specific language governing permissions and limitations # under the License. --- name: extra_liminal type: super super: default_liminal service_defaults: endpoints: - endpoint: /myendpoint1 module: myserver.my_server function: myendpoint1func images: - image: my_static_input_task_image type: python source: helloworld no_cache: true - image: my_task_output_input_task_image type: python source: helloworld no_cache: true pipeline_defaults: tasks: - task: my_static_input_task type: python description: static input task image: my_static_input_task_image env_vars: env2: b cmd: python -u hello_world.py - task: extenable type: pipeline description: optional sub tasks - task: my_task_output_input_task type: python no_cache: true description: task with input from other task's output image: my_task_output_input_task_image env_vars: env1: c env2: b cmd: python -u hello_world.py ================================================ FILE: tests/runners/apps/test_app/helloworld/__init__.py ================================================ # # Licensed to the Apache Software Foundation (ASF) under one # or more contributor license agreements. See the NOTICE file # distributed with this work for additional information # regarding copyright ownership. The ASF licenses this file # to you under the Apache License, Version 2.0 (the # "License"); you may not use this file except in compliance # with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, # software distributed under the License is distributed on an # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY # KIND, either express or implied. See the License for the # specific language governing permissions and limitations # under the License. print('Hello world!\n') ================================================ FILE: tests/runners/apps/test_app/helloworld/hellp_world.py ================================================ # # Licensed to the Apache Software Foundation (ASF) under one # or more contributor license agreements. See the NOTICE file # distributed with this work for additional information # regarding copyright ownership. The ASF licenses this file # to you under the Apache License, Version 2.0 (the # "License"); you may not use this file except in compliance # with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, # software distributed under the License is distributed on an # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY # KIND, either express or implied. See the License for the # specific language governing permissions and limitations # under the License. import json import os inputs_dir = '/mnt/vol1/inputs/' num_files = int(os.environ['NUM_FILES']) num_splits = int(os.environ['NUM_SPLITS']) # create input files - split by round robin for i in range(0, num_files): split_id = i % num_splits split_dir = os.path.join(inputs_dir, str(split_id)) if not os.path.exists(split_dir): os.makedirs(split_dir) filename = os.path.join(split_dir, f'input{i}.json') with open(filename, 'w') as f: print(f'Writing input file {filename}') f.write(json.dumps({'mykey': f'myval{i}'})) ================================================ FILE: tests/runners/apps/test_app/liminal.yml ================================================ # Licensed to the Apache Software Foundation (ASF) under one # or more contributor license agreements. See the NOTICE file # distributed with this work for additional information # regarding copyright ownership. The ASF licenses this file # to you under the Apache License, Version 2.0 (the # "License"); you may not use this file except in compliance # with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, # software distributed under the License is distributed on an # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY # KIND, either express or implied. See the License for the # specific language governing permissions and limitations # under the License. --- name: MyPipeline super: extra_liminal images: - image: my_static_input_task_image type: python source: helloworld no_cache: true - image: my_server_image type: python_server source: myserver no_cache: true pipelines: - pipeline: my_pipeline owner: Bosco Albert Baracus start_date: 1970-01-01 timeout_minutes: 45 schedule: 0 * 1 * * default_arg_loaded: check default_array_loaded: [2, 3, 4] default_object_loaded: key1: val1 key2: val2 tasks: - task: my_parallelized_static_input_task type: python description: parallelized static input task image: my_static_input_task_image executors: 2 cmd: python -u hello_world.py services: - service: my_python_server description: my python server image: my_server_image ================================================ FILE: tests/runners/apps/test_app/my_server/__init__.py ================================================ # # Licensed to the Apache Software Foundation (ASF) under one # or more contributor license agreements. See the NOTICE file # distributed with this work for additional information # regarding copyright ownership. The ASF licenses this file # to you under the Apache License, Version 2.0 (the # "License"); you may not use this file except in compliance # with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, # software distributed under the License is distributed on an # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY # KIND, either express or implied. See the License for the # specific language governing permissions and limitations # under the License. ================================================ FILE: tests/runners/apps/test_app/my_server/my_server.py ================================================ # # Licensed to the Apache Software Foundation (ASF) under one # or more contributor license agreements. See the NOTICE file # distributed with this work for additional information # regarding copyright ownership. The ASF licenses this file # to you under the Apache License, Version 2.0 (the # "License"); you may not use this file except in compliance # with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, # software distributed under the License is distributed on an # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY # KIND, either express or implied. See the License for the # specific language governing permissions and limitations # under the License. import json def myendpoint1func(input_json): input_dict = json.loads(input_json) if input_json else {} return f'Input was: {input_dict}' ================================================ FILE: tests/runners/apps/test_spark_app/__init__.py ================================================ # # Licensed to the Apache Software Foundation (ASF) under one # or more contributor license agreements. See the NOTICE file # distributed with this work for additional information # regarding copyright ownership. The ASF licenses this file # to you under the Apache License, Version 2.0 (the # "License"); you may not use this file except in compliance # with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, # software distributed under the License is distributed on an # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY # KIND, either express or implied. See the License for the # specific language governing permissions and limitations # under the License. ================================================ FILE: tests/runners/apps/test_spark_app/liminal.yml ================================================ # Licensed to the Apache Software Foundation (ASF) under one # or more contributor license agreements. See the NOTICE file # distributed with this work for additional information # regarding copyright ownership. The ASF licenses this file # to you under the Apache License, Version 2.0 (the # "License"); you may not use this file except in compliance # with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, # software distributed under the License is distributed on an # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY # KIND, either express or implied. See the License for the # specific language governing permissions and limitations # under the License. --- name: MyPipeline images: - image: my_spark_image type: spark source: wordcount no_cache: true pipelines: - pipeline: my_pipeline owner: Bosco Albert Baracus start_date: 1970-01-01 timeout_minutes: 45 schedule: 0 * 1 * * tasks: - task: my_test_spark_task type: spark description: spark task on k8s image: my_spark_image executors: 2 application_source: wordcount.py application_arguments: - words.txt ================================================ FILE: tests/runners/apps/test_spark_app/wordcount/__init__.py ================================================ # # Licensed to the Apache Software Foundation (ASF) under one # or more contributor license agreements. See the NOTICE file # distributed with this work for additional information # regarding copyright ownership. The ASF licenses this file # to you under the Apache License, Version 2.0 (the # "License"); you may not use this file except in compliance # with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, # software distributed under the License is distributed on an # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY # KIND, either express or implied. See the License for the # specific language governing permissions and limitations # under the License. ================================================ FILE: tests/runners/apps/test_spark_app/wordcount/requirements.txt ================================================ # # Licensed to the Apache Software Foundation (ASF) under one # or more contributor license agreements. See the NOTICE file # distributed with this work for additional information # regarding copyright ownership. The ASF licenses this file # to you under the Apache License, Version 2.0 (the # "License"); you may not use this file except in compliance # with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, # software distributed under the License is distributed on an # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY # KIND, either express or implied. See the License for the # specific language governing permissions and limitations # under the License. pyyaml==5.4 ================================================ FILE: tests/runners/apps/test_spark_app/wordcount/wordcount.py ================================================ # # Licensed to the Apache Software Foundation (ASF) under one # or more contributor license agreements. See the NOTICE file # distributed with this work for additional information # regarding copyright ownership. The ASF licenses this file # to you under the Apache License, Version 2.0 (the # "License"); you may not use this file except in compliance # with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, # software distributed under the License is distributed on an # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY # KIND, either express or implied. See the License for the # specific language governing permissions and limitations # under the License. import sys from operator import add import yaml from pyspark.sql import SparkSession if __name__ == "__main__": if len(sys.argv) != 3: print("Usage: wordcount output ", file=sys.stderr) sys.exit(-1) spark = SparkSession.builder.appName("PythonWordCount").getOrCreate() lines = spark.read.text(sys.argv[1]).rdd.filter(lambda x: not x[0].startswith('#')).map(lambda r: r[0]) counts = lines.flatMap(lambda x: x.split(' ')).map(lambda x: (x, 1)).reduceByKey(add) output = counts.collect() for (word, count) in output: print("%s: %i" % (word, count)) print(f"writing the results to {sys.argv[2]}") counts.toDF(["word", "count"]).coalesce(1).write.mode("overwrite").json(sys.argv[2]) spark.stop() ================================================ FILE: tests/runners/apps/test_spark_app/wordcount/words.txt ================================================ # # Licensed to the Apache Software Foundation (ASF) under one # or more contributor license agreements. See the NOTICE file # distributed with this work for additional information # regarding copyright ownership. The ASF licenses this file # to you under the Apache License, Version 2.0 (the # "License"); you may not use this file except in compliance # with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, # software distributed under the License is distributed on an # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY # KIND, either express or implied. See the License for the # specific language governing permissions and limitations # under the License. my first liminal spark task ================================================ FILE: tests/test_licenses.py ================================================ # # Licensed to the Apache Software Foundation (ASF) under one # or more contributor license agreements. See the NOTICE file # distributed with this work for additional information # regarding copyright ownership. The ASF licenses this file # to you under the Apache License, Version 2.0 (the # "License"); you may not use this file except in compliance # with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, # software distributed under the License is distributed on an # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY # KIND, either express or implied. See the License for the # specific language governing permissions and limitations # under the License. import os import pathlib from unittest import TestCase from termcolor import colored EXCLUDED_EXTENSIONS = ['.gif', '.png', '.pyc', 'LICENSE', 'DISCLAIMER', 'DISCLAIMER-WIP', 'NOTICE', '.whl'] EXCLUDED_DIRS = [ 'docs/build', 'build', 'dist', '.git', '.idea', 'venv', '.venv', 'apache_liminal.egg-info', '.pytest_cache', ] EXCLUDED_FILES = ['DISCLAIMER-WIP', 'LICENSE.txt', '.autovenv', 'credentials-aws.txt'] PYTHON_LICENSE_HEADER = """ # # Licensed to the Apache Software Foundation (ASF) under one # or more contributor license agreements. See the NOTICE file # distributed with this work for additional information # regarding copyright ownership. The ASF licenses this file # to you under the Apache License, Version 2.0 (the # "License"); you may not use this file except in compliance # with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, # software distributed under the License is distributed on an # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY # KIND, either express or implied. See the License for the # specific language governing permissions and limitations # under the License. """.strip().split( "\n" ) MD_LICENSE_HEADER = """ """.strip().split( "\n" ) RST_LICENSE_HEADER = """ .. Licensed to the Apache Software Foundation (ASF) under one or more contributor license agreements. See the NOTICE file distributed with this work for additional information regarding copyright ownership. The ASF licenses this file to you under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at .. .. http://www.apache.org/licenses/LICENSE-2.0 .. Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. .. """.strip().split( "\n" ) BAT_LICENSE_HEADER = """ REM REM Licensed to the Apache Software Foundation (ASF) under one REM or more contributor license agreements. See the NOTICE file REM distributed with this work for additional information REM regarding copyright ownership. The ASF licenses this file REM to you under the Apache License, Version 2.0 (the REM "License"); you may not use this file except in compliance REM with the License. You may obtain a copy of the License at REM REM http://www.apache.org/licenses/LICENSE-2.0 REM REM Unless required by applicable law or agreed to in writing, REM software distributed under the License is distributed on an REM "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY REM KIND, either express or implied. See the License for the REM specific language governing permissions and limitations REM under the License. """.strip().split( "\n" ) JSON_LICENSE_HEADER = """ "": [ "Licensed to the Apache Software Foundation (ASF) under one", "or more contributor license agreements. See the NOTICE file", "distributed with this work for additional information", "regarding copyright ownership. The ASF licenses this file", "to you under the Apache License, Version 2.0 (the", "\\"License\\"); you may not use this file except in compliance", "with the License. You may obtain a copy of the License at", "", " http://www.apache.org/licenses/LICENSE-2.0", "", "Unless required by applicable law or agreed to in writing,", "software distributed under the License is distributed on an", "\\"AS IS\\"BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY", "KIND, either express or implied. See the License for the", "specific language governing permissions and limitations", "under the License." ], """.strip().split( "\n" ) class TestLicenses(TestCase): def test_licenses(self): files = [] base_dir = os.path.join(pathlib.Path(__file__).parent.parent.absolute()) print(f'Checking licenses for files in {base_dir}') for r, d, f in os.walk(base_dir): if not any(os.path.relpath(r, base_dir).startswith(excluded) for excluded in EXCLUDED_DIRS): for file in f: if ( not any(os.path.basename(file).endswith(ext) for ext in EXCLUDED_EXTENSIONS) and not os.path.basename(file) in EXCLUDED_FILES ): files.append(os.path.join(r, file)) output = '' files_missing_license = [] success = True for file in files: print(f'Checking license for file {file}') has_license = self.check_license(file) if not has_license: output += colored(f'Missing License: {file}\n', 'red') files_missing_license.append(file) success = success and has_license print(output) if not success: self.assertListEqual([], files_missing_license) @staticmethod def check_license(file): header_lines = PYTHON_LICENSE_HEADER if file.endswith('.md'): header_lines = MD_LICENSE_HEADER elif file.endswith('.rst'): header_lines = RST_LICENSE_HEADER elif file.endswith('.bat'): header_lines = BAT_LICENSE_HEADER elif file.endswith('.json'): header_lines = JSON_LICENSE_HEADER header_lines[0] = f' {header_lines[0]}' with open(file) as f: file_lines = f.readlines() return all(f'{line}\n' in file_lines for line in header_lines) ================================================ FILE: tests/util/__init__.py ================================================ # # Licensed to the Apache Software Foundation (ASF) under one # or more contributor license agreements. See the NOTICE file # distributed with this work for additional information # regarding copyright ownership. The ASF licenses this file # to you under the Apache License, Version 2.0 (the # "License"); you may not use this file except in compliance # with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, # software distributed under the License is distributed on an # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY # KIND, either express or implied. See the License for the # specific language governing permissions and limitations # under the License. ================================================ FILE: tests/util/dag_test_utils.py ================================================ # # Licensed to the Apache Software Foundation (ASF) under one # or more contributor license agreements. See the NOTICE file # distributed with this work for additional information # regarding copyright ownership. The ASF licenses this file # to you under the Apache License, Version 2.0 (the # "License"); you may not use this file except in compliance # with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, # software distributed under the License is distributed on an # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY # KIND, either express or implied. See the License for the # specific language governing permissions and limitations # under the License. from datetime import datetime from airflow import DAG def create_dag(): """ Test util to create a basic DAG for testing. """ return DAG(dag_id='test_dag', default_args={'start_date': datetime(1970, 1, 1)}) ================================================ FILE: tests/util/test_class_utils.py ================================================ # # Licensed to the Apache Software Foundation (ASF) under one # or more contributor license agreements. See the NOTICE file # distributed with this work for additional information # regarding copyright ownership. The ASF licenses this file # to you under the Apache License, Version 2.0 (the # "License"); you may not use this file except in compliance # with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, # software distributed under the License is distributed on an # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY # KIND, either express or implied. See the License for the # specific language governing permissions and limitations # under the License. from unittest import TestCase from liminal.core.util import class_util from tests.util.test_pkg_1.test_clazz_base import A from tests.util.test_pkg_1.test_pkg_1_1.test_clazz_child_1 import B from tests.util.test_pkg_1.test_pkg_1_1.test_clazz_child_2 import C from tests.util.test_pkg_1.test_pkg_1_1.test_pkg_1_1_1.test_clazz_leaf_1 import D from tests.util.test_pkg_1.test_pkg_1_1.test_pkg_1_1_2.test_clazz_leaf_2 import E class Test(TestCase): def test_find_full_hierarchy_from_root(self): expected_set = { 'test_clazz_child_1': B, 'test_clazz_child_2': C, 'test_clazz_leaf_1': D, 'test_clazz_leaf_2': E, } self.hierarchy_check(A, expected_set) def hierarchy_check(self, clazz, expected_dict): pkg_root = 'tests.util.test_pkg_1' result_dict = class_util.find_subclasses_in_packages([pkg_root], clazz) self.assertDictEqual(result_dict, expected_dict) ================================================ FILE: tests/util/test_pkg_1/__init__.py ================================================ # # Licensed to the Apache Software Foundation (ASF) under one # or more contributor license agreements. See the NOTICE file # distributed with this work for additional information # regarding copyright ownership. The ASF licenses this file # to you under the Apache License, Version 2.0 (the # "License"); you may not use this file except in compliance # with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, # software distributed under the License is distributed on an # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY # KIND, either express or implied. See the License for the # specific language governing permissions and limitations # under the License. ================================================ FILE: tests/util/test_pkg_1/test_clazz_base.py ================================================ # # Licensed to the Apache Software Foundation (ASF) under one # or more contributor license agreements. See the NOTICE file # distributed with this work for additional information # regarding copyright ownership. The ASF licenses this file # to you under the Apache License, Version 2.0 (the # "License"); you may not use this file except in compliance # with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, # software distributed under the License is distributed on an # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY # KIND, either express or implied. See the License for the # specific language governing permissions and limitations # under the License. class A: pass ================================================ FILE: tests/util/test_pkg_1/test_pkg_1_1/__init__.py ================================================ # # Licensed to the Apache Software Foundation (ASF) under one # or more contributor license agreements. See the NOTICE file # distributed with this work for additional information # regarding copyright ownership. The ASF licenses this file # to you under the Apache License, Version 2.0 (the # "License"); you may not use this file except in compliance # with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, # software distributed under the License is distributed on an # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY # KIND, either express or implied. See the License for the # specific language governing permissions and limitations # under the License. ================================================ FILE: tests/util/test_pkg_1/test_pkg_1_1/test_clazz_child_1.py ================================================ # # Licensed to the Apache Software Foundation (ASF) under one # or more contributor license agreements. See the NOTICE file # distributed with this work for additional information # regarding copyright ownership. The ASF licenses this file # to you under the Apache License, Version 2.0 (the # "License"); you may not use this file except in compliance # with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, # software distributed under the License is distributed on an # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY # KIND, either express or implied. See the License for the # specific language governing permissions and limitations # under the License. from tests.util.test_pkg_1.test_clazz_base import A class B(A): pass class M: pass ================================================ FILE: tests/util/test_pkg_1/test_pkg_1_1/test_clazz_child_2.py ================================================ # # Licensed to the Apache Software Foundation (ASF) under one # or more contributor license agreements. See the NOTICE file # distributed with this work for additional information # regarding copyright ownership. The ASF licenses this file # to you under the Apache License, Version 2.0 (the # "License"); you may not use this file except in compliance # with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, # software distributed under the License is distributed on an # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY # KIND, either express or implied. See the License for the # specific language governing permissions and limitations # under the License. from tests.util.test_pkg_1.test_clazz_base import A class C(A): pass ================================================ FILE: tests/util/test_pkg_1/test_pkg_1_1/test_pkg_1_1_1/__init__.py ================================================ # # Licensed to the Apache Software Foundation (ASF) under one # or more contributor license agreements. See the NOTICE file # distributed with this work for additional information # regarding copyright ownership. The ASF licenses this file # to you under the Apache License, Version 2.0 (the # "License"); you may not use this file except in compliance # with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, # software distributed under the License is distributed on an # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY # KIND, either express or implied. See the License for the # specific language governing permissions and limitations # under the License. ================================================ FILE: tests/util/test_pkg_1/test_pkg_1_1/test_pkg_1_1_1/test_clazz_leaf_1.py ================================================ # # Licensed to the Apache Software Foundation (ASF) under one # or more contributor license agreements. See the NOTICE file # distributed with this work for additional information # regarding copyright ownership. The ASF licenses this file # to you under the Apache License, Version 2.0 (the # "License"); you may not use this file except in compliance # with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, # software distributed under the License is distributed on an # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY # KIND, either express or implied. See the License for the # specific language governing permissions and limitations # under the License. from tests.util.test_pkg_1.test_pkg_1_1.test_clazz_child_1 import B class D(B): pass ================================================ FILE: tests/util/test_pkg_1/test_pkg_1_1/test_pkg_1_1_2/__init__.py ================================================ # # Licensed to the Apache Software Foundation (ASF) under one # or more contributor license agreements. See the NOTICE file # distributed with this work for additional information # regarding copyright ownership. The ASF licenses this file # to you under the Apache License, Version 2.0 (the # "License"); you may not use this file except in compliance # with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, # software distributed under the License is distributed on an # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY # KIND, either express or implied. See the License for the # specific language governing permissions and limitations # under the License. ================================================ FILE: tests/util/test_pkg_1/test_pkg_1_1/test_pkg_1_1_2/test_clazz_leaf_2.py ================================================ # # Licensed to the Apache Software Foundation (ASF) under one # or more contributor license agreements. See the NOTICE file # distributed with this work for additional information # regarding copyright ownership. The ASF licenses this file # to you under the Apache License, Version 2.0 (the # "License"); you may not use this file except in compliance # with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, # software distributed under the License is distributed on an # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY # KIND, either express or implied. See the License for the # specific language governing permissions and limitations # under the License. from tests.util.test_pkg_1.test_pkg_1_1.test_pkg_1_1_1.test_clazz_leaf_1 import D class E(D): pass ================================================ FILE: yamllint-config.yml ================================================ # Licensed to the Apache Software Foundation (ASF) under one # or more contributor license agreements. See the NOTICE file # distributed with this work for additional information # regarding copyright ownership. The ASF licenses this file # to you under the Apache License, Version 2.0 (the # "License"); you may not use this file except in compliance # with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, # software distributed under the License is distributed on an # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY # KIND, either express or implied. See the License for the # specific language governing permissions and limitations # under the License. --- # https://yamllint.readthedocs.io/en/stable/ extends: default locale: en_US.UTF-8 yaml-files: - '*.yaml' - '*.yml' - .yamllint rules: line-length: max: 140 # level: warning brackets: enable colons: enable commas: enable empty-lines: enable empty-values: disable key-duplicates: disable hyphens: enable key-ordering: disable new-line-at-end-of-file: enable new-lines: enable # quoted-strings: disable truthy: level: warning indentation: spaces: 2 indent-sequences: true ignore: | docs/ .asf.yaml