Repository: intro-to-ml-with-kubeflow/intro-to-ml-with-kubeflow-examples Branch: master Commit: b00b44a88011 Files: 147 Total size: 501.5 KB Directory structure: gitextract_v2ir0_h2/ ├── .circleci/ │ └── config.yml ├── .gitignore ├── .travis.yaml ├── LICENSE ├── README.md ├── autopep_stuff.sh ├── ch03/ │ ├── example_secret.yaml │ ├── linux_install.sh │ ├── mac_install.sh │ └── minio.sh ├── ch04/ │ ├── code/ │ │ ├── ControlStructures.ipynb │ │ ├── ControlStructures.py │ │ ├── Lightweight Pipeline.ipynb │ │ ├── Lightweight Pipeline.py │ │ ├── RecommenderPipeline.ipynb │ │ ├── RecommenderPipeline.py │ │ └── download_components.sh │ └── install/ │ ├── deployment.yaml │ └── virtualservice.yaml ├── ch06/ │ ├── MLflow.ipynb │ ├── MLflow.py │ ├── Metadata.ipynb │ ├── Metadata.py │ ├── docker/ │ │ ├── Dockerfile │ │ ├── build.sh │ │ └── run.sh │ └── install/ │ └── mlflowchart/ │ ├── .helmignore │ ├── Chart.yaml │ ├── templates/ │ │ ├── NOTES.txt │ │ ├── _helpers.tpl │ │ └── mlflow.yaml │ └── values.yaml ├── ch10/ │ ├── experiment.yaml │ ├── hptuning.py │ └── random.yaml ├── ch2/ │ ├── Dockerfile │ ├── build-and-push.sh │ └── query-endpoint.py ├── ch2_seldon_examples/ │ ├── pipeline_role.yaml │ ├── pipeline_rolebinding.yaml │ ├── pv-claim.yaml │ ├── pv-volume.yaml │ ├── request_example.ipynb │ ├── run_example.sh │ ├── setup_example.sh │ ├── tf_mnist_no_seldon_pipeline.py │ ├── tiller_rbac.yaml │ └── train_pipeline.py ├── ch9/ │ └── ctscans/ │ ├── DICOM Denoising Pipeline.ipynb │ ├── calculate-basis-vectors/ │ │ ├── Dockerfile │ │ ├── build-component.sh │ │ ├── pom.xml │ │ └── src/ │ │ └── main/ │ │ └── scala/ │ │ └── org/ │ │ └── rawkintrevo/ │ │ └── covid/ │ │ └── App.scala │ ├── download-dicom/ │ │ ├── Dockerfile │ │ ├── build-component.sh │ │ └── run.sh │ ├── process-dicoms-into-vectors/ │ │ ├── Dockerfile │ │ ├── build-component.sh │ │ ├── data/ │ │ │ └── s.150.csv │ │ ├── process-dicoms-into-vectors.yaml │ │ └── src/ │ │ └── program.py │ └── visualize-basis-vectors/ │ ├── Dockerfile │ ├── build-component.sh │ └── src/ │ └── program.py ├── ci.sh ├── convert_notebooks.sh ├── data-extraction/ │ ├── README.md │ ├── github_comments_query.bsql │ ├── github_issues_query.bsql │ ├── iot/ │ │ ├── basic.yaml │ │ └── build.sh │ ├── python-notebook/ │ │ ├── AddSpamassassinDockerfile │ │ ├── MailingListDataPrep.ipynb │ │ ├── MailingListDataPrep.py │ │ └── RunNBDockerfile │ ├── python-spark/ │ │ ├── Dockerfile │ │ ├── LaunchSparkJobs.ipynb │ │ ├── LaunchSparkJobs.py │ │ ├── fake_job.py │ │ └── requirements.txt │ ├── python-spark-notebook/ │ │ ├── AddGCSDockerfile │ │ ├── AddPython3.6Dockerfile │ │ ├── Dockerfile │ │ ├── SparkMailingListForKF.ipynb │ │ ├── SparkMailingListForKF.py │ │ ├── build.sh │ │ ├── dr.yaml │ │ ├── no-saprk-tls.yaml │ │ ├── spark-driver-service.yaml │ │ └── virt_service.yaml │ ├── spark-hello-world/ │ │ ├── Dockerfile │ │ ├── README.md │ │ ├── hello_world_pipeline.py │ │ ├── lr_demo/ │ │ │ ├── .gitignore │ │ │ ├── .travis.yml │ │ │ ├── README.md │ │ │ ├── build.sbt │ │ │ ├── project/ │ │ │ │ ├── build.properties │ │ │ │ └── plugins.sbt │ │ │ ├── sample.csv │ │ │ ├── sbt/ │ │ │ │ └── sbt │ │ │ └── src/ │ │ │ ├── main/ │ │ │ │ └── scala/ │ │ │ │ └── com/ │ │ │ │ └── introtomlwithkubeflow/ │ │ │ │ └── spark/ │ │ │ │ └── demo/ │ │ │ │ └── lr/ │ │ │ │ ├── TrainingApp.scala │ │ │ │ └── TrainingPipeline.scala │ │ │ └── test/ │ │ │ └── scala/ │ │ │ └── com/ │ │ │ └── introtomlwithkubeflow/ │ │ │ └── spark/ │ │ │ └── demo/ │ │ │ └── lr/ │ │ │ └── TrainingPipelineTest.scala │ │ ├── setup.sh │ │ ├── spark-pi-min.yaml │ │ └── spark-pi.yaml │ ├── stack_overflow_questions.bsql │ └── tfx/ │ ├── TFDV.ipynb │ ├── TFDV.py │ ├── install_tfx.sh │ ├── requirements.txt │ └── run_on_dataflow_ex.py ├── dev-setup/ │ ├── install-argo.sh │ ├── install-kf-pipeline-sdk.sh │ ├── install-kf.sh │ ├── install-kubectl.sh │ ├── install-kustomize.sh │ ├── install-microk8s.sh │ └── jsonnet.sh ├── feature-prep/ │ ├── README.md │ ├── spark/ │ │ ├── SparkMailingListFeaturePrep.ipynb │ │ └── SparkMailingListFeaturePrep.py │ └── tft/ │ ├── requirements.txt │ └── transform.py ├── gcp-setup/ │ ├── cloudshell_scrip.sh │ └── setup-gcp.sh ├── kfctl_gcp_iap.v1.0.1.yaml ├── pipelines/ │ ├── ControlStructures.ipynb │ ├── Lightweight Pipeline.ipynb │ ├── RecommenderPipeline.ipynb │ └── download_components.sh ├── recommender/ │ ├── Dockerfile │ ├── Recommender_Kubeflow.ipynb │ ├── Recommender_Kubeflow.py │ ├── docker/ │ │ ├── Dockerfile │ │ └── build.sh │ └── tfservingchart/ │ ├── .helmignore │ ├── Chart.yaml │ ├── templates/ │ │ ├── NOTES.txt │ │ ├── _helpers.tpl │ │ ├── minioaccess.yaml │ │ ├── tfserving.yaml │ │ └── tfserving1.yaml │ └── values.yaml ├── runthrough.sh └── scikitLearn/ └── python/ └── IncomePrediction.ipynb ================================================ FILE CONTENTS ================================================ ================================================ FILE: .circleci/config.yml ================================================ version: 2 apt-run: &apt-install name: Install apt packages command: | sudo apt-get -qq update sudo apt-get install -y \ shellcheck jobs: build: working_directory: ~/mermaid-starter docker: - image: circleci/python:3.6-jessie-node-browsers-legacy steps: - checkout - run: *apt-install - run: name: Run our basic shell CI command: ./ci.sh ================================================ FILE: .gitignore ================================================ # Byte-compiled / optimized / DLL files __pycache__/ *.py[cod] *$py.class # C extensions *.so # Distribution / packaging .idea .Python build/ develop-eggs/ dist/ downloads/ eggs/ .eggs/ lib/ lib64/ parts/ sdist/ var/ wheels/ *.egg-info/ .installed.cfg *.egg MANIFEST # PyInstaller # Usually these files are written by a python script from a template # before PyInstaller builds the exe, so as to inject date/other infos into it. *.manifest *.spec # Installer logs pip-log.txt pip-delete-this-directory.txt # Unit test / coverage reports htmlcov/ .tox/ .coverage .coverage.* .cache nosetests.xml coverage.xml *.cover .hypothesis/ .pytest_cache/ # Translations *.mo *.pot # Django stuff: *.log local_settings.py db.sqlite3 # Flask stuff: instance/ .webassets-cache # Scrapy stuff: .scrapy # Sphinx documentation docs/_build/ # PyBuilder target/ # Jupyter Notebook .ipynb_checkpoints # pyenv .python-version # celery beat schedule file celerybeat-schedule # SageMath parsed files *.sage.py # Environments .env .venv env/ venv/ ENV/ env.bak/ venv.bak/ # Spyder project settings .spyderproject .spyproject # Rope project settings .ropeproject # mkdocs documentation /site # mypy .mypy_cache/ # Emacs *~ # Ignore kfctl's downloaded kfctl*.t*z ================================================ FILE: .travis.yaml ================================================ language: generic sudo: true addons: apt: packages: - shellcheck script: - ./ci.sh ================================================ FILE: LICENSE ================================================ Apache License Version 2.0, January 2004 http://www.apache.org/licenses/ TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 1. Definitions. "License" shall mean the terms and conditions for use, reproduction, and distribution as defined by Sections 1 through 9 of this document. "Licensor" shall mean the copyright owner or entity authorized by the copyright owner that is granting the License. "Legal Entity" shall mean the union of the acting entity and all other entities that control, are controlled by, or are under common control with that entity. For the purposes of this definition, "control" means (i) the power, direct or indirect, to cause the direction or management of such entity, whether by contract or otherwise, or (ii) ownership of fifty percent (50%) or more of the outstanding shares, or (iii) beneficial ownership of such entity. "You" (or "Your") shall mean an individual or Legal Entity exercising permissions granted by this License. "Source" form shall mean the preferred form for making modifications, including but not limited to software source code, documentation source, and configuration files. "Object" form shall mean any form resulting from mechanical transformation or translation of a Source form, including but not limited to compiled object code, generated documentation, and conversions to other media types. "Work" shall mean the work of authorship, whether in Source or Object form, made available under the License, as indicated by a copyright notice that is included in or attached to the work (an example is provided in the Appendix below). "Derivative Works" shall mean any work, whether in Source or Object form, that is based on (or derived from) the Work and for which the editorial revisions, annotations, elaborations, or other modifications represent, as a whole, an original work of authorship. For the purposes of this License, Derivative Works shall not include works that remain separable from, or merely link (or bind by name) to the interfaces of, the Work and Derivative Works thereof. "Contribution" shall mean any work of authorship, including the original version of the Work and any modifications or additions to that Work or Derivative Works thereof, that is intentionally submitted to Licensor for inclusion in the Work by the copyright owner or by an individual or Legal Entity authorized to submit on behalf of the copyright owner. For the purposes of this definition, "submitted" means any form of electronic, verbal, or written communication sent to the Licensor or its representatives, including but not limited to communication on electronic mailing lists, source code control systems, and issue tracking systems that are managed by, or on behalf of, the Licensor for the purpose of discussing and improving the Work, but excluding communication that is conspicuously marked or otherwise designated in writing by the copyright owner as "Not a Contribution." "Contributor" shall mean Licensor and any individual or Legal Entity on behalf of whom a Contribution has been received by Licensor and subsequently incorporated within the Work. 2. Grant of Copyright License. Subject to the terms and conditions of this License, each Contributor hereby grants to You a perpetual, worldwide, non-exclusive, no-charge, royalty-free, irrevocable copyright license to reproduce, prepare Derivative Works of, publicly display, publicly perform, sublicense, and distribute the Work and such Derivative Works in Source or Object form. 3. Grant of Patent License. Subject to the terms and conditions of this License, each Contributor hereby grants to You a perpetual, worldwide, non-exclusive, no-charge, royalty-free, irrevocable (except as stated in this section) patent license to make, have made, use, offer to sell, sell, import, and otherwise transfer the Work, where such license applies only to those patent claims licensable by such Contributor that are necessarily infringed by their Contribution(s) alone or by combination of their Contribution(s) with the Work to which such Contribution(s) was submitted. If You institute patent litigation against any entity (including a cross-claim or counterclaim in a lawsuit) alleging that the Work or a Contribution incorporated within the Work constitutes direct or contributory patent infringement, then any patent licenses granted to You under this License for that Work shall terminate as of the date such litigation is filed. 4. Redistribution. You may reproduce and distribute copies of the Work or Derivative Works thereof in any medium, with or without modifications, and in Source or Object form, provided that You meet the following conditions: (a) You must give any other recipients of the Work or Derivative Works a copy of this License; and (b) You must cause any modified files to carry prominent notices stating that You changed the files; and (c) You must retain, in the Source form of any Derivative Works that You distribute, all copyright, patent, trademark, and attribution notices from the Source form of the Work, excluding those notices that do not pertain to any part of the Derivative Works; and (d) If the Work includes a "NOTICE" text file as part of its distribution, then any Derivative Works that You distribute must include a readable copy of the attribution notices contained within such NOTICE file, excluding those notices that do not pertain to any part of the Derivative Works, in at least one of the following places: within a NOTICE text file distributed as part of the Derivative Works; within the Source form or documentation, if provided along with the Derivative Works; or, within a display generated by the Derivative Works, if and wherever such third-party notices normally appear. The contents of the NOTICE file are for informational purposes only and do not modify the License. You may add Your own attribution notices within Derivative Works that You distribute, alongside or as an addendum to the NOTICE text from the Work, provided that such additional attribution notices cannot be construed as modifying the License. You may add Your own copyright statement to Your modifications and may provide additional or different license terms and conditions for use, reproduction, or distribution of Your modifications, or for any such Derivative Works as a whole, provided Your use, reproduction, and distribution of the Work otherwise complies with the conditions stated in this License. 5. Submission of Contributions. Unless You explicitly state otherwise, any Contribution intentionally submitted for inclusion in the Work by You to the Licensor shall be under the terms and conditions of this License, without any additional terms or conditions. Notwithstanding the above, nothing herein shall supersede or modify the terms of any separate license agreement you may have executed with Licensor regarding such Contributions. 6. Trademarks. This License does not grant permission to use the trade names, trademarks, service marks, or product names of the Licensor, except as required for reasonable and customary use in describing the origin of the Work and reproducing the content of the NOTICE file. 7. Disclaimer of Warranty. Unless required by applicable law or agreed to in writing, Licensor provides the Work (and each Contributor provides its Contributions) on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied, including, without limitation, any warranties or conditions of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A PARTICULAR PURPOSE. You are solely responsible for determining the appropriateness of using or redistributing the Work and assume any risks associated with Your exercise of permissions under this License. 8. Limitation of Liability. In no event and under no legal theory, whether in tort (including negligence), contract, or otherwise, unless required by applicable law (such as deliberate and grossly negligent acts) or agreed to in writing, shall any Contributor be liable to You for damages, including any direct, indirect, special, incidental, or consequential damages of any character arising as a result of this License or out of the use or inability to use the Work (including but not limited to damages for loss of goodwill, work stoppage, computer failure or malfunction, or any and all other commercial damages or losses), even if such Contributor has been advised of the possibility of such damages. 9. Accepting Warranty or Additional Liability. While redistributing the Work or Derivative Works thereof, You may choose to offer, and charge a fee for, acceptance of support, warranty, indemnity, or other liability obligations and/or rights consistent with this License. However, in accepting such obligations, You may act only on Your own behalf and on Your sole responsibility, not on behalf of any other Contributor, and only if You agree to indemnify, defend, and hold each Contributor harmless for any liability incurred by, or claims asserted against, such Contributor by reason of your accepting any such warranty or additional liability. END OF TERMS AND CONDITIONS APPENDIX: How to apply the Apache License to your work. To apply the Apache License to your work, attach the following boilerplate notice, with the fields enclosed by brackets "[]" replaced with your own identifying information. (Don't include the brackets!) The text should be enclosed in the appropriate comment syntax for the file format. We also recommend that a file or class name and description of purpose be included on the same "printed page" as the copyright notice for easier identification within third-party archives. Copyright [yyyy] [name of copyright owner] Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. ================================================ FILE: README.md ================================================ # intro-to-ml-with-kubeflow-examples Examples for the Intro to ML with Kubeflow book ================================================ FILE: autopep_stuff.sh ================================================ #!/bin/bash # autopep8 a bunch of things that we can autopep8 -i -r ./ \ --select E101,E202,E201,E203,E211,E221,E222,E223,E224,E225,E226,E227,\ E228,E231,E241,E242,E251,E252,E262,E271,E272,E273,E274,E301,E302,E303,\ E304,E305,E306,E501,E502,E711,E712,E713,E714,E721,E722,E731,W291,W293,\ W391,W601,W602,W603,W604,W690\ -j 0 --exclude "*venv*" # Then we use YAPF because it does a better job on long-lines yapf -i -r ./ --exclude "*venv*" ================================================ FILE: ch03/example_secret.yaml ================================================ apiVersion: v1 kind: Secret metadata: name: minioaccess namespace: mynamespace data: AWS_ACCESS_KEY_ID: xxxxxxxxxx AWS_SECRET_ACCESS_KEY: xxxxxxxxxxxxxxxxxxxxx ================================================ FILE: ch03/linux_install.sh ================================================ #!/bin/bash #tag::installMCLinux[] pushd ~/bin wget https://dl.min.io/client/mc/release/linux-amd64/mc chmod a+x mc #end::installMCLinux[] ================================================ FILE: ch03/mac_install.sh ================================================ #!/bin/bash #tag::installMCMac[] brew install minio/stable/minio #end::installMCMac[] ================================================ FILE: ch03/minio.sh ================================================ #!/bin/bash set -ex # Minio runs on port 9000 (both UI and service) so expose locally to use cli or UI #tag::fwdMinio[] kubectl port-forward -n kubeflow svc/minio-service 9000:9000 & #end::fwdMinio[] # Give it a spell to settle sleep 10 # Kubeflow creates a minio user with password minio123 at install #tag::configMC[] mc config host add minio http://localhost:9000 minio minio123 #end::configMC[] #tag::listMC[] mc ls minio #end::listMC[] # Output [2018-12-13 18:23:41 CST]     0B mlpipeline/ # Make a new bucket for our work #tag::makeBucket[] mc mb minio/kf-book-examples #end::makeBucket[] ================================================ FILE: ch04/code/ControlStructures.ipynb ================================================ { "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Simple Control structure\n", "\n", "Shows how to use conditional execution" ] }, { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Requirement already up-to-date: kfp in ./.local/lib/python3.6/site-packages (0.2.2.1)\n", "Requirement already satisfied, skipping upgrade: jsonschema>=3.0.1 in /usr/local/lib/python3.6/dist-packages (from kfp) (3.2.0)\n", "Requirement already satisfied, skipping upgrade: google-cloud-storage>=1.13.0 in /usr/local/lib/python3.6/dist-packages (from kfp) (1.25.0)\n", "Requirement already satisfied, skipping upgrade: python-dateutil in /usr/local/lib/python3.6/dist-packages (from kfp) (2.8.1)\n", "Requirement already satisfied, skipping upgrade: google-auth>=1.6.1 in /usr/local/lib/python3.6/dist-packages (from kfp) (1.11.0)\n", "Requirement already satisfied, skipping upgrade: argo-models==2.2.1a in ./.local/lib/python3.6/site-packages (from kfp) (2.2.1a0)\n", "Requirement already satisfied, skipping upgrade: urllib3<1.25,>=1.15 in ./.local/lib/python3.6/site-packages (from kfp) (1.24.3)\n", "Requirement already satisfied, skipping upgrade: cloudpickle==1.1.1 in ./.local/lib/python3.6/site-packages (from kfp) (1.1.1)\n", "Requirement already satisfied, skipping upgrade: kubernetes<=10.0.0,>=8.0.0 in ./.local/lib/python3.6/site-packages (from kfp) (10.0.0)\n", "Requirement already satisfied, skipping upgrade: click==7.0 in ./.local/lib/python3.6/site-packages (from kfp) (7.0)\n", "Requirement already satisfied, skipping upgrade: certifi in /usr/local/lib/python3.6/dist-packages (from kfp) (2019.11.28)\n", "Requirement already satisfied, skipping upgrade: PyYAML in /usr/local/lib/python3.6/dist-packages (from kfp) (5.3)\n", "Requirement already satisfied, skipping upgrade: kfp-server-api<=0.1.40,>=0.1.18 in ./.local/lib/python3.6/site-packages (from kfp) (0.1.40)\n", "Requirement already satisfied, skipping upgrade: six>=1.10 in /usr/lib/python3/dist-packages (from kfp) (1.11.0)\n", "Requirement already satisfied, skipping upgrade: PyJWT>=1.6.4 in ./.local/lib/python3.6/site-packages (from kfp) (1.7.1)\n", "Requirement already satisfied, skipping upgrade: Deprecated in ./.local/lib/python3.6/site-packages (from kfp) (1.2.7)\n", "Requirement already satisfied, skipping upgrade: requests-toolbelt>=0.8.0 in ./.local/lib/python3.6/site-packages (from kfp) (0.9.1)\n", "Requirement already satisfied, skipping upgrade: cryptography>=2.4.2 in ./.local/lib/python3.6/site-packages (from kfp) (2.8)\n", "Requirement already satisfied, skipping upgrade: tabulate==0.8.3 in ./.local/lib/python3.6/site-packages (from kfp) (0.8.3)\n", "Requirement already satisfied, skipping upgrade: importlib-metadata; python_version < \"3.8\" in /usr/local/lib/python3.6/dist-packages (from jsonschema>=3.0.1->kfp) (1.4.0)\n", "Requirement already satisfied, skipping upgrade: attrs>=17.4.0 in /usr/local/lib/python3.6/dist-packages (from jsonschema>=3.0.1->kfp) (19.3.0)\n", "Requirement already satisfied, skipping upgrade: pyrsistent>=0.14.0 in /usr/local/lib/python3.6/dist-packages (from jsonschema>=3.0.1->kfp) (0.15.7)\n", "Requirement already satisfied, skipping upgrade: setuptools in /usr/local/lib/python3.6/dist-packages (from jsonschema>=3.0.1->kfp) (45.1.0)\n", "Requirement already satisfied, skipping upgrade: google-resumable-media<0.6dev,>=0.5.0 in /usr/local/lib/python3.6/dist-packages (from google-cloud-storage>=1.13.0->kfp) (0.5.0)\n", "Requirement already satisfied, skipping upgrade: google-cloud-core<2.0dev,>=1.2.0 in /usr/local/lib/python3.6/dist-packages (from google-cloud-storage>=1.13.0->kfp) (1.3.0)\n", "Requirement already satisfied, skipping upgrade: cachetools<5.0,>=2.0.0 in /usr/local/lib/python3.6/dist-packages (from google-auth>=1.6.1->kfp) (4.0.0)\n", "Requirement already satisfied, skipping upgrade: pyasn1-modules>=0.2.1 in /usr/local/lib/python3.6/dist-packages (from google-auth>=1.6.1->kfp) (0.2.8)\n", "Requirement already satisfied, skipping upgrade: rsa<4.1,>=3.1.4 in /usr/local/lib/python3.6/dist-packages (from google-auth>=1.6.1->kfp) (4.0)\n", "Requirement already satisfied, skipping upgrade: requests-oauthlib in /usr/local/lib/python3.6/dist-packages (from kubernetes<=10.0.0,>=8.0.0->kfp) (1.3.0)\n", "Requirement already satisfied, skipping upgrade: requests in /usr/local/lib/python3.6/dist-packages (from kubernetes<=10.0.0,>=8.0.0->kfp) (2.22.0)\n", "Requirement already satisfied, skipping upgrade: websocket-client!=0.40.0,!=0.41.*,!=0.42.*,>=0.32.0 in /usr/local/lib/python3.6/dist-packages (from kubernetes<=10.0.0,>=8.0.0->kfp) (0.57.0)\n", "Requirement already satisfied, skipping upgrade: wrapt<2,>=1.10 in /usr/local/lib/python3.6/dist-packages (from Deprecated->kfp) (1.11.2)\n", "Requirement already satisfied, skipping upgrade: cffi!=1.11.3,>=1.8 in ./.local/lib/python3.6/site-packages (from cryptography>=2.4.2->kfp) (1.14.0)\n", "Requirement already satisfied, skipping upgrade: zipp>=0.5 in /usr/local/lib/python3.6/dist-packages (from importlib-metadata; python_version < \"3.8\"->jsonschema>=3.0.1->kfp) (2.1.0)\n", "Requirement already satisfied, skipping upgrade: google-api-core<2.0.0dev,>=1.16.0 in /usr/local/lib/python3.6/dist-packages (from google-cloud-core<2.0dev,>=1.2.0->google-cloud-storage>=1.13.0->kfp) (1.16.0)\n", "Requirement already satisfied, skipping upgrade: pyasn1<0.5.0,>=0.4.6 in /usr/local/lib/python3.6/dist-packages (from pyasn1-modules>=0.2.1->google-auth>=1.6.1->kfp) (0.4.8)\n", "Requirement already satisfied, skipping upgrade: oauthlib>=3.0.0 in /usr/local/lib/python3.6/dist-packages (from requests-oauthlib->kubernetes<=10.0.0,>=8.0.0->kfp) (3.1.0)\n", "Requirement already satisfied, skipping upgrade: chardet<3.1.0,>=3.0.2 in /usr/local/lib/python3.6/dist-packages (from requests->kubernetes<=10.0.0,>=8.0.0->kfp) (3.0.4)\n", "Requirement already satisfied, skipping upgrade: idna<2.9,>=2.5 in /usr/lib/python3/dist-packages (from requests->kubernetes<=10.0.0,>=8.0.0->kfp) (2.6)\n", "Requirement already satisfied, skipping upgrade: pycparser in ./.local/lib/python3.6/site-packages (from cffi!=1.11.3,>=1.8->cryptography>=2.4.2->kfp) (2.19)\n", "Requirement already satisfied, skipping upgrade: protobuf>=3.4.0 in /usr/local/lib/python3.6/dist-packages (from google-api-core<2.0.0dev,>=1.16.0->google-cloud-core<2.0dev,>=1.2.0->google-cloud-storage>=1.13.0->kfp) (3.11.2)\n", "Requirement already satisfied, skipping upgrade: pytz in /usr/local/lib/python3.6/dist-packages (from google-api-core<2.0.0dev,>=1.16.0->google-cloud-core<2.0dev,>=1.2.0->google-cloud-storage>=1.13.0->kfp) (2019.3)\n", "Requirement already satisfied, skipping upgrade: googleapis-common-protos<2.0dev,>=1.6.0 in /usr/local/lib/python3.6/dist-packages (from google-api-core<2.0.0dev,>=1.16.0->google-cloud-core<2.0dev,>=1.2.0->google-cloud-storage>=1.13.0->kfp) (1.51.0)\n" ] } ], "source": [ "!pip install kfp --upgrade --user\n", "\n", "import kfp\n", "from kfp import dsl\n", "from kfp.components import func_to_container_op, InputPath, OutputPath" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Functions" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [], "source": [ "@func_to_container_op\n", "def get_random_int_op(minimum: int, maximum: int) -> int:\n", " \"\"\"Generate a random number between minimum and maximum (inclusive).\"\"\"\n", " import random\n", " result = random.randint(minimum, maximum)\n", " print(result)\n", " return result\n", "\n", "@func_to_container_op\n", "def process_small_op(data: int):\n", " \"\"\"Process small numbers.\"\"\"\n", " print(\"Processing small result\", data)\n", " return\n", "\n", "@func_to_container_op\n", "def process_medium_op(data: int):\n", " \"\"\"Process medium numbers.\"\"\"\n", " print(\"Processing medium result\", data)\n", " return\n", "\n", "@func_to_container_op\n", "def process_large_op(data: int):\n", " \"\"\"Process large numbers.\"\"\"\n", " print(\"Processing large result\", data)\n", " return" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Conditional pipeline" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [], "source": [ "@dsl.pipeline(\n", " name='Conditional execution pipeline',\n", " description='Shows how to use dsl.Condition().'\n", ")\n", "def conditional_pipeline():\n", " number = get_random_int_op(0, 100).output\n", " with dsl.Condition(number < 10):\n", " process_small_op(number)\n", " with dsl.Condition(number > 10 and number < 50):\n", " process_medium_op(number)\n", " with dsl.Condition(number > 50):\n", " process_large_op(number)\n", " " ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Submit the pipeline for execution:" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [ { "data": { "text/html": [ "Experiment link here" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/html": [ "Run link here" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/plain": [ "RunPipelineResult(run_id=293a92c5-50b2-4a96-bbd4-ebc85106f337)" ] }, "execution_count": 4, "metadata": {}, "output_type": "execute_result" } ], "source": [ "kfp.Client().create_run_from_pipeline_func(conditional_pipeline, arguments={})" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.6.9" } }, "nbformat": 4, "nbformat_minor": 2 } ================================================ FILE: ch04/code/ControlStructures.py ================================================ #!/usr/bin/env python # coding: utf-8 # # Simple Control structure # # Shows how to use conditional execution # In[1]: get_ipython().system('pip install kfp --upgrade --user') import kfp from kfp import dsl from kfp.components import func_to_container_op, InputPath, OutputPath # # Functions # In[2]: @func_to_container_op def get_random_int_op(minimum: int, maximum: int) -> int: """Generate a random number between minimum and maximum (inclusive).""" import random result = random.randint(minimum, maximum) print(result) return result @func_to_container_op def process_small_op(data: int): """Process small numbers.""" print("Processing small result", data) return @func_to_container_op def process_medium_op(data: int): """Process medium numbers.""" print("Processing medium result", data) return @func_to_container_op def process_large_op(data: int): """Process large numbers.""" print("Processing large result", data) return # # Conditional pipeline # In[3]: @dsl.pipeline(name='Conditional execution pipeline', description='Shows how to use dsl.Condition().') def conditional_pipeline(): number = get_random_int_op(0, 100).output with dsl.Condition(number < 10): process_small_op(number) with dsl.Condition(number > 10 and number < 50): process_medium_op(number) with dsl.Condition(number > 50): process_large_op(number) # # Submit the pipeline for execution: # In[4]: kfp.Client().create_run_from_pipeline_func(conditional_pipeline, arguments={}) # In[ ]: ================================================ FILE: ch04/code/Lightweight Pipeline.ipynb ================================================ { "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Setup" ] }, { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Requirement already up-to-date: kfp in ./.local/lib/python3.6/site-packages (0.2.2.1)\n", "Requirement already satisfied, skipping upgrade: PyYAML in /usr/local/lib/python3.6/dist-packages (from kfp) (5.3)\n", "Requirement already satisfied, skipping upgrade: requests-toolbelt>=0.8.0 in ./.local/lib/python3.6/site-packages (from kfp) (0.9.1)\n", "Requirement already satisfied, skipping upgrade: click==7.0 in ./.local/lib/python3.6/site-packages (from kfp) (7.0)\n", "Requirement already satisfied, skipping upgrade: Deprecated in ./.local/lib/python3.6/site-packages (from kfp) (1.2.7)\n", "Requirement already satisfied, skipping upgrade: kubernetes<=10.0.0,>=8.0.0 in ./.local/lib/python3.6/site-packages (from kfp) (10.0.0)\n", "Requirement already satisfied, skipping upgrade: six>=1.10 in /usr/lib/python3/dist-packages (from kfp) (1.11.0)\n", "Requirement already satisfied, skipping upgrade: certifi in /usr/local/lib/python3.6/dist-packages (from kfp) (2019.11.28)\n", "Requirement already satisfied, skipping upgrade: google-cloud-storage>=1.13.0 in /usr/local/lib/python3.6/dist-packages (from kfp) (1.25.0)\n", "Requirement already satisfied, skipping upgrade: PyJWT>=1.6.4 in ./.local/lib/python3.6/site-packages (from kfp) (1.7.1)\n", "Requirement already satisfied, skipping upgrade: cryptography>=2.4.2 in ./.local/lib/python3.6/site-packages (from kfp) (2.8)\n", "Requirement already satisfied, skipping upgrade: google-auth>=1.6.1 in /usr/local/lib/python3.6/dist-packages (from kfp) (1.11.0)\n", "Requirement already satisfied, skipping upgrade: cloudpickle==1.1.1 in ./.local/lib/python3.6/site-packages (from kfp) (1.1.1)\n", "Requirement already satisfied, skipping upgrade: kfp-server-api<=0.1.40,>=0.1.18 in ./.local/lib/python3.6/site-packages (from kfp) (0.1.40)\n", "Requirement already satisfied, skipping upgrade: argo-models==2.2.1a in ./.local/lib/python3.6/site-packages (from kfp) (2.2.1a0)\n", "Requirement already satisfied, skipping upgrade: tabulate==0.8.3 in ./.local/lib/python3.6/site-packages (from kfp) (0.8.3)\n", "Requirement already satisfied, skipping upgrade: python-dateutil in /usr/local/lib/python3.6/dist-packages (from kfp) (2.8.1)\n", "Requirement already satisfied, skipping upgrade: urllib3<1.25,>=1.15 in ./.local/lib/python3.6/site-packages (from kfp) (1.24.3)\n", "Requirement already satisfied, skipping upgrade: jsonschema>=3.0.1 in /usr/local/lib/python3.6/dist-packages (from kfp) (3.2.0)\n", "Requirement already satisfied, skipping upgrade: requests<3.0.0,>=2.0.1 in /usr/local/lib/python3.6/dist-packages (from requests-toolbelt>=0.8.0->kfp) (2.22.0)\n", "Requirement already satisfied, skipping upgrade: wrapt<2,>=1.10 in /usr/local/lib/python3.6/dist-packages (from Deprecated->kfp) (1.11.2)\n", "Requirement already satisfied, skipping upgrade: setuptools>=21.0.0 in /usr/local/lib/python3.6/dist-packages (from kubernetes<=10.0.0,>=8.0.0->kfp) (45.1.0)\n", "Requirement already satisfied, skipping upgrade: websocket-client!=0.40.0,!=0.41.*,!=0.42.*,>=0.32.0 in /usr/local/lib/python3.6/dist-packages (from kubernetes<=10.0.0,>=8.0.0->kfp) (0.57.0)\n", "Requirement already satisfied, skipping upgrade: requests-oauthlib in /usr/local/lib/python3.6/dist-packages (from kubernetes<=10.0.0,>=8.0.0->kfp) (1.3.0)\n", "Requirement already satisfied, skipping upgrade: google-cloud-core<2.0dev,>=1.2.0 in /usr/local/lib/python3.6/dist-packages (from google-cloud-storage>=1.13.0->kfp) (1.3.0)\n", "Requirement already satisfied, skipping upgrade: google-resumable-media<0.6dev,>=0.5.0 in /usr/local/lib/python3.6/dist-packages (from google-cloud-storage>=1.13.0->kfp) (0.5.0)\n", "Requirement already satisfied, skipping upgrade: cffi!=1.11.3,>=1.8 in ./.local/lib/python3.6/site-packages (from cryptography>=2.4.2->kfp) (1.14.0)\n", "Requirement already satisfied, skipping upgrade: cachetools<5.0,>=2.0.0 in /usr/local/lib/python3.6/dist-packages (from google-auth>=1.6.1->kfp) (4.0.0)\n", "Requirement already satisfied, skipping upgrade: rsa<4.1,>=3.1.4 in /usr/local/lib/python3.6/dist-packages (from google-auth>=1.6.1->kfp) (4.0)\n", "Requirement already satisfied, skipping upgrade: pyasn1-modules>=0.2.1 in /usr/local/lib/python3.6/dist-packages (from google-auth>=1.6.1->kfp) (0.2.8)\n", "Requirement already satisfied, skipping upgrade: attrs>=17.4.0 in /usr/local/lib/python3.6/dist-packages (from jsonschema>=3.0.1->kfp) (19.3.0)\n", "Requirement already satisfied, skipping upgrade: pyrsistent>=0.14.0 in /usr/local/lib/python3.6/dist-packages (from jsonschema>=3.0.1->kfp) (0.15.7)\n", "Requirement already satisfied, skipping upgrade: importlib-metadata; python_version < \"3.8\" in /usr/local/lib/python3.6/dist-packages (from jsonschema>=3.0.1->kfp) (1.4.0)\n", "Requirement already satisfied, skipping upgrade: chardet<3.1.0,>=3.0.2 in /usr/local/lib/python3.6/dist-packages (from requests<3.0.0,>=2.0.1->requests-toolbelt>=0.8.0->kfp) (3.0.4)\n", "Requirement already satisfied, skipping upgrade: idna<2.9,>=2.5 in /usr/lib/python3/dist-packages (from requests<3.0.0,>=2.0.1->requests-toolbelt>=0.8.0->kfp) (2.6)\n", "Requirement already satisfied, skipping upgrade: oauthlib>=3.0.0 in /usr/local/lib/python3.6/dist-packages (from requests-oauthlib->kubernetes<=10.0.0,>=8.0.0->kfp) (3.1.0)\n", "Requirement already satisfied, skipping upgrade: google-api-core<2.0.0dev,>=1.16.0 in /usr/local/lib/python3.6/dist-packages (from google-cloud-core<2.0dev,>=1.2.0->google-cloud-storage>=1.13.0->kfp) (1.16.0)\n", "Requirement already satisfied, skipping upgrade: pycparser in ./.local/lib/python3.6/site-packages (from cffi!=1.11.3,>=1.8->cryptography>=2.4.2->kfp) (2.19)\n", "Requirement already satisfied, skipping upgrade: pyasn1>=0.1.3 in /usr/local/lib/python3.6/dist-packages (from rsa<4.1,>=3.1.4->google-auth>=1.6.1->kfp) (0.4.8)\n", "Requirement already satisfied, skipping upgrade: zipp>=0.5 in /usr/local/lib/python3.6/dist-packages (from importlib-metadata; python_version < \"3.8\"->jsonschema>=3.0.1->kfp) (2.1.0)\n", "Requirement already satisfied, skipping upgrade: googleapis-common-protos<2.0dev,>=1.6.0 in /usr/local/lib/python3.6/dist-packages (from google-api-core<2.0.0dev,>=1.16.0->google-cloud-core<2.0dev,>=1.2.0->google-cloud-storage>=1.13.0->kfp) (1.51.0)\n", "Requirement already satisfied, skipping upgrade: pytz in /usr/local/lib/python3.6/dist-packages (from google-api-core<2.0.0dev,>=1.16.0->google-cloud-core<2.0dev,>=1.2.0->google-cloud-storage>=1.13.0->kfp) (2019.3)\n", "Requirement already satisfied, skipping upgrade: protobuf>=3.4.0 in /usr/local/lib/python3.6/dist-packages (from google-api-core<2.0.0dev,>=1.16.0->google-cloud-core<2.0dev,>=1.2.0->google-cloud-storage>=1.13.0->kfp) (3.11.2)\n" ] } ], "source": [ "!pip install kfp --upgrade --user\n", "\n", "import kfp \n", "from kfp import compiler\n", "import kfp.dsl as dsl\n", "import kfp.notebook\n", "import kfp.components as comp\n" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Simple function that just add two numbers:" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [], "source": [ "#Define a Python function\n", "def add(a: float, b: float) -> float:\n", " '''Calculates sum of two arguments'''\n", " return a + b" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Convert the function to a pipeline operation" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [], "source": [ "add_op = comp.func_to_container_op(add)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "A bit more advanced function which demonstrates how to use imports, helper functions and produce multiple outputs." ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [], "source": [ "from typing import NamedTuple\n", "def my_divmod(dividend: float, divisor:float) -> NamedTuple('MyDivmodOutput', [('quotient', float), ('remainder', float)]):\n", " '''Divides two numbers and calculate the quotient and remainder'''\n", " #Imports inside a component function:\n", " import numpy as np\n", "\n", " #This function demonstrates how to use nested functions inside a component function:\n", " def divmod_helper(dividend, divisor):\n", " return np.divmod(dividend, divisor)\n", "\n", " (quotient, remainder) = divmod_helper(dividend, divisor)\n", "\n", " from collections import namedtuple\n", " divmod_output = namedtuple('MyDivmodOutput', ['quotient', 'remainder'])\n", " return divmod_output(quotient, remainder)\n" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Test running the python function directly" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "MyDivmodOutput(quotient=14, remainder=2)" ] }, "execution_count": 5, "metadata": {}, "output_type": "execute_result" } ], "source": [ "my_divmod(100, 7)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Convert the function to a pipeline operation" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [], "source": [ "divmod_op = comp.func_to_container_op(my_divmod, base_image='tensorflow/tensorflow:1.14.0-py3')" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Define the pipeline\n", "Pipeline function has to be decorated with the @dsl.pipeline decorator" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [], "source": [ "@dsl.pipeline(\n", " name='Calculation pipeline',\n", " description='A toy pipeline that performs arithmetic calculations.'\n", ")\n", "def calc_pipeline(\n", " a='a',\n", " b='7',\n", " c='17',\n", "):\n", " #Passing pipeline parameter and a constant value as operation arguments\n", " add_task = add_op(a, 4) #Returns a dsl.ContainerOp class instance. \n", " \n", " #Passing a task output reference as operation arguments\n", " #For an operation with a single return value, the output reference can be accessed using `task.output` or `task.outputs['output_name']` syntax\n", " divmod_task = divmod_op(add_task.output, b)\n", "\n", " #For an operation with a multiple return values, the output references can be accessed using `task.outputs['output_name']` syntax\n", " result_task = add_op(divmod_task.outputs['quotient'], c)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Submit the pipeline for execution" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [ { "data": { "text/html": [ "Experiment link here" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/html": [ "Run link here" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/plain": [ "RunPipelineResult(run_id=87276776-0c3a-4d4e-99d0-4563b7f42fa5)" ] }, "execution_count": 8, "metadata": {}, "output_type": "execute_result" } ], "source": [ "client = kfp.Client()\n", "\n", "#Specify pipeline argument values\n", "arguments = {'a': '7', 'b': '8'}\n", "\n", "#Submit a pipeline run\n", "client.create_run_from_pipeline_func(calc_pipeline, arguments=arguments)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.6.9" } }, "nbformat": 4, "nbformat_minor": 2 } ================================================ FILE: ch04/code/Lightweight Pipeline.py ================================================ #!/usr/bin/env python # coding: utf-8 # # Setup # In[1]: get_ipython().system('pip install kfp --upgrade --user') import kfp from kfp import compiler import kfp.dsl as dsl import kfp.notebook import kfp.components as comp # Simple function that just add two numbers: # In[2]: #Define a Python function def add(a: float, b: float) -> float: '''Calculates sum of two arguments''' return a + b # Convert the function to a pipeline operation # In[3]: add_op = comp.func_to_container_op(add) # A bit more advanced function which demonstrates how to use imports, helper functions and produce multiple outputs. # In[4]: from typing import NamedTuple def my_divmod( dividend: float, divisor: float ) -> NamedTuple('MyDivmodOutput', [('quotient', float), ('remainder', float)]): '''Divides two numbers and calculate the quotient and remainder''' #Imports inside a component function: import numpy as np #This function demonstrates how to use nested functions inside a component function: def divmod_helper(dividend, divisor): return np.divmod(dividend, divisor) (quotient, remainder) = divmod_helper(dividend, divisor) from collections import namedtuple divmod_output = namedtuple('MyDivmodOutput', ['quotient', 'remainder']) return divmod_output(quotient, remainder) # Test running the python function directly # In[5]: my_divmod(100, 7) # Convert the function to a pipeline operation # In[6]: divmod_op = comp.func_to_container_op( my_divmod, base_image='tensorflow/tensorflow:1.14.0-py3') # Define the pipeline # Pipeline function has to be decorated with the @dsl.pipeline decorator # In[7]: @dsl.pipeline( name='Calculation pipeline', description='A toy pipeline that performs arithmetic calculations.') def calc_pipeline( a='a', b='7', c='17', ): #Passing pipeline parameter and a constant value as operation arguments add_task = add_op(a, 4) # Returns a dsl.ContainerOp class instance. #Passing a task output reference as operation arguments #For an operation with a single return value, the output reference can be accessed using `task.output` or `task.outputs['output_name']` syntax divmod_task = divmod_op(add_task.output, b) #For an operation with a multiple return values, the output references can be accessed using `task.outputs['output_name']` syntax result_task = add_op(divmod_task.outputs['quotient'], c) # Submit the pipeline for execution # In[8]: client = kfp.Client() #Specify pipeline argument values arguments = {'a': '7', 'b': '8'} #Submit a pipeline run client.create_run_from_pipeline_func(calc_pipeline, arguments=arguments) # In[ ]: ================================================ FILE: ch04/code/RecommenderPipeline.ipynb ================================================ { "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Kubeflow pipeline\n", "This is a fairly simple pipeline, containing sequential steps:\n", "\n", "1. Update data - implemented by lightbend/recommender-data-update-publisher:0.2 image\n", "2. Run model training. Ideally we would run TFJob, but due to the current limitations for pipelines, we will directly use an image implementing training lightbend/ml-tf-recommender:0.1\n", "3. Update serving model - implemented by lightbend/recommender-model-publisher:0.2" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Setup" ] }, { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Requirement already up-to-date: kubernetes in ./.local/lib/python3.6/site-packages (10.0.1)\n", "Requirement already satisfied, skipping upgrade: pyyaml>=3.12 in /usr/local/lib/python3.6/dist-packages (from kubernetes) (5.3)\n", "Requirement already satisfied, skipping upgrade: six>=1.9.0 in /usr/lib/python3/dist-packages (from kubernetes) (1.11.0)\n", "Requirement already satisfied, skipping upgrade: urllib3>=1.24.2 in ./.local/lib/python3.6/site-packages (from kubernetes) (1.24.3)\n", "Requirement already satisfied, skipping upgrade: requests-oauthlib in /usr/local/lib/python3.6/dist-packages (from kubernetes) (1.3.0)\n", "Requirement already satisfied, skipping upgrade: certifi>=14.05.14 in /usr/local/lib/python3.6/dist-packages (from kubernetes) (2019.11.28)\n", "Requirement already satisfied, skipping upgrade: python-dateutil>=2.5.3 in /usr/local/lib/python3.6/dist-packages (from kubernetes) (2.8.1)\n", "Requirement already satisfied, skipping upgrade: setuptools>=21.0.0 in /usr/local/lib/python3.6/dist-packages (from kubernetes) (45.1.0)\n", "Requirement already satisfied, skipping upgrade: requests in /usr/local/lib/python3.6/dist-packages (from kubernetes) (2.22.0)\n", "Requirement already satisfied, skipping upgrade: websocket-client!=0.40.0,!=0.41.*,!=0.42.*,>=0.32.0 in /usr/local/lib/python3.6/dist-packages (from kubernetes) (0.57.0)\n", "Requirement already satisfied, skipping upgrade: google-auth>=1.0.1 in /usr/local/lib/python3.6/dist-packages (from kubernetes) (1.11.0)\n", "Requirement already satisfied, skipping upgrade: oauthlib>=3.0.0 in /usr/local/lib/python3.6/dist-packages (from requests-oauthlib->kubernetes) (3.1.0)\n", "Requirement already satisfied, skipping upgrade: idna<2.9,>=2.5 in /usr/lib/python3/dist-packages (from requests->kubernetes) (2.6)\n", "Requirement already satisfied, skipping upgrade: chardet<3.1.0,>=3.0.2 in /usr/local/lib/python3.6/dist-packages (from requests->kubernetes) (3.0.4)\n", "Requirement already satisfied, skipping upgrade: pyasn1-modules>=0.2.1 in /usr/local/lib/python3.6/dist-packages (from google-auth>=1.0.1->kubernetes) (0.2.8)\n", "Requirement already satisfied, skipping upgrade: cachetools<5.0,>=2.0.0 in /usr/local/lib/python3.6/dist-packages (from google-auth>=1.0.1->kubernetes) (4.0.0)\n", "Requirement already satisfied, skipping upgrade: rsa<4.1,>=3.1.4 in /usr/local/lib/python3.6/dist-packages (from google-auth>=1.0.1->kubernetes) (4.0)\n", "Requirement already satisfied, skipping upgrade: pyasn1<0.5.0,>=0.4.6 in /usr/local/lib/python3.6/dist-packages (from pyasn1-modules>=0.2.1->google-auth>=1.0.1->kubernetes) (0.4.8)\n", "Requirement already up-to-date: kfp in ./.local/lib/python3.6/site-packages (0.2.2.1)\n", "Requirement already satisfied, skipping upgrade: PyJWT>=1.6.4 in ./.local/lib/python3.6/site-packages (from kfp) (1.7.1)\n", "Requirement already satisfied, skipping upgrade: requests-toolbelt>=0.8.0 in ./.local/lib/python3.6/site-packages (from kfp) (0.9.1)\n", "Requirement already satisfied, skipping upgrade: python-dateutil in /usr/local/lib/python3.6/dist-packages (from kfp) (2.8.1)\n", "Requirement already satisfied, skipping upgrade: PyYAML in /usr/local/lib/python3.6/dist-packages (from kfp) (5.3)\n", "Requirement already satisfied, skipping upgrade: kfp-server-api<=0.1.40,>=0.1.18 in ./.local/lib/python3.6/site-packages (from kfp) (0.1.40)\n", "Requirement already satisfied, skipping upgrade: google-cloud-storage>=1.13.0 in /usr/local/lib/python3.6/dist-packages (from kfp) (1.25.0)\n", "Requirement already satisfied, skipping upgrade: jsonschema>=3.0.1 in /usr/local/lib/python3.6/dist-packages (from kfp) (3.2.0)\n", "Requirement already satisfied, skipping upgrade: Deprecated in ./.local/lib/python3.6/site-packages (from kfp) (1.2.7)\n", "Requirement already satisfied, skipping upgrade: google-auth>=1.6.1 in /usr/local/lib/python3.6/dist-packages (from kfp) (1.11.0)\n", "Collecting kubernetes<=10.0.0,>=8.0.0\n", " Using cached kubernetes-10.0.0-py2.py3-none-any.whl (1.5 MB)\n", "Requirement already satisfied, skipping upgrade: argo-models==2.2.1a in ./.local/lib/python3.6/site-packages (from kfp) (2.2.1a0)\n", "Requirement already satisfied, skipping upgrade: urllib3<1.25,>=1.15 in ./.local/lib/python3.6/site-packages (from kfp) (1.24.3)\n", "Requirement already satisfied, skipping upgrade: certifi in /usr/local/lib/python3.6/dist-packages (from kfp) (2019.11.28)\n", "Requirement already satisfied, skipping upgrade: tabulate==0.8.3 in ./.local/lib/python3.6/site-packages (from kfp) (0.8.3)\n", "Requirement already satisfied, skipping upgrade: click==7.0 in ./.local/lib/python3.6/site-packages (from kfp) (7.0)\n", "Requirement already satisfied, skipping upgrade: cloudpickle==1.1.1 in ./.local/lib/python3.6/site-packages (from kfp) (1.1.1)\n", "Requirement already satisfied, skipping upgrade: six>=1.10 in /usr/lib/python3/dist-packages (from kfp) (1.11.0)\n", "Requirement already satisfied, skipping upgrade: cryptography>=2.4.2 in ./.local/lib/python3.6/site-packages (from kfp) (2.8)\n", "Requirement already satisfied, skipping upgrade: requests<3.0.0,>=2.0.1 in /usr/local/lib/python3.6/dist-packages (from requests-toolbelt>=0.8.0->kfp) (2.22.0)\n", "Requirement already satisfied, skipping upgrade: google-resumable-media<0.6dev,>=0.5.0 in /usr/local/lib/python3.6/dist-packages (from google-cloud-storage>=1.13.0->kfp) (0.5.0)\n", "Requirement already satisfied, skipping upgrade: google-cloud-core<2.0dev,>=1.2.0 in /usr/local/lib/python3.6/dist-packages (from google-cloud-storage>=1.13.0->kfp) (1.3.0)\n", "Requirement already satisfied, skipping upgrade: setuptools in /usr/local/lib/python3.6/dist-packages (from jsonschema>=3.0.1->kfp) (45.1.0)\n", "Requirement already satisfied, skipping upgrade: attrs>=17.4.0 in /usr/local/lib/python3.6/dist-packages (from jsonschema>=3.0.1->kfp) (19.3.0)\n", "Requirement already satisfied, skipping upgrade: importlib-metadata; python_version < \"3.8\" in /usr/local/lib/python3.6/dist-packages (from jsonschema>=3.0.1->kfp) (1.4.0)\n", "Requirement already satisfied, skipping upgrade: pyrsistent>=0.14.0 in /usr/local/lib/python3.6/dist-packages (from jsonschema>=3.0.1->kfp) (0.15.7)\n", "Requirement already satisfied, skipping upgrade: wrapt<2,>=1.10 in /usr/local/lib/python3.6/dist-packages (from Deprecated->kfp) (1.11.2)\n", "Requirement already satisfied, skipping upgrade: rsa<4.1,>=3.1.4 in /usr/local/lib/python3.6/dist-packages (from google-auth>=1.6.1->kfp) (4.0)\n", "Requirement already satisfied, skipping upgrade: cachetools<5.0,>=2.0.0 in /usr/local/lib/python3.6/dist-packages (from google-auth>=1.6.1->kfp) (4.0.0)\n", "Requirement already satisfied, skipping upgrade: pyasn1-modules>=0.2.1 in /usr/local/lib/python3.6/dist-packages (from google-auth>=1.6.1->kfp) (0.2.8)\n", "Requirement already satisfied, skipping upgrade: websocket-client!=0.40.0,!=0.41.*,!=0.42.*,>=0.32.0 in /usr/local/lib/python3.6/dist-packages (from kubernetes<=10.0.0,>=8.0.0->kfp) (0.57.0)\n", "Requirement already satisfied, skipping upgrade: requests-oauthlib in /usr/local/lib/python3.6/dist-packages (from kubernetes<=10.0.0,>=8.0.0->kfp) (1.3.0)\n", "Requirement already satisfied, skipping upgrade: cffi!=1.11.3,>=1.8 in ./.local/lib/python3.6/site-packages (from cryptography>=2.4.2->kfp) (1.14.0)\n", "Requirement already satisfied, skipping upgrade: chardet<3.1.0,>=3.0.2 in /usr/local/lib/python3.6/dist-packages (from requests<3.0.0,>=2.0.1->requests-toolbelt>=0.8.0->kfp) (3.0.4)\n", "Requirement already satisfied, skipping upgrade: idna<2.9,>=2.5 in /usr/lib/python3/dist-packages (from requests<3.0.0,>=2.0.1->requests-toolbelt>=0.8.0->kfp) (2.6)\n", "Requirement already satisfied, skipping upgrade: google-api-core<2.0.0dev,>=1.16.0 in /usr/local/lib/python3.6/dist-packages (from google-cloud-core<2.0dev,>=1.2.0->google-cloud-storage>=1.13.0->kfp) (1.16.0)\n", "Requirement already satisfied, skipping upgrade: zipp>=0.5 in /usr/local/lib/python3.6/dist-packages (from importlib-metadata; python_version < \"3.8\"->jsonschema>=3.0.1->kfp) (2.1.0)\n", "Requirement already satisfied, skipping upgrade: pyasn1>=0.1.3 in /usr/local/lib/python3.6/dist-packages (from rsa<4.1,>=3.1.4->google-auth>=1.6.1->kfp) (0.4.8)\n", "Requirement already satisfied, skipping upgrade: oauthlib>=3.0.0 in /usr/local/lib/python3.6/dist-packages (from requests-oauthlib->kubernetes<=10.0.0,>=8.0.0->kfp) (3.1.0)\n", "Requirement already satisfied, skipping upgrade: pycparser in ./.local/lib/python3.6/site-packages (from cffi!=1.11.3,>=1.8->cryptography>=2.4.2->kfp) (2.19)\n", "Requirement already satisfied, skipping upgrade: googleapis-common-protos<2.0dev,>=1.6.0 in /usr/local/lib/python3.6/dist-packages (from google-api-core<2.0.0dev,>=1.16.0->google-cloud-core<2.0dev,>=1.2.0->google-cloud-storage>=1.13.0->kfp) (1.51.0)\n", "Requirement already satisfied, skipping upgrade: protobuf>=3.4.0 in /usr/local/lib/python3.6/dist-packages (from google-api-core<2.0.0dev,>=1.16.0->google-cloud-core<2.0dev,>=1.2.0->google-cloud-storage>=1.13.0->kfp) (3.11.2)\n", "Requirement already satisfied, skipping upgrade: pytz in /usr/local/lib/python3.6/dist-packages (from google-api-core<2.0.0dev,>=1.16.0->google-cloud-core<2.0dev,>=1.2.0->google-cloud-storage>=1.13.0->kfp) (2019.3)\n", "Installing collected packages: kubernetes\n", " Attempting uninstall: kubernetes\n", " Found existing installation: kubernetes 10.0.1\n", " Uninstalling kubernetes-10.0.1:\n", " Successfully uninstalled kubernetes-10.0.1\n", "Successfully installed kubernetes-10.0.0\n" ] } ], "source": [ "!pip install kubernetes --upgrade --user\n", "!pip install kfp --upgrade --user\n", "\n", "\n", "import kfp # the Pipelines SDK. This library is included with the notebook image.\n", "from kfp import compiler\n", "import kfp.dsl as dsl\n", "import kfp.notebook\n", "from kubernetes import client as k8s_client" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Create/Get an Experiment in the Kubeflow Pipeline System\n", "The Kubeflow Pipeline system requires an \"Experiment\" to group pipeline runs. You can create a new experiment, or call client.list_experiments() to get existing ones." ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [], "source": [ "client = kfp.Client()\n", "client.list_experiments()\n", "#exp = client.create_experiment(name='mdupdate')\n", "exp = client.get_experiment(experiment_name ='mdupdate')" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Define a Pipeline\n", "Authoring a pipeline is like authoring a normal Python function. The pipeline function describes the topology of the pipeline.\n", "\n", "Each step in the pipeline is typically a ContainerOp --- a simple class or function describing how to interact with a docker container image. In the pipeline, all the container images referenced in the pipeline are already built." ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [], "source": [ "@dsl.pipeline(\n", " name='Recommender model update',\n", " description='Demonstrate usage of pipelines for multi-step model update'\n", ")\n", "def recommender_pipeline():\n", " # Load new data\n", " data = dsl.ContainerOp(\n", " name='updatedata',\n", " image='lightbend/recommender-data-update-publisher:0.2') \\\n", " .add_env_variable(k8s_client.V1EnvVar(name='MINIO_URL',value='http://minio-service.kubeflow.svc.cluster.local:9000')) \\\n", " .add_env_variable(k8s_client.V1EnvVar(name='MINIO_KEY', value='minio')) \\\n", " .add_env_variable(k8s_client.V1EnvVar(name='MINIO_SECRET', value='minio123'))\n", " # Train the model\n", " train = dsl.ContainerOp(\n", " name='trainmodel',\n", " image='lightbend/ml-tf-recommender:0.1') \\\n", " .add_env_variable(k8s_client.V1EnvVar(name='MINIO_URL',value='minio-service.kubeflow.svc.cluster.local:9000')) \\\n", " .add_env_variable(k8s_client.V1EnvVar(name='MINIO_KEY', value='minio')) \\\n", " .add_env_variable(k8s_client.V1EnvVar(name='MINIO_SECRET', value='minio123'))\n", " train.after(data)\n", " # Publish new model model\n", " publish = dsl.ContainerOp(\n", " name='publishmodel',\n", " image='lightbend/recommender-model-publisher:0.2') \\\n", " .add_env_variable(k8s_client.V1EnvVar(name='MINIO_URL',value='http://minio-service.kubeflow.svc.cluster.local:9000')) \\\n", " .add_env_variable(k8s_client.V1EnvVar(name='MINIO_KEY', value='minio')) \\\n", " .add_env_variable(k8s_client.V1EnvVar(name='MINIO_SECRET', value='minio123')) \\\n", " .add_env_variable(k8s_client.V1EnvVar(name='KAFKA_BROKERS', value='cloudflow-kafka-brokers.cloudflow.svc.cluster.local:9092')) \\\n", " .add_env_variable(k8s_client.V1EnvVar(name='DEFAULT_RECOMMENDER_URL', value='http://recommendermodelserver.kubeflow.svc.cluster.local:8501')) \\\n", " .add_env_variable(k8s_client.V1EnvVar(name='ALTERNATIVE_RECOMMENDER_URL', value='http://recommendermodelserver1.kubeflow.svc.cluster.local:8501'))\n", " publish.after(train)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Compile pipeline" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [], "source": [ "compiler.Compiler().compile(recommender_pipeline, 'pipeline.tar.gz')" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Submit an experiment run" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [ { "data": { "text/html": [ "Run link here" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "run = client.run_pipeline(exp.id, 'pipeline1', 'pipeline.tar.gz')" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.6.9" } }, "nbformat": 4, "nbformat_minor": 2 } ================================================ FILE: ch04/code/RecommenderPipeline.py ================================================ #!/usr/bin/env python # coding: utf-8 # # Kubeflow pipeline # This is a fairly simple pipeline, containing sequential steps: # # 1. Update data - implemented by lightbend/recommender-data-update-publisher:0.2 image # 2. Run model training. Ideally we would run TFJob, but due to the current limitations for pipelines, we will directly use an image implementing training lightbend/ml-tf-recommender:0.1 # 3. Update serving model - implemented by lightbend/recommender-model-publisher:0.2 # # Setup # In[1]: get_ipython().system('pip install kubernetes --upgrade --user') get_ipython().system('pip install kfp --upgrade --user') # the Pipelines SDK. This library is included with the notebook image. import kfp from kfp import compiler import kfp.dsl as dsl import kfp.notebook from kubernetes import client as k8s_client # # Create/Get an Experiment in the Kubeflow Pipeline System # The Kubeflow Pipeline system requires an "Experiment" to group pipeline runs. You can create a new experiment, or call client.list_experiments() to get existing ones. # In[3]: client = kfp.Client() client.list_experiments() #exp = client.create_experiment(name='mdupdate') exp = client.get_experiment(experiment_name='mdupdate') # # Define a Pipeline # Authoring a pipeline is like authoring a normal Python function. The pipeline function describes the topology of the pipeline. # # Each step in the pipeline is typically a ContainerOp --- a simple class or function describing how to interact with a docker container image. In the pipeline, all the container images referenced in the pipeline are already built. # In[4]: @dsl.pipeline( name='Recommender model update', description='Demonstrate usage of pipelines for multi-step model update') def recommender_pipeline(): # Load new data data = dsl.ContainerOp( name='updatedata', image='lightbend/recommender-data-update-publisher:0.2') \ .add_env_variable(k8s_client.V1EnvVar(name='MINIO_URL', value='http://minio-service.kubeflow.svc.cluster.local:9000')) \ .add_env_variable(k8s_client.V1EnvVar(name='MINIO_KEY', value='minio')) \ .add_env_variable(k8s_client.V1EnvVar(name='MINIO_SECRET', value='minio123')) # Train the model train = dsl.ContainerOp( name='trainmodel', image='lightbend/ml-tf-recommender:0.1') \ .add_env_variable(k8s_client.V1EnvVar(name='MINIO_URL', value='minio-service.kubeflow.svc.cluster.local:9000')) \ .add_env_variable(k8s_client.V1EnvVar(name='MINIO_KEY', value='minio')) \ .add_env_variable(k8s_client.V1EnvVar(name='MINIO_SECRET', value='minio123')) train.after(data) # Publish new model model publish = dsl.ContainerOp( name='publishmodel', image='lightbend/recommender-model-publisher:0.2') \ .add_env_variable(k8s_client.V1EnvVar(name='MINIO_URL', value='http://minio-service.kubeflow.svc.cluster.local:9000')) \ .add_env_variable(k8s_client.V1EnvVar(name='MINIO_KEY', value='minio')) \ .add_env_variable(k8s_client.V1EnvVar(name='MINIO_SECRET', value='minio123')) \ .add_env_variable(k8s_client.V1EnvVar(name='KAFKA_BROKERS', value='cloudflow-kafka-brokers.cloudflow.svc.cluster.local:9092')) \ .add_env_variable(k8s_client.V1EnvVar(name='DEFAULT_RECOMMENDER_URL', value='http://recommendermodelserver.kubeflow.svc.cluster.local:8501')) \ .add_env_variable(k8s_client.V1EnvVar(name='ALTERNATIVE_RECOMMENDER_URL', value='http://recommendermodelserver1.kubeflow.svc.cluster.local:8501')) publish.after(train) # # Compile pipeline # In[5]: compiler.Compiler().compile(recommender_pipeline, 'pipeline.tar.gz') # # Submit an experiment run # In[6]: run = client.run_pipeline(exp.id, 'pipeline1', 'pipeline.tar.gz') # In[ ]: ================================================ FILE: ch04/code/download_components.sh ================================================ #!/bin/bash #tag::dlPipelineRelease[] wget https://github.com/kubeflow/pipelines/archive/0.2.5.tar.gz tar -xvf 0.2.5.tar.gz #end::dlPipelineRelease[] ================================================ FILE: ch04/install/deployment.yaml ================================================ apiVersion: extensions/v1beta1 kind: Deployment metadata: labels: app: argo-ui app.kubernetes.io/component: argo app.kubernetes.io/instance: argo-v2.3.0 app.kubernetes.io/managed-by: kfctl app.kubernetes.io/name: argo app.kubernetes.io/part-of: kubeflow app.kubernetes.io/version: v2.3.0 kustomize.component: argo name: argo-ui namespace: kubeflow spec: progressDeadlineSeconds: 600 replicas: 1 revisionHistoryLimit: 10 selector: matchLabels: app: argo-ui app.kubernetes.io/component: argo app.kubernetes.io/instance: argo-v2.3.0 app.kubernetes.io/managed-by: kfctl app.kubernetes.io/name: argo app.kubernetes.io/part-of: kubeflow app.kubernetes.io/version: v2.3.0 kustomize.component: argo strategy: rollingUpdate: maxSurge: 25% maxUnavailable: 25% type: RollingUpdate template: metadata: annotations: sidecar.istio.io/inject: "false" creationTimestamp: null labels: app: argo-ui app.kubernetes.io/component: argo app.kubernetes.io/instance: argo-v2.3.0 app.kubernetes.io/managed-by: kfctl app.kubernetes.io/name: argo app.kubernetes.io/part-of: kubeflow app.kubernetes.io/version: v2.3.0 kustomize.component: argo spec: containers: - env: - name: ARGO_NAMESPACE valueFrom: fieldRef: apiVersion: v1 fieldPath: metadata.namespace - name: IN_CLUSTER value: "true" - name: ENABLE_WEB_CONSOLE value: "true" - name: BASE_HREF value: / image: argoproj/argoui:v2.3.0 imagePullPolicy: IfNotPresent name: argo-ui ports: - containerPort: 8001 name: ui protocol: TCP readinessProbe: failureThreshold: 3 httpGet: path: / port: 8001 scheme: HTTP periodSeconds: 10 successThreshold: 1 timeoutSeconds: 1 resources: {} terminationMessagePath: /dev/termination-log terminationMessagePolicy: File dnsPolicy: ClusterFirst restartPolicy: Always schedulerName: default-scheduler securityContext: {} serviceAccount: argo-ui serviceAccountName: argo-ui terminationGracePeriodSeconds: 30 ================================================ FILE: ch04/install/virtualservice.yaml ================================================ apiVersion: networking.istio.io/v1alpha3 kind: VirtualService metadata: name: argo-ui namespace: kubeflow spec: gateways: - kubeflow-gateway hosts: - '*' http: - match: - uri: prefix: /argo/ rewrite: uri: / route: - destination: host: argo-ui.kubeflow.svc.cluster.local port: number: 80 ================================================ FILE: ch06/MLflow.ipynb ================================================ { "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# mlflow-energyforecast\n", "\n", "This is a showcase for ML Flow capabilities, based on the article\n", "http://the-odd-dataguy.com/be-more-efficient-to-produce-ml-models-with-mlflow\n", "and a github https://github.com/jeanmidevacc/mlflow-energyforecast\n" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Collecting pandas\n", "\u001b[?25l Downloading https://files.pythonhosted.org/packages/12/d1/a6502c2f5c15b50f5dd579fc1c52b47edf6f2e9f682aed917dd7565b3e60/pandas-1.0.0-cp36-cp36m-manylinux1_x86_64.whl (10.1MB)\n", "\u001b[K |████████████████████████████████| 10.1MB 3.2MB/s eta 0:00:01\n", "\u001b[?25hRequirement already satisfied, skipping upgrade: numpy>=1.13.3 in ./.local/lib/python3.6/site-packages (from pandas) (1.18.1)\n", "Requirement already satisfied, skipping upgrade: python-dateutil>=2.6.1 in /usr/local/lib/python3.6/dist-packages (from pandas) (2.8.0)\n", "Requirement already satisfied, skipping upgrade: pytz>=2017.2 in /usr/local/lib/python3.6/dist-packages (from pandas) (2019.2)\n", "Requirement already satisfied, skipping upgrade: six>=1.5 in /usr/lib/python3/dist-packages (from python-dateutil>=2.6.1->pandas) (1.11.0)\n", "Installing collected packages: pandas\n", " Found existing installation: pandas 0.25.3\n", " Uninstalling pandas-0.25.3:\n", " Successfully uninstalled pandas-0.25.3\n", "Successfully installed pandas-1.0.0\n", "\u001b[33mWARNING: You are using pip version 19.1.1, however version 20.0.2 is available.\n", "You should consider upgrading via the 'pip install --upgrade pip' command.\u001b[0m\n", "Collecting mlflow\n", "\u001b[?25l Downloading https://files.pythonhosted.org/packages/65/33/5fe1559f7eb95e1fa2077df747ada7fd225045bad4e76bcdb53605e4b937/mlflow-1.6.0.tar.gz (15.9MB)\n", "\u001b[K |████████████████████████████████| 15.9MB 3.0MB/s eta 0:00:01\n", "\u001b[?25hRequirement already satisfied, skipping upgrade: alembic in ./.local/lib/python3.6/site-packages (from mlflow) (1.3.2)\n", "Requirement already satisfied, skipping upgrade: click>=7.0 in /usr/local/lib/python3.6/dist-packages (from mlflow) (7.0)\n", "Requirement already satisfied, skipping upgrade: cloudpickle in ./.local/lib/python3.6/site-packages (from mlflow) (1.1.1)\n", "Requirement already satisfied, skipping upgrade: databricks-cli>=0.8.7 in ./.local/lib/python3.6/site-packages (from mlflow) (0.9.1)\n", "Requirement already satisfied, skipping upgrade: requests>=2.17.3 in /usr/local/lib/python3.6/dist-packages (from mlflow) (2.22.0)\n", "Requirement already satisfied, skipping upgrade: six>=1.10.0 in /usr/lib/python3/dist-packages (from mlflow) (1.11.0)\n", "Requirement already satisfied, skipping upgrade: Flask in ./.local/lib/python3.6/site-packages (from mlflow) (1.1.1)\n", "Requirement already satisfied, skipping upgrade: numpy in ./.local/lib/python3.6/site-packages (from mlflow) (1.18.1)\n", "Requirement already satisfied, skipping upgrade: pandas in ./.local/lib/python3.6/site-packages (from mlflow) (1.0.0)\n", "Requirement already satisfied, skipping upgrade: python-dateutil in /usr/local/lib/python3.6/dist-packages (from mlflow) (2.8.0)\n", "Requirement already satisfied, skipping upgrade: protobuf>=3.6.0 in /usr/local/lib/python3.6/dist-packages (from mlflow) (3.8.0)\n", "Requirement already satisfied, skipping upgrade: gitpython>=2.1.0 in ./.local/lib/python3.6/site-packages (from mlflow) (3.0.5)\n", "Requirement already satisfied, skipping upgrade: pyyaml in /usr/local/lib/python3.6/dist-packages (from mlflow) (5.1.2)\n", "Requirement already satisfied, skipping upgrade: querystring_parser in ./.local/lib/python3.6/site-packages (from mlflow) (1.2.4)\n", "Requirement already satisfied, skipping upgrade: simplejson in ./.local/lib/python3.6/site-packages (from mlflow) (3.17.0)\n", "Requirement already satisfied, skipping upgrade: docker>=4.0.0 in /usr/local/lib/python3.6/dist-packages (from mlflow) (4.0.2)\n", "Requirement already satisfied, skipping upgrade: entrypoints in /usr/local/lib/python3.6/dist-packages (from mlflow) (0.3)\n", "Requirement already satisfied, skipping upgrade: sqlparse in ./.local/lib/python3.6/site-packages (from mlflow) (0.3.0)\n", "Requirement already satisfied, skipping upgrade: sqlalchemy in ./.local/lib/python3.6/site-packages (from mlflow) (1.3.12)\n", "Requirement already satisfied, skipping upgrade: gorilla in ./.local/lib/python3.6/site-packages (from mlflow) (0.3.0)\n", "Requirement already satisfied, skipping upgrade: prometheus-flask-exporter in ./.local/lib/python3.6/site-packages (from mlflow) (0.12.1)\n", "Requirement already satisfied, skipping upgrade: gunicorn in ./.local/lib/python3.6/site-packages (from mlflow) (20.0.4)\n", "Requirement already satisfied, skipping upgrade: Mako in ./.local/lib/python3.6/site-packages (from alembic->mlflow) (1.1.0)\n", "Requirement already satisfied, skipping upgrade: python-editor>=0.3 in ./.local/lib/python3.6/site-packages (from alembic->mlflow) (1.0.4)\n", "Requirement already satisfied, skipping upgrade: configparser>=0.3.5 in ./.local/lib/python3.6/site-packages (from databricks-cli>=0.8.7->mlflow) (4.0.2)\n", "Requirement already satisfied, skipping upgrade: tabulate>=0.7.7 in /usr/local/lib/python3.6/dist-packages (from databricks-cli>=0.8.7->mlflow) (0.8.3)\n", "Requirement already satisfied, skipping upgrade: chardet<3.1.0,>=3.0.2 in /usr/local/lib/python3.6/dist-packages (from requests>=2.17.3->mlflow) (3.0.4)\n", "Requirement already satisfied, skipping upgrade: certifi>=2017.4.17 in /usr/local/lib/python3.6/dist-packages (from requests>=2.17.3->mlflow) (2019.9.11)\n", "Requirement already satisfied, skipping upgrade: urllib3!=1.25.0,!=1.25.1,<1.26,>=1.21.1 in /usr/local/lib/python3.6/dist-packages (from requests>=2.17.3->mlflow) (1.24.3)\n", "Requirement already satisfied, skipping upgrade: idna<2.9,>=2.5 in /usr/lib/python3/dist-packages (from requests>=2.17.3->mlflow) (2.6)\n", "Requirement already satisfied, skipping upgrade: Jinja2>=2.10.1 in /usr/local/lib/python3.6/dist-packages (from Flask->mlflow) (2.10.1)\n", "Requirement already satisfied, skipping upgrade: itsdangerous>=0.24 in ./.local/lib/python3.6/site-packages (from Flask->mlflow) (1.1.0)\n", "Requirement already satisfied, skipping upgrade: Werkzeug>=0.15 in /usr/local/lib/python3.6/dist-packages (from Flask->mlflow) (0.15.4)\n", "Requirement already satisfied, skipping upgrade: pytz>=2017.2 in /usr/local/lib/python3.6/dist-packages (from pandas->mlflow) (2019.2)\n", "Requirement already satisfied, skipping upgrade: setuptools in /usr/local/lib/python3.6/dist-packages (from protobuf>=3.6.0->mlflow) (41.0.1)\n", "Requirement already satisfied, skipping upgrade: gitdb2>=2.0.0 in ./.local/lib/python3.6/site-packages (from gitpython>=2.1.0->mlflow) (2.0.6)\n", "Requirement already satisfied, skipping upgrade: websocket-client>=0.32.0 in /usr/local/lib/python3.6/dist-packages (from docker>=4.0.0->mlflow) (0.56.0)\n", "Requirement already satisfied, skipping upgrade: prometheus-client in /usr/local/lib/python3.6/dist-packages (from prometheus-flask-exporter->mlflow) (0.7.1)\n", "Requirement already satisfied, skipping upgrade: MarkupSafe>=0.9.2 in /usr/local/lib/python3.6/dist-packages (from Mako->alembic->mlflow) (1.1.1)\n", "Requirement already satisfied, skipping upgrade: smmap2>=2.0.0 in ./.local/lib/python3.6/site-packages (from gitdb2>=2.0.0->gitpython>=2.1.0->mlflow) (2.0.5)\n", "Building wheels for collected packages: mlflow\n", " Building wheel for mlflow (setup.py) ... \u001b[?25ldone\n", "\u001b[?25h Stored in directory: /home/jovyan/.cache/pip/wheels/46/4e/83/e58b14b6d2d494783e31690de9572c5777882f675f480374b6\n", "Successfully built mlflow\n", "Installing collected packages: mlflow\n", " Found existing installation: mlflow 1.5.0\n", " Uninstalling mlflow-1.5.0:\n", " Successfully uninstalled mlflow-1.5.0\n", "\u001b[33m WARNING: The script mlflow is installed in '/home/jovyan/.local/bin' which is not on PATH.\n", " Consider adding this directory to PATH or, if you prefer to suppress this warning, use --no-warn-script-location.\u001b[0m\n", "Successfully installed mlflow-1.6.0\n", "\u001b[33mWARNING: You are using pip version 19.1.1, however version 20.0.2 is available.\n", "You should consider upgrading via the 'pip install --upgrade pip' command.\u001b[0m\n", "Requirement already up-to-date: joblib in ./.local/lib/python3.6/site-packages (0.14.1)\n", "\u001b[33mWARNING: You are using pip version 19.1.1, however version 20.0.2 is available.\n", "You should consider upgrading via the 'pip install --upgrade pip' command.\u001b[0m\n", "Requirement already up-to-date: numpy in ./.local/lib/python3.6/site-packages (1.18.1)\n", "\u001b[33mWARNING: You are using pip version 19.1.1, however version 20.0.2 is available.\n", "You should consider upgrading via the 'pip install --upgrade pip' command.\u001b[0m\n", "Requirement already up-to-date: scipy in ./.local/lib/python3.6/site-packages (1.4.1)\n", "Requirement already satisfied, skipping upgrade: numpy>=1.13.3 in ./.local/lib/python3.6/site-packages (from scipy) (1.18.1)\n", "\u001b[33mWARNING: You are using pip version 19.1.1, however version 20.0.2 is available.\n", "You should consider upgrading via the 'pip install --upgrade pip' command.\u001b[0m\n", "Requirement already up-to-date: scikit-learn in ./.local/lib/python3.6/site-packages (0.22.1)\n", "Requirement already satisfied, skipping upgrade: numpy>=1.11.0 in ./.local/lib/python3.6/site-packages (from scikit-learn) (1.18.1)\n", "Requirement already satisfied, skipping upgrade: scipy>=0.17.0 in ./.local/lib/python3.6/site-packages (from scikit-learn) (1.4.1)\n", "Requirement already satisfied, skipping upgrade: joblib>=0.11 in ./.local/lib/python3.6/site-packages (from scikit-learn) (0.14.1)\n", "\u001b[33mWARNING: You are using pip version 19.1.1, however version 20.0.2 is available.\n", "You should consider upgrading via the 'pip install --upgrade pip' command.\u001b[0m\n", "Collecting boto3\n", "\u001b[?25l Downloading https://files.pythonhosted.org/packages/d5/57/e9675a5a8d0ee586594ff19cb9a601334fbf24fa2fb29052d2a900ee5d23/boto3-1.11.9-py2.py3-none-any.whl (128kB)\n", "\u001b[K |████████████████████████████████| 133kB 3.5MB/s eta 0:00:01\n", "\u001b[?25hCollecting botocore<1.15.0,>=1.14.9 (from boto3)\n", "\u001b[?25l Downloading https://files.pythonhosted.org/packages/64/4c/b0b0d3b6f84a05f9135051b56d3eb8708012a289c4b82ee21c8c766f47b5/botocore-1.14.9-py2.py3-none-any.whl (5.9MB)\n", "\u001b[K |████████████████████████████████| 5.9MB 11.6MB/s eta 0:00:01\n", "\u001b[?25hRequirement already satisfied, skipping upgrade: jmespath<1.0.0,>=0.7.1 in ./.local/lib/python3.6/site-packages (from boto3) (0.9.4)\n", "Requirement already satisfied, skipping upgrade: s3transfer<0.4.0,>=0.3.0 in ./.local/lib/python3.6/site-packages (from boto3) (0.3.0)\n", "Requirement already satisfied, skipping upgrade: python-dateutil<3.0.0,>=2.1 in /usr/local/lib/python3.6/dist-packages (from botocore<1.15.0,>=1.14.9->boto3) (2.8.0)\n", "Requirement already satisfied, skipping upgrade: docutils<0.16,>=0.10 in ./.local/lib/python3.6/site-packages (from botocore<1.15.0,>=1.14.9->boto3) (0.15.2)\n", "Requirement already satisfied, skipping upgrade: urllib3<1.26,>=1.20 in /usr/local/lib/python3.6/dist-packages (from botocore<1.15.0,>=1.14.9->boto3) (1.24.3)\n", "Requirement already satisfied, skipping upgrade: six>=1.5 in /usr/lib/python3/dist-packages (from python-dateutil<3.0.0,>=2.1->botocore<1.15.0,>=1.14.9->boto3) (1.11.0)\n", "Installing collected packages: botocore, boto3\n", " Found existing installation: botocore 1.14.4\n", " Uninstalling botocore-1.14.4:\n", " Successfully uninstalled botocore-1.14.4\n", " Found existing installation: boto3 1.11.4\n", " Uninstalling boto3-1.11.4:\n", " Successfully uninstalled boto3-1.11.4\n", "Successfully installed boto3-1.11.9 botocore-1.14.9\n", "\u001b[33mWARNING: You are using pip version 19.1.1, however version 20.0.2 is available.\n", "You should consider upgrading via the 'pip install --upgrade pip' command.\u001b[0m\n" ] } ], "source": [ "!pip install pandas --upgrade --user\n", "!pip install mlflow --upgrade --user\n", "!pip install joblib --upgrade --user\n", "!pip install numpy --upgrade --user \n", "!pip install scipy --upgrade --user \n", "!pip install scikit-learn --upgrade --user\n", "!pip install boto3 --upgrade --user" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [], "source": [ "import time\n", "import json\n", "import os\n", "from joblib import Parallel, delayed\n", "\n", "import pandas as pd\n", "import numpy as np\n", "import scipy\n", "\n", "from sklearn.model_selection import train_test_split, KFold\n", "from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score, explained_variance_score\n", "from sklearn.exceptions import ConvergenceWarning\n", "\n", "import mlflow\n", "import mlflow.sklearn\n", "from mlflow.tracking import MlflowClient\n", "\n", "from warnings import simplefilter\n", "simplefilter(action='ignore', category = FutureWarning)\n", "simplefilter(action='ignore', category = ConvergenceWarning)" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [], "source": [ "# Ensure Minio access\n", "os.environ['MLFLOW_S3_ENDPOINT_URL'] = 'http://minio-service.kubeflow.svc.cluster.local:9000'\n", "os.environ['AWS_ACCESS_KEY_ID'] = 'minio'\n", "os.environ['AWS_SECRET_ACCESS_KEY'] = 'minio123'" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Data preparation" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [], "source": [ "# Collect the data \n", "df_nationalconsumption_electricity_daily = pd.read_csv(\"https://raw.githubusercontent.com/jeanmidevacc/mlflow-energyforecast/master/data/rtu_data.csv\")\n", "df_nationalconsumption_electricity_daily.set_index([\"day\"], inplace = True)" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Size of the training set : 1081\n", "Size of the testing set : 233\n" ] } ], "source": [ "# Prepare the training set and the testing set\n", "df_trainvalidate_energyconsumption = df_nationalconsumption_electricity_daily[df_nationalconsumption_electricity_daily[\"datastatus\"] == \"Définitif\"]\n", "del df_trainvalidate_energyconsumption[\"datastatus\"]\n", "\n", "df_test_energyconsumption = df_nationalconsumption_electricity_daily[df_nationalconsumption_electricity_daily[\"datastatus\"] == \"Consolidé\"]\n", "del df_test_energyconsumption[\"datastatus\"]\n", "\n", "print(\"Size of the training set : \",len(df_trainvalidate_energyconsumption))\n", "print(\"Size of the testing set : \",len(df_test_energyconsumption))" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Output to predict : dailyconsumption\n", "Inputs for the prediction : ['weekday', 'week', 'month', 'year', 'avg_min_temperature', 'avg_max_temperature', 'avg_mean_temperature', 'wavg_min_temperature', 'wavg_max_temperature', 'wavg_mean_temperature', 'is_holiday']\n" ] } ], "source": [ "# Define the inputs and the output\n", "output = \"dailyconsumption\"\n", "allinputs = list(df_trainvalidate_energyconsumption.columns)\n", "allinputs.remove(output)\n", "\n", "print(\"Output to predict : \", output)\n", "print(\"Inputs for the prediction : \", allinputs)" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [], "source": [ "# Build different set of featurws for the model\n", "possible_inputs = {\n", " \"all\" : allinputs,\n", " \"only_allday_inputs\" : [\"weekday\", \"month\", \"is_holiday\", \"week\"],\n", " \"only_allweatheravg_inputs\" : [\"avg_min_temperature\", \"avg_max_temperature\", \"avg_mean_temperature\",\"wavg_min_temperature\", \"wavg_max_temperature\", \"wavg_mean_temperature\"],\n", " \"only_meanweather_inputs_avg\" : [\"avg_mean_temperature\"],\n", " \"only_meanweather_inputs_wavg\" : [\"wavg_mean_temperature\"],\n", "}" ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [], "source": [ "# Prepare the output of the model\n", "array_output_train = np.array(df_trainvalidate_energyconsumption[output])\n", "array_output_test = np.array(df_test_energyconsumption[output])" ] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [], "source": [ "# connect to remote server\n", "remote_server_uri = \"http://mlflowserver.kubeflow.svc.cluster.local:5000\"\n", "mlflow.set_tracking_uri(remote_server_uri)\n", "# Launch the experiment on mlflow\n", "experiment_name = \"electricityconsumption-forecast\"\n", "mlflow.set_experiment(experiment_name)" ] }, { "cell_type": "code", "execution_count": 11, "metadata": {}, "outputs": [], "source": [ "# Define the evaluation function that will do the computation of the different metrics of accuracy (RMSE,MAE,R2)\n", "def evaluation_model(y_test, y_pred):\n", "\n", " rmse = np.sqrt(mean_squared_error(y_test, y_pred))\n", " mae = mean_absolute_error(y_test, y_pred)\n", " r2 = r2_score(y_test, y_pred)\n", "\n", " metrics = {\n", " \"rmse\" : rmse,\n", " \"r2\" : r2,\n", " \"mae\" : mae,\n", " }\n", " \n", " return metrics" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# KNN regressor" ] }, { "cell_type": "code", "execution_count": 12, "metadata": {}, "outputs": [], "source": [ "from sklearn.neighbors import KNeighborsRegressor\n", "\n", "def train_knnmodel(parameters, inputs, tags, log = False):\n", " with mlflow.start_run(nested = True):\n", " \n", " # Prepare the data\n", " array_inputs_train = np.array(df_trainvalidate_energyconsumption[inputs])\n", " array_inputs_test = np.array(df_test_energyconsumption[inputs])\n", " \n", " \n", " # Build the model\n", " tic = time.time()\n", " model = KNeighborsRegressor(parameters[\"nbr_neighbors\"], weights = parameters[\"weight_method\"])\n", " model.fit(array_inputs_train, array_output_train)\n", " duration_training = time.time() - tic\n", "\n", " # Make the prediction\n", " tic1 = time.time()\n", " prediction = model.predict(array_inputs_test)\n", " duration_prediction = time.time() - tic1\n", "\n", " # Evaluate the model prediction\n", " metrics = evaluation_model(array_output_test, prediction)\n", "\n", " # Log in the console\n", " if log:\n", " print(f\"KNN regressor:\")\n", " print(parameters)\n", " print(metrics)\n", "\n", " # Log in mlflow (parameter)\n", " mlflow.log_params(parameters)\n", "\n", " # Log in mlflow (metrics)\n", " metrics[\"duration_training\"] = duration_training\n", " metrics[\"duration_prediction\"] = duration_prediction\n", " mlflow.log_metrics(metrics)\n", "\n", " # log in mlflow (model)\n", " mlflow.sklearn.log_model(model, f\"model\")\n", " \n", " # Tag the model\n", " mlflow.set_tags(tags)" ] }, { "cell_type": "code", "execution_count": 13, "metadata": {}, "outputs": [], "source": [ "# Test the different combinations\n", "configurations = []\n", "for nbr_neighbors in [1,2,5,10]:\n", " for weight_method in ['uniform','distance']:\n", " for field in possible_inputs:\n", " parameters = {\n", " \"nbr_neighbors\" : nbr_neighbors,\n", " \"weight_method\" : weight_method\n", " }\n", "\n", " tags = {\n", " \"model\" : \"knn\",\n", " \"inputs\" : field\n", " }\n", " \n", " configurations.append([parameters, tags])\n", "\n", " train_knnmodel(parameters, possible_inputs[field], tags)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# MLP regressor" ] }, { "cell_type": "code", "execution_count": 14, "metadata": {}, "outputs": [], "source": [ "from sklearn.neural_network import MLPRegressor\n", "\n", "def train_mlpmodel(parameters, inputs, tags, log = False):\n", " with mlflow.start_run(nested = True):\n", " \n", " # Prepare the data\n", " array_inputs_train = np.array(df_trainvalidate_energyconsumption[inputs])\n", " array_inputs_test = np.array(df_test_energyconsumption[inputs])\n", " \n", " # Build the model\n", " tic = time.time()\n", "\n", " model = MLPRegressor(\n", " hidden_layer_sizes = parameters[\"hidden_layers\"],\n", " activation = parameters[\"activation\"],\n", " solver = parameters[\"solver\"],\n", " max_iter = parameters[\"nbr_iteration\"],\n", " random_state = 0)\n", " \n", " model.fit(array_inputs_train, array_output_train)\n", " duration_training = time.time() - tic\n", "\n", " # Make the prediction\n", " tic1 = time.time()\n", " prediction = model.predict(array_inputs_test)\n", " duration_prediction = time.time() - tic1\n", "\n", " # Evaluate the model prediction\n", " metrics = evaluation_model(array_output_test, prediction)\n", "\n", " # Log in the console\n", " if log:\n", " print(f\"Random forest regressor:\")\n", " print(parameters)\n", " print(metrics)\n", " \n", " # Log in mlflow (parameter)\n", " mlflow.log_params(parameters)\n", "\n", " # Log in mlflow (metrics)\n", " metrics[\"duration_training\"] = duration_training\n", " metrics[\"duration_prediction\"] = duration_prediction\n", " mlflow.log_metrics(metrics)\n", "\n", " # log in mlflow (model)\n", " mlflow.sklearn.log_model(model, f\"model\")\n", " \n", " # Tag the model\n", " mlflow.set_tags(tags) " ] }, { "cell_type": "code", "execution_count": 15, "metadata": {}, "outputs": [], "source": [ "for hiddenlayers in [4,8,16]:\n", " for activation in [\"identity\",\"logistic\",]:\n", " for solver in [\"lbfgs\"]:\n", " for nbriteration in [10,100,1000]:\n", " for field in possible_inputs:\n", " parameters = {\n", " \"hidden_layers\" : hiddenlayers,\n", " \"activation\" : activation,\n", " \"solver\" : solver,\n", " \"nbr_iteration\" : nbriteration\n", " }\n", "\n", " tags = {\n", " \"model\" : \"mlp\",\n", " \"inputs\" : field\n", " }\n", "\n", " train_mlpmodel(parameters, possible_inputs[field], tags)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Use a handmade model (scipy approach)" ] }, { "cell_type": "code", "execution_count": 16, "metadata": {}, "outputs": [], "source": [ "class PTG:\n", " def __init__(self, thresholds_x0, thresholds_a, thresholds_b):\n", " self.thresholds_x0 = thresholds_x0\n", " self.thresholds_a = thresholds_a\n", " self.thresholds_b = thresholds_b\n", " \n", " def get_ptgmodel(self, x, a, b, x0):\n", " return np.piecewise(x, [x < x0, x >= x0], [lambda x: a*x + b , lambda x : a*x0 + b])\n", " \n", " def fit(self, dfx, y):\n", " x = np.array(dfx)\n", " \n", " # Define the bounds\n", " bounds_min = [thresholds_a[0], thresholds_b[0], thresholds_x0[0]]\n", " bounds_max = [thresholds_a[1], thresholds_b[1], thresholds_x0[1]]\n", " bounds = (bounds_min, bounds_max)\n", "\n", " # Fit a model\n", " popt, pcov = scipy.optimize.curve_fit(self.get_ptgmodel, x, y, bounds = bounds)\n", "\n", " # Get the parameter of the model\n", " a = popt[0]\n", " b = popt[1]\n", " x0 = popt[2]\n", " \n", " self.coefficients = [a, b, x0]\n", " \n", " def predict(self,dfx):\n", " x = np.array(dfx)\n", " predictions = []\n", " for elt in x:\n", " forecast = self.get_ptgmodel(elt, self.coefficients[0], self.coefficients[1], self.coefficients[2])\n", " predictions.append(forecast)\n", " return np.array(predictions)\n", " \n", "def train_ptgmodel(parameters, inputs, tags, log = False):\n", " with mlflow.start_run(nested = True):\n", " \n", " # Prepare the data\n", " df_inputs_train = df_trainvalidate_energyconsumption[inputs[0]]\n", " df_inputs_test = df_test_energyconsumption[inputs[0]]\n", " \n", " \n", " # Build the model\n", " tic = time.time()\n", " \n", " model = PTG(parameters[\"thresholds_x0\"], parameters[\"thresholds_a\"], parameters[\"thresholds_b\"])\n", " \n", " model.fit(df_inputs_train, array_output_train)\n", " duration_training = time.time() - tic\n", "\n", " # Make the prediction\n", " tic1 = time.time()\n", " prediction = model.predict(df_inputs_test)\n", " duration_prediction = time.time() - tic1\n", "\n", " # Evaluate the model prediction\n", " metrics = evaluation_model(array_output_test, prediction)\n", "\n", " # Log in the console\n", " if log:\n", " print(f\"PTG:\")\n", " print(parameters)\n", " print(metrics)\n", " \n", " # Log in mlflow (parameter)\n", " mlflow.log_params(parameters) \n", "\n", " # Log in mlflow (metrics)\n", " metrics[\"duration_training\"] = duration_training\n", " metrics[\"duration_prediction\"] = duration_prediction\n", " mlflow.log_metrics(metrics)\n", "\n", " # log in mlflow (model)\n", " mlflow.sklearn.log_model(model, f\"model\")\n", " \n", " # Tag the model\n", " mlflow.set_tags(tags) " ] }, { "cell_type": "code", "execution_count": 17, "metadata": {}, "outputs": [], "source": [ "# Define the parameters of the model\n", "thresholds_x0 = [0, 20]\n", "thresholds_a = [-200000, -50000]\n", "thresholds_b = [1000000, 3000000]\n", "\n", "parameters = {\n", " \"thresholds_x0\" : thresholds_x0,\n", " \"thresholds_a\" : thresholds_a,\n", " \"thresholds_b\" : thresholds_b\n", "}\n", "\n", "for field in [\"only_meanweather_inputs_avg\", \"only_meanweather_inputs_wavg\"]:\n", " \n", " tags = {\n", " \"model\" : \"ptg\",\n", " \"inputs\" : field\n", " }\n", " \n", " train_ptgmodel(parameters, possible_inputs[field], tags, log = False)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Evaluate mlflow results" ] }, { "cell_type": "code", "execution_count": 18, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Number of runs done : 272\n" ] } ], "source": [ "# Select the run of the experiment\n", "df_runs = mlflow.search_runs(experiment_ids=\"0\")\n", "print(\"Number of runs done : \", len(df_runs))" ] }, { "cell_type": "code", "execution_count": 19, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
run_idexperiment_idstatusartifact_uristart_timeend_timemetrics.r2metrics.maemetrics.duration_predictionmetrics.rmse...params.activationparams.nbr_iterationparams.hidden_layersparams.nbr_neighborsparams.weight_methodtags.modeltags.mlflow.source.typetags.inputstags.mlflow.usertags.mlflow.source.name
23850ee6409ad3a4778bb9d8cb59034df5d0FINISHEDs3://mlflow/mlflow/artifacts/0/50ee6409ad3a477...2020-01-17 18:17:47.448000+00:002020-01-17 18:17:47.929000+00:000.935956104040.3398090.003205134649.399348...NoneNoneNone5distanceknnLOCALalljovyan/usr/local/lib/python3.6/dist-packages/ipykern...
106614bcf7042ca465c8d86296f12ac9c090FINISHEDs3://mlflow/mlflow/artifacts/0/614bcf7042ca465...2020-01-31 15:21:29.978000+00:002020-01-31 15:21:30.503000+00:000.935956104040.3398090.003404134649.399348...NoneNoneNone5distanceknnLOCALalljovyan/usr/local/lib/python3.6/dist-packages/ipykern...
96b05667486f7d45779d23519eb0dbe24f0FINISHEDs3://mlflow/mlflow/artifacts/0/b05667486f7d457...2020-01-31 15:21:35.424000+00:002020-01-31 15:21:35.922000+00:000.935111105833.3586810.002732135534.759873...NoneNoneNone10distanceknnLOCALalljovyan/usr/local/lib/python3.6/dist-packages/ipykern...
228d279d728946e4b74811203a842d79df30FINISHEDs3://mlflow/mlflow/artifacts/0/d279d728946e4b7...2020-01-17 18:17:52.555000+00:002020-01-17 18:17:53.029000+00:000.935111105833.3586810.002863135534.759873...NoneNoneNone10distanceknnLOCALalljovyan/usr/local/lib/python3.6/dist-packages/ipykern...
11188af21719e0a408b91448f7ddd27e84c0FINISHEDs3://mlflow/mlflow/artifacts/0/88af21719e0a408...2020-01-31 15:21:27.338000+00:002020-01-31 15:21:27.947000+00:000.934465105793.7278970.002668136207.422483...NoneNoneNone5uniformknnLOCALalljovyan/usr/local/lib/python3.6/dist-packages/ipykern...
\n", "

5 rows × 25 columns

\n", "
" ], "text/plain": [ " run_id experiment_id status \\\n", "238 50ee6409ad3a4778bb9d8cb59034df5d 0 FINISHED \n", "106 614bcf7042ca465c8d86296f12ac9c09 0 FINISHED \n", "96 b05667486f7d45779d23519eb0dbe24f 0 FINISHED \n", "228 d279d728946e4b74811203a842d79df3 0 FINISHED \n", "111 88af21719e0a408b91448f7ddd27e84c 0 FINISHED \n", "\n", " artifact_uri \\\n", "238 s3://mlflow/mlflow/artifacts/0/50ee6409ad3a477... \n", "106 s3://mlflow/mlflow/artifacts/0/614bcf7042ca465... \n", "96 s3://mlflow/mlflow/artifacts/0/b05667486f7d457... \n", "228 s3://mlflow/mlflow/artifacts/0/d279d728946e4b7... \n", "111 s3://mlflow/mlflow/artifacts/0/88af21719e0a408... \n", "\n", " start_time end_time \\\n", "238 2020-01-17 18:17:47.448000+00:00 2020-01-17 18:17:47.929000+00:00 \n", "106 2020-01-31 15:21:29.978000+00:00 2020-01-31 15:21:30.503000+00:00 \n", "96 2020-01-31 15:21:35.424000+00:00 2020-01-31 15:21:35.922000+00:00 \n", "228 2020-01-17 18:17:52.555000+00:00 2020-01-17 18:17:53.029000+00:00 \n", "111 2020-01-31 15:21:27.338000+00:00 2020-01-31 15:21:27.947000+00:00 \n", "\n", " metrics.r2 metrics.mae metrics.duration_prediction metrics.rmse \\\n", "238 0.935956 104040.339809 0.003205 134649.399348 \n", "106 0.935956 104040.339809 0.003404 134649.399348 \n", "96 0.935111 105833.358681 0.002732 135534.759873 \n", "228 0.935111 105833.358681 0.002863 135534.759873 \n", "111 0.934465 105793.727897 0.002668 136207.422483 \n", "\n", " ... params.activation params.nbr_iteration params.hidden_layers \\\n", "238 ... None None None \n", "106 ... None None None \n", "96 ... None None None \n", "228 ... None None None \n", "111 ... None None None \n", "\n", " params.nbr_neighbors params.weight_method tags.model \\\n", "238 5 distance knn \n", "106 5 distance knn \n", "96 10 distance knn \n", "228 10 distance knn \n", "111 5 uniform knn \n", "\n", " tags.mlflow.source.type tags.inputs tags.mlflow.user \\\n", "238 LOCAL all jovyan \n", "106 LOCAL all jovyan \n", "96 LOCAL all jovyan \n", "228 LOCAL all jovyan \n", "111 LOCAL all jovyan \n", "\n", " tags.mlflow.source.name \n", "238 /usr/local/lib/python3.6/dist-packages/ipykern... \n", "106 /usr/local/lib/python3.6/dist-packages/ipykern... \n", "96 /usr/local/lib/python3.6/dist-packages/ipykern... \n", "228 /usr/local/lib/python3.6/dist-packages/ipykern... \n", "111 /usr/local/lib/python3.6/dist-packages/ipykern... \n", "\n", "[5 rows x 25 columns]" ] }, "execution_count": 19, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Quick sorting to get the best models based on the RMSE metric\n", "df_runs.sort_values([\"metrics.rmse\"], ascending = True, inplace = True)\n", "df_runs.head()" ] }, { "cell_type": "code", "execution_count": 20, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "'50ee6409ad3a4778bb9d8cb59034df5d'" ] }, "execution_count": 20, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Get the best one\n", "runid_selected = df_runs.head(1)[\"run_id\"].values[0]\n", "runid_selected" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "celltoolbar": "Raw Cell Format", "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.6.8" } }, "nbformat": 4, "nbformat_minor": 2 } ================================================ FILE: ch06/MLflow.py ================================================ #!/usr/bin/env python # coding: utf-8 # # mlflow-energyforecast # # This is a showcase for ML Flow capabilities, based on the article # http://the-odd-dataguy.com/be-more-efficient-to-produce-ml-models-with-mlflow # and a github https://github.com/jeanmidevacc/mlflow-energyforecast # # In[2]: get_ipython().system('pip install pandas --upgrade --user') get_ipython().system('pip install mlflow --upgrade --user') get_ipython().system('pip install joblib --upgrade --user') get_ipython().system('pip install numpy --upgrade --user ') get_ipython().system('pip install scipy --upgrade --user ') get_ipython().system('pip install scikit-learn --upgrade --user') get_ipython().system('pip install boto3 --upgrade --user') # In[3]: import time import json import os from joblib import Parallel, delayed import pandas as pd import numpy as np import scipy from sklearn.model_selection import train_test_split, KFold from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score, explained_variance_score from sklearn.exceptions import ConvergenceWarning import mlflow import mlflow.sklearn from mlflow.tracking import MlflowClient from warnings import simplefilter simplefilter(action='ignore', category=FutureWarning) simplefilter(action='ignore', category=ConvergenceWarning) # In[4]: # Ensure Minio access os.environ[ 'MLFLOW_S3_ENDPOINT_URL'] = 'http://minio-service.kubeflow.svc.cluster.local:9000' os.environ['AWS_ACCESS_KEY_ID'] = 'minio' os.environ['AWS_SECRET_ACCESS_KEY'] = 'minio123' # # Data preparation # In[5]: # Collect the data df_nationalconsumption_electricity_daily = pd.read_csv( "https://raw.githubusercontent.com/jeanmidevacc/mlflow-energyforecast/master/data/rtu_data.csv" ) df_nationalconsumption_electricity_daily.set_index(["day"], inplace=True) # In[6]: # Prepare the training set and the testing set df_trainvalidate_energyconsumption = df_nationalconsumption_electricity_daily[ df_nationalconsumption_electricity_daily["datastatus"] == "Définitif"] del df_trainvalidate_energyconsumption["datastatus"] df_test_energyconsumption = df_nationalconsumption_electricity_daily[ df_nationalconsumption_electricity_daily["datastatus"] == "Consolidé"] del df_test_energyconsumption["datastatus"] print("Size of the training set : ", len(df_trainvalidate_energyconsumption)) print("Size of the testing set : ", len(df_test_energyconsumption)) # In[7]: # Define the inputs and the output output = "dailyconsumption" allinputs = list(df_trainvalidate_energyconsumption.columns) allinputs.remove(output) print("Output to predict : ", output) print("Inputs for the prediction : ", allinputs) # In[8]: # Build different set of featurws for the model possible_inputs = { "all": allinputs, "only_allday_inputs": ["weekday", "month", "is_holiday", "week"], "only_allweatheravg_inputs": [ "avg_min_temperature", "avg_max_temperature", "avg_mean_temperature", "wavg_min_temperature", "wavg_max_temperature", "wavg_mean_temperature" ], "only_meanweather_inputs_avg": ["avg_mean_temperature"], "only_meanweather_inputs_wavg": ["wavg_mean_temperature"], } # In[9]: # Prepare the output of the model array_output_train = np.array(df_trainvalidate_energyconsumption[output]) array_output_test = np.array(df_test_energyconsumption[output]) # In[10]: # connect to remote server remote_server_uri = "http://mlflowserver.kubeflow.svc.cluster.local:5000" mlflow.set_tracking_uri(remote_server_uri) # Launch the experiment on mlflow experiment_name = "electricityconsumption-forecast" mlflow.set_experiment(experiment_name) # In[11]: # Define the evaluation function that will do the computation of the different metrics of accuracy (RMSE,MAE,R2) def evaluation_model(y_test, y_pred): rmse = np.sqrt(mean_squared_error(y_test, y_pred)) mae = mean_absolute_error(y_test, y_pred) r2 = r2_score(y_test, y_pred) metrics = { "rmse": rmse, "r2": r2, "mae": mae, } return metrics # # KNN regressor # In[12]: from sklearn.neighbors import KNeighborsRegressor def train_knnmodel(parameters, inputs, tags, log=False): with mlflow.start_run(nested=True): # Prepare the data array_inputs_train = np.array( df_trainvalidate_energyconsumption[inputs]) array_inputs_test = np.array(df_test_energyconsumption[inputs]) # Build the model tic = time.time() model = KNeighborsRegressor(parameters["nbr_neighbors"], weights=parameters["weight_method"]) model.fit(array_inputs_train, array_output_train) duration_training = time.time() - tic # Make the prediction tic1 = time.time() prediction = model.predict(array_inputs_test) duration_prediction = time.time() - tic1 # Evaluate the model prediction metrics = evaluation_model(array_output_test, prediction) # Log in the console if log: print(f"KNN regressor:") print(parameters) print(metrics) # Log in mlflow (parameter) mlflow.log_params(parameters) # Log in mlflow (metrics) metrics["duration_training"] = duration_training metrics["duration_prediction"] = duration_prediction mlflow.log_metrics(metrics) # log in mlflow (model) mlflow.sklearn.log_model(model, f"model") # Tag the model mlflow.set_tags(tags) # In[13]: # Test the different combinations configurations = [] for nbr_neighbors in [1, 2, 5, 10]: for weight_method in ['uniform', 'distance']: for field in possible_inputs: parameters = { "nbr_neighbors": nbr_neighbors, "weight_method": weight_method } tags = {"model": "knn", "inputs": field} configurations.append([parameters, tags]) train_knnmodel(parameters, possible_inputs[field], tags) # # MLP regressor # In[14]: from sklearn.neural_network import MLPRegressor def train_mlpmodel(parameters, inputs, tags, log=False): with mlflow.start_run(nested=True): # Prepare the data array_inputs_train = np.array( df_trainvalidate_energyconsumption[inputs]) array_inputs_test = np.array(df_test_energyconsumption[inputs]) # Build the model tic = time.time() model = MLPRegressor(hidden_layer_sizes=parameters["hidden_layers"], activation=parameters["activation"], solver=parameters["solver"], max_iter=parameters["nbr_iteration"], random_state=0) model.fit(array_inputs_train, array_output_train) duration_training = time.time() - tic # Make the prediction tic1 = time.time() prediction = model.predict(array_inputs_test) duration_prediction = time.time() - tic1 # Evaluate the model prediction metrics = evaluation_model(array_output_test, prediction) # Log in the console if log: print(f"Random forest regressor:") print(parameters) print(metrics) # Log in mlflow (parameter) mlflow.log_params(parameters) # Log in mlflow (metrics) metrics["duration_training"] = duration_training metrics["duration_prediction"] = duration_prediction mlflow.log_metrics(metrics) # log in mlflow (model) mlflow.sklearn.log_model(model, f"model") # Tag the model mlflow.set_tags(tags) # In[15]: for hiddenlayers in [4, 8, 16]: for activation in [ "identity", "logistic", ]: for solver in ["lbfgs"]: for nbriteration in [10, 100, 1000]: for field in possible_inputs: parameters = { "hidden_layers": hiddenlayers, "activation": activation, "solver": solver, "nbr_iteration": nbriteration } tags = {"model": "mlp", "inputs": field} train_mlpmodel(parameters, possible_inputs[field], tags) # # Use a handmade model (scipy approach) # In[16]: class PTG: def __init__(self, thresholds_x0, thresholds_a, thresholds_b): self.thresholds_x0 = thresholds_x0 self.thresholds_a = thresholds_a self.thresholds_b = thresholds_b def get_ptgmodel(self, x, a, b, x0): return np.piecewise(x, [x < x0, x >= x0], [lambda x: a * x + b, lambda x: a * x0 + b]) def fit(self, dfx, y): x = np.array(dfx) # Define the bounds bounds_min = [thresholds_a[0], thresholds_b[0], thresholds_x0[0]] bounds_max = [thresholds_a[1], thresholds_b[1], thresholds_x0[1]] bounds = (bounds_min, bounds_max) # Fit a model popt, pcov = scipy.optimize.curve_fit(self.get_ptgmodel, x, y, bounds=bounds) # Get the parameter of the model a = popt[0] b = popt[1] x0 = popt[2] self.coefficients = [a, b, x0] def predict(self, dfx): x = np.array(dfx) predictions = [] for elt in x: forecast = self.get_ptgmodel(elt, self.coefficients[0], self.coefficients[1], self.coefficients[2]) predictions.append(forecast) return np.array(predictions) def train_ptgmodel(parameters, inputs, tags, log=False): with mlflow.start_run(nested=True): # Prepare the data df_inputs_train = df_trainvalidate_energyconsumption[inputs[0]] df_inputs_test = df_test_energyconsumption[inputs[0]] # Build the model tic = time.time() model = PTG(parameters["thresholds_x0"], parameters["thresholds_a"], parameters["thresholds_b"]) model.fit(df_inputs_train, array_output_train) duration_training = time.time() - tic # Make the prediction tic1 = time.time() prediction = model.predict(df_inputs_test) duration_prediction = time.time() - tic1 # Evaluate the model prediction metrics = evaluation_model(array_output_test, prediction) # Log in the console if log: print(f"PTG:") print(parameters) print(metrics) # Log in mlflow (parameter) mlflow.log_params(parameters) # Log in mlflow (metrics) metrics["duration_training"] = duration_training metrics["duration_prediction"] = duration_prediction mlflow.log_metrics(metrics) # log in mlflow (model) mlflow.sklearn.log_model(model, f"model") # Tag the model mlflow.set_tags(tags) # In[17]: # Define the parameters of the model thresholds_x0 = [0, 20] thresholds_a = [-200000, -50000] thresholds_b = [1000000, 3000000] parameters = { "thresholds_x0": thresholds_x0, "thresholds_a": thresholds_a, "thresholds_b": thresholds_b } for field in ["only_meanweather_inputs_avg", "only_meanweather_inputs_wavg"]: tags = {"model": "ptg", "inputs": field} train_ptgmodel(parameters, possible_inputs[field], tags, log=False) # # Evaluate mlflow results # In[18]: # Select the run of the experiment df_runs = mlflow.search_runs(experiment_ids="0") print("Number of runs done : ", len(df_runs)) # In[19]: # Quick sorting to get the best models based on the RMSE metric df_runs.sort_values(["metrics.rmse"], ascending=True, inplace=True) df_runs.head() # In[20]: # Get the best one runid_selected = df_runs.head(1)["run_id"].values[0] runid_selected # In[ ]: ================================================ FILE: ch06/Metadata.ipynb ================================================ { "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Installation and imports" ] }, { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Requirement already up-to-date: kfmd in ./.local/lib/python3.6/site-packages (0.1.8)\n", "Requirement already up-to-date: pandas in ./.local/lib/python3.6/site-packages (1.0.1)\n", "Requirement already satisfied, skipping upgrade: python-dateutil>=2.6.1 in /usr/local/lib/python3.6/dist-packages (from pandas) (2.8.1)\n", "Requirement already satisfied, skipping upgrade: numpy>=1.13.3 in /usr/local/lib/python3.6/dist-packages (from pandas) (1.18.1)\n", "Requirement already satisfied, skipping upgrade: pytz>=2017.2 in /usr/local/lib/python3.6/dist-packages (from pandas) (2019.3)\n", "Requirement already satisfied, skipping upgrade: six>=1.5 in /usr/lib/python3/dist-packages (from python-dateutil>=2.6.1->pandas) (1.11.0)\n" ] } ], "source": [ "!pip install kfmd --upgrade --user\n", "!pip install pandas --upgrade --user\n", "\n", "from kfmd import metadata\n", "import pandas\n", "from datetime import datetime\n" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Create a workspace, run and execution" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [], "source": [ "ws1 = metadata.Workspace(\n", " # Connect to metadata-service in namesapce kubeflow in k8s cluster.\n", " backend_url_prefix=\"metadata-service.kubeflow.svc.cluster.local:8080\",\n", " name=\"ws1\",\n", " description=\"a workspace for testing\",\n", " labels={\"n1\": \"v1\"})\n", "r = metadata.Run(\n", " workspace=ws1,\n", " name=\"run-\" + datetime.utcnow().isoformat(\"T\") ,\n", " description=\"a run in ws_1\",\n", ")\n", "exec = metadata.Execution(\n", " name = \"execution\" + datetime.utcnow().isoformat(\"T\") ,\n", " workspace=ws1,\n", " run=r,\n", " description=\"execution example\",\n", ")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Log data set, model and its evaluation" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [], "source": [ "data_set = exec.log_input(\n", " metadata.DataSet(\n", " description=\"an example data\",\n", " name=\"mytable-dump\",\n", " owner=\"owner@my-company.org\",\n", " uri=\"file://path/to/dataset\",\n", " version=\"v1.0.0\",\n", " query=\"SELECT * FROM mytable\"))\n", "model = exec.log_output(\n", " metadata.Model(\n", " name=\"MNIST\",\n", " description=\"model to recognize handwritten digits\",\n", " owner=\"someone@kubeflow.org\",\n", " uri=\"gcs://my-bucket/mnist\",\n", " model_type=\"neural network\",\n", " training_framework={\n", " \"name\": \"tensorflow\",\n", " \"version\": \"v1.0\"\n", " },\n", " hyperparameters={\n", " \"learning_rate\": 0.5,\n", " \"layers\": [10, 3, 1],\n", " \"early_stop\": True\n", " },\n", " version=\"v0.0.1\",\n", " labels={\"mylabel\": \"l1\"}))\n", "metrics = exec.log_output(\n", " metadata.Metrics(\n", " name=\"MNIST-evaluation\",\n", " description=\"validating the MNIST model to recognize handwritten digits\",\n", " owner=\"someone@kubeflow.org\",\n", " uri=\"gcs://my-bucket/mnist-eval.csv\",\n", " data_set_id=data_set.id,\n", " model_id=model.id,\n", " metrics_type=metadata.Metrics.VALIDATION,\n", " values={\"accuracy\": 0.95},\n", " labels={\"mylabel\": \"l1\"}))" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "List all the models in the workspace" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
idworkspaceruncreate_timedescriptionmodel_typenameownerversionuritraining_frameworkhyperparameterslabels
08ws1run-2020-02-18T00:48:10.7349392020-02-18T00:48:13.273533Zmodel to recognize handwritten digitsneural networkMNISTsomeone@kubeflow.orgv0.0.1gcs://my-bucket/mnist{'name': 'tensorflow', 'version': 'v1.0'}{'learning_rate': 0.5, 'layers': [10, 3, 1], '...{'mylabel': 'l1'}
\n", "
" ], "text/plain": [ " id workspace run create_time \\\n", "0 8 ws1 run-2020-02-18T00:48:10.734939 2020-02-18T00:48:13.273533Z \n", "\n", " description model_type name \\\n", "0 model to recognize handwritten digits neural network MNIST \n", "\n", " owner version uri \\\n", "0 someone@kubeflow.org v0.0.1 gcs://my-bucket/mnist \n", "\n", " training_framework \\\n", "0 {'name': 'tensorflow', 'version': 'v1.0'} \n", "\n", " hyperparameters labels \n", "0 {'learning_rate': 0.5, 'layers': [10, 3, 1], '... {'mylabel': 'l1'} " ] }, "execution_count": 4, "metadata": {}, "output_type": "execute_result" } ], "source": [ "pandas.DataFrame.from_dict(ws1.list(metadata.Model.ARTIFACT_TYPE_NAME))" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Get basic lineage" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "model id is 8\n", "\n" ] } ], "source": [ "print(\"model id is %s\\n\" % model.id)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Find the execution that produces this model." ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "3\n" ] } ], "source": [ "output_events = ws1.client.list_events2(model.id).events\n", "assert len(output_events) == 1\n", "execution_id = output_events[0].execution_id\n", "print(execution_id)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Find all events related to that execution." ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\n", "All events related to this model:\n" ] }, { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
artifact_idexecution_idpathtypemilliseconds_since_epoch
073NoneINPUT1581986893248
183NoneOUTPUT1581986893273
293NoneOUTPUT1581986893298
\n", "
" ], "text/plain": [ " artifact_id execution_id path type milliseconds_since_epoch\n", "0 7 3 None INPUT 1581986893248\n", "1 8 3 None OUTPUT 1581986893273\n", "2 9 3 None OUTPUT 1581986893298" ] }, "execution_count": 7, "metadata": {}, "output_type": "execute_result" } ], "source": [ "all_events = ws1.client.list_events(execution_id).events\n", "assert len(all_events) == 3\n", "print(\"\\nAll events related to this model:\")\n", "pandas.DataFrame.from_dict([e.to_dict() for e in all_events])" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.6.9" } }, "nbformat": 4, "nbformat_minor": 2 } ================================================ FILE: ch06/Metadata.py ================================================ #!/usr/bin/env python # coding: utf-8 # # Installation and imports # In[1]: get_ipython().system('pip install kfmd --upgrade --user') get_ipython().system('pip install pandas --upgrade --user') from kfmd import metadata import pandas from datetime import datetime # Create a workspace, run and execution # In[2]: ws1 = metadata.Workspace( # Connect to metadata-service in namesapce kubeflow in k8s cluster. backend_url_prefix="metadata-service.kubeflow.svc.cluster.local:8080", name="ws1", description="a workspace for testing", labels={"n1": "v1"}) r = metadata.Run( workspace=ws1, name="run-" + datetime.utcnow().isoformat("T"), description="a run in ws_1", ) exec = metadata.Execution( name="execution" + datetime.utcnow().isoformat("T"), workspace=ws1, run=r, description="execution example", ) # Log data set, model and its evaluation # In[3]: data_set = exec.log_input( metadata.DataSet(description="an example data", name="mytable-dump", owner="owner@my-company.org", uri="file://path/to/dataset", version="v1.0.0", query="SELECT * FROM mytable")) model = exec.log_output( metadata.Model(name="MNIST", description="model to recognize handwritten digits", owner="someone@kubeflow.org", uri="gcs://my-bucket/mnist", model_type="neural network", training_framework={ "name": "tensorflow", "version": "v1.0" }, hyperparameters={ "learning_rate": 0.5, "layers": [10, 3, 1], "early_stop": True }, version="v0.0.1", labels={"mylabel": "l1"})) metrics = exec.log_output( metadata.Metrics( name="MNIST-evaluation", description= "validating the MNIST model to recognize handwritten digits", owner="someone@kubeflow.org", uri="gcs://my-bucket/mnist-eval.csv", data_set_id=data_set.id, model_id=model.id, metrics_type=metadata.Metrics.VALIDATION, values={"accuracy": 0.95}, labels={"mylabel": "l1"})) # List all the models in the workspace # In[4]: pandas.DataFrame.from_dict(ws1.list(metadata.Model.ARTIFACT_TYPE_NAME)) # Get basic lineage # In[5]: print("model id is %s\n" % model.id) # Find the execution that produces this model. # In[6]: output_events = ws1.client.list_events2(model.id).events assert len(output_events) == 1 execution_id = output_events[0].execution_id print(execution_id) # Find all events related to that execution. # In[7]: all_events = ws1.client.list_events(execution_id).events assert len(all_events) == 3 print("\nAll events related to this model:") pandas.DataFrame.from_dict([e.to_dict() for e in all_events]) # In[ ]: ================================================ FILE: ch06/docker/Dockerfile ================================================ # from https://github.com/flmu/mlflow-tracking-server FROM python:3.7 RUN pip3 install --upgrade pip && \ pip3 install mlflow --upgrade && \ pip3 install awscli --upgrade && \ pip3 install boto3 --upgrade ENV PORT 5000 ENV AWS_BUCKET bucket ENV AWS_ACCESS_KEY_ID aws_id ENV AWS_SECRET_ACCESS_KEY aws_key ENV FILE_DIR /tmp/mlflow RUN mkdir -p /opt/mlflow COPY run.sh /opt/mlflow RUN chmod -R 777 /opt/mlflow/ ENTRYPOINT ["/opt/mlflow/run.sh"] ================================================ FILE: ch06/docker/build.sh ================================================ #!/bin/bash img='lightbend/mlflow' tag='0.1' docker build -t $img:$tag . ================================================ FILE: ch06/docker/run.sh ================================================ #!/bin/sh set -e if [ -z "${AWS_BUCKET}" ]; then echo >&2 "AWS_BUCKET must be set" exit 1 fi if [ -z "${AWS_ACCESS_KEY_ID}" ]; then echo >&2 "AWS_ACCESS_KEY_ID must be set" exit 1 fi if [ -z "${AWS_SECRET_ACCESS_KEY}" ]; then echo >&2 "AWS_SECRET_ACCESS_KEY must be set" exit 1 fi mkdir -p "${FILE_DIR}" mlflow server \ --backend-store-uri "file://$FILE_DIR" \ --default-artifact-root "s3://$AWS_BUCKET/mlflow/artifacts" \ --host 0.0.0.0 \ --port "$PORT" ================================================ FILE: ch06/install/mlflowchart/.helmignore ================================================ # Patterns to ignore when building packages. # This supports shell glob matching, relative path matching, and # negation (prefixed with !). Only one pattern per line. .DS_Store # Common VCS dirs .git/ .gitignore .bzr/ .bzrignore .hg/ .hgignore .svn/ # Common backup files *.swp *.bak *.tmp *~ # Various IDEs .project .idea/ *.tmproj ================================================ FILE: ch06/install/mlflowchart/Chart.yaml ================================================ apiVersion: v1 appVersion: 0.1 description: MLFlow maintainers: - name: Boris Lublinsky name: MLFLOW tracking server version: 0.1 ================================================ FILE: ch06/install/mlflowchart/templates/NOTES.txt ================================================ ML Flow tracking server is installed ================================================ FILE: ch06/install/mlflowchart/templates/_helpers.tpl ================================================ {{/* vim: set filetype=mustache: */}} {{/* Expand the name of the chart. */}} {{- define "modelserverchart.name" -}} {{- default .Chart.Name .Values.nameOverride | trunc 63 | trimSuffix "-" -}} {{- end -}} {{/* Create a default fully qualified app name. We truncate at 63 chars because some Kubernetes name fields are limited to this (by the DNS naming spec). */}} {{- define "modelserverchart.fullname" -}} {{- $name := default .Chart.Name .Values.nameOverride -}} {{- printf "%s-%s" .Release.Name $name | trunc 63 | trimSuffix "-" -}} {{- end -}} ================================================ FILE: ch06/install/mlflowchart/templates/mlflow.yaml ================================================ apiVersion: apps/v1 kind: Deployment metadata: namespace: kubeflow name: mlflowserver labels: app: mlflowserver spec: replicas: 1 selector: matchLabels: app: mlflowserver strategy: type: RollingUpdate template: metadata: labels: app: mlflowserver spec: containers: - name: server image: "{{ .Values.image.server }}:{{ .Values.image.version }}" imagePullPolicy: "{{ .Values.image.pullPolicy }}" ports: - containerPort: 5000 name: serving protocol: TCP env: - name: "MLFLOW_S3_ENDPOINT_URL" value: "http://minio-service.kubeflow.svc.cluster.local:9000" - name: "AWS_ACCESS_KEY_ID" valueFrom: { secretKeyRef: { name: "minioaccess", key: "AWS_ACCESS_KEY_ID" } } - name: "AWS_SECRET_ACCESS_KEY" valueFrom: { secretKeyRef: { name: "minioaccess", key: "AWS_SECRET_ACCESS_KEY" } } - name: "AWS_BUCKET" value: "mlflow" volumes: - name: secret-volume secret: secretName: minioaccess --- apiVersion: v1 kind: Service metadata: namespace: kubeflow name: mlflowserver spec: selector: app: mlflowserver ports: - protocol: TCP port: 5000 targetPort: 5000 --- apiVersion: networking.istio.io/v1alpha3 kind: VirtualService metadata: name: mlflow-server namespace: kubeflow spec: gateways: - kubeflow-gateway hosts: - '*' http: - match: - uri: prefix: /mlflow/ rewrite: uri: / route: - destination: host: mlflowserver.kubeflow.svc.cluster.local port: number: 5000 ================================================ FILE: ch06/install/mlflowchart/values.yaml ================================================ # application name is a namespace # docker images image: server: lightbend/mlflow pullPolicy: Always version: 0.1 ================================================ FILE: ch10/experiment.yaml ================================================ Name: random-example Namespace: kubeflow Labels: controller-tools.k8s.io=1.0 Annotations: API Version: kubeflow.org/v1alpha3 Kind: Experiment Metadata: Creation Timestamp: 2019-12-22T22:53:25Z Finalizers: update-prometheus-metrics Generation: 2 Resource Version: 720692 Self Link: /apis/kubeflow.org/v1alpha3/namespaces/kubeflow/experiments/random-example UID: dc6bc15a-250d-11ea-8cae-42010a80010f Spec: Algorithm: Algorithm Name: random Algorithm Settings: Max Failed Trial Count: 3 Max Trial Count: 12 Metrics Collector Spec: Collector: Kind: StdOut Objective: Additional Metric Names: accuracy Goal: 0.99 Objective Metric Name: Validation-accuracy Type: maximize Parallel Trial Count: 3 Parameters: Feasible Space: Max: 0.03 Min: 0.01 Name: --lr Parameter Type: double Feasible Space: Max: 5 Min: 2 Name: --num-layers Parameter Type: int Feasible Space: List: sgd adam ftrl Name: --optimizer Parameter Type: categorical Trial Template: Go Template: Raw Template: apiVersion: batch/v1 kind: Job metadata: name: {{.Trial}} namespace: {{.NameSpace}} spec: template: spec: containers: - name: {{.Trial}} image: docker.io/kubeflowkatib/mxnet-mnist-example command: - "python" - "/mxnet/example/image-classification/train_mnist.py" - "--batch-size=64" {{- with .HyperParameters}} {{- range .}} - "{{.Name}}={{.Value}}" {{- end}} {{- end}} restartPolicy: Never Status: Conditions: Last Transition Time: 2019-12-22T22:53:25Z Last Update Time: 2019-12-22T22:53:25Z Message: Experiment is created Reason: ExperimentCreated Status: True Type: Created Last Transition Time: 2019-12-22T22:55:10Z Last Update Time: 2019-12-22T22:55:10Z Message: Experiment is running Reason: ExperimentRunning Status: True Type: Running Current Optimal Trial: Observation: Metrics: Name: Validation-accuracy Value: 0.981091 Parameter Assignments: Name: --lr Value: 0.025139701133432946 Name: --num-layers Value: 4 Name: --optimizer Value: sgd Start Time: 2019-12-22T22:53:25Z Trials: 12 Trials Running: 2 Trials Succeeded: 10 Events: Type something here! ================================================ FILE: ch10/hptuning.py ================================================ # Initialize search space # Initialize model while not objective_reached and not bugdget_exhausted: # Obtain new hyperparameters suggestion = GetSuggestions() # Run trial with new hyperparameters; collect metrics metrics = RunTrial(suggestion) # Report metrics Report(metrics) ================================================ FILE: ch10/random.yaml ================================================ apiVersion: "kubeflow.org/v1alpha3" kind: Experiment metadata: namespace: kubeflow labels: controller-tools.k8s.io: "1.0" name: random-example spec: objective: type: maximize goal: 0.99 objectiveMetricName: Validation-accuracy additionalMetricNames: - Train-accuracy algorithm: algorithmName: random parallelTrialCount: 3 maxTrialCount: 12 maxFailedTrialCount: 3 parameters: - name: --lr parameterType: double feasibleSpace: min: "0.01" max: "0.03" - name: --num-layers parameterType: int feasibleSpace: min: "2" max: "5" - name: --optimizer parameterType: categorical feasibleSpace: list: - sgd - adam - ftrl trialTemplate: goTemplate: rawTemplate: |- apiVersion: batch/v1 kind: Job metadata: name: {{.Trial}} namespace: {{.NameSpace}} spec: template: spec: containers: - name: {{.Trial}} image: docker.io/kubeflowkatib/mxnet-mnist command: - "python3" - "/opt/mxnet-mnist/mnist.py" - "--batch-size=64" {{- with .HyperParameters}} {{- range .}} - "{{.Name}}={{.Value}}" {{- end}} {{- end}} restartPolicy: NeverType something here! ================================================ FILE: ch2/Dockerfile ================================================ FROM gcr.io/kubeflow-images-public/tensorflow-2.1.0-notebook-cpu:1.0.0 ================================================ FILE: ch2/build-and-push.sh ================================================ #!/bin/bash #tag::buildandpush[] IMAGE="${CONTAINER_REGISTRY}/kubeflow/test:v1" docker build -t "${IMAGE}" -f Dockerfile . docker push "${IMAGE}" #end::buildandpush[] ================================================ FILE: ch2/query-endpoint.py ================================================ # Licensed to the Apache Software Foundation (ASF) under one # or more contributor license agreements. See the NOTICE file # distributed with this work for additional information # regarding copyright ownership. The ASF licenses this file # to you under the Apache License, Version 2.0 (the # "License"); you may not use this file except in compliance # with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, # software distributed under the License is distributed on an # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY # KIND, either express or implied. See the License for the # specific language governing permissions and limitations # under the License. #tag::scriptSetup[] import requests import numpy as np from tensorflow.examples.tutorials.mnist import input_data from matplotlib import pyplot as plt def download_mnist(): return input_data.read_data_sets("MNIST_data/", one_hot=True) def gen_image(arr): two_d = (np.reshape(arr, (28, 28)) * 255).astype(np.uint8) plt.imshow(two_d, cmap=plt.cm.gray_r, interpolation='nearest') return plt #end::scriptSetup[] AMBASSADOR_API_IP = "10.53.148.167:30134" #tag::scriptGuts[] mnist = download_mnist() batch_xs, batch_ys = mnist.train.next_batch(1) chosen = 0 gen_image(batch_xs[chosen]).show() data = batch_xs[chosen].reshape((1, 784)) features = ["X" + str(i + 1) for i in range(0, 784)] request = {"data": {"names": features, "ndarray": data.tolist()}} deploymentName = "mnist-classifier" uri = "http://" + AMBASSADOR_API_IP + "/seldon/" + \ deploymentName + "/api/v0.1/predictions" response = requests.post(uri, json=request) #end::scriptGuts[] print(response.status_code) ================================================ FILE: ch2_seldon_examples/pipeline_role.yaml ================================================ apiVersion: rbac.authorization.k8s.io/v1 kind: Role metadata: namespace: kubeflow name: pipeline-runner rules: - apiGroups: ["machinelearning.seldon.io"] resources: ["seldondeployments"] verbs: ["*"] ================================================ FILE: ch2_seldon_examples/pipeline_rolebinding.yaml ================================================ apiVersion: rbac.authorization.k8s.io/v1 kind: RoleBinding metadata: name: pipeline-runner namespace: kubeflow subjects: - kind: ServiceAccount name: pipeline-runner namespace: kubeflow roleRef: kind: Role name: pipeline-runner apiGroup: rbac.authorization.k8s.io ================================================ FILE: ch2_seldon_examples/pv-claim.yaml ================================================ kind: PersistentVolumeClaim apiVersion: v1 metadata: name: "nfs-1" spec: storageClassName: manual accessModes: - ReadWriteOnce resources: requests: storage: 3Gi ================================================ FILE: ch2_seldon_examples/pv-volume.yaml ================================================ kind: PersistentVolume apiVersion: v1 metadata: name: task-pv-volume labels: type: local spec: storageClassName: manual capacity: storage: 10Gi accessModes: - ReadWriteOnce hostPath: path: "/mnt/data" ================================================ FILE: ch2_seldon_examples/request_example.ipynb ================================================ { "cells": [ { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Collecting matplotlib\n", "\u001b[?25l Downloading https://files.pythonhosted.org/packages/57/4f/dd381ecf6c6ab9bcdaa8ea912e866dedc6e696756156d8ecc087e20817e2/matplotlib-3.1.1-cp36-cp36m-manylinux1_x86_64.whl (13.1MB)\n", "\u001b[K 100% |████████████████████████████████| 13.1MB 2.7MB/s eta 0:00:01\n", "\u001b[?25hRequirement already satisfied: python-dateutil>=2.1 in /opt/conda/lib/python3.6/site-packages (from matplotlib) (2.8.0)\n", "Collecting cycler>=0.10 (from matplotlib)\n", " Downloading https://files.pythonhosted.org/packages/f7/d2/e07d3ebb2bd7af696440ce7e754c59dd546ffe1bbe732c8ab68b9c834e61/cycler-0.10.0-py2.py3-none-any.whl\n", "Collecting kiwisolver>=1.0.1 (from matplotlib)\n", "\u001b[?25l Downloading https://files.pythonhosted.org/packages/f8/a1/5742b56282449b1c0968197f63eae486eca2c35dcd334bab75ad524e0de1/kiwisolver-1.1.0-cp36-cp36m-manylinux1_x86_64.whl (90kB)\n", "\u001b[K 100% |████████████████████████████████| 92kB 32.5MB/s ta 0:00:01\n", "\u001b[?25hCollecting pyparsing!=2.0.4,!=2.1.2,!=2.1.6,>=2.0.1 (from matplotlib)\n", "\u001b[?25l Downloading https://files.pythonhosted.org/packages/11/fa/0160cd525c62d7abd076a070ff02b2b94de589f1a9789774f17d7c54058e/pyparsing-2.4.2-py2.py3-none-any.whl (65kB)\n", "\u001b[K 100% |████████████████████████████████| 71kB 25.6MB/s ta 0:00:01\n", "\u001b[?25hRequirement already satisfied: numpy>=1.11 in /opt/conda/lib/python3.6/site-packages (from matplotlib) (1.16.2)\n", "Requirement already satisfied: six>=1.5 in /opt/conda/lib/python3.6/site-packages (from python-dateutil>=2.1->matplotlib) (1.12.0)\n", "Requirement already satisfied: setuptools in /opt/conda/lib/python3.6/site-packages (from kiwisolver>=1.0.1->matplotlib) (40.9.0)\n", "Installing collected packages: cycler, kiwisolver, pyparsing, matplotlib\n", "Successfully installed cycler-0.10.0 kiwisolver-1.1.0 matplotlib-3.1.1 pyparsing-2.4.2\n", "\u001b[33mYou are using pip version 19.0.1, however version 19.2.3 is available.\n", "You should consider upgrading via the 'pip install --upgrade pip' command.\u001b[0m\n" ] } ], "source": [ "!pip install matplotlib" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [], "source": [ "import requests\n", "import numpy as np\n", "\n", "from tensorflow.examples.tutorials.mnist import input_data\n", "from matplotlib import pyplot as plt\n", "\n", "\n", "def download_mnist():\n", " return input_data.read_data_sets(\"MNIST_data/\", one_hot = True)\n", "\n", "def gen_image(arr):\n", " two_d = (np.reshape(arr, (28, 28)) * 255).astype(np.uint8)\n", " plt.imshow(two_d,cmap=plt.cm.gray_r, interpolation='nearest')\n", " return plt\n", "\n" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "WARNING:tensorflow:From :9: read_data_sets (from tensorflow.contrib.learn.python.learn.datasets.mnist) is deprecated and will be removed in a future version.\n", "Instructions for updating:\n", "Please use alternatives such as official/mnist/dataset.py from tensorflow/models.\n", "WARNING:tensorflow:From /opt/conda/lib/python3.6/site-packages/tensorflow/contrib/learn/python/learn/datasets/mnist.py:260: maybe_download (from tensorflow.contrib.learn.python.learn.datasets.base) is deprecated and will be removed in a future version.\n", "Instructions for updating:\n", "Please write your own downloading logic.\n", "WARNING:tensorflow:From /opt/conda/lib/python3.6/site-packages/tensorflow/contrib/learn/python/learn/datasets/base.py:252: _internal_retry..wrap..wrapped_fn (from tensorflow.contrib.learn.python.learn.datasets.base) is deprecated and will be removed in a future version.\n", "Instructions for updating:\n", "Please use urllib or similar directly.\n", "Successfully downloaded train-images-idx3-ubyte.gz 9912422 bytes.\n", "WARNING:tensorflow:From /opt/conda/lib/python3.6/site-packages/tensorflow/contrib/learn/python/learn/datasets/mnist.py:262: extract_images (from tensorflow.contrib.learn.python.learn.datasets.mnist) is deprecated and will be removed in a future version.\n", "Instructions for updating:\n", "Please use tf.data to implement this functionality.\n", "Extracting MNIST_data/train-images-idx3-ubyte.gz\n", "Successfully downloaded train-labels-idx1-ubyte.gz 28881 bytes.\n", "WARNING:tensorflow:From /opt/conda/lib/python3.6/site-packages/tensorflow/contrib/learn/python/learn/datasets/mnist.py:267: extract_labels (from tensorflow.contrib.learn.python.learn.datasets.mnist) is deprecated and will be removed in a future version.\n", "Instructions for updating:\n", "Please use tf.data to implement this functionality.\n", "Extracting MNIST_data/train-labels-idx1-ubyte.gz\n", "WARNING:tensorflow:From /opt/conda/lib/python3.6/site-packages/tensorflow/contrib/learn/python/learn/datasets/mnist.py:110: dense_to_one_hot (from tensorflow.contrib.learn.python.learn.datasets.mnist) is deprecated and will be removed in a future version.\n", "Instructions for updating:\n", "Please use tf.one_hot on tensors.\n", "Successfully downloaded t10k-images-idx3-ubyte.gz 1648877 bytes.\n", "Extracting MNIST_data/t10k-images-idx3-ubyte.gz\n", "Successfully downloaded t10k-labels-idx1-ubyte.gz 4542 bytes.\n", "Extracting MNIST_data/t10k-labels-idx1-ubyte.gz\n", "WARNING:tensorflow:From /opt/conda/lib/python3.6/site-packages/tensorflow/contrib/learn/python/learn/datasets/mnist.py:290: DataSet.__init__ (from tensorflow.contrib.learn.python.learn.datasets.mnist) is deprecated and will be removed in a future version.\n", "Instructions for updating:\n", "Please use alternatives such as official/mnist/dataset.py from tensorflow/models.\n" ] } ], "source": [ "mnist = download_mnist()" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [ { "data": { "image/png": "iVBORw0KGgoAAAANSUhEUgAAAPsAAAD4CAYAAAAq5pAIAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADh0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uMy4xLjEsIGh0dHA6Ly9tYXRwbG90bGliLm9yZy8QZhcZAAANMklEQVR4nO3dXaxV9ZnH8d9Ppr0REmE4ORDAgakYo2Ok5EhMahonZIgvIdgYTblATMxQXyCtaeIYJ1ovvMAJ0BQzklAlpaRDbWwVYohTB5uY3hCPBoUjaX0JBghyDhqiqFiVZy7Osjni2Wsf9lr7RZ7vJznZe69nrb2erPBj7b3+e++/I0IAzn3ndbsBAJ1B2IEkCDuQBGEHkiDsQBL/0MmdTZ8+PebOndvJXQKpHDx4UMePH/d4tUpht32tpF9ImiTp8YhYW7b+3LlzNTg4WGWXAEoMDAw0rLX8Mt72JEn/Lek6SZdKWm770lafD0B7VXnPvkjSmxHxdkT8TdJvJS2rpy0AdasS9lmSDo15fLhY9hW2V9ketD04MjJSYXcAqmj71fiI2BwRAxEx0NfX1+7dAWigStiPSJoz5vHsYhmAHlQl7C9Jmm97nu1vS/qhpJ31tAWgbi0PvUXE57ZXS/pfjQ69bYmIodo6A1CrSuPsEbFL0q6aegHQRnxcFkiCsANJEHYgCcIOJEHYgSQIO5AEYQeSIOxAEoQdSIKwA0kQdiAJwg4kQdiBJAg7kARhB5Ig7EAShB1IgrADSRB2IAnCDiRB2IEkCDuQBGEHkiDsQBKEHUiCsANJEHYgCcIOJEHYgSQIO5BEpSmbbR+U9KGkLyR9HhEDdTQFoH6Vwl7414g4XsPzAGgjXsYDSVQNe0j6o+2Xba8abwXbq2wP2h4cGRmpuDsAraoa9qsjYqGk6yTdbfv7Z64QEZsjYiAiBvr6+iruDkCrKoU9Io4Ut8OSnpa0qI6mANSv5bDbPt/2lC/vS1oiaX9djQGoV5Wr8f2Snrb95fP8T0Q8V0tXAGrXctgj4m1JV9TYC4A2YugNSIKwA0kQdiAJwg4kQdiBJOr4Igy+wbZt21Za/+STTzrUydl74IEHSuvr1q1rWFuxYkXd7fQ8zuxAEoQdSIKwA0kQdiAJwg4kQdiBJAg7kATj7OeANWvWNKzt2bOndNu9e/eW1j/77LPSen9/f8vbnz59unTbEydOlNabKb5+jQJndiAJwg4kQdiBJAg7kARhB5Ig7EAShB1IgnH2Djh27Fhp/eGHH670/Dt27GhYO3ToUKXnvuOOO0rrt912W2n9o48+aljbtGlT6bZPPfVUaX3+/Pml9SuvvLK0ng1ndiAJwg4kQdiBJAg7kARhB5Ig7EAShB1IgnH2Grz11lul9WXLlpXWh4aGKu1/8uTJDWu33npr6bbr168vrU+bNq20ft555eeLxx9/vGFtcHCwdNvLLrustP7cc+UzhM+ePbu0nk3TM7vtLbaHbe8fs2ya7edtv1HcTm1vmwCqmsjL+F9JuvaMZfdJ2h0R8yXtLh4D6GFNwx4RL0p6/4zFyyRtLe5vlXRjzX0BqFmrF+j6I+Jocf9dSQ1/iMz2KtuDtgdHRkZa3B2AqipfjY+IkBQl9c0RMRARA319fVV3B6BFrYb9mO2ZklTcDtfXEoB2aDXsOyWtLO6vlNT4O5YAekLTcXbb2yVdI2m67cOSfiZpraTf2b5d0juSbmlnk72ubJxbkmbNmlVarzrO/sgjjzSs3XXXXZWeu5n33nuvtL5hw4aGtZMnT5Zue/PNN5fWGUc/O03DHhHLG5QW19wLgDbi47JAEoQdSIKwA0kQdiAJwg4kwVdca9Bs2uKNGzeW1i+55JJK+2/2k8rt9Nhjj5XWDxw40LDW7Ou3N9xwQ0s9YXyc2YEkCDuQBGEHkiDsQBKEHUiCsANJEHYgCcbZO+Ciiy4qra9Zs6a0/uijj9bZzlk5depUab3Z13NnzJjRsHbnnXeWbnvVVVeV1nF2OLMDSRB2IAnCDiRB2IEkCDuQBGEHkiDsQBKMs3fApEmTSuurV68urS9durS0vnDhwrPuaaJOnDhRWn/yySdL60uWLGlYYxy9szizA0kQdiAJwg4kQdiBJAg7kARhB5Ig7EASjLP3gIsvvrhSvZ2eeeaZru0b9Wp6Zre9xfaw7f1jlj1k+4jtvcXf9e1tE0BVE3kZ/ytJ146z/OcRsaD421VvWwDq1jTsEfGipPc70AuANqpygW617deKl/lTG61ke5XtQduDIyMjFXYHoIpWw75J0nckLZB0VNL6RitGxOaIGIiIgb6+vhZ3B6CqlsIeEcci4ouIOC3pl5IW1dsWgLq1FHbbM8c8/IGk/Y3WBdAbmo6z294u6RpJ020flvQzSdfYXiApJB2U9KM29ogu2rWrfKDlnnvuKa0/+OCDdbaDCpqGPSKWj7P4iTb0AqCN+LgskARhB5Ig7EAShB1IgrADSfAV1+SGh4dL659++mlpffLkyaX1Cy644Kx7QntwZgeSIOxAEoQdSIKwA0kQdiAJwg4kQdiBJBhnT+7yyy8vrX/88cel9XvvvbfOdtBGnNmBJAg7kARhB5Ig7EAShB1IgrADSRB2IAnG2VGq2ffVFy9e3KFOUBVndiAJwg4kQdiBJAg7kARhB5Ig7EAShB1IgnH2c9z27dtL6x988EFpfcaMGXW2gy5qema3Pcf2n2y/bnvI9o+L5dNsP2/7jeJ2avvbBdCqibyM/1zSTyPiUklXSbrb9qWS7pO0OyLmS9pdPAbQo5qGPSKORsQrxf0PJR2QNEvSMklbi9W2SrqxXU0CqO6sLtDZnivpu5L2SOqPiKNF6V1J/Q22WWV70PbgyMhIhVYBVDHhsNueLOn3kn4SEV+5qhMRISnG2y4iNkfEQEQM9PX1VWoWQOsmFHbb39Jo0H8TEX8oFh+zPbOoz5RUPh0ogK5qOvRm25KekHQgIjaMKe2UtFLS2uJ2R1s6RCVDQ0Ol9VOnTpXWN27cWGc76KKJjLN/T9IKSfts7y2W3a/RkP/O9u2S3pF0S3taBFCHpmGPiD9LcoMyv1wAfEPwcVkgCcIOJEHYgSQIO5AEYQeSIOxAEoQdSIKwA0kQdiAJwg4kQdiBJAg7kARhB5Lgp6TPAWvXrm1Y27lzZ+m2CxYsKK1fccUVLfWE3sOZHUiCsANJEHYgCcIOJEHYgSQIO5AEYQeSYJz9HPDCCy80rO3bt6/Sc7/66qul9QsvvLDS86NzOLMDSRB2IAnCDiRB2IEkCDuQBGEHkiDsQBITmZ99jqRfS+qXFJI2R8QvbD8k6d8ljRSr3h8Ru9rVKBqbN29e255727ZtpfWlS5e2bd+o10Q+VPO5pJ9GxCu2p0h62fbzRe3nEbGufe0BqMtE5mc/Kulocf9D2wckzWp3YwDqdVbv2W3PlfRdSXuKRattv2Z7i+2pDbZZZXvQ9uDIyMh4qwDogAmH3fZkSb+X9JOI+EDSJknfkbRAo2f+9eNtFxGbI2IgIgb6+vpqaBlAKyYUdtvf0mjQfxMRf5CkiDgWEV9ExGlJv5S0qH1tAqiqadhtW9ITkg5ExIYxy2eOWe0HkvbX3x6Aukzkavz3JK2QtM/23mLZ/ZKW216g0eG4g5J+1JYO0dS6dY0HRI4cOVK67eLFi0vrN910U0s9ofdM5Gr8nyV5nBJj6sA3CJ+gA5Ig7EAShB1IgrADSRB2IAnCDiTBT0mfA6ZMmdKw9uyzz3awE/QyzuxAEoQdSIKwA0kQdiAJwg4kQdiBJAg7kIQjonM7s0ckvTNm0XRJxzvWwNnp1d56tS+J3lpVZ2//FBHj/v5bR8P+tZ3bgxEx0LUGSvRqb73al0RvrepUb7yMB5Ig7EAS3Q775i7vv0yv9tarfUn01qqO9NbV9+wAOqfbZ3YAHULYgSS6Enbb19r+i+03bd/XjR4asX3Q9j7be20PdrmXLbaHbe8fs2ya7edtv1HcjjvHXpd6e8j2keLY7bV9fZd6m2P7T7Zftz1k+8fF8q4eu5K+OnLcOv6e3fYkSX+V9G+SDkt6SdLyiHi9o400YPugpIGI6PoHMGx/X9JJSb+OiH8plv2XpPcjYm3xH+XUiPiPHuntIUknuz2NdzFb0cyx04xLulHSberisSvp6xZ14Lh148y+SNKbEfF2RPxN0m8lLetCHz0vIl6U9P4Zi5dJ2lrc36rRfywd16C3nhARRyPileL+h5K+nGa8q8eupK+O6EbYZ0k6NObxYfXWfO8h6Y+2X7a9qtvNjKM/Io4W99+V1N/NZsbRdBrvTjpjmvGeOXatTH9eFRfovu7qiFgo6TpJdxcvV3tSjL4H66Wx0wlN490p40wz/nfdPHatTn9eVTfCfkTSnDGPZxfLekJEHCluhyU9rd6bivrYlzPoFrfDXe7n73ppGu/xphlXDxy7bk5/3o2wvyRpvu15tr8t6YeSdnahj6+xfX5x4US2z5e0RL03FfVOSSuL+ysl7ehiL1/RK9N4N5pmXF0+dl2f/jwiOv4n6XqNXpF/S9J/dqOHBn39s6RXi7+hbvcmabtGX9Z9ptFrG7dL+kdJuyW9Ien/JE3rod62Sdon6TWNBmtml3q7WqMv0V+TtLf4u77bx66kr44cNz4uCyTBBTogCcIOJEHYgSQIO5AEYQeSIOxAEoQdSOL/AQe88PwDu2A0AAAAAElFTkSuQmCC\n", "text/plain": [ "
" ] }, "metadata": { "needs_background": "light" }, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "401\n" ] } ], "source": [ "batch_xs, batch_ys = mnist.train.next_batch(1)\n", "chosen=0\n", "gen_image(batch_xs[chosen]).show()\n", "data = batch_xs[chosen].reshape((1,784))\n", "features = [\"X\"+str(i+1) for i in range (0,784)]\n", "request = {\"data\":{\"names\":features,\"ndarray\":data.tolist()}}\n", "deploymentName = \"mnist-classifier\"\n", "uri = \"http://istio-ingressgateway.istio-system.svc.cluster.local/seldon/\"+deploymentName+\"/api/v0.1/predictions\"\n", "\n", "response = requests.post(\n", " uri,\n", " json=request)\n", "\n", "print(response.status_code)\n" ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Origin authentication failed.\n" ] } ], "source": [ "print(response.text)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.6.7" } }, "nbformat": 4, "nbformat_minor": 2 } ================================================ FILE: ch2_seldon_examples/run_example.sh ================================================ #!/bin/bash #tag::buildPipeline[] dsl-compile --py train_pipeline.py --output job.yaml #end::buildPipeline[] #tag::connectToWebUI[] # If you're on minikube and not using a loadbalancer: minikube service --url -n istio-system istio-ingressgateway # If your on GCP https://.endpoints..cloud.goog/ # If you're on vanilla K8s INGRESS_HOST=$(kubectl -n istio-system get service istio-ingressgateway \ -o jsonpath='{.status.loadBalancer.ingress[0].ip}') export INGRESS_HOST INGRESS_PORT=$(kubectl -n istio-system get service istio-ingressgateway \ -o jsonpath='{.spec.ports[?(@.name=="http2")].port}') export INGRESS_PORT SECURE_INGRESS_PORT=$(kubectl -n istio-system get service istio-ingressgateway \ -o jsonpath='{.spec.ports[?(@.name=="https")].port}') export SECURE_INGRESS_PORT kubectl get svc istio-ingressgateway -n istio-system #end::connectToWebUI[] ================================================ FILE: ch2_seldon_examples/setup_example.sh ================================================ #!/bin/bash set -ex echo "Setting up example" unset ch2_example_path ch2_example_path="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )" echo "Using path ${ch2_example_path} for our example path" example_path=$(dirname "${ch2_example_path}") #tag::generate_kf_app_p1[] # Pick the correct config file for your platform from # https://github.com/kubeflow/manifests/tree/[version]/kfdef # You can download & edit the configuration at this point if you need to. # For generic k8s with istio: MANIFEST_BRANCH=${MANIFEST_BRANCH:-v1.0-branch} export MANIFEST_BRANCH MANIFEST_VERSION=${MANIFEST_VERSION:-v1.0.1} export MANIFEST_VERSION KF_PROJECT_NAME=${KF_PROJECT_NAME:-hello-kf-${PLATFORM}} export KF_PROJECT_NAME mkdir "${KF_PROJECT_NAME}" pushd "${KF_PROJECT_NAME}" manifest_root=https://raw.githubusercontent.com/kubeflow/manifests/ # On most enviroments this will create a "vanilla" kubeflow install using istio. KFDEF=${manifest_root}${MANIFEST_BRANCH}/kfdef/kfctl_k8s_istio.${MANIFEST_VERSION}.yaml #end::generate_kf_app_p1[] # On GCP this will create a cluster with basic authentication if [ "$PLATFORM" == "gcp" ]; then KFDEF=${manifest_root}${MANIFEST_BRANCH}/kfdef/kfctl_gcp_iap.${MANIFEST_VERSION}.yaml # Temp hack cp "${example_path}/kfctl_gcp_iap.v1.0.1.yaml" ./ KFDEF=./kfctl_gcp_iap.v1.0.1.yaml # Set up IAP # TODO(holden) # Set up environment variables for GCP export PROJECT=${PROJECT:-""} gcloud config set project "${PROJECT}" export ZONE=${ZONE:-""} gcloud config set compute/zone "${ZONE}" fi pwd #tag::generate_kf_app_p2[] kfctl apply -f $KFDEF -V echo $? popd #end::generate_kf_app_p2[] # TODO(trevor): what version/tag? #tag::cloneSeldonExample[] # Clone the base seldon example git clone https://github.com/kubeflow/example-seldon #end::cloneSeldonExample[] ================================================ FILE: ch2_seldon_examples/tf_mnist_no_seldon_pipeline.py ================================================ # Copyright 2019 Google LLC # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. """ Kubeflow Pipelines MNIST example Run this script to compile pipeline """ import kfp.dsl as dsl import kfp.gcp as gcp import kfp.onprem as onprem gcs_or_pvc = 'PVC' @dsl.pipeline(name='MNIST', description='A pipeline to train and serve the MNIST example.') def mnist_pipeline(gcs_bucket=None, train_steps='200', learning_rate='0.01', batch_size='100'): """ Pipeline with three stages: 1. train an MNIST classifier 2. deploy a tf-serving instance to the cluster 3. deploy a web-ui to interact with it """ vop = None volume = None if gcs_or_pvc == "PVC": vop = dsl.VolumeOp(name="create_pvc", resource_name="nfs-1", modes=dsl.VOLUME_MODE_RWO, size="10G") volume = vop.volume train = dsl.ContainerOp( name='train', image= 'gcr.io/kubeflow-examples/mnist/model:v20190304-v0.2-176-g15d997b', arguments=[ "/opt/model.py", "--tf-export-dir", gcs_bucket or "/mnt", "--tf-train-steps", train_steps, "--tf-batch-size", batch_size, "--tf-learning-rate", learning_rate ]) serve_args = [ '--model-export-path', gcs_bucket or "/mnt", '--server-name', "mnist-service" ] if gcs_or_pvc != 'GCS': serve_args.extend( ['--cluster-name', "mnist-pipeline", '--pvc-name', volume]) serve = dsl.ContainerOp( name='serve', image='gcr.io/ml-pipeline/ml-pipeline-kubeflow-deployer:' '7775692adf28d6f79098e76e839986c9ee55dd61', arguments=serve_args) serve.after(train) webui_args = [ '--image', 'gcr.io/kubeflow-examples/mnist/web-ui:' 'v20190304-v0.2-176-g15d997b-pipelines', '--name', 'web-ui', '--container-port', '5000', '--service-port', '80', '--service-type', "LoadBalancer" ] web_ui = dsl.ContainerOp( name='web-ui', image='gcr.io/kubeflow-examples/mnist/deploy-service:latest', arguments=webui_args) web_ui.after(serve) steps = [train, serve, web_ui] for step in steps: if gcs_or_pvc == 'GCS': step.apply(gcp.use_gcp_secret('user-gcp-sa')) else: step.after(vop) step.add_pvolumes({"/mnt": volume}) if __name__ == '__main__': import kfp.compiler as compiler compiler.Compiler().compile(mnist_pipeline, __file__ + '.tar.gz') ================================================ FILE: ch2_seldon_examples/tiller_rbac.yaml ================================================ apiVersion: v1 kind: ServiceAccount metadata: name: tiller namespace: kube-system --- apiVersion: rbac.authorization.k8s.io/v1 kind: ClusterRoleBinding metadata: name: tiller roleRef: apiGroup: rbac.authorization.k8s.io kind: ClusterRole name: cluster-admin subjects: - kind: ServiceAccount name: tiller namespace: kube-system ================================================ FILE: ch2_seldon_examples/train_pipeline.py ================================================ import kfp.dsl as dsl import kfp.gcp as gcp import kfp.onprem as onprem from string import Template import json @dsl.pipeline(name='Simple sci-kit KF Pipeline', description='A simple end to end sci-kit seldon kf pipeline') def mnist_train_pipeline(docker_org="index.docker.io/seldonio", train_container_version="0.2", serve_container_version="0.1"): vop = dsl.VolumeOp(name="create_pvc", resource_name="nfs-1", modes=dsl.VOLUME_MODE_RWO, size="10G") volume = vop.volume train = dsl.ContainerOp( name='sk-train', image= f"{docker_org}/skmnistclassifier_trainer:{train_container_version}", pvolumes={"/data": volume}) seldon_serving_json_template = Template(""" { "apiVersion": "machinelearning.seldon.io/v1alpha2", "kind": "SeldonDeployment", "metadata": { "labels": { "app": "seldon" }, "name": "mnist-classifier" }, "spec": { "annotations": { "deployment_version": "v1", "project_name": "MNIST Example" }, "name": "mnist-classifier", "predictors": [ { "annotations": { "predictor_version": "v1" }, "componentSpecs": [ { "spec": { "containers": [ { "image": "$dockerreposerving:$dockertagserving", "imagePullPolicy": "Always", "name": "mnist-classifier", "volumeMounts": [ { "mountPath": "/data", "name": "persistent-storage" } ] } ], "terminationGracePeriodSeconds": 1, "volumes": [ { "name": "persistent-storage", "persistentVolumeClaim": { "claimName": "$modelpvc" } } ] } } ], "graph": { "children": [], "endpoint": { "type": "REST" }, "name": "mnist-classifier", "type": "MODEL" }, "name": "mnist-classifier", "replicas": 1 } ] } } """) seldon_serving_json = seldon_serving_json_template.substitute({ 'dockerreposerving': f"{docker_org}/skmnistclassifier_runtime", 'dockertagserving': str(serve_container_version), 'modelpvc': vop.outputs["name"] }) seldon_deployment = json.loads(seldon_serving_json) serve = dsl.ResourceOp( name='serve', k8s_resource=seldon_deployment, success_condition='status.state == Available').after(train) # If we're called directly create an expirement and run if __name__ == '__main__': pipeline_func = mnist_train_pipeline pipeline_filename = pipeline_func.__name__ + '.pipeline.zip' import kfp.compiler as compiler compiler.Compiler().compile(pipeline_func, pipeline_filename) expirement_name = "cheese" experiment = client.create_experiment(expirement_name) run_name = pipeline_func.__name__ + ' run' run_result = client.run_pipeline(experiment.id, run_name, pipeline_filename, arguments) print(run_result) ================================================ FILE: ch9/ctscans/DICOM Denoising Pipeline.ipynb ================================================ { "cells": [ { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Defaulting to user installation because normal site-packages is not writeable\n", "Collecting kfp\n", " Downloading kfp-0.5.1.tar.gz (119 kB)\n", "\u001b[K |████████████████████████████████| 119 kB 3.5 MB/s eta 0:00:01\n", "\u001b[?25hRequirement already satisfied: PyYAML in /usr/local/lib/python3.6/dist-packages (from kfp) (5.3)\n", "Requirement already satisfied: google-cloud-storage>=1.13.0 in /usr/local/lib/python3.6/dist-packages (from kfp) (1.25.0)\n", "Requirement already satisfied: kubernetes<12.0.0,>=8.0.0 in /usr/local/lib/python3.6/dist-packages (from kfp) (10.0.1)\n", "Requirement already satisfied: google-auth>=1.6.1 in /usr/local/lib/python3.6/dist-packages (from kfp) (1.11.0)\n", "Collecting requests_toolbelt>=0.8.0\n", " Downloading requests_toolbelt-0.9.1-py2.py3-none-any.whl (54 kB)\n", "\u001b[K |████████████████████████████████| 54 kB 4.0 MB/s eta 0:00:01\n", "\u001b[?25hRequirement already satisfied: cloudpickle in /usr/local/lib/python3.6/dist-packages (from kfp) (1.2.2)\n", "Collecting kfp-server-api<0.6.0,>=0.2.5\n", " Downloading kfp-server-api-0.5.0.tar.gz (39 kB)\n", "Requirement already satisfied: jsonschema>=3.0.1 in /usr/local/lib/python3.6/dist-packages (from kfp) (3.2.0)\n", "Collecting tabulate\n", " Downloading tabulate-0.8.7-py3-none-any.whl (24 kB)\n", "Collecting click\n", " Downloading click-7.1.2-py2.py3-none-any.whl (82 kB)\n", "\u001b[K |████████████████████████████████| 82 kB 1.5 MB/s eta 0:00:01\n", "\u001b[?25hCollecting Deprecated\n", " Downloading Deprecated-1.2.9-py2.py3-none-any.whl (8.6 kB)\n", "Collecting strip-hints\n", " Downloading strip-hints-0.1.9.tar.gz (30 kB)\n", "Requirement already satisfied: google-cloud-core<2.0dev,>=1.2.0 in /usr/local/lib/python3.6/dist-packages (from google-cloud-storage>=1.13.0->kfp) (1.3.0)\n", "Requirement already satisfied: google-resumable-media<0.6dev,>=0.5.0 in /usr/local/lib/python3.6/dist-packages (from google-cloud-storage>=1.13.0->kfp) (0.5.0)\n", "Requirement already satisfied: six>=1.9.0 in /usr/lib/python3/dist-packages (from kubernetes<12.0.0,>=8.0.0->kfp) (1.11.0)\n", "Requirement already satisfied: websocket-client!=0.40.0,!=0.41.*,!=0.42.*,>=0.32.0 in /usr/local/lib/python3.6/dist-packages (from kubernetes<12.0.0,>=8.0.0->kfp) (0.57.0)\n", "Requirement already satisfied: python-dateutil>=2.5.3 in /usr/local/lib/python3.6/dist-packages (from kubernetes<12.0.0,>=8.0.0->kfp) (2.8.1)\n", "Requirement already satisfied: requests in /usr/local/lib/python3.6/dist-packages (from kubernetes<12.0.0,>=8.0.0->kfp) (2.22.0)\n", "Requirement already satisfied: requests-oauthlib in /usr/local/lib/python3.6/dist-packages (from kubernetes<12.0.0,>=8.0.0->kfp) (1.3.0)\n", "Requirement already satisfied: setuptools>=21.0.0 in /usr/local/lib/python3.6/dist-packages (from kubernetes<12.0.0,>=8.0.0->kfp) (45.1.0)\n", "Requirement already satisfied: urllib3>=1.24.2 in /usr/local/lib/python3.6/dist-packages (from kubernetes<12.0.0,>=8.0.0->kfp) (1.25.8)\n", "Requirement already satisfied: certifi>=14.05.14 in /usr/local/lib/python3.6/dist-packages (from kubernetes<12.0.0,>=8.0.0->kfp) (2019.11.28)\n", "Requirement already satisfied: rsa<4.1,>=3.1.4 in /usr/local/lib/python3.6/dist-packages (from google-auth>=1.6.1->kfp) (4.0)\n", "Requirement already satisfied: cachetools<5.0,>=2.0.0 in /usr/local/lib/python3.6/dist-packages (from google-auth>=1.6.1->kfp) (4.0.0)\n", "Requirement already satisfied: pyasn1-modules>=0.2.1 in /usr/local/lib/python3.6/dist-packages (from google-auth>=1.6.1->kfp) (0.2.8)\n", "Requirement already satisfied: attrs>=17.4.0 in /usr/local/lib/python3.6/dist-packages (from jsonschema>=3.0.1->kfp) (19.3.0)\n", "Requirement already satisfied: importlib-metadata; python_version < \"3.8\" in /usr/local/lib/python3.6/dist-packages (from jsonschema>=3.0.1->kfp) (1.4.0)\n", "Requirement already satisfied: pyrsistent>=0.14.0 in /usr/local/lib/python3.6/dist-packages (from jsonschema>=3.0.1->kfp) (0.15.7)\n", "Requirement already satisfied: wrapt<2,>=1.10 in /usr/local/lib/python3.6/dist-packages (from Deprecated->kfp) (1.11.2)\n", "Requirement already satisfied: wheel in /usr/lib/python3/dist-packages (from strip-hints->kfp) (0.30.0)\n", "Requirement already satisfied: google-api-core<2.0.0dev,>=1.16.0 in /usr/local/lib/python3.6/dist-packages (from google-cloud-core<2.0dev,>=1.2.0->google-cloud-storage>=1.13.0->kfp) (1.16.0)\n", "Requirement already satisfied: idna<2.9,>=2.5 in /usr/lib/python3/dist-packages (from requests->kubernetes<12.0.0,>=8.0.0->kfp) (2.6)\n", "Requirement already satisfied: chardet<3.1.0,>=3.0.2 in /usr/local/lib/python3.6/dist-packages (from requests->kubernetes<12.0.0,>=8.0.0->kfp) (3.0.4)\n", "Requirement already satisfied: oauthlib>=3.0.0 in /usr/local/lib/python3.6/dist-packages (from requests-oauthlib->kubernetes<12.0.0,>=8.0.0->kfp) (3.1.0)\n", "Requirement already satisfied: pyasn1>=0.1.3 in /usr/local/lib/python3.6/dist-packages (from rsa<4.1,>=3.1.4->google-auth>=1.6.1->kfp) (0.4.8)\n", "Requirement already satisfied: zipp>=0.5 in /usr/local/lib/python3.6/dist-packages (from importlib-metadata; python_version < \"3.8\"->jsonschema>=3.0.1->kfp) (2.1.0)\n", "Requirement already satisfied: protobuf>=3.4.0 in /usr/local/lib/python3.6/dist-packages (from google-api-core<2.0.0dev,>=1.16.0->google-cloud-core<2.0dev,>=1.2.0->google-cloud-storage>=1.13.0->kfp) (3.11.2)\n", "Requirement already satisfied: pytz in /usr/local/lib/python3.6/dist-packages (from google-api-core<2.0.0dev,>=1.16.0->google-cloud-core<2.0dev,>=1.2.0->google-cloud-storage>=1.13.0->kfp) (2019.3)\n", "Requirement already satisfied: googleapis-common-protos<2.0dev,>=1.6.0 in /usr/local/lib/python3.6/dist-packages (from google-api-core<2.0.0dev,>=1.16.0->google-cloud-core<2.0dev,>=1.2.0->google-cloud-storage>=1.13.0->kfp) (1.51.0)\n", "Building wheels for collected packages: kfp, kfp-server-api, strip-hints\n", " Building wheel for kfp (setup.py) ... \u001b[?25ldone\n", "\u001b[?25h Created wheel for kfp: filename=kfp-0.5.1-py3-none-any.whl size=163151 sha256=da5b540ae9834d37659146f0576997ffd8f7a7e2b305e1eb7b2a99dd4745930b\n", " Stored in directory: /home/jovyan/.cache/pip/wheels/2f/26/f9/e3836cb6e6cabd63ef912304e18a852ac29cb870a4a0b85f98\n", " Building wheel for kfp-server-api (setup.py) ... \u001b[?25ldone\n", "\u001b[?25h Created wheel for kfp-server-api: filename=kfp_server_api-0.5.0-py3-none-any.whl size=106319 sha256=84f55948cc254c0f836dffdfd51574a828ae8a503a2ca9198acf7a27ca2aaea7\n", " Stored in directory: /home/jovyan/.cache/pip/wheels/73/36/4e/bfe2efeeea4f74f04984ebe1d44136202b72191302f4760951\n", " Building wheel for strip-hints (setup.py) ... \u001b[?25ldone\n", "\u001b[?25h Created wheel for strip-hints: filename=strip_hints-0.1.9-py2.py3-none-any.whl size=24671 sha256=3bcfd573a91f5f4c46d23509ac3fee9a0cf351b414e00ed505a8f71d0e6a1141\n", " Stored in directory: /home/jovyan/.cache/pip/wheels/21/6d/fa/7ed7c0560e1ef39ebabd5cc0241e7fca711660bae1ad752e2b\n", "Successfully built kfp kfp-server-api strip-hints\n", "Installing collected packages: requests-toolbelt, kfp-server-api, tabulate, click, Deprecated, strip-hints, kfp\n", "\u001b[33m WARNING: The script tabulate is installed in '/home/jovyan/.local/bin' which is not on PATH.\n", " Consider adding this directory to PATH or, if you prefer to suppress this warning, use --no-warn-script-location.\u001b[0m\n", "\u001b[33m WARNING: The script strip-hints is installed in '/home/jovyan/.local/bin' which is not on PATH.\n", " Consider adding this directory to PATH or, if you prefer to suppress this warning, use --no-warn-script-location.\u001b[0m\n", "\u001b[33m WARNING: The scripts dsl-compile and kfp are installed in '/home/jovyan/.local/bin' which is not on PATH.\n", " Consider adding this directory to PATH or, if you prefer to suppress this warning, use --no-warn-script-location.\u001b[0m\n", "Successfully installed Deprecated-1.2.9 click-7.1.2 kfp-0.5.1 kfp-server-api-0.5.0 requests-toolbelt-0.9.1 strip-hints-0.1.9 tabulate-0.8.7\n", "\u001b[33mWARNING: You are using pip version 20.0.2; however, version 20.1 is available.\n", "You should consider upgrading via the '/usr/bin/python3 -m pip install --upgrade pip' command.\u001b[0m\n" ] } ], "source": [ "!pip3 install kfp\n" ] }, { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "import kfp\n", "import kubernetes" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [], "source": [ "\n", "container_manifest = {\n", " \"apiVersion\": \"sparkoperator.k8s.io/v1beta2\",\n", " \"kind\": \"SparkApplication\",\n", " \"metadata\": {\n", " \"name\": \"spark-app\",\n", " \"namespace\": \"kubeflow\"\n", " },\n", " \"spec\": {\n", " \"type\": \"Scala\",\n", " \"mode\": \"cluster\",\n", " \"image\": \"docker.io/rawkintrevo/covid-basis-vectors:0.2.0\",\n", " \"imagePullPolicy\": \"Always\",\n", " \"hadoopConf\": {\n", " \"fs.gs.project.id\": \"kubeflow-hacky-hacky\",\n", " \"fs.gs.system.bucket\": \"covid-dicoms\",\n", " \"fs.gs.impl\" : \"com.google.cloud.hadoop.fs.gcs.GoogleHadoopFileSystem\",\n", " \"google.cloud.auth.service.account.enable\": \"true\",\n", " \"google.cloud.auth.service.account.json.keyfile\": \"/mnt/secrets/user-gcp-sa.json\",\n", " },\n", " \"mainClass\": \"org.rawkintrevo.covid.App\",\n", " \"mainApplicationFile\": \"local:///covid-0.1-jar-with-dependencies.jar\", # See the Dockerfile\n", " \"arguments\": [\"245\", \"15\", \"1\"],\n", " \"sparkVersion\": \"2.4.5\",\n", " \"restartPolicy\": {\n", " \"type\": \"Never\"\n", " },\n", " \"driver\": {\n", " \"cores\": 1,\n", " \"secrets\": [\n", " {\"name\": \"user-gcp-sa\",\n", " \"path\": \"/mnt/secrets\",\n", " \"secretType\": \"GCPServiceAccount\"\n", " }\n", " ],\n", "\n", " \"coreLimit\": \"1200m\",\n", " \"memory\": \"512m\",\n", " \"labels\": {\n", " \"version\": \"2.4.5\",\n", " },\n", " \"serviceAccount\": \"spark-operatoroperator-sa\", # also try spark-operatoroperator-sa\n", " },\n", " \"executor\": {\n", " \"cores\": 1,\n", " \"secrets\": [\n", " {\"name\": \"user-gcp-sa\",\n", " \"path\": \"/mnt/secrets\",\n", " \"secretType\": \"GCPServiceAccount\"\n", " }\n", " ],\n", " \"instances\": 4,\n", " \"memory\": \"4084m\"\n", " },\n", " \"labels\": {\n", " \"version\": \"2.4.5\"\n", " },\n", "\n", " }\n", "}" ] }, { "cell_type": "code", "execution_count": 16, "metadata": {}, "outputs": [], "source": [ "from kfp.gcp import use_gcp_secret\n", "@kfp.dsl.pipeline(\n", " name=\"Covid DICOM Pipe v2\",\n", " description=\"Create Basis Vectors for Lung Images\"\n", ")\n", "def covid_dicom_pipeline():\n", " vop = kfp.dsl.VolumeOp(\n", " name=\"requisition-PVC\",\n", " resource_name=\"datapvc\",\n", " size=\"20Gi\", #10 Gi blows up...\n", " modes=kfp.dsl.VOLUME_MODE_RWO\n", " )\n", " step1 = kfp.dsl.ContainerOp(\n", " name=\"download-dicom\",\n", " image=\"rawkintrevo/download-dicom:0.0.0.4\",\n", " command=[\"/run.sh\"],\n", " pvolumes={\"/data\": vop.volume}\n", " )\n", " step2 = kfp.dsl.ContainerOp(\n", " name=\"convert-dicoms-to-vectors\",\n", " image=\"rawkintrevo/covid-prep-dicom:0.9.5\",\n", " arguments=[\n", " '--bucket_name', \"covid-dicoms\",\n", " ],\n", " command=[\"python\", \"/program.py\"],\n", " pvolumes={\"/mnt/data\": step1.pvolume}\n", " ).apply(kfp.gcp.use_gcp_secret(secret_name='user-gcp-sa'))\n", " rop = kfp.dsl.ResourceOp(\n", " name=\"calculate-basis-vectors\",\n", " k8s_resource=container_manifest,\n", " action=\"create\",\n", " success_condition=\"status.applicationState.state == COMPLETED\"\n", " ).after(step2)\n", " pyviz = kfp.dsl.ContainerOp(\n", " name=\"visualize-slice-of-dicom\",\n", " image=\"rawkintrevo/visualize-dicom-output:0.0.11\",\n", " command=[\"python\", \"/program.py\"],\n", " arguments=[\n", " '--bucket_name', \"covid-dicoms\",\n", " ],\n", " ).apply(kfp.gcp.use_gcp_secret(secret_name='user-gcp-sa')).after(rop)\n", " \n", "\n", "kfp.compiler.Compiler().compile(covid_dicom_pipeline,\"dicom-pipeline-2.zip\")\n", "client = kfp.Client()" ] }, { "cell_type": "code", "execution_count": 17, "metadata": {}, "outputs": [ { "data": { "text/html": [ "Experiment link here" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/html": [ "Run link here" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "\n", "my_experiment = client.create_experiment(name='my-experiments')\n", "my_run = client.run_pipeline(my_experiment.id, 'my-run1', 'dicom-pipeline-2.zip')" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.6.9" } }, "nbformat": 4, "nbformat_minor": 4 } ================================================ FILE: ch9/ctscans/calculate-basis-vectors/Dockerfile ================================================ FROM gcr.io/spark-operator/spark:v2.4.5-gcs-prometheus COPY target/covid-0.1-jar-with-dependencies.jar / ## Someday soon we'll live in a world where this hack is unnessecary # https://github.com/GoogleCloudDataproc/hadoop-connectors/issues/323 CMD rm /opt/spark/jars/gcs-connector-latest-hadoop2.jar ADD https://storage.googleapis.com/hadoop-lib/gcs/gcs-connector-hadoop2-2.0.1.jar $SPARK_HOME/jars ENTRYPOINT ["/opt/entrypoint.sh"] ================================================ FILE: ch9/ctscans/calculate-basis-vectors/build-component.sh ================================================ #!/usr/bin/env bash image_name=rawkintrevo/covid-basis-vectors # Specify the image name here image_tag=0.2.0 full_image_name=${image_name}:${image_tag} cd "$(dirname "$0")" docker build -t "${full_image_name}" . docker push "$full_image_name" ================================================ FILE: ch9/ctscans/calculate-basis-vectors/pom.xml ================================================ 4.0.0 org.rawkintrevo covid 0.1 2020 2.11.12 scala-tools.org Scala-Tools Maven2 Repository http://scala-tools.org/repo-releases scala-tools.org Scala-Tools Maven2 Repository http://scala-tools.org/repo-releases org.scala-lang scala-library ${scala.version} junit junit 4.4 test org.specs specs 1.2.5 test org.apache.mahout mahout-core_2.11 14.1-SNAPSHOT org.apache.mahout mahout-hdfs_2.11 14.1-SNAPSHOT org.apache.mahout mahout-spark_2.11 14.1-SNAPSHOT src/main/scala src/test/scala org.scala-tools maven-scala-plugin compile testCompile ${scala.version} -target:jvm-1.5 org.apache.maven.plugins maven-eclipse-plugin true ch.epfl.lamp.sdt.core.scalabuilder ch.epfl.lamp.sdt.core.scalanature org.eclipse.jdt.launching.JRE_CONTAINER ch.epfl.lamp.sdt.launching.SCALA_CONTAINER maven-assembly-plugin org.rawkintrevo.covid.App jar-with-dependencies make-assembly package single org.scala-tools maven-scala-plugin ${scala.version} ================================================ FILE: ch9/ctscans/calculate-basis-vectors/src/main/scala/org/rawkintrevo/covid/App.scala ================================================ package org.rawkintrevo.covid import org.apache.mahout.math._ import org.apache.mahout.math.scalabindings._ import org.apache.mahout.math.drm._ import org.apache.mahout.math.scalabindings.RLikeOps._ import org.apache.mahout.math.drm.RLikeDrmOps._ import org.apache.mahout.sparkbindings._ import org.apache.mahout.math.decompositions._ import org.apache.mahout.math.scalabindings.MahoutCollections._ import org.apache.spark.SparkContext import org.apache.spark.SparkConf import org.apache.spark.SparkFiles object App { def main(args: Array[String]) { val conf:SparkConf = new SparkConf() .setAppName("Calculate CT Scan Basis Vectors") .set("spark.kryo.referenceTracking", "false") .set("spark.kryo.registrator", "org.apache.mahout.sparkbindings.io.MahoutKryoRegistrator") .set("spark.kryoserializer.buffer", "32") .set("spark.kryoserializer.buffer.max" , "600m") .set("spark.serializer", "org.apache.spark.serializer.KryoSerializer") //create spark context object val sc = new SparkContext(conf) implicit val sdc: org.apache.mahout.sparkbindings.SparkDistributedContext = sc2sdc(sc) val pathToMatrix = "gs://covid-dicoms/s.csv" // todo make this an arg. val voxelRDD:DrmRdd[Int] = sc.textFile(pathToMatrix) .map(s => dvec( s.split(",") .map(f => f.toDouble))) .zipWithIndex .map(o => (o._2.toInt, o._1)) val voxelDRM = drmWrap(voxelRDD) // k, p, q should all be cli parameters // k is rank of the output e.g. the number of eigenfaces we want out. // p is oversampling parameter, // and q is the number of additional power iterations // Read https://mahout.apache.org/users/dim-reduction/ssvd.html val k = args(0).toInt val p = args(1).toInt val q = args(2).toInt val(drmU, drmV, s) = dssvd(voxelDRM.t, k, p, q) val V = drmV.checkpoint().rdd.saveAsTextFile("gs://covid-dicoms/drmV") val U = drmU.t.checkpoint().rdd.saveAsTextFile("gs://covid-dicoms/drmU") sc.parallelize(s.toArray,1).saveAsTextFile("gs://covid-dicoms/s") println("The job is done!") } } // $SPARK_HOME/bin/spark-submit --driver-memory 4G --executor-memory 4G --class org.rawkintrevo.book.App *jar ================================================ FILE: ch9/ctscans/download-dicom/Dockerfile ================================================ FROM gcr.io/google.com/cloudsdktool/cloud-sdk:latest # ## install gsutil lightly #RUN apt update \ # && apt install -y wget #RUN wget https://storage.googleapis.com/pub/gsutil.tar.gz #RUN tar xfz gsutil.tar.gz -C $HOME #ENV PATH="${PATH}:$HOME/gsutil" COPY ./run.sh /run.sh ================================================ FILE: ch9/ctscans/download-dicom/build-component.sh ================================================ #!/usr/bin/env bash image_name=rawkintrevo/download-dicom # Specify the image name here image_tag=0.0.0.4 full_image_name=${image_name}:${image_tag} cd "$(dirname "$0")" docker build -t "${full_image_name}" . docker push "$full_image_name" ================================================ FILE: ch9/ctscans/download-dicom/run.sh ================================================ #!/usr/bin/env bash set -e # 1st arg- case number (leading zero required if < 10), defaults to case1 if [ -z "${1}" ] then CASE="01" else CASE="${1}" fi echo "Downloading DICOMs" # If not on GCP need to download this gsutil cp gs://covid-dicoms/covid-dicoms.tar.gz /tmp/covid-dicoms.tar.gz tar -xzf /tmp/covid-dicoms.tar.gz -C /tmp mv "/tmp/case0${CASE}/axial" /data/dicom ================================================ FILE: ch9/ctscans/process-dicoms-into-vectors/Dockerfile ================================================ FROM pydicom/dicom:v3.6.5 # From https://github.com/HealthplusAI/python3-gdcm RUN apt update && apt install -y python-vtk6 libvtk6-dev cmake-curses-gui swig python3-dev libpython3.7-dev ## checkinstall missing... RUN ln -s /opt/conda/bin/* /usr/local/bin RUN git clone --branch release git://git.code.sf.net/p/gdcm/gdcm RUN mkdir build RUN cd build && cmake -D CMAKE_BUILD_TYPE=Release -D CMAKE_C_FLAGS=-fPIC -D CMAKE_CXX_FLAGS=-fPIC -D GDCM_BUILD_SHARED_LIBS:BOOL=ON \ -D GDCM_WRAP_PYTHON=ON -D PYTHON_EXECUTABLE=/usr/local/bin/python3.7 \ -D PYTHON_INCLUDE_DIR=/usr/include/python3.7m/ -D GDCM_BUILD_SHARED_LIBS=ON -D GDCM_USE_VTK=ON ../gdcm ## They forgot this line in instuctions, but is important... RUN cd build && make install # checkinstall -D -y --pkgversion --pkgname=python3-gdcm --pkgversion=1 # checkinstall doesn't exist in debian? RUN cp /usr/local/lib/gdcm.py /opt/conda/lib/python3.7/site-packages/ RUN cp /usr/local/lib/gdcmswig.py /opt/conda/lib/python3.7/site-packages/ RUN cp /usr/local/lib/_gdcmswig.so /opt/conda/lib/python3.7/site-packages/ RUN cp /usr/local/lib/libgdcm* /opt/conda/lib/python3.7/site-packages/ RUN ldconfig # RUN mkdir /data already exists in base # todo move these to requirements.txt RUN pip install numpy RUN pip install scipy RUN pip install google-cloud-storage ENV GOOGLE_APPLICATION_CREDENTIALS="/secret/gcp-credentials/user-gcp-sa.json" COPY src/program.py /program.py ================================================ FILE: ch9/ctscans/process-dicoms-into-vectors/build-component.sh ================================================ #!/usr/bin/env bash image_name=rawkintrevo/covid-prep-dicom # Specify the image name here image_tag=0.9.5 full_image_name=${image_name}:${image_tag} cd "$(dirname "$0")" docker build -t "${full_image_name}" . docker push "$full_image_name" ================================================ FILE: ch9/ctscans/process-dicoms-into-vectors/data/s.150.csv ================================================ ================================================ FILE: ch9/ctscans/process-dicoms-into-vectors/process-dicoms-into-vectors.yaml ================================================ name: Process DICOMs into Vectors description: Take a number of COVID DICOMs - output a list of vectors for DS-SVD. inputs: - {name: in, type: String, description='Input file name.'} - {name: out, type: String, description='Output file name.'} implementation: container: image: rawkintrevo/covid-prep-docim command: [ python, /program.py, {inputValue: in}, inputValue: out} ] ================================================ FILE: ch9/ctscans/process-dicoms-into-vectors/src/program.py ================================================ from os import listdir import numpy as np import pydicom import argparse from google.cloud import storage parser = argparse.ArgumentParser( description='Process DICOM Images into Vectors.') parser.add_argument('--input_dir', type=str, default="/mnt/data/dicom", help='Directory containing DICOM Images') parser.add_argument('--bucket_name', type=str, help='name of bucket to write output to.') parser.add_argument('--output_file', type=str, default="s.csv", help='file name of dcm converted to 2d numerical matrix') args = parser.parse_args() def create_3d_matrix(path): dicoms = [pydicom.dcmread(f"{path}/{f}") for f in listdir(path)] slices = [d for d in dicoms if hasattr(d, "SliceLocation")] slices = sorted(slices, key=lambda s: s.SliceLocation) ps = slices[0].PixelSpacing ss = slices[0].SliceThickness ax_aspect = ps[1] / ps[0] sag_aspect = ps[1] / ss cor_aspect = ss / ps[0] # create 3D array img_shape = list(slices[0].pixel_array.shape) img_shape.append(len(slices)) img3d = np.zeros(img_shape) for i, s in enumerate(slices): img2d = s.pixel_array img3d[:, :, i] = img2d return { "img3d": img3d, "img_shape": img_shape, "ax_aspect": ax_aspect, "sag_aspect": sag_aspect, "cor_aspect": cor_aspect } def upload_blob(bucket_name, source_file_name, destination_blob_name): """Uploads a file to the bucket.""" # bucket_name = "your-bucket-name" # source_file_name = "local/path/to/file" # destination_blob_name = "storage-object-name" storage_client = storage.Client() bucket = storage_client.bucket(bucket_name) blob = bucket.blob(destination_blob_name) blob.upload_from_filename(source_file_name) print("File {} uploaded to {}.".format(source_file_name, destination_blob_name)) input_dir = args.input_dir output_file = args.output_file m = create_3d_matrix(f"{input_dir}") np.savetxt("/tmp/s.csv", m['img3d'].reshape((-1, m['img_shape'][2])), delimiter=",") upload_blob(args.bucket_name, "/tmp/s.csv", output_file) ================================================ FILE: ch9/ctscans/visualize-basis-vectors/Dockerfile ================================================ FROM python:3-buster RUN pip install numpy RUN pip install matplotlib RUN pip install google-cloud-storage COPY src/program.py /program.py CMD ["python" , "/program.py"] ================================================ FILE: ch9/ctscans/visualize-basis-vectors/build-component.sh ================================================ #!/usr/bin/env bash image_name=rawkintrevo/visualize-dicom-output # Specify the image name here image_tag=0.0.11 full_image_name=${image_name}:${image_tag} cd "$(dirname "$0")" docker build -t "${full_image_name}" . docker push "$full_image_name" ================================================ FILE: ch9/ctscans/visualize-basis-vectors/src/program.py ================================================ from ast import literal_eval from os import listdir import matplotlib.pyplot as plt import numpy as np import argparse from google.cloud import storage parser = argparse.ArgumentParser( description='Convert DRMs into DICOMs and Images') parser.add_argument('--bucket_name', type=str, help='name of bucket to write output to.') args = parser.parse_args() def read_mahout_drm(path): data = {} counter = 0 parts = [p for p in listdir(path) if "part"] for p in parts: with open(f"{path}/{p}", 'r') as f: lines = f.read().split("\n") for l in lines[:-1]: counter += 1 t = literal_eval(l) arr = np.array([t[1][i] for i in range(len(t[1].keys()))]) data[t[0]] = arr print(f"read {counter} lines from {path}") return data def plot_3d_matrix(img3d, img_shape, ax_aspect, sag_aspect, cor_aspect): # plot 3 orthogonal slices a1 = plt.subplot(2, 2, 1) plt.imshow(img3d[:, :, img_shape[2] // 2]) a1.set_aspect(ax_aspect) a2 = plt.subplot(2, 2, 2) plt.imshow(img3d[:, img_shape[1] // 2, :]) a2.set_aspect(sag_aspect) a3 = plt.subplot(2, 2, 3) plt.imshow(img3d[img_shape[0] // 2, :, :].T) a3.set_aspect(cor_aspect) plt.show(cmap=plt.cm.bone) def plot_2_3d_matrices(img1, img2, aspect, slice, cmap): a1 = plt.subplot(1, 2, 1) plt.imshow(img1[:, slice, :], cmap=cmap) a1.set_aspect(aspect) a2 = plt.subplot(1, 2, 2) plt.imshow(img2[:, slice, :], cmap=cmap) a2.set_aspect(aspect) def upload_blob(bucket_name, source_file_name, destination_blob_name): """Uploads a file to the bucket.""" # bucket_name = "your-bucket-name" # source_file_name = "local/path/to/file" # destination_blob_name = "storage-object-name" storage_client = storage.Client() bucket = storage_client.bucket(bucket_name) blob = bucket.blob(destination_blob_name) blob.upload_from_filename(source_file_name) print("File {} uploaded to {}.".format(source_file_name, destination_blob_name)) def download_folder(bucket_name='your-bucket-name', bucket_dir='your-bucket-directory/', dl_dir="local-dir/"): storage_client = storage.Client() bucket = storage_client.get_bucket(bucket_name) blobs = bucket.list_blobs(prefix=bucket_dir) # Get list of files for blob in blobs: filename = blob.name.replace('/', '_') blob.download_to_filename(dl_dir + filename) # Download import os bucket_name = args.bucket_name os.mkdir('/tmp/drmU') os.mkdir('/tmp/drmV') os.mkdir('/tmp/s') download_folder(bucket_name, "drmU/", "/tmp/drmU/") download_folder(bucket_name, "drmV/", "/tmp/drmV/") download_folder(bucket_name, "s/", "/tmp/s/") drmU = read_mahout_drm("/tmp/drmU") drmV = read_mahout_drm("/tmp/drmV") print(os.listdir("/tmp")) print(os.listdir("/tmp/s")) drmU_p5 = np.transpose(np.array([drmU[i] for i in range(len(drmU.keys()))])) drmV_p5 = np.array([drmV[i] for i in range(len(drmV.keys()))]) with open(f"/tmp/s/s_part-00000", 'r') as f: diags = [float(d) for d in f.read().split('\n') if d != ''] recon = drmU_p5 @ np.diag(diags) @ drmV_p5.transpose() # plot_3d_matrix(recon.transpose().reshape((512,512,301)), (512,512,301), 1.0, 0.810547, 1.2337347494963278) composite_img = recon.transpose().reshape((512, 512, 301)) diags_orig = diags percs = [0.001, 0.01, 0.05, 0.1, 0.3] for p in range(len(percs)): perc = percs[p] diags = [ diags_orig[i] if i < round(len(diags) - (len(diags) * perc)) else 0 for i in range(len(diags)) ] recon = drmU_p5 @ np.diag(diags) @ drmV_p5.transpose() # plot_3d_matrix(recon.transpose().reshape((512,512,301)), (512,512,301), 1.0, 0.810547, 1.2337347494963278) composite_img = recon.transpose().reshape((512, 512, 301)) a1 = plt.subplot(1, 1, 1) plt.imshow(composite_img[:, :, 150], cmap=plt.cm.bone) plt.title( f"{perc*100}% denoised. (k={len(diags)}, oversample=15, power_iters=2)" ) a1.set_aspect(1.0) plt.axis('off') fname = f"{100-(perc*100)}%-denoised-img.png" plt.savefig(f"/tmp/{fname}") upload_blob(bucket_name, f"/tmp/{fname}", f"/output/{fname}") ================================================ FILE: ci.sh ================================================ #!/bin/bash set -ex # Check all the shell scripts find ./ -iregex '^.+\.sh$' -type f -print0 | \ xargs -0 shellcheck -e SC1091 -e SC2164 -e SC1090 # Check for cases where I use tags rather than tag bad_tags=$(grep -r "tags::" ./ | grep -v "ci.sh:" || true) # Look for long lines long_lines=$(grep --include '*.sh' --exclude '*venv*' -Hnr '.\{90\}' ./ || true) if [[ -n "$bad_tags" ]]; then echo "Found bad tags $bad_tags replace tags with tag" fi if [[ -n "$long_lines" ]]; then print "Found long lines:\n$long_lines" fi if [[ -n "$bad_tags" ]] || [[ -n "$long_lines" ]]; then exit 1 fi ./runthrough.sh ================================================ FILE: convert_notebooks.sh ================================================ #!/bin/bash find . -name "*ipynb" |grep -v venv | xargs -d '\n' ipython3 nbconvert --to script ================================================ FILE: data-extraction/README.md ================================================ ## Data Extraction To successfully construct a machine learning pipeline we need to collect the data we are going to train on. The data extraction is organized here by the different use case. In many introduction to machine learning examples the data is pre-extracted, and sometimes even pre-cleaned. Here we will show some ways to collect the initial data. Once the initial training data has been extracted, we will continue on downstream with data cleaning, and may later do some data augmentation. ================================================ FILE: data-extraction/github_comments_query.bsql ================================================ SELECT pull_request_url, ANY_VALUE(pull_patch_url) as pull_patch_url, ARRAY_AGG(comment_position) as comments_positions, ARRAY_AGG(diff_hunk) as diff_hunks, ARRAY_AGG(comment_original_position) as comments_original_positions, ARRAY_AGG(comment_commit_id IGNORE NULLS) as comment_commit_ids, ARRAY_AGG(comment_file_path IGNORE NULLS) as comment_file_paths FROM ( SELECT *, JSON_EXTRACT(payload, '$.action') AS action, JSON_EXTRACT(payload, '$.pull_request.url') AS pull_request_url, JSON_EXTRACT(payload, '$.pull_request.patch_url') AS pull_patch_url, IFNULL(JSON_EXTRACT(payload, '$.comment.original_position'), "-1") AS comment_original_position, IFNULL(JSON_EXTRACT(payload, '$.comment.position'), "-1") AS comment_position, JSON_EXTRACT(payload, '$.comment.commit_id') AS comment_commit_id, JSON_EXTRACT(payload, '$.comment.path') AS comment_file_path FROM "githubarchive.day.*" WHERE type = "PullRequestReviewCommentEvent") GROUP BY pull_request_url ================================================ FILE: data-extraction/github_issues_query.bsql ================================================ SELECT repo.name, JSON_EXTRACT(payload, '$.issue.url') AS url FROM ( SELECT *, JSON_EXTRACT(payload, '$.action') AS action FROM "githubarchive.day.*" WHERE type = "IssuesEvent") WHERE type = "IssuesEvent" AND action = "\"opened\"" ================================================ FILE: data-extraction/iot/basic.yaml ================================================ apiVersion: batch/v1 kind: Job metadata: name: iot-data-extraction namespace: kubeflow spec: template: spec: containers: - env: - name: GOOGLE_APPLICATION_CREDENTIALS value: /secret/gcp-credentials/user-gcp-sa.json image: IMAGE_NAME name: gh-data-extract-gh-job volumeMounts: - mountPath: /secret/gcp-credentials name: secret-volume readOnly: true restartPolicy: OnFailure volumes: - name: secret-volume secret: secretName: user-gcp-sa ================================================ FILE: data-extraction/iot/build.sh ================================================ #!/bin/bash CONTAINER_REGISTRY="gcr.io/${PROJECT_NAME}" #tag::buildandpush[] TARGET="${CONTAINER_REGISTRY}/kf-steps/iot-extract:v2" docker build . -t "${TARGET}" docker push "${TARGET}" #end::buildandpush[] #tag::run[] kubectl apply -f iot_extract_job.yaml #end::run[] #tag::verify[] kubectl get jobs |grep gh-data #end::verify[] ================================================ FILE: data-extraction/python-notebook/AddSpamassassinDockerfile ================================================ ARG base FROM $base # Run as root for updates USER root # Install Spamassassin RUN apt-get update && \ apt-get install -yq spamassassin spamc && \ apt-get clean && \ rm -rf /var/lib/apt/lists/* && \ rm -rf /var/cache/apt # Switch back to the expected user USER jovyan ================================================ FILE: data-extraction/python-notebook/MailingListDataPrep.ipynb ================================================ { "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "Here we can install some packages our notebook needs. We can also install them in our container to speed things up & make it more reliable. But for prototyping this works great!" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "jupyter": { "outputs_hidden": true } }, "outputs": [], "source": [ "!pip3 install --upgrade lxml\n", "!pip3 install --upgrade pandas\n", "!pip3 install --upgrade scikit-learn\n", "!pip3 install --upgrade scipy\n", "!pip3 install --upgrade tables" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "We can use Jupyter notebooks just like normal inside of Kubeflow" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "from datetime import datetime\n", "from requests import get\n", "from lxml import etree\n", "from time import sleep\n", "\n", "import re\n", "\n", "import pandas as pd\n", "\n", "import os" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "container_registry = \"\" # Wherever you put your containers" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "def scrapeMailArchives(mailingList: str, year: int, month: int):\n", " baseUrl = \"http://mail-archives.apache.org/mod_mbox/%s/%s.mbox/ajax/\" % (mailingList, datetime(year,month,1).strftime(\"%Y%m\"))\n", " r = get(baseUrl + \"thread?0\")\n", " utf8_parser = etree.XMLParser(encoding='utf-8')\n", " root = etree.fromstring(r.text.replace('encoding=\"UTF-8\"', \"\"), parser=utf8_parser)\n", " output = []\n", " for message in root.xpath(\"//message\"):\n", " _id = message.get(\"id\")\n", " linked = message.get(\"linked\")\n", " depth = message.get(\"depth\")\n", " fr = message.xpath(\"from\")[0].text\n", " dt = message.xpath(\"date\")[0].text ## todo convert to date\n", " subject = message.xpath(\"subject\")[0].text\n", " r2 = get(baseUrl + _id)\n", " bodyRoot = etree.fromstring(r2.text.replace('encoding=\"UTF-8\"', \"\"), parser=utf8_parser)\n", " body = bodyRoot.xpath(\"//contents\")[0].text\n", " record = {\n", " \"id\" : _id,\n", " \"linked\" : linked,\n", " \"depth\" : depth,\n", " \"from\" : fr,\n", " \"dt\" : dt,\n", " \"subject\" : subject,\n", " \"body\" : body\n", " }\n", " output.append(record)\n", " sleep(0.1)\n", " return output\n", "\n", "\n", "def extract_links(body):\n", " link_regex_str = r'(http(|s)://(.*?))([\\s\\n]|$)'\n", " itr = re.finditer(link_regex_str, body, re.MULTILINE)\n", " return list(map(lambda elem: elem.group(1), itr))\n", "\n", "def extract_domains(links):\n", " from urllib.parse import urlparse\n", " def extract_domain(link):\n", " try:\n", " nloc = urlparse(link).netloc\n", " # We want to drop www and any extra spaces wtf nloc on the spaces.\n", " regex_str = r'^(www\\.|)(.*?)\\s*$'\n", " match = re.search(regex_str, nloc)\n", " return match.group(2)\n", " except:\n", " return None\n", " return list(map(extract_domain, links))\n", "\n", "def contains_python_stack_trace(body):\n", " return \"Traceback (most recent call last)\" in body\n", "\n", "def contains_probably_java_stack_trace(body):\n", " # Look for something based on regex\n", " # Tried https://stackoverflow.com/questions/20609134/regular-expression-optional-multiline-java-stacktrace - more msg looking\n", " # Tried https://stackoverflow.com/questions/3814327/regular-expression-to-parse-a-log-file-and-find-stacktraces\n", " # Yes the compile is per call, but it's cached so w/e\n", " import re\n", " stack_regex_str = r'^\\s*(.+Exception.*):\\n(.*\\n){0,3}?(\\s+at\\s+.*\\(.*\\))+'\n", " match = re.search(stack_regex_str, body, re.MULTILINE)\n", " return match is not None\n", "\n", "def contains_exception_in_task(body):\n", " # Look for a line along the lines of ERROR Executor: Exception in task\n", " return \"ERROR Executor: Exception in task\" in body" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "datesToScrape = [(2019, i) for i in range(1,13)]\n", "\n", "records = []\n", "for y,m in datesToScrape:\n", " print(m,\"-\",y)\n", " records += scrapeMailArchives(\"spark-dev\", y, m)\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "df = pd.DataFrame(records)\n", "df['links'] = df['body'].apply(extract_links)\n", "df['containsPythonStackTrace'] = df['body'].apply(contains_python_stack_trace)\n", "df['containsJavaStackTrace'] = df['body'].apply(contains_probably_java_stack_trace)\n", "df['containsExceptionInTaskBody'] = df['body'].apply(contains_exception_in_task)\n", "\n", "df['domains'] = df['links'].apply(extract_domains)\n", "df['isThreadStart'] = df['depth'] == '0'" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "from sklearn.feature_extraction.text import TfidfVectorizer\n", "\n", "bodyV = TfidfVectorizer()\n", "# bodyV = TfidfVectorizer(max_features=10000) #if we cared about making this 1:1 w holden's code.\n", "bodyFeatures = bodyV.fit_transform(df['body'])\n", "\n", "domainV = TfidfVectorizer()\n", "# domainV = TfidfVectorizer(max_features=100)\n", "\n", "## A couple of \"None\" domains really screwed the pooch on this one. Also, no lists just space seperated domains.\n", "def makeDomainsAList(d):\n", " return ' '.join([a for a in d if not a is None])\n", "\n", "domainFeatures = domainV.fit_transform(df['domains'].apply(makeDomainsAList))\n", "\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "\n", "from scipy.sparse import csr_matrix, hstack\n", "\n", "data = hstack([csr_matrix(df[['containsPythonStackTrace', 'containsJavaStackTrace', 'containsExceptionInTaskBody', 'isThreadStart']].to_numpy()),\n", " bodyFeatures,\n", " domainFeatures])\n", "\n", "\n", "from sklearn.cluster import KMeans\n", "from sklearn.model_selection import train_test_split\n", "\n", "train, test = train_test_split(data, test_size=0.1)\n", "\n", "kmeans = KMeans(n_clusters=2, random_state=42).fit(train)\n", "train_pred = kmeans.predict(train)\n", "test_pred = kmeans.predict(test)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Alternatively, by structuring our code correctly we can take advantage of pipelines" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "!pip3 install --upgrade kfp" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "import kfp\n", "import kfp.dsl as dsl" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "def download_data(year: int) -> str:\n", " \n", " from datetime import datetime\n", " from lxml import etree\n", " from requests import get\n", " from time import sleep\n", " \n", " import json\n", " \n", " def scrapeMailArchives(mailingList: str, year: int, month: int):\n", " baseUrl = \"http://mail-archives.apache.org/mod_mbox/%s/%s.mbox/ajax/\" % (mailingList, datetime(year,month,1).strftime(\"%Y%m\"))\n", " r = get(baseUrl + \"thread?0\")\n", " utf8_parser = etree.XMLParser(encoding='utf-8')\n", " root = etree.fromstring(r.text.replace('encoding=\"UTF-8\"', \"\"), parser=utf8_parser)\n", " output = []\n", " for message in root.xpath(\"//message\"):\n", " _id = message.get(\"id\")\n", " linked = message.get(\"linked\")\n", " depth = message.get(\"depth\")\n", " fr = message.xpath(\"from\")[0].text\n", " dt = message.xpath(\"date\")[0].text ## todo convert to date\n", " subject = message.xpath(\"subject\")[0].text\n", " r2 = get(baseUrl + _id)\n", " bodyRoot = etree.fromstring(r2.text.replace('encoding=\"UTF-8\"', \"\"), parser=utf8_parser)\n", " body = bodyRoot.xpath(\"//contents\")[0].text\n", " record = {\n", " \"id\" : _id,\n", " \"linked\" : linked,\n", " \"depth\" : depth,\n", " \"from\" : fr,\n", " \"dt\" : dt,\n", " \"subject\" : subject,\n", " \"body\" : body\n", " }\n", " output.append(record)\n", " sleep(0.1)\n", " \n", " return output\n", "\n", " datesToScrape = [(year, i) for i in range(1,2)]\n", "\n", " records = []\n", " ## todo, go back further\n", " for y,m in datesToScrape:\n", " print(m,\"-\",y)\n", " records += scrapeMailArchives(\"spark-dev\", y, m)\n", " import os\n", " output_path = '/data_processing/data.json'\n", " with open(output_path, 'w') as f:\n", " json.dump(records, f)\n", " \n", " return output_path\n", " " ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "def download_tld_data() -> str:\n", " from requests import get\n", " import pandas as pd\n", " print(\"importing io....\")\n", " import io\n", "\n", " url = \"https://pkgstore.datahub.io/core/country-list/data_csv/data/d7c9d7cfb42cb69f4422dec222dbbaa8/data_csv.csv\"\n", " print(\"Getting the url\")\n", " s = get(url).content\n", " print(\"Converting content\")\n", " df = pd.read_csv(io.StringIO(s.decode('utf-8')))\n", " print(\"Writing output\")\n", " output_path_hdf = '/tld_info/clean_data.hdf'\n", " df.to_hdf(output_path_hdf, key=\"tld\")\n", " \n", " return output_path_hdf" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Now that we have some data, we want to get rid of any \"bad\" records" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "#tag::clean_data_fun[]\n", "def clean_data(input_path: str) -> str:\n", " import json\n", " import pandas as pd\n", " \n", " print(\"loading records...\")\n", " with open(input_path, 'r') as f:\n", " records = json.load(f)\n", " print(\"records loaded\")\n", " \n", " df = pd.DataFrame(records)\n", " # Drop records without a subject, body, or sender\n", " cleaned = df.dropna(subset=[\"subject\", \"body\", \"from\"])\n", " \n", " output_path_hdf = '/data_processing/clean_data.hdf'\n", " cleaned.to_hdf(output_path_hdf, key=\"clean\")\n", " \n", " return output_path_hdf\n", "#end::clean_data_fun[]" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Preparing the data\n", "\n", "Remember earlier when we did that big (and arguably pointless) classification of emails from the Apache Spark mailing list? OK, now we're going to do it again, as a \"lightweight\" Python function in a Kubeflow Pipeline. I hope the irony of the term \"lightweight\" isn't lost on anyone, because this is pretty blatent abuse of something that was originally presented for conveinience. \n", "\n", "First note, all of the imports and declarations of helper functions MUST be with in the \"ligthweight\" function. One could argue (and they would probably be correct) that I have two steps here- feature prep and ML, and as such I should split them. I would say that's fair, but I choose not to do so at this time. Perhaps in some scripts later on?\n", "\n", "As has been pointed out so many times before, we assume the reader either arleady understands what is going on with the KMeans clustering, or better yet, doesn't even care. I won't be digging into that right now. What I will point out- and maybe as a note to the editor, the model that is finally saved really ought to be persisted somewhere. If the model isn't saved, then this basically pointless pipeline, is truly pointless. \n" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Now let's make sure we can read that data in the next step (before we write a big complicated model to do whatever torture to it)." ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "def prepare_features(input_path: str, tld_info_path: str):\n", " \n", " import re\n", " import pandas as pd\n", " \n", " print(\"loading records...\")\n", " df = pd.read_hdf(input_path, key=\"clean\")\n", " print(\"records loaded\")\n", " \n", " print(\"Loading tld info....\")\n", " tld_df = pd.read_hdf(tld_info_path, key=\"tld\")\n", " print(\"Loaded tld info\")\n", " \n", " \n", " ## Note: \"Lightweight\" Python Fns mean helper code must be inside the fn. (Bad Form)\n", " def extract_links(body):\n", " link_regex_str = r'(http(|s)://(.*?))([\\s\\n]|$)'\n", " itr = re.finditer(link_regex_str, body, re.MULTILINE)\n", " return list(map(lambda elem: elem.group(1), itr))\n", "\n", " def extract_domains(links):\n", " from urllib.parse import urlparse\n", " def extract_domain(link):\n", " try:\n", " nloc = urlparse(link).netloc\n", " # We want to drop www and any extra spaces wtf nloc on the spaces.\n", " regex_str = r'^(www\\.|)(.*?)\\s*$'\n", " match = re.search(regex_str, nloc)\n", " return match.group(2)\n", " except:\n", " return None\n", " return list(map(extract_domain, links))\n", "\n", " def contains_python_stack_trace(body):\n", " return \"Traceback (most recent call last)\" in body\n", "\n", " def contains_probably_java_stack_trace(body):\n", " # Look for something based on regex\n", " # Tried https://stackoverflow.com/questions/20609134/regular-expression-optional-multiline-java-stacktrace - more msg looking\n", " # Tried https://stackoverflow.com/questions/3814327/regular-expression-to-parse-a-log-file-and-find-stacktraces\n", " # Yes the compile is per call, but it's cached so w/e\n", " import re\n", " stack_regex_str = r'^\\s*(.+Exception.*):\\n(.*\\n){0,3}?(\\s+at\\s+.*\\(.*\\))+'\n", " match = re.search(stack_regex_str, body, re.MULTILINE)\n", " return match is not None\n", "\n", " def contains_exception_in_task(body):\n", " # Look for a line along the lines of ERROR Executor: Exception in task\n", " return \"ERROR Executor: Exception in task\" in body\n", "\n", " print(df.shape)\n", " df['links'] = df['body'].apply(extract_links)\n", " df['containsPythonStackTrace'] = df['body'].apply(contains_python_stack_trace)\n", " df['containsJavaStackTrace'] = df['body'].apply(contains_probably_java_stack_trace)\n", " df['containsExceptionInTaskBody'] = df['body'].apply(contains_exception_in_task)\n", "\n", " #tag::local_mailing_list_feature_prep_fun[]\n", " df['domains'] = df['links'].apply(extract_domains)\n", " df['isThreadStart'] = df['depth'] == '0'\n", " \n", " # Arguably, you could split building the dataset away from the actual witchcraft.\n", " from sklearn.feature_extraction.text import TfidfVectorizer\n", "\n", " bodyV = TfidfVectorizer()\n", " bodyFeatures = bodyV.fit_transform(df['body'])\n", "\n", " domainV = TfidfVectorizer()\n", "\n", " ## A couple of \"None\" domains really screwed the pooch on this one.Also, no lists just space seperated domains.\n", " def makeDomainsAList(d):\n", " return ' '.join([a for a in d if not a is None])\n", "\n", " domainFeatures = domainV.fit_transform(df['domains'].apply(makeDomainsAList))\n", "\n", " from scipy.sparse import csr_matrix, hstack\n", "\n", " data = hstack([csr_matrix(df[['containsPythonStackTrace',\n", " 'containsJavaStackTrace',\n", " 'containsExceptionInTaskBody', \n", " 'isThreadStart']].to_numpy()),\n", " bodyFeatures,\n", " domainFeatures])\n", " #end::local_mailing_list_feature_prep_fun[]" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "\n", "### The Kubeflow Bit.\n", "\n", "Now we can put these two pieces together into a pipeline. Since the data is relatively small we will use a persistent volume put them together. Later on we can add training to this pipeline as well.\n", "\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Make a volume example. We redo it inside of the pipeline definition because we need to be inside\n", "#tag::makeVolume[]\n", "dvop = dsl.VolumeOp(\n", " name=\"create_pvc\",\n", " resource_name=\"my-pvc-2\",\n", " size=\"5Gi\",\n", " modes=dsl.VOLUME_MODE_RWO)\n", "#end::makeVolume[]" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "!rm local-data-prep-2.zip" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "#tag::makePipeline[]\n", "@kfp.dsl.pipeline(\n", " name='Simple1',\n", " description='Simple1'\n", ")\n", "def my_pipeline_mini(year: int):\n", " dvop = dsl.VolumeOp(\n", " name=\"create_pvc\",\n", " resource_name=\"my-pvc-2\",\n", " size=\"5Gi\",\n", " modes=dsl.VOLUME_MODE_RWO)\n", " tldvop = dsl.VolumeOp(\n", " name=\"create_pvc\",\n", " resource_name=\"tld-volume-2\",\n", " size=\"100Mi\",\n", " modes=dsl.VOLUME_MODE_RWO)\n", " download_data_op = kfp.components.func_to_container_op(\n", " download_data,\n", " packages_to_install=['lxml', 'requests'])\n", " download_tld_info_op = kfp.components.func_to_container_op(\n", " download_tld_data,\n", " packages_to_install=['requests', 'pandas>=0.24', 'tables'])\n", " clean_data_op = kfp.components.func_to_container_op(\n", " clean_data,\n", " packages_to_install=['pandas>=0.24', 'tables'])\n", "\n", " step1 = download_data_op(year).add_pvolumes({\"/data_processing\": dvop.volume})\n", " step2 = clean_data_op(input_path=step1.output).add_pvolumes({\"/data_processing\": dvop.volume})\n", " step3 = download_tld_info_op().add_pvolumes({\"/tld_info\": tldvop.volume})\n", "\n", "kfp.compiler.Compiler().compile(my_pipeline_mini, 'local-data-prep-2.zip')\n", "#end::makePipeline[]" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "!rm *.zip" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "#tag::longPipeline[]\n", "@kfp.dsl.pipeline(\n", " name='Simple1',\n", " description='Simple1'\n", ")\n", "def my_pipeline2(year: int):\n", " dvop = dsl.VolumeOp(\n", " name=\"create_pvc\",\n", " resource_name=\"my-pvc-2\",\n", " size=\"5Gi\",\n", " modes=dsl.VOLUME_MODE_RWO)\n", " tldvop = dsl.VolumeOp(\n", " name=\"create_pvc\",\n", " resource_name=\"tld-volume-2\",\n", " size=\"100Mi\",\n", " modes=dsl.VOLUME_MODE_RWO)\n", "\n", " download_data_op = kfp.components.func_to_container_op(\n", " download_data,\n", " packages_to_install=['lxml', 'requests'])\n", " download_tld_info_op = kfp.components.func_to_container_op(\n", " download_tld_data,\n", " packages_to_install=['requests', 'pandas>=0.24', 'tables'])\n", " clean_data_op = kfp.components.func_to_container_op(\n", " clean_data,\n", " packages_to_install=['pandas>=0.24', 'tables'])\n", "#tag::add_feature_step[]\n", " prepare_features_op = kfp.components.func_to_container_op(\n", " prepare_features,\n", " packages_to_install=['pandas>=0.24', 'tables', 'scikit-learn'])\n", "#tag::end_feature_step[]\n", "\n", " step1 = download_data_op(year).add_pvolumes({\"/data_processing\": dvop.volume})\n", " step2 = clean_data_op(input_path=step1.output).add_pvolumes({\"/data_processing\": dvop.volume})\n", " step3 = download_tld_info_op().add_pvolumes({\"/tld_info\": tldvop.volume})\n", " step4 = prepare_features_op(input_path=step2.output, tld_info_path=step3.output).add_pvolumes({\n", " \"/data_processing\": dvop.volume,\n", " \"/tld_info\": tldvop.volume})\n", "#end::longPipeline[]\n", "\n", "kfp.compiler.Compiler().compile(my_pipeline2, 'local-data-and-feature-prep-2.zip')" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "client = kfp.Client()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "my_experiment = client.create_experiment(name='local-data-prep-test-2')\n", "my_run = client.run_pipeline(my_experiment.id, 'local-data-prep', \n", " 'local-data-and-feature-prep-2.zip', params={'year': '2019'})" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "If we were using Spamassasin or some other library installed in a different base container we would:" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Clean data with custom container\n", "#tag::cleanDataWithContainer[]\n", "clean_data_op = kfp.components.func_to_container_op(\n", " clean_data,\n", " base_image=\"{0}/kubeflow/spammassisan\".format(container_registry),\n", " packages_to_install=['pandas>=0.24', 'tables'])\n", "#end::cleanDataWithContainer[]" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "def train_func(input_path: String):\n", " from sklearn.cluster import KMeans\n", " from sklearn.model_selection import train_test_split\n", "\n", " train, test = train_test_split(data, test_size=0.1)\n", "\n", " kmeans = KMeans(n_clusters=2, random_state=42).fit(train)\n", " train_pred = kmeans.predict(train)\n", " test_pred = kmeans.predict(test)\n", " print(test_pred)\n", " # TODO: Dump the model somewhere you can use it later. " ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "And just like that, we've done it. We've created a Kubeflow Pipeline.\n", "\n", "So let's take a moment to step back and think, \"what in the crazy-town-heck is going on here?!\". A valid question, and well spotted. Each \"Step\" is going to be creating a container. Maybe I should have noted that earlier when talking about attatching volumes, beacuse if you thougth I was doing that to a function, you'd probably think me quite insane. \n", "\n", "But, if you follow this code, and create this pipeline, download it and run it, you will see each \"step\" as a seperate container, downloading data, saving it to a `PVC` then passing some parameters to a next container, which also will load the `PVC`, etc. etc. \n", "\n", "### Using Python to Create Containers, but not like a crazy person\n", "\n", "For completeness, let's last explore how to do all of these things using annotations. \n", "\n", "The trick for the most part is to create a function that returns a `kfp.dsl.ContainerOp`. This will point to an image, note the volumes that need to be mounted, and a number of other things. I've heard told people don't always just like creating absurdly large and fat functions to do everything in real life, so I leave this hear as an aside in case the reader is interested in it. It's alsow worth noting that adding the `@kfp.dsl.component` annotation instructs teh Kubeflow compiler to turn on static typce checking. \n", "\n", "```\n", "@kfp.dsl.component\n", "def my_component(my_param):\n", " ...\n", " return kfp.dsl.ContainerOp(\n", " name='My component name',\n", " image='gcr.io/path/to/container/image'\n", " )\n", "```\n", "\n", "Finally, when it comes to incorporating these components into pipelines, you would do something like this:\n", "\n", "```\n", "@kfp.dsl.pipeline(\n", " name='My pipeline',\n", " description='My machine learning pipeline'\n", ")\n", "def my_pipeline(param_1: PipelineParam, param_2: PipelineParam):\n", " my_step = my_component(my_param='a')\n", "```\n", "\n", "Which should look exceedingly familiar as we did something very similar with our `download_data_fn` and `witchcraft_fn`. " ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.6.9" } }, "nbformat": 4, "nbformat_minor": 4 } ================================================ FILE: data-extraction/python-notebook/MailingListDataPrep.py ================================================ #!/usr/bin/env python # coding: utf-8 # Here we can install some packages our notebook needs. We can also install them in our container to speed things up & make it more reliable. But for prototyping this works great! # In[ ]: get_ipython().system('pip3 install --upgrade lxml') get_ipython().system('pip3 install --upgrade pandas') get_ipython().system('pip3 install --upgrade scikit-learn') get_ipython().system('pip3 install --upgrade scipy') get_ipython().system('pip3 install --upgrade tables') # We can use Jupyter notebooks just like normal inside of Kubeflow # In[ ]: from datetime import datetime from requests import get from lxml import etree from time import sleep import re import pandas as pd import os # In[ ]: container_registry = "" # Wherever you put your containers # In[ ]: def scrapeMailArchives(mailingList: str, year: int, month: int): baseUrl = "http://mail-archives.apache.org/mod_mbox/%s/%s.mbox/ajax/" % ( mailingList, datetime(year, month, 1).strftime("%Y%m")) r = get(baseUrl + "thread?0") utf8_parser = etree.XMLParser(encoding='utf-8') root = etree.fromstring(r.text.replace('encoding="UTF-8"', ""), parser=utf8_parser) output = [] for message in root.xpath("//message"): _id = message.get("id") linked = message.get("linked") depth = message.get("depth") fr = message.xpath("from")[0].text dt = message.xpath("date")[0].text # todo convert to date subject = message.xpath("subject")[0].text r2 = get(baseUrl + _id) bodyRoot = etree.fromstring(r2.text.replace('encoding="UTF-8"', ""), parser=utf8_parser) body = bodyRoot.xpath("//contents")[0].text record = { "id": _id, "linked": linked, "depth": depth, "from": fr, "dt": dt, "subject": subject, "body": body } output.append(record) sleep(0.1) return output def extract_links(body): link_regex_str = r'(http(|s)://(.*?))([\s\n]|$)' itr = re.finditer(link_regex_str, body, re.MULTILINE) return list(map(lambda elem: elem.group(1), itr)) def extract_domains(links): from urllib.parse import urlparse def extract_domain(link): try: nloc = urlparse(link).netloc # We want to drop www and any extra spaces wtf nloc on the spaces. regex_str = r'^(www\.|)(.*?)\s*$' match = re.search(regex_str, nloc) return match.group(2) except: return None return list(map(extract_domain, links)) def contains_python_stack_trace(body): return "Traceback (most recent call last)" in body def contains_probably_java_stack_trace(body): # Look for something based on regex # Tried https://stackoverflow.com/questions/20609134/regular-expression-optional-multiline-java-stacktrace - more msg looking # Tried https://stackoverflow.com/questions/3814327/regular-expression-to-parse-a-log-file-and-find-stacktraces # Yes the compile is per call, but it's cached so w/e import re stack_regex_str = r'^\s*(.+Exception.*):\n(.*\n){0,3}?(\s+at\s+.*\(.*\))+' match = re.search(stack_regex_str, body, re.MULTILINE) return match is not None def contains_exception_in_task(body): # Look for a line along the lines of ERROR Executor: Exception in task return "ERROR Executor: Exception in task" in body # In[ ]: datesToScrape = [(2019, i) for i in range(1, 13)] records = [] for y, m in datesToScrape: print(m, "-", y) records += scrapeMailArchives("spark-dev", y, m) # In[ ]: df = pd.DataFrame(records) df['links'] = df['body'].apply(extract_links) df['containsPythonStackTrace'] = df['body'].apply(contains_python_stack_trace) df['containsJavaStackTrace'] = df['body'].apply( contains_probably_java_stack_trace) df['containsExceptionInTaskBody'] = df['body'].apply( contains_exception_in_task) df['domains'] = df['links'].apply(extract_domains) df['isThreadStart'] = df['depth'] == '0' # In[ ]: from sklearn.feature_extraction.text import TfidfVectorizer bodyV = TfidfVectorizer() # bodyV = TfidfVectorizer(max_features=10000) #if we cared about making this 1:1 w holden's code. bodyFeatures = bodyV.fit_transform(df['body']) domainV = TfidfVectorizer() # domainV = TfidfVectorizer(max_features=100) ## A couple of "None" domains really screwed the pooch on this one. Also, no lists just space seperated domains. def makeDomainsAList(d): return ' '.join([a for a in d if not a is None]) domainFeatures = domainV.fit_transform(df['domains'].apply(makeDomainsAList)) # In[ ]: # In[ ]: from scipy.sparse import csr_matrix, hstack data = hstack([ csr_matrix(df[[ 'containsPythonStackTrace', 'containsJavaStackTrace', 'containsExceptionInTaskBody', 'isThreadStart' ]].to_numpy()), bodyFeatures, domainFeatures ]) from sklearn.cluster import KMeans from sklearn.model_selection import train_test_split train, test = train_test_split(data, test_size=0.1) kmeans = KMeans(n_clusters=2, random_state=42).fit(train) train_pred = kmeans.predict(train) test_pred = kmeans.predict(test) # Alternatively, by structuring our code correctly we can take advantage of pipelines # In[ ]: get_ipython().system('pip3 install --upgrade kfp') # In[ ]: import kfp import kfp.dsl as dsl # In[ ]: def download_data(year: int) -> str: from datetime import datetime from lxml import etree from requests import get from time import sleep import json def scrapeMailArchives(mailingList: str, year: int, month: int): baseUrl = "http://mail-archives.apache.org/mod_mbox/%s/%s.mbox/ajax/" % ( mailingList, datetime(year, month, 1).strftime("%Y%m")) r = get(baseUrl + "thread?0") utf8_parser = etree.XMLParser(encoding='utf-8') root = etree.fromstring(r.text.replace('encoding="UTF-8"', ""), parser=utf8_parser) output = [] for message in root.xpath("//message"): _id = message.get("id") linked = message.get("linked") depth = message.get("depth") fr = message.xpath("from")[0].text dt = message.xpath("date")[0].text # todo convert to date subject = message.xpath("subject")[0].text r2 = get(baseUrl + _id) bodyRoot = etree.fromstring(r2.text.replace( 'encoding="UTF-8"', ""), parser=utf8_parser) body = bodyRoot.xpath("//contents")[0].text record = { "id": _id, "linked": linked, "depth": depth, "from": fr, "dt": dt, "subject": subject, "body": body } output.append(record) sleep(0.1) return output datesToScrape = [(year, i) for i in range(1, 2)] records = [] ## todo, go back further for y, m in datesToScrape: print(m, "-", y) records += scrapeMailArchives("spark-dev", y, m) import os output_path = '/data_processing/data.json' with open(output_path, 'w') as f: json.dump(records, f) return output_path # In[ ]: # In[ ]: def download_tld_data() -> str: from requests import get import pandas as pd print("importing io....") import io url = "https://pkgstore.datahub.io/core/country-list/data_csv/data/d7c9d7cfb42cb69f4422dec222dbbaa8/data_csv.csv" print("Getting the url") s = get(url).content print("Converting content") df = pd.read_csv(io.StringIO(s.decode('utf-8'))) print("Writing output") output_path_hdf = '/tld_info/clean_data.hdf' df.to_hdf(output_path_hdf, key="tld") return output_path_hdf # In[ ]: # Now that we have some data, we want to get rid of any "bad" records # In[ ]: #tag::clean_data_fun[] def clean_data(input_path: str) -> str: import json import pandas as pd print("loading records...") with open(input_path, 'r') as f: records = json.load(f) print("records loaded") df = pd.DataFrame(records) # Drop records without a subject, body, or sender cleaned = df.dropna(subset=["subject", "body", "from"]) output_path_hdf = '/data_processing/clean_data.hdf' cleaned.to_hdf(output_path_hdf, key="clean") return output_path_hdf #end::clean_data_fun[] # ### Preparing the data # # Remember earlier when we did that big (and arguably pointless) classification of emails from the Apache Spark mailing list? OK, now we're going to do it again, as a "lightweight" Python function in a Kubeflow Pipeline. I hope the irony of the term "lightweight" isn't lost on anyone, because this is pretty blatent abuse of something that was originally presented for conveinience. # # First note, all of the imports and declarations of helper functions MUST be with in the "ligthweight" function. One could argue (and they would probably be correct) that I have two steps here- feature prep and ML, and as such I should split them. I would say that's fair, but I choose not to do so at this time. Perhaps in some scripts later on? # # As has been pointed out so many times before, we assume the reader either arleady understands what is going on with the KMeans clustering, or better yet, doesn't even care. I won't be digging into that right now. What I will point out- and maybe as a note to the editor, the model that is finally saved really ought to be persisted somewhere. If the model isn't saved, then this basically pointless pipeline, is truly pointless. # # Now let's make sure we can read that data in the next step (before we write a big complicated model to do whatever torture to it). # In[ ]: def prepare_features(input_path: str, tld_info_path: str): import re import pandas as pd print("loading records...") df = pd.read_hdf(input_path, key="clean") print("records loaded") print("Loading tld info....") tld_df = pd.read_hdf(tld_info_path, key="tld") print("Loaded tld info") ## Note: "Lightweight" Python Fns mean helper code must be inside the fn. (Bad Form) def extract_links(body): link_regex_str = r'(http(|s)://(.*?))([\s\n]|$)' itr = re.finditer(link_regex_str, body, re.MULTILINE) return list(map(lambda elem: elem.group(1), itr)) def extract_domains(links): from urllib.parse import urlparse def extract_domain(link): try: nloc = urlparse(link).netloc # We want to drop www and any extra spaces wtf nloc on the spaces. regex_str = r'^(www\.|)(.*?)\s*$' match = re.search(regex_str, nloc) return match.group(2) except: return None return list(map(extract_domain, links)) def contains_python_stack_trace(body): return "Traceback (most recent call last)" in body def contains_probably_java_stack_trace(body): # Look for something based on regex # Tried https://stackoverflow.com/questions/20609134/regular-expression-optional-multiline-java-stacktrace - more msg looking # Tried https://stackoverflow.com/questions/3814327/regular-expression-to-parse-a-log-file-and-find-stacktraces # Yes the compile is per call, but it's cached so w/e import re stack_regex_str = r'^\s*(.+Exception.*):\n(.*\n){0,3}?(\s+at\s+.*\(.*\))+' match = re.search(stack_regex_str, body, re.MULTILINE) return match is not None def contains_exception_in_task(body): # Look for a line along the lines of ERROR Executor: Exception in task return "ERROR Executor: Exception in task" in body print(df.shape) df['links'] = df['body'].apply(extract_links) df['containsPythonStackTrace'] = df['body'].apply( contains_python_stack_trace) df['containsJavaStackTrace'] = df['body'].apply( contains_probably_java_stack_trace) df['containsExceptionInTaskBody'] = df['body'].apply( contains_exception_in_task) #tag::local_mailing_list_feature_prep_fun[] df['domains'] = df['links'].apply(extract_domains) df['isThreadStart'] = df['depth'] == '0' # Arguably, you could split building the dataset away from the actual witchcraft. from sklearn.feature_extraction.text import TfidfVectorizer bodyV = TfidfVectorizer() bodyFeatures = bodyV.fit_transform(df['body']) domainV = TfidfVectorizer() ## A couple of "None" domains really screwed the pooch on this one.Also, no lists just space seperated domains. def makeDomainsAList(d): return ' '.join([a for a in d if not a is None]) domainFeatures = domainV.fit_transform( df['domains'].apply(makeDomainsAList)) from scipy.sparse import csr_matrix, hstack data = hstack([ csr_matrix(df[[ 'containsPythonStackTrace', 'containsJavaStackTrace', 'containsExceptionInTaskBody', 'isThreadStart' ]].to_numpy()), bodyFeatures, domainFeatures ]) #end::local_mailing_list_feature_prep_fun[] # # ### The Kubeflow Bit. # # Now we can put these two pieces together into a pipeline. Since the data is relatively small we will use a persistent volume put them together. Later on we can add training to this pipeline as well. # # # In[ ]: # Make a volume example. We redo it inside of the pipeline definition because we need to be inside #tag::makeVolume[] dvop = dsl.VolumeOp(name="create_pvc", resource_name="my-pvc-2", size="5Gi", modes=dsl.VOLUME_MODE_RWO) #end::makeVolume[] # In[ ]: get_ipython().system('rm local-data-prep-2.zip') # In[ ]: #tag::makePipeline[] @kfp.dsl.pipeline(name='Simple1', description='Simple1') def my_pipeline_mini(year: int): dvop = dsl.VolumeOp(name="create_pvc", resource_name="my-pvc-2", size="5Gi", modes=dsl.VOLUME_MODE_RWO) tldvop = dsl.VolumeOp(name="create_pvc", resource_name="tld-volume-2", size="100Mi", modes=dsl.VOLUME_MODE_RWO) download_data_op = kfp.components.func_to_container_op( download_data, packages_to_install=['lxml', 'requests']) download_tld_info_op = kfp.components.func_to_container_op( download_tld_data, packages_to_install=['requests', 'pandas>=0.24', 'tables']) clean_data_op = kfp.components.func_to_container_op( clean_data, packages_to_install=['pandas>=0.24', 'tables']) step1 = download_data_op(year).add_pvolumes( {"/data_processing": dvop.volume}) step2 = clean_data_op(input_path=step1.output).add_pvolumes( {"/data_processing": dvop.volume}) step3 = download_tld_info_op().add_pvolumes({"/tld_info": tldvop.volume}) kfp.compiler.Compiler().compile(my_pipeline_mini, 'local-data-prep-2.zip') #end::makePipeline[] # In[ ]: get_ipython().system('rm *.zip') # In[ ]: #tag::longPipeline[] @kfp.dsl.pipeline(name='Simple1', description='Simple1') def my_pipeline2(year: int): dvop = dsl.VolumeOp(name="create_pvc", resource_name="my-pvc-2", size="5Gi", modes=dsl.VOLUME_MODE_RWO) tldvop = dsl.VolumeOp(name="create_pvc", resource_name="tld-volume-2", size="100Mi", modes=dsl.VOLUME_MODE_RWO) download_data_op = kfp.components.func_to_container_op( download_data, packages_to_install=['lxml', 'requests']) download_tld_info_op = kfp.components.func_to_container_op( download_tld_data, packages_to_install=['requests', 'pandas>=0.24', 'tables']) clean_data_op = kfp.components.func_to_container_op( clean_data, packages_to_install=['pandas>=0.24', 'tables']) #tag::add_feature_step[] prepare_features_op = kfp.components.func_to_container_op( prepare_features, packages_to_install=['pandas>=0.24', 'tables', 'scikit-learn']) #end::add_feature_step[] step1 = download_data_op(year).add_pvolumes( {"/data_processing": dvop.volume}) step2 = clean_data_op(input_path=step1.output).add_pvolumes( {"/data_processing": dvop.volume}) step3 = download_tld_info_op().add_pvolumes({"/tld_info": tldvop.volume}) step4 = prepare_features_op(input_path=step2.output, tld_info_path=step3.output).add_pvolumes({ "/data_processing": dvop.volume, "/tld_info": tldvop.volume }) #end::longPipeline[] kfp.compiler.Compiler().compile(my_pipeline2, 'local-data-and-feature-prep-2.zip') # In[ ]: client = kfp.Client() # In[ ]: my_experiment = client.create_experiment(name='local-data-prep-test-2') my_run = client.run_pipeline(my_experiment.id, 'local-data-prep', 'local-data-and-feature-prep-2.zip', params={'year': '2019'}) # If we were using Spamassasin or some other library installed in a different base container we would: # In[ ]: # Clean data with custom container #tag::cleanDataWithContainer[] clean_data_op = kfp.components.func_to_container_op( clean_data, base_image="{0}/kubeflow/spammassisan".format(container_registry), packages_to_install=['pandas>=0.24', 'tables']) #end::cleanDataWithContainer[] # In[ ]: def train_func(input_path: String): from sklearn.cluster import KMeans from sklearn.model_selection import train_test_split train, test = train_test_split(data, test_size=0.1) kmeans = KMeans(n_clusters=2, random_state=42).fit(train) train_pred = kmeans.predict(train) test_pred = kmeans.predict(test) print(test_pred) # TODO: Dump the model somewhere you can use it later. # And just like that, we've done it. We've created a Kubeflow Pipeline. # # So let's take a moment to step back and think, "what in the crazy-town-heck is going on here?!". A valid question, and well spotted. Each "Step" is going to be creating a container. Maybe I should have noted that earlier when talking about attatching volumes, beacuse if you thougth I was doing that to a function, you'd probably think me quite insane. # # But, if you follow this code, and create this pipeline, download it and run it, you will see each "step" as a seperate container, downloading data, saving it to a `PVC` then passing some parameters to a next container, which also will load the `PVC`, etc. etc. # # ### Using Python to Create Containers, but not like a crazy person # # For completeness, let's last explore how to do all of these things using annotations. # # The trick for the most part is to create a function that returns a `kfp.dsl.ContainerOp`. This will point to an image, note the volumes that need to be mounted, and a number of other things. I've heard told people don't always just like creating absurdly large and fat functions to do everything in real life, so I leave this hear as an aside in case the reader is interested in it. It's alsow worth noting that adding the `@kfp.dsl.component` annotation instructs teh Kubeflow compiler to turn on static typce checking. # # ``` # @kfp.dsl.component # def my_component(my_param): # ... # return kfp.dsl.ContainerOp( # name='My component name', # image='gcr.io/path/to/container/image' # ) # ``` # # Finally, when it comes to incorporating these components into pipelines, you would do something like this: # # ``` # @kfp.dsl.pipeline( # name='My pipeline', # description='My machine learning pipeline' # ) # def my_pipeline(param_1: PipelineParam, param_2: PipelineParam): # my_step = my_component(my_param='a') # ``` # # Which should look exceedingly familiar as we did something very similar with our `download_data_fn` and `witchcraft_fn`. # In[ ]: # In[ ]: ================================================ FILE: data-extraction/python-notebook/RunNBDockerfile ================================================ # Since we used Jupyter notebooks to do the first pass extraction, we can try directly use that notebook with # Kubeflow's pre-baked "tensorflow-notebook-image" (based on the Jupyter image) that automatically # launches the notebooks included in the docker file. If you have multiple notebooks # Give them names like: # 01-mything.ipynb # 02-step2.ipynb # as they will be executed in lexiographical order. #tag::spec[] FROM gcr.io/kubeflow-images-public/tensorflow-1.6.0-notebook-cpu COPY ./ /workdir / #end::spec[] #tag::deps[] RUN pip3 install --upgrade lxml pandas #end::deps[] ================================================ FILE: data-extraction/python-spark/Dockerfile ================================================ # Use the spark operator image as base FROM gcr.io/spark-operator/spark-py:v2.4.5 # Install Python requirements COPY requirements.txt / RUN pip3 install -r /requirements.txt # Now you can reference local:///job/my_file.py RUN mkdir -p /job COPY *.py /job ENTRYPOINT ["/opt/entrypoint.sh"] ================================================ FILE: data-extraction/python-spark/LaunchSparkJobs.ipynb ================================================ { "cells": [ { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "!pip3 install --upgrade --user kfp" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "import kfp" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "import kfp.dsl as dsl" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Use Kubeflow's built in Spark operator\n", "#tag::launch_operator[]\n", "resource = {\n", " \"apiVersion\": \"sparkoperator.k8s.io/v1beta2\",\n", " \"kind\": \"SparkApplication\",\n", " \"metadata\": {\n", " \"name\": \"boop\",\n", " \"namespace\": \"kubeflow\"\n", " },\n", " \"spec\": {\n", " \"type\": \"Python\",\n", " \"mode\": \"cluster\",\n", " \"image\": \"gcr.io/boos-demo-projects-are-rad/kf-steps/kubeflow/myspark\",\n", " \"imagePullPolicy\": \"Always\",\n", " \"mainApplicationFile\": \"local:///job/job.py\", # See the Dockerfile OR use GCS/S3/...\n", " \"sparkVersion\": \"2.4.5\",\n", " \"restartPolicy\": {\n", " \"type\": \"Never\"\n", " },\n", " \"driver\": {\n", " \"cores\": 1, \n", " \"coreLimit\": \"1200m\", \n", " \"memory\": \"512m\", \n", " \"labels\": {\n", " \"version\": \"2.4.5\", \n", " }, \n", " \"serviceAccount\": \"spark-operatoroperator-sa\", # also try spark-operatoroperator-sa\n", " },\n", " \"executor\": {\n", " \"cores\": 1,\n", " \"instances\": 2,\n", " \"memory\": \"512m\" \n", " }, \n", " \"labels\": {\n", " \"version\": \"2.4.5\"\n", " }, \n", " }\n", "}\n", "\n", "@dsl.pipeline(\n", " name=\"local Pipeline\",\n", " description=\"No need to ask why.\"\n", ")\n", "def local_pipeline():\n", "\n", " rop = dsl.ResourceOp(\n", " name=\"boop\",\n", " k8s_resource=resource,\n", " action=\"create\",\n", " success_condition=\"status.applicationState.state == COMPLETED\"\n", " )\n", "#end::launch_operator[]\n", "\n", "import kfp.compiler as compiler\n", "\n", "compiler.Compiler().compile(local_pipeline,\"boop.zip\")" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "client = kfp.Client()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "my_experiment = client.create_experiment(name='boop-test-2')\n", "my_run = client.run_pipeline(my_experiment.id, 'boop-test', \n", " 'boop.zip')" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.6.9" } }, "nbformat": 4, "nbformat_minor": 2 } ================================================ FILE: data-extraction/python-spark/LaunchSparkJobs.py ================================================ #!/usr/bin/env python # coding: utf-8 # In[ ]: get_ipython().system('pip3 install --upgrade --user kfp') # In[ ]: import kfp # In[ ]: import kfp.dsl as dsl # In[ ]: # Use Kubeflow's built in Spark operator #tag::launch_operator[] resource = { "apiVersion": "sparkoperator.k8s.io/v1beta2", "kind": "SparkApplication", "metadata": { "name": "boop", "namespace": "kubeflow" }, "spec": { "type": "Python", "mode": "cluster", "image": "gcr.io/boos-demo-projects-are-rad/kf-steps/kubeflow/myspark", "imagePullPolicy": "Always", # See the Dockerfile OR use GCS/S3/... "mainApplicationFile": "local:///job/job.py", "sparkVersion": "2.4.5", "restartPolicy": { "type": "Never" }, "driver": { "cores": 1, "coreLimit": "1200m", "memory": "512m", "labels": { "version": "2.4.5", }, # also try spark-operatoroperator-sa "serviceAccount": "spark-operatoroperator-sa", }, "executor": { "cores": 1, "instances": 2, "memory": "512m" }, "labels": { "version": "2.4.5" }, } } @dsl.pipeline(name="local Pipeline", description="No need to ask why.") def local_pipeline(): rop = dsl.ResourceOp( name="boop", k8s_resource=resource, action="create", success_condition="status.applicationState.state == COMPLETED") #end::launch_operator[] import kfp.compiler as compiler compiler.Compiler().compile(local_pipeline, "boop.zip") # In[ ]: client = kfp.Client() # In[ ]: my_experiment = client.create_experiment(name='boop-test-2') my_run = client.run_pipeline(my_experiment.id, 'boop-test', 'boop.zip') # In[ ]: ================================================ FILE: data-extraction/python-spark/fake_job.py ================================================ # Yes we need both these imports #tag::imports[] from pyspark.sql import SparkSession from pyspark.sql.functions import col, to_date from pyspark.sql.types import * #end::imports[] from pyspark.sql.catalog import UserDefinedFunction import os #tag::basic_session[] session = SparkSession.builder.getOrCreate() #end::basic_session[] ================================================ FILE: data-extraction/python-spark/requirements.txt ================================================ pandas ================================================ FILE: data-extraction/python-spark-notebook/AddGCSDockerfile ================================================ ARG base FROM $base # Set an enviroment variable for where we are going to put spark ENV SPARK_HOME /opt/spark # Run as root for updates USER root # Add access to GCS RUN rm $SPARK_HOME/jars/guava-1*.jar ADD http://maven-central.storage.googleapis.com/maven2/com/google/guava/guava/23.0/guava-23.0.jar $SPARK_HOME/jars # Add the connector jar needed to access Google Cloud Storage using the Hadoop FileSystem API. ADD https://storage.googleapis.com/hadoop-lib/gcs/gcs-connector-latest-hadoop3.jar $SPARK_HOME/jars # Add the S3A connector ADD https://repo1.maven.org/maven2/org/apache/hadoop/hadoop-aws/3.2.0/hadoop-aws-3.2.0.jar $SPARK_HOME/jars ADD https://repo1.maven.org/maven2/com/amazonaws/aws-java-sdk-bundle/1.11.732/aws-java-sdk-bundle-1.11.732.jar $SPARK_HOME/jars RUN chmod -R 777 $SPARK_HOME/jars USER 185 ================================================ FILE: data-extraction/python-spark-notebook/AddPython3.6Dockerfile ================================================ ARG base FROM $base USER root # Install libraries we need to build Python 3.6 RUN apt-get update && \ DEBIAN_FRONTEND=noninteractive apt-get install -y -q \ make build-essential libssl-dev zlib1g-dev libbz2-dev \ libreadline-dev libsqlite3-dev wget curl llvm libncurses5-dev \ libncursesw5-dev xz-utils tk-dev libffi-dev liblzma-dev && \ rm -rf /var/cache/apt # Install python3.6 to match the notebook RUN cd /tmp && \ wget https://www.python.org/ftp/python/3.6.10/Python-3.6.10.tgz && \ tar -xvf Python-3.6.10.tgz && \ cd Python-3.6.10 && \ ./configure && \ make -j 8 && \ make altinstall RUN python3.6 -m pip install pandas pyarrow==0.11.0 spacy # We depend on Spark being on the PYTHONPATH so no pip install USER 185 ================================================ FILE: data-extraction/python-spark-notebook/Dockerfile ================================================ #tag::include[] # See https://www.kubeflow.org/docs/notebooks/custom-notebook/ ARG base FROM $base ARG sparkversion ARG sparkrelease ARG sparkserver https://www-us.apache.org/dist/spark # We need to run as root for updates USER root # Set an enviroment variable for where we are going to put spark ENV SPARK_HOME /opt/spark # Install java because Spark needs it RUN apt-get update && \ apt-get install -yq openjdk-8-jre openjdk-8-jre-headless && \ apt-get clean && \ rm -rf /var/lib/apt/lists/* # Install Spark RUN set -ex && \ rm /bin/sh && \ ln -sv /bin/bash /bin/sh RUN echo "Setting up $sparkversion" RUN cd /tmp && \ (wget ${sparkserver}/spark-${sparkversion}/${sparkrelease}.tgz) && \ cd /opt && tar -xvf /tmp/${sparkrelease}.tgz && \ rm /tmp/${sparkrelease}.tgz && mv ${sparkrelease} spark && \ cd spark/python && pip install -e . #end::include[] # Add access to GCS RUN rm $SPARK_HOME/jars/guava-1*.jar ADD https://maven-central.storage.googleapis.com/maven2/com/google/guava/guava/23.0/guava-23.0.jar $SPARK_HOME/jars # Add the connector jar needed to access Google Cloud Storage using the Hadoop FileSystem API. ADD https://storage.googleapis.com/hadoop-lib/gcs/gcs-connector-latest-hadoop3.jar $SPARK_HOME/jars # Add the S3A connector ADD https://repo1.maven.org/maven2/org/apache/hadoop/hadoop-aws/3.2.0/hadoop-aws-3.2.0.jar $SPARK_HOME/jars ADD https://repo1.maven.org/maven2/com/amazonaws/aws-java-sdk-bundle/1.11.732/aws-java-sdk-bundle-1.11.732.jar $SPARK_HOME/jars #tag::include[] # Fix permissions WORKDIR /opt/spark/work-dir RUN chmod -R 777 /opt/spark/ # Switch the user back, using jovyan as a user is bad but the base image # depends on it. USER jovyan # Install some common tools pip install pandas numpy scipy pyarrow #end::include[] ================================================ FILE: data-extraction/python-spark-notebook/SparkMailingListForKF.ipynb ================================================ { "cells": [ { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Yes we need both these imports\n", "from pyspark.sql import SparkSession\n", "from pyspark.sql.functions import col, to_date\n", "from pyspark.sql.types import *\n", "from pyspark.sql.types import StructField, StructType\n", "from pyspark.sql.catalog import UserDefinedFunction\n", "import os" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "fs_prefix = \"s3a://kf-book-examples/mailing-lists\" # Create with mc as in ch1" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# See https://medium.com/@szinck/setting-up-pyspark-jupyter-and-minio-on-kubeflow-kubernetes-aab98874794f\n", "#tag::configurePythonVersion[]\n", "os.environ[\"PYSPARK_PYTHON\"] = \"python3.6\"\n", "#end::configurePythonVersion[]\n", "session = (\n", " SparkSession.builder\n", " .appName(\"fetchMailingListData\")\n", " .config(\"spark.executor.instances\", \"8\")\n", " .config(\"spark.driver.memoryOverhead\", \"0.25\")\n", " .config(\"spark.executor.memory\", \"6g\")\n", " .config(\"spark.dynamicAllocation.enabled\", \"false\")\n", " .config(\"spark.ui.enabled\", \"true\")\n", " .config(\"spark.kubernetes.container.image\",\n", " \"gcr.io/boos-demo-projects-are-rad/kubeflow/spark-worker/spark-py-36:v3.0.0-preview2-23\")\n", " #tag::notebookSession[]\n", " .config(\"spark.driver.bindAddress\", \"0.0.0.0\")\n", " .config(\"spark.kubernetes.namespace\", \"kubeflow-programmerboo\")\n", " .config(\"spark.master\", \"k8s://https://kubernetes.default\")\n", " .config(\"spark.driver.host\", \n", " \"spark-driver.kubeflow-programmerboo.svc.cluster.local\")\n", " .config(\"spark.kubernetes.executor.annotation.sidecar.istio.io/inject\",\n", " \"false\")\n", " .config(\"spark.driver.port\", \"39235\")\n", " .config(\"spark.blockManager.port\", \"39236\")\n", " #end::notebookSession[]\n", " # If using minio - see https://github.com/minio/cookbook/blob/master/docs/apache-spark-with-minio.md\n", " #tag::minio[]\n", " .config(\"spark.hadoop.fs.s3a.endpoint\",\n", " \"minio-service.kubeflow.svc.cluster.local:9000\")\n", " .config(\"fs.s3a.connection.ssl.enabled\", \"false\")\n", " .config(\"fs.s3a.path.style.access\", \"true\")\n", " # You can also add an account using the minio command as described in chapter 1\n", " .config(\"spark.hadoop.fs.s3a.access.key\", \"minio\")\n", " .config(\"spark.hadoop.fs.s3a.secret.key\", \"minio123\")\n", " #end::minio[]\n", " ).getOrCreate()\n", "sc = session.sparkContext" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Data fetch pipeline: Download mailing list data" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "list_name=\"spark-user\"" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "mailing_list_template=\"http://mail-archives.apache.org/mod_mbox/{list_name}/{date}.mbox\"" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Generate the possible dates" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "start_year=2019 # Change to 2002 once you've verified\n", "end_year=2021\n", "dates = [\"{:d}{:02d}\".format(year, month) for year in range(start_year, end_year) for month in range (1,12)]" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "def download_emails(date):\n", " import subprocess\n", " from mailbox import mbox\n", " import os\n", " mbox_filename = \"{date}.mbox\".format(date=date)\n", " url=mailing_list_template.format(list_name=list_name,date=date)\n", " subprocess.call([\"wget\", url])\n", " # Skip years that don't exist\n", " if not os.path.exists(mbox_filename):\n", " return []\n", " mail = mbox(mbox_filename.format(date=date), create=False)\n", " # LC the keys since the casing is non-consistent\n", " def get_body(message):\n", " content_type = message.get_content_type()\n", " # Multi-part messages\n", " if message.is_multipart():\n", " return \"\".join(map(get_body, message.get_payload()))\n", " elif \"text\" in content_type or \"html\" in content_type:\n", " return message.get_payload()\n", " else:\n", " return \"\"\n", " def message_to_dict(message):\n", " ret = dict((k.lower(), v) for k, v in message.items())\n", " ret[\"multipart\"] = message.is_multipart()\n", " ret[\"body\"] = get_body(message)\n", " return ret\n", " emails = list(map(message_to_dict, mail.itervalues()))\n", " os.remove(mbox_filename)\n", " return emails" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Optional: test that it works locally\n", "# download_emails(\"202001\")" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "emails_rdd = sc.parallelize(dates).flatMap(download_emails).cache()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "emails_rdd.count()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "mailing_list_posts_mbox_df = emails_rdd.toDF(sampleRatio=1.0)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "cached = mailing_list_posts_mbox_df.cache()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "mailing_list_posts_mbox_df.select(\"list-id\", \"In-Reply-To\").take(5)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "spark_mailing_list_data = mailing_list_posts_mbox_df.filter(\n", " mailing_list_posts_mbox_df[\"list-id\"].contains(\"spark\")).repartition(60).cache()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "spark_mailing_list_data.show()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "spark_mailing_list_data.printSchema()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "def extract_date_from_email_datefield(datefield):\n", " if datefield is None:\n", " return None\n", " from datetime import datetime\n", " import time\n", " import email.utils\n", " parsed_date = email.utils.parsedate(datefield)\n", " return datetime.fromtimestamp(time.mktime((parsed_date)))\n", "\n", "\n", "extract_date_from_email_datefield_udf = UserDefinedFunction(\n", " extract_date_from_email_datefield, StringType(), \"extract_date_from_email_datefield\")\n", "\n", "session.catalog._jsparkSession.udf().registerPython(\n", " \"extract_date_from_email_datefield\",\n", " extract_date_from_email_datefield_udf._judf)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "spark_mailing_list_data_with_date = spark_mailing_list_data.select(\n", " \"*\",\n", " extract_date_from_email_datefield_udf(spark_mailing_list_data[\"Date\"]).alias(\"email_date\"))" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Manually verify that our date parser is looking ok\n", "spark_mailing_list_data.select(spark_mailing_list_data[\"Date\"],\n", " extract_date_from_email_datefield_udf(spark_mailing_list_data[\"Date\"]).alias(\"email_date\")\n", " ).take(2)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "#tag::filter_junk[]\n", "def is_ok(post):\n", " # Your special business logic goes here\n", " return True\n", "spark_mailing_list_data_cleaned = spark_mailing_list_data_with_date.filter(is_ok)\n", "#end::filter_junk[]" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "mailing_list_posts_in_reply_to = spark_mailing_list_data_cleaned.filter(\n", " spark_mailing_list_data[\"In-Reply-To\"].isNotNull()).alias(\"mailing_list_posts_in_reply_to\")\n", "initial_posts = spark_mailing_list_data_cleaned.filter(\n", " spark_mailing_list_data[\"In-Reply-To\"].isNull()).alias(\"initial_posts\").cache()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# See how many start-of-thread posts we have\n", "initial_posts.count()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "ids_in_reply = mailing_list_posts_in_reply_to.select(\"In-Reply-To\", \"message-id\")" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "ids_in_reply.schema" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Ok now it's time to save these\n", "#tag::write_big_data[]\n", "initial_posts.write.format(\"parquet\").mode('overwrite').save(fs_prefix + \"/initial_posts\")\n", "ids_in_reply.write.format(\"parquet\").mode('overwrite').save(fs_prefix + \"/ids_in_reply\")\n", "#end::write_big_data[]" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "#tag::small_data[]\n", "initial_posts.toPandas()\n", "#end::small_data[]" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "session.stop()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.6.9" } }, "nbformat": 4, "nbformat_minor": 2 } ================================================ FILE: data-extraction/python-spark-notebook/SparkMailingListForKF.py ================================================ #!/usr/bin/env python # coding: utf-8 # In[ ]: # Yes we need both these imports from pyspark.sql import SparkSession from pyspark.sql.functions import col, to_date from pyspark.sql.types import * from pyspark.sql.types import StructField, StructType from pyspark.sql.catalog import UserDefinedFunction import os # In[ ]: # In[ ]: fs_prefix = "s3a://kf-book-examples/mailing-lists" # Create with mc as in ch1 # In[ ]: # See https://medium.com/@szinck/setting-up-pyspark-jupyter-and-minio-on-kubeflow-kubernetes-aab98874794f #tag::configurePythonVersion[] os.environ["PYSPARK_PYTHON"] = "python3.6" #end::configurePythonVersion[] session = ( SparkSession.builder.appName("fetchMailingListData").config( "spark.executor.instances", "8").config("spark.driver.memoryOverhead", "0.25").config("spark.executor.memory", "6g").config( "spark.dynamicAllocation.enabled", "false"). config("spark.ui.enabled", "true").config( "spark.kubernetes.container.image", "gcr.io/boos-demo-projects-are-rad/kubeflow/spark-worker/spark-py-36:v3.0.0-preview2-23" ) #tag::notebookSession[] .config("spark.driver.bindAddress", "0.0.0.0").config("spark.kubernetes.namespace", "kubeflow-programmerboo"). config("spark.master", "k8s://https://kubernetes.default").config( "spark.driver.host", "spark-driver.kubeflow-programmerboo.svc.cluster.local").config( "spark.kubernetes.executor.annotation.sidecar.istio.io/inject", "false").config("spark.driver.port", "39235").config("spark.blockManager.port", "39236") #end::notebookSession[] # If using minio - see https://github.com/minio/cookbook/blob/master/docs/apache-spark-with-minio.md #tag::minio[] .config("spark.hadoop.fs.s3a.endpoint", "minio-service.kubeflow.svc.cluster.local:9000").config( "fs.s3a.connection.ssl.enabled", "false").config("fs.s3a.path.style.access", "true") # You can also add an account using the minio command as described in chapter 1 .config("spark.hadoop.fs.s3a.access.key", "minio").config("spark.hadoop.fs.s3a.secret.key", "minio123") #end::minio[] ).getOrCreate() sc = session.sparkContext # In[ ]: # Data fetch pipeline: Download mailing list data # In[ ]: list_name = "spark-user" # In[ ]: mailing_list_template = "http://mail-archives.apache.org/mod_mbox/{list_name}/{date}.mbox" # In[ ]: # Generate the possible dates # In[ ]: start_year = 2019 # Change to 2002 once you've verified end_year = 2021 dates = [ "{:d}{:02d}".format(year, month) for year in range(start_year, end_year) for month in range(1, 12) ] # In[ ]: def download_emails(date): import subprocess from mailbox import mbox import os mbox_filename = "{date}.mbox".format(date=date) url = mailing_list_template.format(list_name=list_name, date=date) subprocess.call(["wget", url]) # Skip years that don't exist if not os.path.exists(mbox_filename): return [] mail = mbox(mbox_filename.format(date=date), create=False) # LC the keys since the casing is non-consistent def get_body(message): content_type = message.get_content_type() # Multi-part messages if message.is_multipart(): return "".join(map(get_body, message.get_payload())) elif "text" in content_type or "html" in content_type: return message.get_payload() else: return "" def message_to_dict(message): ret = dict((k.lower(), v) for k, v in message.items()) ret["multipart"] = message.is_multipart() ret["body"] = get_body(message) return ret emails = list(map(message_to_dict, mail.itervalues())) os.remove(mbox_filename) return emails # In[ ]: # Optional: test that it works locally # download_emails("202001") # In[ ]: emails_rdd = sc.parallelize(dates).flatMap(download_emails).cache() # In[ ]: emails_rdd.count() # In[ ]: mailing_list_posts_mbox_df = emails_rdd.toDF(sampleRatio=1.0) # In[ ]: cached = mailing_list_posts_mbox_df.cache() # In[ ]: mailing_list_posts_mbox_df.select("list-id", "In-Reply-To").take(5) # In[ ]: spark_mailing_list_data = mailing_list_posts_mbox_df.filter( mailing_list_posts_mbox_df["list-id"].contains("spark")).repartition( 60).cache() # In[ ]: spark_mailing_list_data.show() # In[ ]: spark_mailing_list_data.printSchema() # In[ ]: def extract_date_from_email_datefield(datefield): if datefield is None: return None from datetime import datetime import time import email.utils parsed_date = email.utils.parsedate(datefield) return datetime.fromtimestamp(time.mktime((parsed_date))) extract_date_from_email_datefield_udf = UserDefinedFunction( extract_date_from_email_datefield, StringType(), "extract_date_from_email_datefield") session.catalog._jsparkSession.udf().registerPython( "extract_date_from_email_datefield", extract_date_from_email_datefield_udf._judf) # In[ ]: spark_mailing_list_data_with_date = spark_mailing_list_data.select( "*", extract_date_from_email_datefield_udf( spark_mailing_list_data["Date"]).alias("email_date")) # In[ ]: # Manually verify that our date parser is looking ok spark_mailing_list_data.select( spark_mailing_list_data["Date"], extract_date_from_email_datefield_udf( spark_mailing_list_data["Date"]).alias("email_date")).take(2) # In[ ]: #tag::filter_junk[] def is_ok(post): # Your special business logic goes here return True spark_mailing_list_data_cleaned = spark_mailing_list_data_with_date.filter( is_ok) #end::filter_junk[] # In[ ]: mailing_list_posts_in_reply_to = spark_mailing_list_data_cleaned.filter( spark_mailing_list_data["In-Reply-To"].isNotNull()).alias( "mailing_list_posts_in_reply_to") initial_posts = spark_mailing_list_data_cleaned.filter( spark_mailing_list_data["In-Reply-To"].isNull()).alias( "initial_posts").cache() # In[ ]: # See how many start-of-thread posts we have initial_posts.count() # In[ ]: ids_in_reply = mailing_list_posts_in_reply_to.select("In-Reply-To", "message-id") # In[ ]: ids_in_reply.schema # In[ ]: # Ok now it's time to save these #tag::write_big_data[] initial_posts.write.format("parquet").mode('overwrite').save(fs_prefix + "/initial_posts") ids_in_reply.write.format("parquet").mode('overwrite').save(fs_prefix + "/ids_in_reply") #end::write_big_data[] # In[ ]: #tag::small_data[] initial_posts.toPandas() #end::small_data[] # In[ ]: session.stop() # In[ ]: ================================================ FILE: data-extraction/python-spark-notebook/build.sh ================================================ #!/bin/bash # Build a notebook with Spark 3 # Note when Spark 3 is fully released we can use gcr.io/spark-operator/spark-py:v3.0.0 set -ex V=${V:-"23"} REPO=${REPO:-"gcr.io/$PROJECT"} TARGET=${TARGET:-"$REPO/kubeflow/spark-notebook:v$V"} KF_BASE=${KF_BASE:-"gcr.io/kubeflow-images-public"} BASE=${BASE:-"$KF_BASE/tensorflow-1.15.2-notebook-cpu:1.0.0"} SPARK_VERSION="3.0.0-preview2" SPARK_RELEASE="spark-3.0.0-preview2-bin-hadoop3.2" SPARK_ARTIFACT="${SPARK_RELEASE}.tgz" docker build . -t "${TARGET}" --build-arg sparkversion="${SPARK_VERSION}" \ --build-arg sparkrelease="${SPARK_RELEASE}" --build-arg base="${BASE}" docker push "${TARGET}" # Build Spark worker image SPARK_TARGET=${SPARK_TARGET:-"$REPO/kubeflow/spark-worker"} if [ ! -f /tmp/${SPARK_ARTIFACT} ]; then pushd /tmp/ wget "https://www-us.apache.org/dists/spark/spark-${SPARK_VERSION}/${SPARK_ARTIFACT}" popd fi tmp_dir=$(mktemp -d -t spark-build-XXXXXXXXXX) pushd "${tmp_dir}" tar -xvf "/tmp/${SPARK_ARTIFACT}" pushd "${SPARK_RELEASE}" ./bin/docker-image-tool.sh -r "${SPARK_TARGET}" -t "v${SPARK_VERSION}-${V}" build ./bin/docker-image-tool.sh -r "${SPARK_TARGET}" -t "v${SPARK_VERSION}-${V}" \ -p kubernetes/dockerfiles/spark/bindings/python/Dockerfile \ build ./bin/docker-image-tool.sh -r "${SPARK_TARGET}" -t "v${SPARK_VERSION}-${V}" push popd popd # Add GCS to Spark images docker build --build-arg base="${SPARK_TARGET}/spark:v${SPARK_VERSION}-${V}" \ -t "${SPARK_TARGET}/spark-with-gcs:v${SPARK_VERSION}-$V" -f AddGCSDockerfile . PYSPARK_WITH_GCS="${SPARK_TARGET}/spark-py-with-gcs:v${SPARK_VERSION}-$V" docker build --build-arg base="${SPARK_TARGET}/spark-py:v${SPARK_VERSION}-${V}" \ -t "${PYSPARK_WITH_GCS}" -f AddGCSDockerfile . # Add Python 3.6 to PySpark images for notebook compat SPARK_PY36_WORKER="${SPARK_TARGET}/spark-py-36:v${SPARK_VERSION}-$V" docker build --build-arg base="${PYSPARK_WITH_GCS}" \ -t "${SPARK_PY36_WORKER}" -f AddPython3.6Dockerfile . docker push "${SPARK_TARGET}/spark-with-gcs:v${SPARK_VERSION}-$V" docker push "${SPARK_TARGET}/spark-py-with-gcs:v${SPARK_VERSION}-$V" docker push "${SPARK_PY36_WORKER}" rm -rf "${tmp_dir}" echo "Spark notebook pushed to ${TARGET}" echo "Spark py worker pushed to ${SPARK_PY36_WORKER}" ================================================ FILE: data-extraction/python-spark-notebook/dr.yaml ================================================ apiVersion: networking.istio.io/v1alpha3 kind: DestinationRule metadata: name: default namespace: kubeflow-programmerboo spec: host: '*.svc.cluster.local' trafficPolicy: tls: mode: DISABLE ================================================ FILE: data-extraction/python-spark-notebook/no-saprk-tls.yaml ================================================ apiVersion: "authentication.istio.io/v1alpha1" kind: "Policy" metadata: name: spark-no-tls spec: targets: - name: spark-notebook-0 ================================================ FILE: data-extraction/python-spark-notebook/spark-driver-service.yaml ================================================ apiVersion: v1 kind: Service metadata: name: spark-driver namespace: kubeflow-programmerboo spec: selector: notebook-name: spark-test-2 ports: - port: 39235 targetPort: 39235 name: spark-driver-port - port: 39236 targetPort: 39236 name: spark-block-port ================================================ FILE: data-extraction/python-spark-notebook/virt_service.yaml ================================================ apiVersion: networking.istio.io/v1alpha3 kind: VirtualService metadata: creationTimestamp: "2019-10-14T20:09:50Z" generation: 1 name: notebook-programmerboo-spark-notebook namespace: programmerboo ownerReferences: - apiVersion: kubeflow.org/v1beta1 blockOwnerDeletion: true controller: true kind: Notebook name: spark-notebook uid: 93fb0c0e-eebe-11e9-a454-42010a8e0119 resourceVersion: "3616573" selfLink: /apis/networking.istio.io/v1alpha3/namespaces/programmerboo/virtualservices/notebook-programmerboo-spark-notebook uid: 9404145c-eebe-11e9-a454-42010a8e0119 spec: gateways: - kubeflow/kubeflow-gateway hosts: - '*' http: - match: - uri: prefix: /notebook/programmerboo/spark-notebook rewrite: uri: /notebook/programmerboo/spark-notebook route: - destination: host: spark-notebook.programmerboo.svc.cluster.local port: number: 80 timeout: 300s ================================================ FILE: data-extraction/spark-hello-world/Dockerfile ================================================ ================================================ FILE: data-extraction/spark-hello-world/README.md ================================================ This directory will walk you through running a Spark Hello world example with kubeflow. It (currently) uses the master branch of Kubeflow unlike the rest of the examples since Spark support is not yet in a released version. ================================================ FILE: data-extraction/spark-hello-world/hello_world_pipeline.py ================================================ import kfp.dsl as dsl import kfp.gcp as gcp import kfp.onprem as onprem from string import Template import json @dsl.pipeline(name='Simple spark pipeline demo', description='Shows how to use Spark operator inside KF') def spark_hello_world_pipeline(jar_location="gcs://....", tf_job_image="..."): spark_json_template = Template(""" { "apiVersion": "sparkoperator.k8s.io/v1beta2", "kind": "SparkApplication", "metadata": { "name": "spark-frank", "namespace": "kubeflow"}, "spec": { "type": "Scala", "mode": "cluster", "mainApplicationFile": "$jar_location" }""") spark_json = spark_json_template.substitute({'jar_location': jar_location}) spark_job = json.loads(spark_json) spark_resource = dsl.ResourceOp( name='spark-job', k8s_resource=spark_job, success_condition='status.state == Succeeded') train = dsl.ContainerOp( name='train', image=tf_job_image, ).after(spark_resoure) ================================================ FILE: data-extraction/spark-hello-world/lr_demo/.gitignore ================================================ *.class *.log build.sbt_back # sbt specific dist/* target/ lib_managed/ src_managed/ project/boot/ project/plugins/project/ sbt/*.jar mini-complete-example/sbt/*.jar # Scala-IDE specific .scala_dependencies #Emacs *~ #ignore the metastore metastore_db/* # Byte-compiled / optimized / DLL files __pycache__/ *.py[cod] # C extensions *.so # Distribution / packaging .env .Python env/ bin/ build/*.jar develop-eggs/ dist/ eggs/ lib/ lib64/ parts/ sdist/ var/ *.egg-info/ .installed.cfg *.egg # Installer logs pip-log.txt pip-delete-this-directory.txt # Unit test / coverage reports htmlcov/ .tox/ .coverage .cache nosetests.xml coverage.xml # Translations *.mo # Mr Developer .mr.developer.cfg .project .pydevproject # Rope .ropeproject # Django stuff: *.log *.pot # Sphinx documentation docs/_build/ # PyCharm files *.idea # emacs stuff # Autoenv .env *~ # Byte-compiled / optimized / DLL files __pycache__/ *.py[cod] # C extensions *.so # Distribution / packaging .env .Python env/ bin/ build/ develop-eggs/ dist/ eggs/ lib/ lib64/ parts/ sdist/ var/ *.egg-info/ .installed.cfg *.egg # Installer logs pip-log.txt pip-delete-this-directory.txt # Unit test / coverage reports htmlcov/ .tox/ .coverage .cache nosetests.xml coverage.xml # Translations *.mo # Mr Developer .mr.developer.cfg .project .pydevproject # Rope .ropeproject # Django stuff: *.log *.pot # Sphinx documentation docs/_build/ # PyCharm files *.idea # emacs stuff \#*\# \.\#* # Autoenv .env *~ ================================================ FILE: data-extraction/spark-hello-world/lr_demo/.travis.yml ================================================ language: scala # These directories are cached to S3 at the end of the build cache: directories: - $HOME/.ivy2/cache - $HOME/.sbt/boot/ - $HOME/.sbt/launchers - $HOME/build jdk: - oraclejdk8 scala: - 2.11.8 after_success: - bash <(curl -s https://codecov.io/bash) sudo: false ================================================ FILE: data-extraction/spark-hello-world/lr_demo/README.md ================================================ A simple, bad, LR example with Spark. ================================================ FILE: data-extraction/spark-hello-world/lr_demo/build.sbt ================================================ val sparkVersion = "2.3.1" lazy val root = (project in file(".")). settings( inThisBuild(List( organization := "com.introtomlwithkubeflow.spark.demo", scalaVersion := "2.11.12" )), name := "basic.lr", version := "0.0.1", javacOptions ++= Seq("-source", "1.8", "-target", "1.8"), javaOptions ++= Seq("-Xms512M", "-Xmx2048M", "-XX:MaxPermSize=2048M", "-XX:+CMSClassUnloadingEnabled"), scalacOptions ++= Seq("-deprecation", "-unchecked"), parallelExecution in Test := false, fork := true, coverageHighlighting := true, libraryDependencies ++= Seq( "org.apache.spark" %% "spark-streaming" % sparkVersion % "provided", "org.apache.spark" %% "spark-sql" % sparkVersion % "provided", "org.apache.spark" %% "spark-mllib" % sparkVersion % "provided", "ml.combust.mleap" %% "mleap-spark" % "0.13.0", "org.scalatest" %% "scalatest" % "3.0.1" % "test", "org.scalacheck" %% "scalacheck" % "1.13.4" % "test", "com.holdenkarau" %% "spark-testing-base" % "2.3.1_0.11.0" % "test" ), // uses compile classpath for the run task, including "provided" jar (cf http://stackoverflow.com/a/21803413/3827) run in Compile := Defaults.runTask(fullClasspath in Compile, mainClass in (Compile, run), runner in (Compile, run)).evaluated, scalacOptions ++= Seq("-deprecation", "-unchecked"), pomIncludeRepository := { x => false }, resolvers ++= Seq( "sonatype-releases" at "https://oss.sonatype.org/content/repositories/releases/", "Typesafe repository" at "http://repo.typesafe.com/typesafe/releases/", "Second Typesafe repo" at "http://repo.typesafe.com/typesafe/maven-releases/", Resolver.sonatypeRepo("public") ), pomIncludeRepository := { x => false }, mergeStrategy in assembly := { case m if m.toLowerCase.endsWith("manifest.mf") => MergeStrategy.discard case m if m.toLowerCase.endsWith("io.netty.versions.properties") => MergeStrategy.concat case m if m.toLowerCase.endsWith("services") => MergeStrategy.filterDistinctLines case m if m.toLowerCase.endsWith("git.properties") => MergeStrategy.discard case m if m.toLowerCase.endsWith("reference.conf") => MergeStrategy.filterDistinctLines // Travis is giving a weird error on netty I don't see locally :( case PathList("META-INF", "io.netty.versions.properties") => MergeStrategy.first case PathList("META-INF", "native", xs @ _*) => MergeStrategy.deduplicate case PathList("META-INF", "services", xs @ _ *) => MergeStrategy.filterDistinctLines case PathList("META-INF", xs @ _ *) => MergeStrategy.discard case PathList("javax", "servlet", xs @ _*) => MergeStrategy.last case PathList("org", "apache", xs @ _*) => MergeStrategy.last case PathList("org", "jboss", xs @ _*) => MergeStrategy.last // Start http://queirozf.com/entries/creating-scala-fat-jars-for-spark-on-sbt-with-sbt-assembly-plugin case PathList("org","aopalliance", xs @ _*) => MergeStrategy.last case PathList("javax", "inject", xs @ _*) => MergeStrategy.last case PathList("javax", "servlet", xs @ _*) => MergeStrategy.last case PathList("javax", "activation", xs @ _*) => MergeStrategy.last case PathList("org", "apache", xs @ _*) => MergeStrategy.last case PathList("com", "google", xs @ _*) => MergeStrategy.last case PathList("com", "esotericsoftware", xs @ _*) => MergeStrategy.last case PathList("com", "codahale", xs @ _*) => MergeStrategy.last case PathList("com", "yammer", xs @ _*) => MergeStrategy.last // End http://queirozf.com/entries/creating-scala-fat-jars-for-spark-on-sbt-with-sbt-assembly-plugin case PathList("com", "sun", "activation", "registries", xs @ _*) => MergeStrategy.last case PathList("com", "sun", "activation", "viewers", xs @ _*) => MergeStrategy.last case "about.html" => MergeStrategy.rename case "reference.conf" => MergeStrategy.concat case m => val oldStrategy = (assemblyMergeStrategy in assembly).value oldStrategy(m) }, assemblyShadeRules in assembly := Seq( ShadeRule.rename("com.google.protobuf.**" -> "shadeproto.@1").inAll ), // publish settings publishTo := { val nexus = "https://oss.sonatype.org/" if (isSnapshot.value) Some("snapshots" at nexus + "content/repositories/snapshots") else Some("releases" at nexus + "service/local/staging/deploy/maven2") } ) ================================================ FILE: data-extraction/spark-hello-world/lr_demo/project/build.properties ================================================ sbt.version=1.2.8 ================================================ FILE: data-extraction/spark-hello-world/lr_demo/project/plugins.sbt ================================================ addSbtPlugin("org.scalastyle" %% "scalastyle-sbt-plugin" % "1.0.0") resolvers += "sonatype-releases" at "https://oss.sonatype.org/content/repositories/releases/" resolvers += "Spark Package Main Repo" at "https://dl.bintray.com/spark-packages/maven" addSbtPlugin("org.scoverage" % "sbt-scoverage" % "1.5.1") addSbtPlugin("com.eed3si9n" % "sbt-assembly" % "0.14.5") ================================================ FILE: data-extraction/spark-hello-world/lr_demo/sample.csv ================================================ e1,e2,label 1.0, 0.0, 1.0 2.0, 2.1, 2.0 ================================================ FILE: data-extraction/spark-hello-world/lr_demo/sbt/sbt ================================================ #!/bin/bash # # Licensed to the Apache Software Foundation (ASF) under one or more # contributor license agreements. See the NOTICE file distributed with # this work for additional information regarding copyright ownership. # The ASF licenses this file to You under the Apache License, Version 2.0 # (the "License"); you may not use this file except in compliance with # the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # # This script launches sbt for this project. If present it uses the system # version of sbt. If there is no system version of sbt it attempts to download # sbt locally. SBT_VERSION=0.13.15 URL1=http://typesafe.artifactoryonline.com/typesafe/ivy-releases/org.scala-sbt/sbt-launch/${SBT_VERSION}/sbt-launch.jar URL2=http://repo.typesafe.com/typesafe/ivy-releases/org.scala-sbt/sbt-launch/${SBT_VERSION}/sbt-launch.jar JAR=sbt/sbt-launch-${SBT_VERSION}.jar # Download sbt launch jar if it hasn't been downloaded yet if [ ! -f ${JAR} ]; then # Download printf "Attempting to fetch sbt\n" set -x JAR_DL=${JAR}.part if hash wget 2>/dev/null; then (wget --progress=bar ${URL1} -O ${JAR_DL} || wget --progress=bar ${URL2} -O ${JAR_DL}) && mv ${JAR_DL} ${JAR} elif hash axel 2>/dev/null; then (axel ${URL1} -o ${JAR_DL} || axel ${URL2} -o ${JAR_DL}) && mv ${JAR_DL} ${JAR} else printf "You do not have curl or wget installed, please install sbt manually from http://www.scala-sbt.org/\n" exit -1 fi fi if [ ! -f ${JAR} ]; then # We failed to download printf "Our attempt to download sbt locally to ${JAR} failed. Please install sbt manually from http://www.scala-sbt.org/\n" exit -1 fi printf "Launching sbt from ${JAR}\n" java \ -Xmx1200m -XX:MaxPermSize=350m -XX:ReservedCodeCacheSize=256m \ -jar ${JAR} \ "$@" ================================================ FILE: data-extraction/spark-hello-world/lr_demo/src/main/scala/com/introtomlwithkubeflow/spark/demo/lr/TrainingApp.scala ================================================ package com.introtomlwithkubeflow.spark.demo.lr import org.apache.spark.{SparkConf, SparkContext} /** * Use this when submitting the app to a cluster with spark-submit * */ object TrainingApp extends App{ val (inputFile, outputFile) = (args(0), args(1)) // spark-submit command should supply all necessary config elements Runner.run(new SparkConf(), inputFile, outputFile) } object Runner { def run(conf: SparkConf, inputFile: String, outputFile: String): Unit = { val sc = new SparkContext(conf) val trainer = new TrainingPipeline(sc) trainer.train(inputFile, outputFile) } } ================================================ FILE: data-extraction/spark-hello-world/lr_demo/src/main/scala/com/introtomlwithkubeflow/spark/demo/lr/TrainingPipeline.scala ================================================ package com.introtomlwithkubeflow.spark.demo.lr import java.nio.file.{Files, Paths} import ml.combust.bundle.BundleFile import ml.combust.mleap.spark.SparkSupport._ import org.apache.hadoop.fs.{FileSystem, Path} import org.apache.spark.ml.bundle.SparkBundleContext // Actually an mleap import import org.apache.spark.{SparkConf, SparkContext} import org.apache.spark.sql._ import org.apache.spark.sql.functions._ import org.apache.spark.sql.catalyst.ScalaReflection import org.apache.spark.ml.Transformer import org.apache.spark.ml.{Pipeline, PipelineModel} import org.apache.spark.ml.feature._ import org.apache.spark.ml.regression._ import resource._ class TrainingPipeline(sc: SparkContext) { val session = SparkSession.builder().getOrCreate() import session.implicits._ def train(input: String, outputFile: String) = { val trainingData = session.read.format("csv") .option("inferSchema", "true").option("header", "true").load(input) val vectorizer = new VectorAssembler().setInputCols(Array("e1", "e2")).setOutputCol("features") val lr = new GeneralizedLinearRegression() .setFamily("gaussian") .setLink("identity") .setMaxIter(10) .setRegParam(0.3) val pipeline = new Pipeline().setStages(Array( vectorizer, lr)) val fit = pipeline.fit(trainingData) // Serialize the fit pipeline val resultData = fit.transform(trainingData) val localFile = "/tmp/mleap.zip" val localOutput = s"jar:file:${localFile}" val sbc = SparkBundleContext().withDataset(resultData) for(bf <- managed(BundleFile(localOutput))) { fit.writeBundle.save(bf)(sbc).get } // We only have one file so its k val modelBinary = Files.readAllBytes(Paths.get(localFile)) val fs = FileSystem.get(sc.hadoopConfiguration) val out = fs.create(new Path(outputFile)) out.write(modelBinary); out.close(); } } ================================================ FILE: data-extraction/spark-hello-world/lr_demo/src/test/scala/com/introtomlwithkubeflow/spark/demo/lr/TrainingPipelineTest.scala ================================================ package com.introtomlwithkubeflow.spark.demo.lr /** * A simple test for the training pipeline */ import com.holdenkarau.spark.testing.{SharedSparkContext, Utils} import org.apache.spark.sql._ import org.scalatest.FunSuite import java.io.File case class MyData(e1: Double, e2: Double, label: Double) class TrainingPipelineTest extends FunSuite with SharedSparkContext { test("smok test"){ val session = SparkSession.builder().getOrCreate() import session.implicits._ val tempDir = Utils.createTempDir() val sampleDataRDD = sc.parallelize(Seq( MyData(1.0, 0.0, 1.0), MyData(2.0, 2.1, 2.0))) val sampleDataDS = session.createDataset(sampleDataRDD) val inputDataLocation = tempDir + "/input" val outputFile = tempDir + "/output.zip" sampleDataDS.write.format("csv").option("header", "true").save(inputDataLocation) val trainingPipeline = new TrainingPipeline(sc) trainingPipeline.train(inputDataLocation, outputFile) assert(new File(outputFile).exists()) } } ================================================ FILE: data-extraction/spark-hello-world/setup.sh ================================================ #!/bin/bash set -ex SPARK_DEMO_DIR=${SPARK_DEMO_DIR:=~/spark_demo_3} SPARK_DEMO_GCS=${SPARK_DEMO_GCS:=gs://boo-spark-kf-demo} # Set up kubeflow mkdir "$SPARK_DEMO_DIR" pushd "$SPARK_DEMO_DIR" pwd wget https://raw.githubusercontent.com/kubeflow/kubeflow/master/scripts/download.sh chmod a+x download.sh KUBEFLOW_VERSION=0.5.0 export KUBEFLOW_VERSION ./download.sh PATH="$(pwd)/scripts":$PATH kfctl.sh init mydemoapp --platform none pushd mydemoapp source env.sh #kfctl.sh generate platform #kfctl.sh apply platform kfctl.sh generate k8s kfctl.sh apply k8s pushd ks_app # Set up the Spark operator ks pkg install kubeflow/spark ks generate spark-operator spark-operator --name=spark-operator ks apply default -c spark-operator # Create a Spark job with the operator (Pi) local_jar_path="local:///opt/spark/examples/jars/spark-examples_2.11-2.3.1.jar" ks generate spark-job spark-pi --name=spark-operator \ --applicationResource="$local_jar_path" \ --mainClass=org.apache.spark.examples.SparkPi ks apply default -c spark-pi # Create a Spark job with the operator to train an LR model pushd "$SPARK_MNIST_DIR/lr_demo" sbt assembly gsutil cp target/scala-2.11/basic.lr-assembly-0.0.1.jar "$SPARK_DEMO_GCS/jars" gsutil cp sample.csv "$SPARK_DEMO_GCS/input/part0.csv" popd ks generate spark-job spark-lr --name=spark-operator \ --applicationResource="$SPARK_DEMO_GCS/jars/basic.lr-assembly-0.0.1.jar" \ --mainClass=com.introtomlwithkubeflow.spark.demo.lr.TrainingApp "$SPARK_DEMO_GCS/input" "$SPARK_DEMO_GCS/output" ks apply default -c spark-lr # Create a Spark job with the operator for data prep on the GitHub data popd ================================================ FILE: data-extraction/spark-hello-world/spark-pi-min.yaml ================================================ apiVersion: "sparkoperator.k8s.io/v1beta2" kind: SparkApplication metadata: name: spark-pi namespace: kubeflow spec: type: Scala mode: cluster image: "gcr.io/spark-operator/spark:v2.4.4" imagePullPolicy: Always mainClass: org.apache.spark.examples.SparkPi mainApplicationFile: "local:///opt/spark/examples/jars/spark-examples_2.11-2.4.4.jar" sparkVersion: "2.4.4" restartPolicy: type: Never volumes: - name: "test-volume" hostPath: path: "/tmp" type: Directory driver: cores: 1 coreLimit: "1200m" memory: "512m" labels: version: 2.4.4 volumeMounts: - name: "test-volume" mountPath: "/tmp" executor: cores: 1 instances: 1 memory: "512m" labels: version: 2.4.4 volumeMounts: - name: "test-volume" mountPath: "/tmp" ================================================ FILE: data-extraction/spark-hello-world/spark-pi.yaml ================================================ apiVersion: "sparkoperator.k8s.io/v1beta2" kind: SparkApplication metadata: name: spark-pi namespace: kubeflow spec: type: Scala mode: cluster image: "gcr.io/spark-operator/spark:v2.4.4" imagePullPolicy: Always mainClass: org.apache.spark.examples.SparkPi mainApplicationFile: "local:///opt/spark/examples/jars/spark-examples_2.11-2.4.4.jar" sparkVersion: "2.4.4" restartPolicy: type: Never volumes: - name: "test-volume" hostPath: path: "/tmp" type: Directory driver: cores: 1 coreLimit: "1200m" memory: "512m" labels: version: 2.4.4 serviceAccount: spark-operatoroperator-sa volumeMounts: - name: "test-volume" mountPath: "/tmp" executor: cores: 1 instances: 1 memory: "512m" labels: version: 2.4.4 volumeMounts: - name: "test-volume" mountPath: "/tmp" ================================================ FILE: data-extraction/stack_overflow_questions.bsql ================================================ SELECT ================================================ FILE: data-extraction/tfx/TFDV.ipynb ================================================ { "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "We start by downloading a specific release of the components because running from master is not a good way to buid \"repetable\" systems" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "!wget https://github.com/kubeflow/pipelines/archive/0.2.5.tar.gz" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "!tar -xvf 0.2.5.tar.gz" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "import kfp" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "#tag::loadGCSDLComponent[]\n", "gcs_download_component = kfp.components.load_component_from_file(\n", " \"pipelines-0.2.5/components/google-cloud/storage/download/component.yaml\")\n", "#end::loadGCSDLComponent[]\n", "#tag::loadTFDVAndFriendsComponents[]\n", "tfx_csv_gen = kfp.components.load_component_from_file(\n", " \"pipelines-0.2.5/components/tfx/ExampleGen/CsvExampleGen/component.yaml\")\n", "tfx_statistic_gen = kfp.components.load_component_from_file(\n", " \"pipelines-0.2.5/components/tfx/StatisticsGen/component.yaml\")\n", "tfx_schema_gen = kfp.components.load_component_from_file(\n", " \"pipelines-0.2.5/components/tfx/SchemaGen/component.yaml\")\n", "tfx_example_validator = kfp.components.load_component_from_file(\n", " \"pipelines-0.2.5/components/tfx/ExampleValidator/component.yaml\")\n", "#end::loadTFDVAndFriendsComponents[]" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "@kfp.dsl.pipeline(\n", " name='DL',\n", " description='Sample DL pipeline'\n", ")\n", "def pipeline_with_dl():\n", " #tag::dlOp[]\n", " dl_op = gcs_download_component(\n", " gcs_path=\"gs://ml-pipeline-playground/tensorflow-tfx-repo/tfx/components/testdata/external/csv\") # Your path goes here\n", " #end::dlOp[]" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "kfp.compiler.Compiler().compile(pipeline_with_dl, 'dl_pipeline.zip')" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "client = kfp.Client()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "my_experiment = client.create_experiment(name='dl')\n", "my_run = client.run_pipeline(my_experiment.id, 'dl', \n", " 'dl_pipeline.zip')" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "#tag::standaloneTFDVPipeline[]\n", "@kfp.dsl.pipeline(\n", " name='TFDV',\n", " description='TF DV Pipeline'\n", ")\n", "def tfdv_pipeline():\n", " # DL with wget, can use gcs instead as well\n", " data_url = \"https://raw.githubusercontent.com/moorissa/medium/master/items-recommender/data/trx_data.csv\"\n", " #tag::wget[]\n", " fetch = kfp.dsl.ContainerOp(\n", " name='download',\n", " image='busybox',\n", " command=['sh', '-c'],\n", " arguments=[\n", " 'sleep 1;'\n", " 'mkdir -p /tmp/data;'\n", " 'wget '+ data_url +' -O /tmp/data/results.csv'],\n", " file_outputs={'downloaded': '/tmp/data'})\n", " # This expects a directory of inputs not just a single file\n", " #end::wget[]\n", " #tag::csv[]\n", " records_example = tfx_csv_gen(input_base=fetch.output)\n", " #end::csv[]\n", " #tag::stats[]\n", " stats = tfx_statistic_gen(input_data=records_example.output)\n", " #end::stats[]\n", " #tag::schema[]\n", " schema_op = tfx_schema_gen(stats.output)\n", " #end::schema[]\n", " #tag::validate[]\n", " tfx_example_validator(stats=stats.outputs['output'], schema=schema_op.outputs['output'])\n", " #end::validate[]\n", "#end::standaloneTFDVPipeline[]" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "kfp.compiler.Compiler().compile(tfdv_pipeline, 'tfdv_pipeline.zip')" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "my_experiment = client.create_experiment(name='tfdv_pipeline')\n", "my_run = client.run_pipeline(my_experiment.id, 'tfdv', \n", " 'tfdv_pipeline.zip')" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "!pip3 install tfx tensorflow-data-validation" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "#tag::importTFDV[]\n", "import tensorflow_data_validation as tfdv\n", "#end::importTFDV[]" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "You can download your schema by looking at the inputs/outputs in your pipeline run for the schema gen stage" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "#tag::displaySchema{}\n", "schema = tfdv.load_schema_text(\"schema_info_2\")\n", "tfdv.display_schema(schema)\n", "#end::displaySchema[]" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "#tag::loadTFT[]\n", "tfx_transform = kfp.components.load_component_from_file(\n", " \"pipelines-0.2.5/components/tfx/Transform/component.yaml\")\n", "#end::loadTFT[]" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "module_file=\"gcs://\"" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "@kfp.dsl.pipeline(\n", " name='TFX',\n", " description='TFX pipeline'\n", ")\n", "def tfx_pipeline():\n", " # DL with wget, can use gcs instead as well\n", " fetch = kfp.dsl.ContainerOp(\n", " name='download',\n", " image='busybox',\n", " command=['sh', '-c'],\n", " arguments=[\n", " 'sleep 1;'\n", " 'mkdir -p /tmp/data;'\n", " 'wget https://raw.githubusercontent.com/moorissa/medium/master/items-recommender/data/trx_data.csv -O /tmp/data/results.csv'],\n", " file_outputs={'downloaded': '/tmp/data'})\n", " records_example = tfx_csv_gen(input_base=fetch.output)\n", " stats = tfx_statistic_gen(input_data=records_example.output)\n", " schema_op = tfx_schema_gen(stats.output)\n", " tfx_example_validator(stats=stats.outputs['output'], schema=schema_op.outputs['output'])\n", " #tag::tft[]\n", " transformed_output = tfx_transform(\n", " input_data=records_example.output,\n", " schema=schema_op.outputs['output'],\n", " module_file=module_file) # Path to your TFT code on GCS/S3\n", " #end::tft[]" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "kfp.compiler.Compiler().compile(tfx_pipeline, 'tfx_pipeline.zip')" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "my_experiment = client.create_experiment(name='tfx_pipeline')\n", "my_run = client.run_pipeline(my_experiment.id, 'tfx', \n", " 'tfx_pipeline.zip')" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.6.9" } }, "nbformat": 4, "nbformat_minor": 4 } ================================================ FILE: data-extraction/tfx/TFDV.py ================================================ #!/usr/bin/env python # coding: utf-8 # We start by downloading a specific release of the components because running from master is not a good way to buid "repetable" systems # In[ ]: get_ipython().system( 'wget https://github.com/kubeflow/pipelines/archive/0.2.5.tar.gz') # In[ ]: get_ipython().system('tar -xvf 0.2.5.tar.gz') # In[ ]: import kfp # In[ ]: # In[ ]: #tag::loadGCSDLComponent[] gcs_download_component = kfp.components.load_component_from_file( "pipelines-0.2.5/components/google-cloud/storage/download/component.yaml") #end::loadGCSDLComponent[] #tag::loadTFDVAndFriendsComponents[] tfx_csv_gen = kfp.components.load_component_from_file( "pipelines-0.2.5/components/tfx/ExampleGen/CsvExampleGen/component.yaml") tfx_statistic_gen = kfp.components.load_component_from_file( "pipelines-0.2.5/components/tfx/StatisticsGen/component.yaml") tfx_schema_gen = kfp.components.load_component_from_file( "pipelines-0.2.5/components/tfx/SchemaGen/component.yaml") tfx_example_validator = kfp.components.load_component_from_file( "pipelines-0.2.5/components/tfx/ExampleValidator/component.yaml") #end::loadTFDVAndFriendsComponents[] # In[ ]: @kfp.dsl.pipeline(name='DL', description='Sample DL pipeline') def pipeline_with_dl(): #tag::dlOp[] dl_op = gcs_download_component( gcs_path= "gs://ml-pipeline-playground/tensorflow-tfx-repo/tfx/components/testdata/external/csv" ) # Your path goes here #end::dlOp[] # In[ ]: kfp.compiler.Compiler().compile(pipeline_with_dl, 'dl_pipeline.zip') # In[ ]: client = kfp.Client() # In[ ]: my_experiment = client.create_experiment(name='dl') my_run = client.run_pipeline(my_experiment.id, 'dl', 'dl_pipeline.zip') # In[ ]: #tag::standaloneTFDVPipeline[] @kfp.dsl.pipeline(name='TFDV', description='TF DV Pipeline') def tfdv_pipeline(): # DL with wget, can use gcs instead as well data_url = "https://raw.githubusercontent.com/moorissa/medium/master/items-recommender/data/trx_data.csv" #tag::wget[] fetch = kfp.dsl.ContainerOp(name='download', image='busybox', command=['sh', '-c'], arguments=[ 'sleep 1;' 'mkdir -p /tmp/data;' 'wget ' + data_url + ' -O /tmp/data/results.csv' ], file_outputs={'downloaded': '/tmp/data'}) # This expects a directory of inputs not just a single file #end::wget[] #tag::csv[] records_example = tfx_csv_gen(input_base=fetch.output) #end::csv[] #tag::stats[] stats = tfx_statistic_gen(input_data=records_example.output) #end::stats[] #tag::schema[] schema_op = tfx_schema_gen(stats.output) #end::schema[] #tag::validate[] tfx_example_validator(stats=stats.outputs['output'], schema=schema_op.outputs['output']) #end::validate[] #end::standaloneTFDVPipeline[] # In[ ]: kfp.compiler.Compiler().compile(tfdv_pipeline, 'tfdv_pipeline.zip') # In[ ]: my_experiment = client.create_experiment(name='tfdv_pipeline') my_run = client.run_pipeline(my_experiment.id, 'tfdv', 'tfdv_pipeline.zip') # In[ ]: get_ipython().system('pip3 install tfx tensorflow-data-validation') # In[ ]: #tag::importTFDV[] import tensorflow_data_validation as tfdv #end::importTFDV[] # You can download your schema by looking at the inputs/outputs in your pipeline run for the schema gen stage # In[ ]: #tag::displaySchema{} schema = tfdv.load_schema_text("schema_info_2") tfdv.display_schema(schema) #end::displaySchema[] # In[ ]: #tag::loadTFT[] tfx_transform = kfp.components.load_component_from_file( "pipelines-0.2.5/components/tfx/Transform/component.yaml") #end::loadTFT[] # In[ ]: module_file = "gcs://" # In[ ]: @kfp.dsl.pipeline(name='TFX', description='TFX pipeline') def tfx_pipeline(): # DL with wget, can use gcs instead as well fetch = kfp.dsl.ContainerOp( name='download', image='busybox', command=['sh', '-c'], arguments=[ 'sleep 1;' 'mkdir -p /tmp/data;' 'wget https://raw.githubusercontent.com/moorissa/medium/master/items-recommender/data/trx_data.csv -O /tmp/data/results.csv' ], file_outputs={'downloaded': '/tmp/data'}) records_example = tfx_csv_gen(input_base=fetch.output) stats = tfx_statistic_gen(input_data=records_example.output) schema_op = tfx_schema_gen(stats.output) tfx_example_validator(stats=stats.outputs['output'], schema=schema_op.outputs['output']) #tag::tft[] transformed_output = tfx_transform( input_data=records_example.output, schema=schema_op.outputs['output'], module_file=module_file) # Path to your TFT code on GCS/S3 #end::tft[] # In[ ]: kfp.compiler.Compiler().compile(tfx_pipeline, 'tfx_pipeline.zip') # In[ ]: my_experiment = client.create_experiment(name='tfx_pipeline') my_run = client.run_pipeline(my_experiment.id, 'tfx', 'tfx_pipeline.zip') # In[ ]: ================================================ FILE: data-extraction/tfx/install_tfx.sh ================================================ #!/bin/bash #tag::install[] pip3 install tfx tensorflow-data-validation #end::install[] ================================================ FILE: data-extraction/tfx/requirements.txt ================================================ tfx ================================================ FILE: data-extraction/tfx/run_on_dataflow_ex.py ================================================ #tag::example[] generated_output_uri = root_output_uri + kfp.dsl.EXECUTION_ID_PLACEHOLDER beam_pipeline_args = [ '--runner=DataflowRunner', '--project=' + project_id, '--temp_location=' + root_output_uri + '/tmp'), '--region=' + gcp_region, '--disk_size_gb=50', # Adjust as needed ] records_example = tfx_csv_gen( input_uri=fetch.output, # Must be on distributed storage beam_pipeline_args=beam_pipeline_args, output_examples_uri=generated_output_uri) #end::example[] ================================================ FILE: dev-setup/install-argo.sh ================================================ #!/bin/bash # Download the binary curl -sLO https://github.com/argoproj/argo/releases/download/v2.8.1/argo-linux-amd64 # Make binary executable chmod +x argo-linux-amd64 # Move binary to path mv ./argo-linux-amd64 ~/bin/argo ================================================ FILE: dev-setup/install-kf-pipeline-sdk.sh ================================================ #!/bin/bash # Put as inside a venv pushd /tmp #tag::venv[] virtualenv kfvenv --python python3 source kfvenv/bin/activate #end::venv[] popd #tag::install[] URL=https://storage.googleapis.com/ml-pipeline/release/latest/kfp.tar.gz pip install "${URL}" --upgrade #end::install[] mkdir -p ~/repos pushd ~/repos if [[ ! -d pipelines ]]; then #tag::checkout_sdk[] git clone --single-branch --branch 0.3.0 https://github.com/kubeflow/pipelines.git #end::checkout_sdk[] fi popd ================================================ FILE: dev-setup/install-kf.sh ================================================ #!/bin/bash set -ex #tag::install[] PLATFORM=$(uname) # Either Linux or Darwin export PLATFORM mkdir -p ~/bin #Configuration export KUBEFLOW_TAG=1.0.1 # ^ You can also point this to a different version if you want to try KUBEFLOW_BASE="https://api.github.com/repos/kubeflow/kfctl/releases" # Or just go to https://github.com/kubeflow/kfctl/releases KFCTL_URL=$(curl -s ${KUBEFLOW_BASE} |\ grep http |\ grep "${KUBEFLOW_TAG}" |\ grep -i "${PLATFORM}" |\ cut -d : -f 2,3 |\ tr -d '\" ' ) wget "${KFCTL_URL}" KFCTL_FILE=${KFCTL_URL##*/} tar -xvf "${KFCTL_FILE}" mv ./kfctl ~/bin/ rm "${KFCTL_FILE}" # Recommended add the scripts directory to your path export PATH=$PATH:~/bin #end::install[] ================================================ FILE: dev-setup/install-kubectl.sh ================================================ #!/bin/bash #tag::ubuntu-kubectl[] sudo snap install kubectl --classic #end::ubuntu-kubectl[] #tag::debian-kubectl[] sudo apt-get update && sudo apt-get install -y apt-transport-https curl -s https://packages.cloud.google.com/apt/doc/apt-key.gpg |\ sudo apt-key add - echo "deb https://apt.kubernetes.io/ kubernetes-xenial main" |\ sudo tee -a /etc/apt/sources.list.d/kubernetes.list sudo apt-get update sudo apt-get install -y kubectl #end::debian-kubectl[] #tag::redhat-kubectl[] cat < /etc/yum.repos.d/kubernetes.repo [kubernetes] name=Kubernetes baseurl=https://packages.cloud.google.com/yum/repos/kubernetes-el7-x86_64 enabled=1 gpgcheck=1 repo_gpgcheck=0 gpgkey=https://packages.cloud.google.com/yum/doc/rpm-package-key.gpg EOF yum install -y kubectl #end::redhat-kubectl[] #tag::osx-kubectl[] brew install kubernetes-cli #end::osx-kubectl[] #tag::no-pkg-manager-kubectl[] kubectl_release_base="https://storage.googleapis.com/kubernetes-release" stable_url="$kubectl_release_base/release/stable.txt" KUBECTL_VERSION=$(curl -s "$stable_url") export KUBECTL_VERSION curl -LO "$kubectl_release_base/$KUBECTL_VERSION/bin/$PLATFORM/amd64/kubectl" # Now either move kubectl to /usr/bin or add it to your PATH #end::no-pkg-manager-kubectl[] ================================================ FILE: dev-setup/install-kustomize.sh ================================================ #!/bin/bash #tag::kustomize[] PLATFORM=$(uname) # Either Linux or Darwin export PLATFORM mkdir -p ~/bin KUSTOMIZE_URL=$(curl -s \ https://api.github.com/repos/kubernetes-sigs/kustomize/releases/latest |\ grep browser_download |\ grep -i "${PLATFORM}" |\ cut -d '"' -f 4) wget "${KUSTOMIZE_URL}" KUSTOMIZE_FILE=${KUSTOMIZE_URL##*/} tar -xvf "${KUSTOMIZE_FILE}" rm "${KUSTOMIZE_FILE}" mv kustomize ~/bin/kustomize chmod u+x ~/bin/kustomize # Add this + platform/version exports to your bashrc or move the ks bin into /usr/bin export PATH=$PATH:"~/bin" #end::kustomize[] ================================================ FILE: dev-setup/install-microk8s.sh ================================================ #!/bin/bash #tag::installmicrok8s[] sudo snap install microk8s --classic #end::installmicrok8s[] #tag::setupmicrok8s[] # Alias the microk8s versions of kubectl and docker so kubeflow uses them # You will want to add this to your bashrc if you intend to use microk8s # generally. alias kubectl="microk8s.kubectl" alias docker="microk8s.docker" ### Faking Docker registry, skip for production docker registry microk8s.enable registry export DOCKER_HOST="unix:///var/snap/microk8s/current/docker.sock" sudo ln -s /var/snap/microk8s/current/docker.sock /var/run/docker.sock sudo ln -s /var/snap/microk8s/common/var/lib/docker /var/lib/docker #end::setupmicrok8s[] #tag::bootstrapwithcanonicallabs[] git clone https://github.com/canonical-labs/kubeflow-tools pushd kubeflow-tools KUBEFLOW_VERSION=0.4.1 ./install-kubeflow.sh #end::bootstrapwithcanonicallabs[] #tag::unaliasmicrok8s[] unalias kubectl unalias docker #end::unaliasmicrok8s[] ================================================ FILE: dev-setup/jsonnet.sh ================================================ #!/bin/bash set -e set -x #tag::snap[] sudo snap install jsonnet #end::snap[] #tag::manual[] export JSONNET_VERSION=0.12.1 wget https://github.com/google/jsonnet/archive/v$JSONNET_VERSION.tar.gz # You will need to add this to your path if it is not already tar -xvf v$JSONNET_VERSION.tar.gz cd jsonnet-$JSONNET_VERSION make # Or otherwise add to your path sudo cp jsonent /usr/bin/ #end::manual[] ================================================ FILE: feature-prep/README.md ================================================ Feature preparation is the task of converting the data into features suitable for our machine algorithms. What makes a "feature" suitable depends on the algorithm used. In the `tft` directory we show feature prep using Tensorflow Transform. At the time of writing this only supports Python 2 and has limited support on non-GCP platforms, but it is rapidly improving in both areas. ================================================ FILE: feature-prep/spark/SparkMailingListFeaturePrep.ipynb ================================================ { "cells": [ { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Yes we need both these imports\n", "from pyspark.sql import SparkSession\n", "from pyspark.sql.functions import col, to_date, lit, isnull\n", "from pyspark.sql.types import *\n", "from pyspark.sql.types import StructField, StructType\n", "from pyspark.sql.catalog import UserDefinedFunction\n", "from pyspark.ml.feature import *\n", "from pyspark.ml.pipeline import Pipeline\n", "import os" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "fs_prefix = \"s3a://kf-book-examples/mailing-lists\" # Create with mc as in ch1" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "os.environ[\"PYSPARK_PYTHON\"] = \"python3.6\"\n", "# See https://medium.com/@szinck/setting-up-pyspark-jupyter-and-minio-on-kubeflow-kubernetes-aab98874794f\n", "session = (SparkSession.builder\n", " .appName(\"processMailingListData\")\n", " .config(\"spark.executor.instances\", \"8\")\n", " .config(\"spark.driver.memoryOverhead\", \"0.25\")\n", " .config(\"spark.executor.memory\", \"10g\")\n", " .config(\"spark.dynamicAllocation.enabled\", \"false\")\n", " .config(\"spark.ui.enabled\", \"true\")\n", " .config(\"spark.kubernetes.container.image\",\n", " \"gcr.io/boos-demo-projects-are-rad/kubeflow/spark-worker/spark-py-36:v3.0.0-preview2-23\")\n", " .config(\"spark.driver.bindAddress\", \"0.0.0.0\")\n", " .config(\"spark.kubernetes.namespace\", \"kubeflow-programmerboo\")\n", " .config(\"spark.master\", \"k8s://https://kubernetes.default\")\n", " .config(\"spark.driver.host\", \"spark-driver.kubeflow-programmerboo.svc.cluster.local\")\n", " .config(\"spark.kubernetes.executor.annotation.sidecar.istio.io/inject\", \"false\")\n", " .config(\"spark.driver.port\", \"39235\")\n", " .config(\"spark.blockManager.port\", \"39236\")\n", " # If using minio - see https://github.com/minio/cookbook/blob/master/docs/apache-spark-with-minio.md\n", " .config(\"spark.hadoop.fs.s3a.endpoint\", \"minio-service.kubeflow.svc.cluster.local:9000\")\n", " .config(\"fs.s3a.connection.ssl.enabled\", \"false\")\n", " .config(\"fs.s3a.path.style.access\", \"true\")\n", " # You can also add an account using the minio command as described in chapter 1\n", " .config(\"spark.hadoop.fs.s3a.access.key\", \"minio\")\n", " .config(\"spark.hadoop.fs.s3a.secret.key\", \"minio123\")\n", " ).getOrCreate()\n", "sc = session.sparkContext" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "#Load data from the previous stage\n", "#tag::load_data[]\n", "initial_posts = session.read.format(\"parquet\").load(fs_prefix + \"/initial_posts\")\n", "ids_in_reply = session.read.format(\"parquet\").load(fs_prefix + \"/ids_in_reply\")\n", "#end::load_data[]" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Load data from the previous stage while checking the schema\n", "#tag::load_with_schema[]\n", "ids_schema = StructType([\n", " StructField(\"In-Reply-To\", StringType(), nullable=True),\n", " StructField(\"message-id\", StringType(),nullable=True)])\n", "ids_in_reply = session.read.format(\"parquet\").schema(ids_schema).load(fs_prefix + \"/ids_in_reply\")\n", "#end::load_with_schema[]" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Cache the data\n", "initial_posts = initial_posts.alias(\"initial_posts\").cache()\n", "ids_in_reply = ids_in_reply.alias(\"ids_in_reply\").cache()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# We can write random SQL -- although we need to wait for preview 3 cause it was taken out in preview1\n", "#tag::direct_sql[]\n", "#ids_in_reply.registerTempTable(\"cheese\")\n", "#no_text = session.sql(\"select * from cheese where body = '' AND subject = ''\")\n", "#end::direct_sql[]" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Drop bad data\n", "#tag::drop_bad_fields[]\n", "initial_posts_count = initial_posts.count()\n", "initial_posts_cleaned = initial_posts.na.drop(how='any', subset=['body', 'from'])\n", "initial_posts_cleaned_count = initial_posts_cleaned.count()\n", "#end::drop_bad_fields[]" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "initial_posts.show()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Start with computing the labels\n", "# Find the initial posts where no one replied\n", "posts_with_replies = (initial_posts.join(\n", " ids_in_reply,\n", " col(\"ids_in_reply.In-Reply-To\") == col(\"initial_posts.Message-Id\"),\n", " \"left_outer\")\n", " .filter(col(\"ids_in_reply.In-Reply-To\").isNotNull())).cache()\n", "posts_with_replies.count()\n", "post_ids_with_replies = (posts_with_replies\n", " .select(col(\"initial_posts.Message-Id\").alias(\"id\"))\n", " .withColumn(\"has_reply\", lit(1.0))).alias(\"post_with_replies\")\n", "\n", "joined_posts = initial_posts.join(\n", " post_ids_with_replies,\n", " col(\"initial_posts.Message-Id\") == col(\"post_with_replies.id\"))" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "joined_posts.show()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "posts_with_labels = joined_posts.na.fill({\"has_reply\": 0.0}).cache()\n", "posts_with_labels.count()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "def extract_links(body):\n", " import re\n", " link_regex_str = r'(http(|s)://(.*?))([\\s\\n]|$)'\n", " itr = re.finditer(link_regex_str, body, re.MULTILINE)\n", " return list(map(lambda elem: elem.group(1), itr))\n", "\n", "def extract_domains(links):\n", " from urllib.parse import urlparse\n", " def extract_domain(link):\n", " try:\n", " nloc = urlparse(link).netloc\n", " # We want to drop www and any extra spaces wtf nloc on the spaces.\n", " regex_str = r'^(www\\.|)(.*?)\\s*$'\n", " match = re.search(regex_str, nloc)\n", " return match.group(2)\n", " except:\n", " return None\n", " return list(map(extract_domain, links))\n", "\n", "def contains_python_stack_trace(body):\n", " return \"Traceback (most recent call last)\" in body\n", "\n", "\n", "\n", "def contains_probably_java_stack_trace(body):\n", " # Look for something based on regex\n", " # Tried https://stackoverflow.com/questions/20609134/regular-expression-optional-multiline-java-stacktrace - more msg looking\n", " # Tried https://stackoverflow.com/questions/3814327/regular-expression-to-parse-a-log-file-and-find-stacktraces\n", " # Yes the compile is per call, but it's cached so w/e\n", " import re\n", " stack_regex_str = r'^\\s*(.+Exception.*):\\n(.*\\n){0,3}?(\\s+at\\s+.*\\(.*\\))+'\n", " match = re.search(stack_regex_str, body, re.MULTILINE)\n", " return match is not None\n", "\n", "\n", "def contains_exception_in_task(body):\n", " # Look for a line along the lines of ERROR Executor: Exception in task \n", " return \"ERROR Executor: Exception in task\" in body\n", " " ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "extract_links_udf = UserDefinedFunction(\n", " extract_links, ArrayType(StringType()), \"extract_links\")\n", "\n", "session.catalog._jsparkSession.udf().registerPython(\n", " \"extract_links\",\n", " extract_links_udf._judf)\n", "\n", "\n", "extract_domains_udf = UserDefinedFunction(\n", " extract_domains, ArrayType(StringType()), \"extract_domains\")\n", "\n", "session.catalog._jsparkSession.udf().registerPython(\n", " \"extract_domains\",\n", " extract_domains_udf._judf)\n", "\n", "\n", "contains_python_stack_trace_udf = UserDefinedFunction(\n", " contains_python_stack_trace, BooleanType(), \"contains_python_stack_trace\")\n", "\n", "session.catalog._jsparkSession.udf().registerPython(\n", " \"contains_python_stack_trace\",\n", " contains_python_stack_trace_udf._judf)\n", "\n", "\n", "contains_probably_java_stack_trace_udf = UserDefinedFunction(\n", " contains_probably_java_stack_trace, BooleanType(), \"contains_probably_java_stack_trace\")\n", "\n", "session.catalog._jsparkSession.udf().registerPython(\n", " \"contains_probably_java_stack_trace\",\n", " contains_probably_java_stack_trace_udf._judf)\n", "\n", "\n", "contains_exception_in_task_udf = UserDefinedFunction(\n", " contains_exception_in_task, BooleanType(), \"contains_exception_in_task\")\n", "\n", "session.catalog._jsparkSession.udf().registerPython(\n", " \"contains_exception_in_task\",\n", " contains_exception_in_task_udf._judf)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "We could make this a transformer stage, but I'm lazy so we'll just use a UDF directly." ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "annotated_spark_mailing_list_data = posts_with_labels.select(\n", " \"*\",\n", " extract_links_udf(posts_with_labels[\"body\"]).alias(\"links_in_email\"),\n", " contains_python_stack_trace_udf(posts_with_labels.body).alias(\"contains_python_stack_trace\").cast(\"double\"),\n", " contains_probably_java_stack_trace_udf(posts_with_labels.body).alias(\"contains_java_stack_trace\").cast(\"double\"),\n", " contains_exception_in_task_udf(posts_with_labels.body).alias(\"contains_exception_in_task\").cast(\"double\"))" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "annotated_spark_mailing_list_data.cache()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "annotated_spark_mailing_list_data.show()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "further_annotated = annotated_spark_mailing_list_data.withColumn(\n", " \"domain_links\",\n", " extract_domains_udf(annotated_spark_mailing_list_data.links_in_email))\n", "# Long story, allow mixed UDF types\n", "further_annotated.cache()\n", "further_annotated.count()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "#tag::make_features[]\n", "tokenizer = Tokenizer(inputCol=\"body\", outputCol=\"body_tokens\")\n", "body_hashing = HashingTF(\n", " inputCol=\"body_tokens\", outputCol=\"raw_body_features\",\n", " numFeatures=10000)\n", "body_idf = IDF(\n", " inputCol=\"raw_body_features\", outputCol=\"body_features\")\n", "body_word2Vec = Word2Vec(\n", " vectorSize=5, minCount=0, numPartitions=10,\n", " inputCol=\"body_tokens\", outputCol=\"body_vecs\")\n", "assembler = VectorAssembler(\n", " inputCols=[\n", " \"body_features\", \"body_vecs\", \"contains_python_stack_trace\",\n", " \"contains_java_stack_trace\", \"contains_exception_in_task\"],\n", " outputCol=\"features\")\n", "#end::make_features[]" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "featureprep_pipeline = Pipeline(\n", " stages=[tokenizer, body_hashing, body_idf, body_word2Vec, assembler])" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "featureprep_pipeline_transformer = featureprep_pipeline.fit(further_annotated)\n", "preped_data = featureprep_pipeline_transformer.transform(further_annotated)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "featureprep_pipeline_transformer.write().save(fs_prefix+\"/feature_prep-2\")" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "preped_data.write.format(\"parquet\").mode(\"overwrite\").save(fs_prefix+\"/prepared_data\")" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.6.9" } }, "nbformat": 4, "nbformat_minor": 2 } ================================================ FILE: feature-prep/spark/SparkMailingListFeaturePrep.py ================================================ #!/usr/bin/env python # coding: utf-8 # In[ ]: # Yes we need both these imports from pyspark.sql import SparkSession from pyspark.sql.functions import col, to_date, lit, isnull from pyspark.sql.types import * from pyspark.sql.types import StructField, StructType from pyspark.sql.catalog import UserDefinedFunction from pyspark.ml.feature import * from pyspark.ml.pipeline import Pipeline import os # In[ ]: # In[ ]: fs_prefix = "s3a://kf-book-examples/mailing-lists" # Create with mc as in ch1 # In[ ]: os.environ["PYSPARK_PYTHON"] = "python3.6" # See https://medium.com/@szinck/setting-up-pyspark-jupyter-and-minio-on-kubeflow-kubernetes-aab98874794f session = ( SparkSession.builder.appName("processMailingListData").config( "spark.executor.instances", "8").config("spark.driver.memoryOverhead", "0.25").config("spark.executor.memory", "10g").config( "spark.dynamicAllocation.enabled", "false"). config("spark.ui.enabled", "true").config( "spark.kubernetes.container.image", "gcr.io/boos-demo-projects-are-rad/kubeflow/spark-worker/spark-py-36:v3.0.0-preview2-23" ).config("spark.driver.bindAddress", "0.0.0.0").config("spark.kubernetes.namespace", "kubeflow-programmerboo"). config("spark.master", "k8s://https://kubernetes.default").config( "spark.driver.host", "spark-driver.kubeflow-programmerboo.svc.cluster.local").config( "spark.kubernetes.executor.annotation.sidecar.istio.io/inject", "false").config("spark.driver.port", "39235").config("spark.blockManager.port", "39236") # If using minio - see https://github.com/minio/cookbook/blob/master/docs/apache-spark-with-minio.md .config("spark.hadoop.fs.s3a.endpoint", "minio-service.kubeflow.svc.cluster.local:9000").config( "fs.s3a.connection.ssl.enabled", "false").config("fs.s3a.path.style.access", "true") # You can also add an account using the minio command as described in chapter 1 .config("spark.hadoop.fs.s3a.access.key", "minio").config("spark.hadoop.fs.s3a.secret.key", "minio123")).getOrCreate() sc = session.sparkContext # In[ ]: #Load data from the previous stage #tag::load_data[] initial_posts = session.read.format("parquet").load(fs_prefix + "/initial_posts") ids_in_reply = session.read.format("parquet").load(fs_prefix + "/ids_in_reply") #end::load_data[] # In[ ]: # Load data from the previous stage while checking the schema #tag::load_with_schema[] ids_schema = StructType([ StructField("In-Reply-To", StringType(), nullable=True), StructField("message-id", StringType(), nullable=True) ]) ids_in_reply = session.read.format("parquet").schema(ids_schema).load( fs_prefix + "/ids_in_reply") #end::load_with_schema[] # In[ ]: # Cache the data initial_posts = initial_posts.alias("initial_posts").cache() ids_in_reply = ids_in_reply.alias("ids_in_reply").cache() # In[ ]: # We can write random SQL -- although we need to wait for preview 3 cause it was taken out in preview1 #tag::direct_sql[] #ids_in_reply.registerTempTable("cheese") #no_text = session.sql("select * from cheese where body = '' AND subject = ''") #end::direct_sql[] # In[ ]: # Drop bad data #tag::drop_bad_fields[] initial_posts_count = initial_posts.count() initial_posts_cleaned = initial_posts.na.drop(how='any', subset=['body', 'from']) initial_posts_cleaned_count = initial_posts_cleaned.count() #end::drop_bad_fields[] # In[ ]: initial_posts.show() # In[ ]: # Start with computing the labels # Find the initial posts where no one replied posts_with_replies = (initial_posts.join( ids_in_reply, col("ids_in_reply.In-Reply-To") == col("initial_posts.Message-Id"), "left_outer").filter(col("ids_in_reply.In-Reply-To").isNotNull())).cache() posts_with_replies.count() post_ids_with_replies = (posts_with_replies.select( col("initial_posts.Message-Id").alias("id")).withColumn( "has_reply", lit(1.0))).alias("post_with_replies") joined_posts = initial_posts.join( post_ids_with_replies, col("initial_posts.Message-Id") == col("post_with_replies.id")) # In[ ]: joined_posts.show() # In[ ]: posts_with_labels = joined_posts.na.fill({"has_reply": 0.0}).cache() posts_with_labels.count() # In[ ]: def extract_links(body): import re link_regex_str = r'(http(|s)://(.*?))([\s\n]|$)' itr = re.finditer(link_regex_str, body, re.MULTILINE) return list(map(lambda elem: elem.group(1), itr)) def extract_domains(links): from urllib.parse import urlparse def extract_domain(link): try: nloc = urlparse(link).netloc # We want to drop www and any extra spaces wtf nloc on the spaces. regex_str = r'^(www\.|)(.*?)\s*$' match = re.search(regex_str, nloc) return match.group(2) except: return None return list(map(extract_domain, links)) def contains_python_stack_trace(body): return "Traceback (most recent call last)" in body def contains_probably_java_stack_trace(body): # Look for something based on regex # Tried https://stackoverflow.com/questions/20609134/regular-expression-optional-multiline-java-stacktrace - more msg looking # Tried https://stackoverflow.com/questions/3814327/regular-expression-to-parse-a-log-file-and-find-stacktraces # Yes the compile is per call, but it's cached so w/e import re stack_regex_str = r'^\s*(.+Exception.*):\n(.*\n){0,3}?(\s+at\s+.*\(.*\))+' match = re.search(stack_regex_str, body, re.MULTILINE) return match is not None def contains_exception_in_task(body): # Look for a line along the lines of ERROR Executor: Exception in task return "ERROR Executor: Exception in task" in body # In[ ]: extract_links_udf = UserDefinedFunction(extract_links, ArrayType(StringType()), "extract_links") session.catalog._jsparkSession.udf().registerPython("extract_links", extract_links_udf._judf) extract_domains_udf = UserDefinedFunction(extract_domains, ArrayType(StringType()), "extract_domains") session.catalog._jsparkSession.udf().registerPython("extract_domains", extract_domains_udf._judf) contains_python_stack_trace_udf = UserDefinedFunction( contains_python_stack_trace, BooleanType(), "contains_python_stack_trace") session.catalog._jsparkSession.udf().registerPython( "contains_python_stack_trace", contains_python_stack_trace_udf._judf) contains_probably_java_stack_trace_udf = UserDefinedFunction( contains_probably_java_stack_trace, BooleanType(), "contains_probably_java_stack_trace") session.catalog._jsparkSession.udf().registerPython( "contains_probably_java_stack_trace", contains_probably_java_stack_trace_udf._judf) contains_exception_in_task_udf = UserDefinedFunction( contains_exception_in_task, BooleanType(), "contains_exception_in_task") session.catalog._jsparkSession.udf().registerPython( "contains_exception_in_task", contains_exception_in_task_udf._judf) # We could make this a transformer stage, but I'm lazy so we'll just use a UDF directly. # In[ ]: annotated_spark_mailing_list_data = posts_with_labels.select( "*", extract_links_udf(posts_with_labels["body"]).alias("links_in_email"), contains_python_stack_trace_udf(posts_with_labels.body).alias( "contains_python_stack_trace").cast("double"), contains_probably_java_stack_trace_udf(posts_with_labels.body).alias( "contains_java_stack_trace").cast("double"), contains_exception_in_task_udf(posts_with_labels.body).alias( "contains_exception_in_task").cast("double")) # In[ ]: annotated_spark_mailing_list_data.cache() # In[ ]: annotated_spark_mailing_list_data.show() # In[ ]: further_annotated = annotated_spark_mailing_list_data.withColumn( "domain_links", extract_domains_udf(annotated_spark_mailing_list_data.links_in_email)) # Long story, allow mixed UDF types further_annotated.cache() further_annotated.count() # In[ ]: #tag::make_features[] tokenizer = Tokenizer(inputCol="body", outputCol="body_tokens") body_hashing = HashingTF(inputCol="body_tokens", outputCol="raw_body_features", numFeatures=10000) body_idf = IDF(inputCol="raw_body_features", outputCol="body_features") body_word2Vec = Word2Vec(vectorSize=5, minCount=0, numPartitions=10, inputCol="body_tokens", outputCol="body_vecs") assembler = VectorAssembler(inputCols=[ "body_features", "body_vecs", "contains_python_stack_trace", "contains_java_stack_trace", "contains_exception_in_task" ], outputCol="features") #end::make_features[] # In[ ]: featureprep_pipeline = Pipeline( stages=[tokenizer, body_hashing, body_idf, body_word2Vec, assembler]) # In[ ]: featureprep_pipeline_transformer = featureprep_pipeline.fit(further_annotated) preped_data = featureprep_pipeline_transformer.transform(further_annotated) # In[ ]: featureprep_pipeline_transformer.write().save(fs_prefix + "/feature_prep-2") # In[ ]: preped_data.write.format("parquet").mode("overwrite").save(fs_prefix + "/prepared_data") ================================================ FILE: feature-prep/tft/requirements.txt ================================================ tfx tensorflow apache-beam ================================================ FILE: feature-prep/tft/transform.py ================================================ #tag::imports[] import tensorflow as tf import tensorflow_transform as tft from tensorflow_transform.tf_metadata import schema_utils #end::imports[] #tag::entry_point[] def preprocessing_fn(inputs): #end::entry_point[] #tag::logic[] outputs = {} # TFT business logic goes here outputs["body_stuff"] = tft.compute_and_apply_vocabulary(inputs["body"], top_k=1000) return outputs #end::logic[] ================================================ FILE: gcp-setup/cloudshell_scrip.sh ================================================ #!/bin/bash # Note: this only works inside of cloudshell! #tag::cloudshell_script[] G_SOURCES="https://source.developers.google.com/p" cloudshell_open \ --repo_url "$G_SOURCES/$PROJECTID/r/$PROJECTID-$DEPLOYMENTNAME-config"\ --dir"v$KUBEFLOWVERSION/kubeflow/kf_util" \ --page "editor" \ --tutorial "conn.md" #end::cloudshell_script[] ================================================ FILE: gcp-setup/setup-gcp.sh ================================================ #!/bin/bash #tag::ubuntu[] apt-get install google-cloud-sdk #end::ubuntu[] apt-get remove google-cloud-sdk #tag::general[] curl https://sdk.cloud.google.com | bash #end::general[] #tag::enable_container_apis[] gcloud services enable container.googleapis.com #end::enable_container_apis[] PROJECT_ID="boos-demo-projects-are-rad" #tag::configure_cloud_sdk[] gcloud auth login # Launches a web browser to login with gcloud config set project "$PROJECT_ID" #Project ID is your Google project ID #end::configure_cloud_sdk[] ZONE="us-central1-a" # For TPU access CLUSTER_NAME="ci-cluster" #tag::launch_cluster[] gcloud beta container clusters create $CLUSTER_NAME \ --zone $ZONE \ --machine-type "n1-standard-4" \ --disk-type "pd-standard" \ --disk-size "100" \ --scopes "https://www.googleapis.com/auth/cloud-platform" \ --addons HorizontalPodAutoscaling,HttpLoadBalancing \ --enable-autoupgrade \ --enable-autorepair \ --enable-autoscaling --min-nodes 1 --max-nodes 10 --num-nodes 2 #end::launch_cluster[] #tag::delete_cluster[] gcloud beta container clusters delete $CLUSTER_NAME --zone $ZONE #end::delete_cluster[] ================================================ FILE: kfctl_gcp_iap.v1.0.1.yaml ================================================ apiVersion: kfdef.apps.kubeflow.org/v1 kind: KfDef metadata: namespace: kubeflow spec: applications: - kustomizeConfig: parameters: - name: namespace value: istio-system repoRef: name: manifests path: istio/istio-crds name: istio-crds - kustomizeConfig: parameters: - name: namespace value: istio-system repoRef: name: manifests path: istio/istio-install name: istio-install - kustomizeConfig: parameters: - name: namespace value: istio-system repoRef: name: manifests path: istio/cluster-local-gateway name: cluster-local-gateway - kustomizeConfig: parameters: - name: namespace value: istio-system repoRef: name: manifests path: istio/kfserving-gateway name: kfserving-gateway - kustomizeConfig: parameters: - name: clusterRbacConfig value: 'ON' repoRef: name: manifests path: istio/istio name: istio - kustomizeConfig: repoRef: name: manifests path: application/application-crds name: application-crds - kustomizeConfig: overlays: - application repoRef: name: manifests path: application/application name: application - kustomizeConfig: parameters: - name: namespace value: cert-manager repoRef: name: manifests path: cert-manager/cert-manager-crds name: cert-manager-crds - kustomizeConfig: parameters: - name: namespace value: kube-system repoRef: name: manifests path: cert-manager/cert-manager-kube-system-resources name: cert-manager-kube-system-resources - kustomizeConfig: overlays: - self-signed - application parameters: - name: namespace value: cert-manager repoRef: name: manifests path: cert-manager/cert-manager name: cert-manager - kustomizeConfig: repoRef: name: manifests path: kubeflow-roles name: kubeflow-roles - kustomizeConfig: repoRef: name: manifests path: metacontroller name: metacontroller - kustomizeConfig: overlays: - istio - application repoRef: name: manifests path: argo name: argo - kustomizeConfig: overlays: - istio - application parameters: - name: userid-header value: X-Goog-Authenticated-User-Email - name: userid-prefix value: 'accounts.google.com:' repoRef: name: manifests path: common/centraldashboard name: centraldashboard - kustomizeConfig: overlays: - application repoRef: name: manifests path: admission-webhook/webhook name: webhook - kustomizeConfig: overlays: - application parameters: - name: webhookNamePrefix value: admission-webhook- repoRef: name: manifests path: admission-webhook/bootstrap name: bootstrap - kustomizeConfig: overlays: - istio - application parameters: - name: userid-header value: X-Goog-Authenticated-User-Email - name: userid-prefix value: 'accounts.google.com:' repoRef: name: manifests path: jupyter/jupyter-web-app name: jupyter-web-app - kustomizeConfig: overlays: - application repoRef: name: manifests path: spark/spark-operator name: spark-operator - kustomizeConfig: overlays: - istio - application - db repoRef: name: manifests path: metadata name: metadata - kustomizeConfig: overlays: - istio - application parameters: - name: injectGcpCredentials value: 'true' repoRef: name: manifests path: jupyter/notebook-controller name: notebook-controller - kustomizeConfig: overlays: - application repoRef: name: manifests path: pytorch-job/pytorch-job-crds name: pytorch-job-crds - kustomizeConfig: overlays: - application repoRef: name: manifests path: pytorch-job/pytorch-operator name: pytorch-operator - kustomizeConfig: overlays: - application parameters: - name: namespace value: knative-serving repoRef: name: manifests path: knative/knative-serving-crds name: knative-crds - kustomizeConfig: overlays: - application parameters: - name: namespace value: knative-serving repoRef: name: manifests path: knative/knative-serving-install name: knative-install - kustomizeConfig: overlays: - application repoRef: name: manifests path: kfserving/kfserving-crds name: kfserving-crds - kustomizeConfig: overlays: - application repoRef: name: manifests path: kfserving/kfserving-install name: kfserving-install - kustomizeConfig: overlays: - application parameters: - name: usageId value: '7439583937720421527' - name: reportUsage value: 'true' repoRef: name: manifests path: common/spartakus name: spartakus - kustomizeConfig: overlays: - istio repoRef: name: manifests path: tensorboard name: tensorboard - kustomizeConfig: overlays: - application repoRef: name: manifests path: tf-training/tf-job-crds name: tf-job-crds - kustomizeConfig: overlays: - application repoRef: name: manifests path: tf-training/tf-job-operator name: tf-job-operator - kustomizeConfig: overlays: - application repoRef: name: manifests path: katib/katib-crds name: katib-crds - kustomizeConfig: overlays: - application - istio repoRef: name: manifests path: katib/katib-controller name: katib-controller - kustomizeConfig: overlays: - application repoRef: name: manifests path: pipeline/api-service name: api-service - kustomizeConfig: overlays: - minioPd - application parameters: - name: minioPd value: test1-storage-artifact-store - name: minioPvName value: minio-pv - name: minioPvcName value: minio-pv-claim repoRef: name: manifests path: pipeline/minio name: minio - kustomizeConfig: overlays: - mysqlPd - application parameters: - name: mysqlPd value: test1-storage-metadata-store - name: mysqlPvName value: mysql-pv - name: mysqlPvcName value: mysql-pv-claim repoRef: name: manifests path: pipeline/mysql name: mysql - kustomizeConfig: overlays: - application repoRef: name: manifests path: pipeline/persistent-agent name: persistent-agent - kustomizeConfig: overlays: - application repoRef: name: manifests path: pipeline/pipelines-runner name: pipelines-runner - kustomizeConfig: overlays: - gcp - istio - application repoRef: name: manifests path: pipeline/pipelines-ui name: pipelines-ui - kustomizeConfig: overlays: - application repoRef: name: manifests path: pipeline/pipelines-viewer name: pipelines-viewer - kustomizeConfig: overlays: - application repoRef: name: manifests path: pipeline/scheduledworkflow name: scheduledworkflow - kustomizeConfig: overlays: - application repoRef: name: manifests path: pipeline/pipeline-visualization-service name: pipeline-visualization-service - kustomizeConfig: overlays: - application repoRef: name: manifests path: gcp/cloud-endpoints name: cloud-endpoints - kustomizeConfig: overlays: - application - istio parameters: - name: admin - name: userid-header value: X-Goog-Authenticated-User-Email - name: userid-prefix value: 'accounts.google.com:' repoRef: name: manifests path: profiles name: profiles - kustomizeConfig: overlays: - application repoRef: name: manifests path: gcp/gpu-driver name: gpu-driver - kustomizeConfig: overlays: - managed-cert - application parameters: - name: namespace value: istio-system - name: ipName value: test1-ip - name: hostname repoRef: name: manifests path: gcp/iap-ingress name: iap-ingress - kustomizeConfig: overlays: - application repoRef: name: manifests path: seldon/seldon-core-operator name: seldon-core-operator - kustomizeConfig: parameters: - name: user - name: profile-name value: anonymous repoRef: name: manifests path: default-install name: default-install plugins: - kind: KfGcpPlugin metadata: creationTimestamp: null name: gcp spec: createPipelinePersistentStorage: true deploymentManagerConfig: repoRef: name: manifests path: gcp/deployment_manager_configs enableWorkloadIdentity: true skipInitProject: true useBasicAuth: false repos: - name: manifests uri: https://github.com/holdenk/manifests/archive/fix-spark-crd.tar.gz version: v1.0.1 ================================================ FILE: pipelines/ControlStructures.ipynb ================================================ { "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Simple Control structure\n", "\n", "Shows how to use conditional execution" ] }, { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Requirement already up-to-date: kfp in ./.local/lib/python3.6/site-packages (0.2.2.1)\n", "Requirement already satisfied, skipping upgrade: jsonschema>=3.0.1 in /usr/local/lib/python3.6/dist-packages (from kfp) (3.2.0)\n", "Requirement already satisfied, skipping upgrade: google-cloud-storage>=1.13.0 in /usr/local/lib/python3.6/dist-packages (from kfp) (1.25.0)\n", "Requirement already satisfied, skipping upgrade: python-dateutil in /usr/local/lib/python3.6/dist-packages (from kfp) (2.8.1)\n", "Requirement already satisfied, skipping upgrade: google-auth>=1.6.1 in /usr/local/lib/python3.6/dist-packages (from kfp) (1.11.0)\n", "Requirement already satisfied, skipping upgrade: argo-models==2.2.1a in ./.local/lib/python3.6/site-packages (from kfp) (2.2.1a0)\n", "Requirement already satisfied, skipping upgrade: urllib3<1.25,>=1.15 in ./.local/lib/python3.6/site-packages (from kfp) (1.24.3)\n", "Requirement already satisfied, skipping upgrade: cloudpickle==1.1.1 in ./.local/lib/python3.6/site-packages (from kfp) (1.1.1)\n", "Requirement already satisfied, skipping upgrade: kubernetes<=10.0.0,>=8.0.0 in ./.local/lib/python3.6/site-packages (from kfp) (10.0.0)\n", "Requirement already satisfied, skipping upgrade: click==7.0 in ./.local/lib/python3.6/site-packages (from kfp) (7.0)\n", "Requirement already satisfied, skipping upgrade: certifi in /usr/local/lib/python3.6/dist-packages (from kfp) (2019.11.28)\n", "Requirement already satisfied, skipping upgrade: PyYAML in /usr/local/lib/python3.6/dist-packages (from kfp) (5.3)\n", "Requirement already satisfied, skipping upgrade: kfp-server-api<=0.1.40,>=0.1.18 in ./.local/lib/python3.6/site-packages (from kfp) (0.1.40)\n", "Requirement already satisfied, skipping upgrade: six>=1.10 in /usr/lib/python3/dist-packages (from kfp) (1.11.0)\n", "Requirement already satisfied, skipping upgrade: PyJWT>=1.6.4 in ./.local/lib/python3.6/site-packages (from kfp) (1.7.1)\n", "Requirement already satisfied, skipping upgrade: Deprecated in ./.local/lib/python3.6/site-packages (from kfp) (1.2.7)\n", "Requirement already satisfied, skipping upgrade: requests-toolbelt>=0.8.0 in ./.local/lib/python3.6/site-packages (from kfp) (0.9.1)\n", "Requirement already satisfied, skipping upgrade: cryptography>=2.4.2 in ./.local/lib/python3.6/site-packages (from kfp) (2.8)\n", "Requirement already satisfied, skipping upgrade: tabulate==0.8.3 in ./.local/lib/python3.6/site-packages (from kfp) (0.8.3)\n", "Requirement already satisfied, skipping upgrade: importlib-metadata; python_version < \"3.8\" in /usr/local/lib/python3.6/dist-packages (from jsonschema>=3.0.1->kfp) (1.4.0)\n", "Requirement already satisfied, skipping upgrade: attrs>=17.4.0 in /usr/local/lib/python3.6/dist-packages (from jsonschema>=3.0.1->kfp) (19.3.0)\n", "Requirement already satisfied, skipping upgrade: pyrsistent>=0.14.0 in /usr/local/lib/python3.6/dist-packages (from jsonschema>=3.0.1->kfp) (0.15.7)\n", "Requirement already satisfied, skipping upgrade: setuptools in /usr/local/lib/python3.6/dist-packages (from jsonschema>=3.0.1->kfp) (45.1.0)\n", "Requirement already satisfied, skipping upgrade: google-resumable-media<0.6dev,>=0.5.0 in /usr/local/lib/python3.6/dist-packages (from google-cloud-storage>=1.13.0->kfp) (0.5.0)\n", "Requirement already satisfied, skipping upgrade: google-cloud-core<2.0dev,>=1.2.0 in /usr/local/lib/python3.6/dist-packages (from google-cloud-storage>=1.13.0->kfp) (1.3.0)\n", "Requirement already satisfied, skipping upgrade: cachetools<5.0,>=2.0.0 in /usr/local/lib/python3.6/dist-packages (from google-auth>=1.6.1->kfp) (4.0.0)\n", "Requirement already satisfied, skipping upgrade: pyasn1-modules>=0.2.1 in /usr/local/lib/python3.6/dist-packages (from google-auth>=1.6.1->kfp) (0.2.8)\n", "Requirement already satisfied, skipping upgrade: rsa<4.1,>=3.1.4 in /usr/local/lib/python3.6/dist-packages (from google-auth>=1.6.1->kfp) (4.0)\n", "Requirement already satisfied, skipping upgrade: requests-oauthlib in /usr/local/lib/python3.6/dist-packages (from kubernetes<=10.0.0,>=8.0.0->kfp) (1.3.0)\n", "Requirement already satisfied, skipping upgrade: requests in /usr/local/lib/python3.6/dist-packages (from kubernetes<=10.0.0,>=8.0.0->kfp) (2.22.0)\n", "Requirement already satisfied, skipping upgrade: websocket-client!=0.40.0,!=0.41.*,!=0.42.*,>=0.32.0 in /usr/local/lib/python3.6/dist-packages (from kubernetes<=10.0.0,>=8.0.0->kfp) (0.57.0)\n", "Requirement already satisfied, skipping upgrade: wrapt<2,>=1.10 in /usr/local/lib/python3.6/dist-packages (from Deprecated->kfp) (1.11.2)\n", "Requirement already satisfied, skipping upgrade: cffi!=1.11.3,>=1.8 in ./.local/lib/python3.6/site-packages (from cryptography>=2.4.2->kfp) (1.14.0)\n", "Requirement already satisfied, skipping upgrade: zipp>=0.5 in /usr/local/lib/python3.6/dist-packages (from importlib-metadata; python_version < \"3.8\"->jsonschema>=3.0.1->kfp) (2.1.0)\n", "Requirement already satisfied, skipping upgrade: google-api-core<2.0.0dev,>=1.16.0 in /usr/local/lib/python3.6/dist-packages (from google-cloud-core<2.0dev,>=1.2.0->google-cloud-storage>=1.13.0->kfp) (1.16.0)\n", "Requirement already satisfied, skipping upgrade: pyasn1<0.5.0,>=0.4.6 in /usr/local/lib/python3.6/dist-packages (from pyasn1-modules>=0.2.1->google-auth>=1.6.1->kfp) (0.4.8)\n", "Requirement already satisfied, skipping upgrade: oauthlib>=3.0.0 in /usr/local/lib/python3.6/dist-packages (from requests-oauthlib->kubernetes<=10.0.0,>=8.0.0->kfp) (3.1.0)\n", "Requirement already satisfied, skipping upgrade: chardet<3.1.0,>=3.0.2 in /usr/local/lib/python3.6/dist-packages (from requests->kubernetes<=10.0.0,>=8.0.0->kfp) (3.0.4)\n", "Requirement already satisfied, skipping upgrade: idna<2.9,>=2.5 in /usr/lib/python3/dist-packages (from requests->kubernetes<=10.0.0,>=8.0.0->kfp) (2.6)\n", "Requirement already satisfied, skipping upgrade: pycparser in ./.local/lib/python3.6/site-packages (from cffi!=1.11.3,>=1.8->cryptography>=2.4.2->kfp) (2.19)\n", "Requirement already satisfied, skipping upgrade: protobuf>=3.4.0 in /usr/local/lib/python3.6/dist-packages (from google-api-core<2.0.0dev,>=1.16.0->google-cloud-core<2.0dev,>=1.2.0->google-cloud-storage>=1.13.0->kfp) (3.11.2)\n", "Requirement already satisfied, skipping upgrade: pytz in /usr/local/lib/python3.6/dist-packages (from google-api-core<2.0.0dev,>=1.16.0->google-cloud-core<2.0dev,>=1.2.0->google-cloud-storage>=1.13.0->kfp) (2019.3)\n", "Requirement already satisfied, skipping upgrade: googleapis-common-protos<2.0dev,>=1.6.0 in /usr/local/lib/python3.6/dist-packages (from google-api-core<2.0.0dev,>=1.16.0->google-cloud-core<2.0dev,>=1.2.0->google-cloud-storage>=1.13.0->kfp) (1.51.0)\n" ] } ], "source": [ "!pip install kfp --upgrade --user\n", "\n", "import kfp\n", "from kfp import dsl\n", "from kfp.components import func_to_container_op, InputPath, OutputPath" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Functions" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [], "source": [ "@func_to_container_op\n", "def get_random_int_op(minimum: int, maximum: int) -> int:\n", " \"\"\"Generate a random number between minimum and maximum (inclusive).\"\"\"\n", " import random\n", " result = random.randint(minimum, maximum)\n", " print(result)\n", " return result\n", "\n", "@func_to_container_op\n", "def process_small_op(data: int):\n", " \"\"\"Process small numbers.\"\"\"\n", " print(\"Processing small result\", data)\n", " return\n", "\n", "@func_to_container_op\n", "def process_medium_op(data: int):\n", " \"\"\"Process medium numbers.\"\"\"\n", " print(\"Processing medium result\", data)\n", " return\n", "\n", "@func_to_container_op\n", "def process_large_op(data: int):\n", " \"\"\"Process large numbers.\"\"\"\n", " print(\"Processing large result\", data)\n", " return" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Conditional pipeline" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [], "source": [ "@dsl.pipeline(\n", " name='Conditional execution pipeline',\n", " description='Shows how to use dsl.Condition().'\n", ")\n", "def conditional_pipeline():\n", " number = get_random_int_op(0, 100).output\n", " with dsl.Condition(number < 10):\n", " process_small_op(number)\n", " with dsl.Condition(number > 10 and number < 50):\n", " process_medium_op(number)\n", " with dsl.Condition(number > 50):\n", " process_large_op(number)\n", " " ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Submit the pipeline for execution:" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [ { "data": { "text/html": [ "Experiment link here" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/html": [ "Run link here" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/plain": [ "RunPipelineResult(run_id=293a92c5-50b2-4a96-bbd4-ebc85106f337)" ] }, "execution_count": 4, "metadata": {}, "output_type": "execute_result" } ], "source": [ "kfp.Client().create_run_from_pipeline_func(conditional_pipeline, arguments={})" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.6.9" } }, "nbformat": 4, "nbformat_minor": 2 } ================================================ FILE: pipelines/Lightweight Pipeline.ipynb ================================================ { "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Setup" ] }, { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Requirement already up-to-date: kfp in ./.local/lib/python3.6/site-packages (0.2.2.1)\n", "Requirement already satisfied, skipping upgrade: PyYAML in /usr/local/lib/python3.6/dist-packages (from kfp) (5.3)\n", "Requirement already satisfied, skipping upgrade: requests-toolbelt>=0.8.0 in ./.local/lib/python3.6/site-packages (from kfp) (0.9.1)\n", "Requirement already satisfied, skipping upgrade: click==7.0 in ./.local/lib/python3.6/site-packages (from kfp) (7.0)\n", "Requirement already satisfied, skipping upgrade: Deprecated in ./.local/lib/python3.6/site-packages (from kfp) (1.2.7)\n", "Requirement already satisfied, skipping upgrade: kubernetes<=10.0.0,>=8.0.0 in ./.local/lib/python3.6/site-packages (from kfp) (10.0.0)\n", "Requirement already satisfied, skipping upgrade: six>=1.10 in /usr/lib/python3/dist-packages (from kfp) (1.11.0)\n", "Requirement already satisfied, skipping upgrade: certifi in /usr/local/lib/python3.6/dist-packages (from kfp) (2019.11.28)\n", "Requirement already satisfied, skipping upgrade: google-cloud-storage>=1.13.0 in /usr/local/lib/python3.6/dist-packages (from kfp) (1.25.0)\n", "Requirement already satisfied, skipping upgrade: PyJWT>=1.6.4 in ./.local/lib/python3.6/site-packages (from kfp) (1.7.1)\n", "Requirement already satisfied, skipping upgrade: cryptography>=2.4.2 in ./.local/lib/python3.6/site-packages (from kfp) (2.8)\n", "Requirement already satisfied, skipping upgrade: google-auth>=1.6.1 in /usr/local/lib/python3.6/dist-packages (from kfp) (1.11.0)\n", "Requirement already satisfied, skipping upgrade: cloudpickle==1.1.1 in ./.local/lib/python3.6/site-packages (from kfp) (1.1.1)\n", "Requirement already satisfied, skipping upgrade: kfp-server-api<=0.1.40,>=0.1.18 in ./.local/lib/python3.6/site-packages (from kfp) (0.1.40)\n", "Requirement already satisfied, skipping upgrade: argo-models==2.2.1a in ./.local/lib/python3.6/site-packages (from kfp) (2.2.1a0)\n", "Requirement already satisfied, skipping upgrade: tabulate==0.8.3 in ./.local/lib/python3.6/site-packages (from kfp) (0.8.3)\n", "Requirement already satisfied, skipping upgrade: python-dateutil in /usr/local/lib/python3.6/dist-packages (from kfp) (2.8.1)\n", "Requirement already satisfied, skipping upgrade: urllib3<1.25,>=1.15 in ./.local/lib/python3.6/site-packages (from kfp) (1.24.3)\n", "Requirement already satisfied, skipping upgrade: jsonschema>=3.0.1 in /usr/local/lib/python3.6/dist-packages (from kfp) (3.2.0)\n", "Requirement already satisfied, skipping upgrade: requests<3.0.0,>=2.0.1 in /usr/local/lib/python3.6/dist-packages (from requests-toolbelt>=0.8.0->kfp) (2.22.0)\n", "Requirement already satisfied, skipping upgrade: wrapt<2,>=1.10 in /usr/local/lib/python3.6/dist-packages (from Deprecated->kfp) (1.11.2)\n", "Requirement already satisfied, skipping upgrade: setuptools>=21.0.0 in /usr/local/lib/python3.6/dist-packages (from kubernetes<=10.0.0,>=8.0.0->kfp) (45.1.0)\n", "Requirement already satisfied, skipping upgrade: websocket-client!=0.40.0,!=0.41.*,!=0.42.*,>=0.32.0 in /usr/local/lib/python3.6/dist-packages (from kubernetes<=10.0.0,>=8.0.0->kfp) (0.57.0)\n", "Requirement already satisfied, skipping upgrade: requests-oauthlib in /usr/local/lib/python3.6/dist-packages (from kubernetes<=10.0.0,>=8.0.0->kfp) (1.3.0)\n", "Requirement already satisfied, skipping upgrade: google-cloud-core<2.0dev,>=1.2.0 in /usr/local/lib/python3.6/dist-packages (from google-cloud-storage>=1.13.0->kfp) (1.3.0)\n", "Requirement already satisfied, skipping upgrade: google-resumable-media<0.6dev,>=0.5.0 in /usr/local/lib/python3.6/dist-packages (from google-cloud-storage>=1.13.0->kfp) (0.5.0)\n", "Requirement already satisfied, skipping upgrade: cffi!=1.11.3,>=1.8 in ./.local/lib/python3.6/site-packages (from cryptography>=2.4.2->kfp) (1.14.0)\n", "Requirement already satisfied, skipping upgrade: cachetools<5.0,>=2.0.0 in /usr/local/lib/python3.6/dist-packages (from google-auth>=1.6.1->kfp) (4.0.0)\n", "Requirement already satisfied, skipping upgrade: rsa<4.1,>=3.1.4 in /usr/local/lib/python3.6/dist-packages (from google-auth>=1.6.1->kfp) (4.0)\n", "Requirement already satisfied, skipping upgrade: pyasn1-modules>=0.2.1 in /usr/local/lib/python3.6/dist-packages (from google-auth>=1.6.1->kfp) (0.2.8)\n", "Requirement already satisfied, skipping upgrade: attrs>=17.4.0 in /usr/local/lib/python3.6/dist-packages (from jsonschema>=3.0.1->kfp) (19.3.0)\n", "Requirement already satisfied, skipping upgrade: pyrsistent>=0.14.0 in /usr/local/lib/python3.6/dist-packages (from jsonschema>=3.0.1->kfp) (0.15.7)\n", "Requirement already satisfied, skipping upgrade: importlib-metadata; python_version < \"3.8\" in /usr/local/lib/python3.6/dist-packages (from jsonschema>=3.0.1->kfp) (1.4.0)\n", "Requirement already satisfied, skipping upgrade: chardet<3.1.0,>=3.0.2 in /usr/local/lib/python3.6/dist-packages (from requests<3.0.0,>=2.0.1->requests-toolbelt>=0.8.0->kfp) (3.0.4)\n", "Requirement already satisfied, skipping upgrade: idna<2.9,>=2.5 in /usr/lib/python3/dist-packages (from requests<3.0.0,>=2.0.1->requests-toolbelt>=0.8.0->kfp) (2.6)\n", "Requirement already satisfied, skipping upgrade: oauthlib>=3.0.0 in /usr/local/lib/python3.6/dist-packages (from requests-oauthlib->kubernetes<=10.0.0,>=8.0.0->kfp) (3.1.0)\n", "Requirement already satisfied, skipping upgrade: google-api-core<2.0.0dev,>=1.16.0 in /usr/local/lib/python3.6/dist-packages (from google-cloud-core<2.0dev,>=1.2.0->google-cloud-storage>=1.13.0->kfp) (1.16.0)\n", "Requirement already satisfied, skipping upgrade: pycparser in ./.local/lib/python3.6/site-packages (from cffi!=1.11.3,>=1.8->cryptography>=2.4.2->kfp) (2.19)\n", "Requirement already satisfied, skipping upgrade: pyasn1>=0.1.3 in /usr/local/lib/python3.6/dist-packages (from rsa<4.1,>=3.1.4->google-auth>=1.6.1->kfp) (0.4.8)\n", "Requirement already satisfied, skipping upgrade: zipp>=0.5 in /usr/local/lib/python3.6/dist-packages (from importlib-metadata; python_version < \"3.8\"->jsonschema>=3.0.1->kfp) (2.1.0)\n", "Requirement already satisfied, skipping upgrade: googleapis-common-protos<2.0dev,>=1.6.0 in /usr/local/lib/python3.6/dist-packages (from google-api-core<2.0.0dev,>=1.16.0->google-cloud-core<2.0dev,>=1.2.0->google-cloud-storage>=1.13.0->kfp) (1.51.0)\n", "Requirement already satisfied, skipping upgrade: pytz in /usr/local/lib/python3.6/dist-packages (from google-api-core<2.0.0dev,>=1.16.0->google-cloud-core<2.0dev,>=1.2.0->google-cloud-storage>=1.13.0->kfp) (2019.3)\n", "Requirement already satisfied, skipping upgrade: protobuf>=3.4.0 in /usr/local/lib/python3.6/dist-packages (from google-api-core<2.0.0dev,>=1.16.0->google-cloud-core<2.0dev,>=1.2.0->google-cloud-storage>=1.13.0->kfp) (3.11.2)\n" ] } ], "source": [ "!pip install kfp --upgrade --user\n", "\n", "import kfp \n", "from kfp import compiler\n", "import kfp.dsl as dsl\n", "import kfp.notebook\n", "import kfp.components as comp\n" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Simple function that just add two numbers:" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [], "source": [ "#Define a Python function\n", "def add(a: float, b: float) -> float:\n", " '''Calculates sum of two arguments'''\n", " return a + b" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Convert the function to a pipeline operation" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [], "source": [ "add_op = comp.func_to_container_op(add)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "A bit more advanced function which demonstrates how to use imports, helper functions and produce multiple outputs." ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [], "source": [ "from typing import NamedTuple\n", "def my_divmod(dividend: float, divisor:float) -> NamedTuple('MyDivmodOutput', [('quotient', float), ('remainder', float)]):\n", " '''Divides two numbers and calculate the quotient and remainder'''\n", " #Imports inside a component function:\n", " import numpy as np\n", "\n", " #This function demonstrates how to use nested functions inside a component function:\n", " def divmod_helper(dividend, divisor):\n", " return np.divmod(dividend, divisor)\n", "\n", " (quotient, remainder) = divmod_helper(dividend, divisor)\n", "\n", " from collections import namedtuple\n", " divmod_output = namedtuple('MyDivmodOutput', ['quotient', 'remainder'])\n", " return divmod_output(quotient, remainder)\n" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Test running the python function directly" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "MyDivmodOutput(quotient=14, remainder=2)" ] }, "execution_count": 5, "metadata": {}, "output_type": "execute_result" } ], "source": [ "my_divmod(100, 7)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Convert the function to a pipeline operation" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [], "source": [ "divmod_op = comp.func_to_container_op(my_divmod, base_image='tensorflow/tensorflow:1.14.0-py3')" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Define the pipeline\n", "Pipeline function has to be decorated with the @dsl.pipeline decorator" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [], "source": [ "@dsl.pipeline(\n", " name='Calculation pipeline',\n", " description='A toy pipeline that performs arithmetic calculations.'\n", ")\n", "def calc_pipeline(\n", " a='a',\n", " b='7',\n", " c='17',\n", "):\n", " #Passing pipeline parameter and a constant value as operation arguments\n", " add_task = add_op(a, 4) #Returns a dsl.ContainerOp class instance. \n", " \n", " #Passing a task output reference as operation arguments\n", " #For an operation with a single return value, the output reference can be accessed using `task.output` or `task.outputs['output_name']` syntax\n", " divmod_task = divmod_op(add_task.output, b)\n", "\n", " #For an operation with a multiple return values, the output references can be accessed using `task.outputs['output_name']` syntax\n", " result_task = add_op(divmod_task.outputs['quotient'], c)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Submit the pipeline for execution" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [ { "data": { "text/html": [ "Experiment link here" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/html": [ "Run link here" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/plain": [ "RunPipelineResult(run_id=87276776-0c3a-4d4e-99d0-4563b7f42fa5)" ] }, "execution_count": 8, "metadata": {}, "output_type": "execute_result" } ], "source": [ "client = kfp.Client()\n", "\n", "#Specify pipeline argument values\n", "arguments = {'a': '7', 'b': '8'}\n", "\n", "#Submit a pipeline run\n", "client.create_run_from_pipeline_func(calc_pipeline, arguments=arguments)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.6.9" } }, "nbformat": 4, "nbformat_minor": 2 } ================================================ FILE: pipelines/RecommenderPipeline.ipynb ================================================ { "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Kubeflow pipeline\n", "This is a fairly simple pipeline, containing sequential steps:\n", "\n", "1. Update data - implemented by lightbend/recommender-data-update-publisher:0.2 image\n", "2. Run model training. Ideally we would run TFJob, but due to the current limitations for pipelines, we will directly use an image implementing training lightbend/ml-tf-recommender:0.1\n", "3. Update serving model - implemented by lightbend/recommender-model-publisher:0.2" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Setup" ] }, { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Requirement already up-to-date: kubernetes in ./.local/lib/python3.6/site-packages (10.0.1)\n", "Requirement already satisfied, skipping upgrade: pyyaml>=3.12 in /usr/local/lib/python3.6/dist-packages (from kubernetes) (5.3)\n", "Requirement already satisfied, skipping upgrade: six>=1.9.0 in /usr/lib/python3/dist-packages (from kubernetes) (1.11.0)\n", "Requirement already satisfied, skipping upgrade: urllib3>=1.24.2 in ./.local/lib/python3.6/site-packages (from kubernetes) (1.24.3)\n", "Requirement already satisfied, skipping upgrade: requests-oauthlib in /usr/local/lib/python3.6/dist-packages (from kubernetes) (1.3.0)\n", "Requirement already satisfied, skipping upgrade: certifi>=14.05.14 in /usr/local/lib/python3.6/dist-packages (from kubernetes) (2019.11.28)\n", "Requirement already satisfied, skipping upgrade: python-dateutil>=2.5.3 in /usr/local/lib/python3.6/dist-packages (from kubernetes) (2.8.1)\n", "Requirement already satisfied, skipping upgrade: setuptools>=21.0.0 in /usr/local/lib/python3.6/dist-packages (from kubernetes) (45.1.0)\n", "Requirement already satisfied, skipping upgrade: requests in /usr/local/lib/python3.6/dist-packages (from kubernetes) (2.22.0)\n", "Requirement already satisfied, skipping upgrade: websocket-client!=0.40.0,!=0.41.*,!=0.42.*,>=0.32.0 in /usr/local/lib/python3.6/dist-packages (from kubernetes) (0.57.0)\n", "Requirement already satisfied, skipping upgrade: google-auth>=1.0.1 in /usr/local/lib/python3.6/dist-packages (from kubernetes) (1.11.0)\n", "Requirement already satisfied, skipping upgrade: oauthlib>=3.0.0 in /usr/local/lib/python3.6/dist-packages (from requests-oauthlib->kubernetes) (3.1.0)\n", "Requirement already satisfied, skipping upgrade: idna<2.9,>=2.5 in /usr/lib/python3/dist-packages (from requests->kubernetes) (2.6)\n", "Requirement already satisfied, skipping upgrade: chardet<3.1.0,>=3.0.2 in /usr/local/lib/python3.6/dist-packages (from requests->kubernetes) (3.0.4)\n", "Requirement already satisfied, skipping upgrade: pyasn1-modules>=0.2.1 in /usr/local/lib/python3.6/dist-packages (from google-auth>=1.0.1->kubernetes) (0.2.8)\n", "Requirement already satisfied, skipping upgrade: cachetools<5.0,>=2.0.0 in /usr/local/lib/python3.6/dist-packages (from google-auth>=1.0.1->kubernetes) (4.0.0)\n", "Requirement already satisfied, skipping upgrade: rsa<4.1,>=3.1.4 in /usr/local/lib/python3.6/dist-packages (from google-auth>=1.0.1->kubernetes) (4.0)\n", "Requirement already satisfied, skipping upgrade: pyasn1<0.5.0,>=0.4.6 in /usr/local/lib/python3.6/dist-packages (from pyasn1-modules>=0.2.1->google-auth>=1.0.1->kubernetes) (0.4.8)\n", "Requirement already up-to-date: kfp in ./.local/lib/python3.6/site-packages (0.2.2.1)\n", "Requirement already satisfied, skipping upgrade: PyJWT>=1.6.4 in ./.local/lib/python3.6/site-packages (from kfp) (1.7.1)\n", "Requirement already satisfied, skipping upgrade: requests-toolbelt>=0.8.0 in ./.local/lib/python3.6/site-packages (from kfp) (0.9.1)\n", "Requirement already satisfied, skipping upgrade: python-dateutil in /usr/local/lib/python3.6/dist-packages (from kfp) (2.8.1)\n", "Requirement already satisfied, skipping upgrade: PyYAML in /usr/local/lib/python3.6/dist-packages (from kfp) (5.3)\n", "Requirement already satisfied, skipping upgrade: kfp-server-api<=0.1.40,>=0.1.18 in ./.local/lib/python3.6/site-packages (from kfp) (0.1.40)\n", "Requirement already satisfied, skipping upgrade: google-cloud-storage>=1.13.0 in /usr/local/lib/python3.6/dist-packages (from kfp) (1.25.0)\n", "Requirement already satisfied, skipping upgrade: jsonschema>=3.0.1 in /usr/local/lib/python3.6/dist-packages (from kfp) (3.2.0)\n", "Requirement already satisfied, skipping upgrade: Deprecated in ./.local/lib/python3.6/site-packages (from kfp) (1.2.7)\n", "Requirement already satisfied, skipping upgrade: google-auth>=1.6.1 in /usr/local/lib/python3.6/dist-packages (from kfp) (1.11.0)\n", "Collecting kubernetes<=10.0.0,>=8.0.0\n", " Using cached kubernetes-10.0.0-py2.py3-none-any.whl (1.5 MB)\n", "Requirement already satisfied, skipping upgrade: argo-models==2.2.1a in ./.local/lib/python3.6/site-packages (from kfp) (2.2.1a0)\n", "Requirement already satisfied, skipping upgrade: urllib3<1.25,>=1.15 in ./.local/lib/python3.6/site-packages (from kfp) (1.24.3)\n", "Requirement already satisfied, skipping upgrade: certifi in /usr/local/lib/python3.6/dist-packages (from kfp) (2019.11.28)\n", "Requirement already satisfied, skipping upgrade: tabulate==0.8.3 in ./.local/lib/python3.6/site-packages (from kfp) (0.8.3)\n", "Requirement already satisfied, skipping upgrade: click==7.0 in ./.local/lib/python3.6/site-packages (from kfp) (7.0)\n", "Requirement already satisfied, skipping upgrade: cloudpickle==1.1.1 in ./.local/lib/python3.6/site-packages (from kfp) (1.1.1)\n", "Requirement already satisfied, skipping upgrade: six>=1.10 in /usr/lib/python3/dist-packages (from kfp) (1.11.0)\n", "Requirement already satisfied, skipping upgrade: cryptography>=2.4.2 in ./.local/lib/python3.6/site-packages (from kfp) (2.8)\n", "Requirement already satisfied, skipping upgrade: requests<3.0.0,>=2.0.1 in /usr/local/lib/python3.6/dist-packages (from requests-toolbelt>=0.8.0->kfp) (2.22.0)\n", "Requirement already satisfied, skipping upgrade: google-resumable-media<0.6dev,>=0.5.0 in /usr/local/lib/python3.6/dist-packages (from google-cloud-storage>=1.13.0->kfp) (0.5.0)\n", "Requirement already satisfied, skipping upgrade: google-cloud-core<2.0dev,>=1.2.0 in /usr/local/lib/python3.6/dist-packages (from google-cloud-storage>=1.13.0->kfp) (1.3.0)\n", "Requirement already satisfied, skipping upgrade: setuptools in /usr/local/lib/python3.6/dist-packages (from jsonschema>=3.0.1->kfp) (45.1.0)\n", "Requirement already satisfied, skipping upgrade: attrs>=17.4.0 in /usr/local/lib/python3.6/dist-packages (from jsonschema>=3.0.1->kfp) (19.3.0)\n", "Requirement already satisfied, skipping upgrade: importlib-metadata; python_version < \"3.8\" in /usr/local/lib/python3.6/dist-packages (from jsonschema>=3.0.1->kfp) (1.4.0)\n", "Requirement already satisfied, skipping upgrade: pyrsistent>=0.14.0 in /usr/local/lib/python3.6/dist-packages (from jsonschema>=3.0.1->kfp) (0.15.7)\n", "Requirement already satisfied, skipping upgrade: wrapt<2,>=1.10 in /usr/local/lib/python3.6/dist-packages (from Deprecated->kfp) (1.11.2)\n", "Requirement already satisfied, skipping upgrade: rsa<4.1,>=3.1.4 in /usr/local/lib/python3.6/dist-packages (from google-auth>=1.6.1->kfp) (4.0)\n", "Requirement already satisfied, skipping upgrade: cachetools<5.0,>=2.0.0 in /usr/local/lib/python3.6/dist-packages (from google-auth>=1.6.1->kfp) (4.0.0)\n", "Requirement already satisfied, skipping upgrade: pyasn1-modules>=0.2.1 in /usr/local/lib/python3.6/dist-packages (from google-auth>=1.6.1->kfp) (0.2.8)\n", "Requirement already satisfied, skipping upgrade: websocket-client!=0.40.0,!=0.41.*,!=0.42.*,>=0.32.0 in /usr/local/lib/python3.6/dist-packages (from kubernetes<=10.0.0,>=8.0.0->kfp) (0.57.0)\n", "Requirement already satisfied, skipping upgrade: requests-oauthlib in /usr/local/lib/python3.6/dist-packages (from kubernetes<=10.0.0,>=8.0.0->kfp) (1.3.0)\n", "Requirement already satisfied, skipping upgrade: cffi!=1.11.3,>=1.8 in ./.local/lib/python3.6/site-packages (from cryptography>=2.4.2->kfp) (1.14.0)\n", "Requirement already satisfied, skipping upgrade: chardet<3.1.0,>=3.0.2 in /usr/local/lib/python3.6/dist-packages (from requests<3.0.0,>=2.0.1->requests-toolbelt>=0.8.0->kfp) (3.0.4)\n", "Requirement already satisfied, skipping upgrade: idna<2.9,>=2.5 in /usr/lib/python3/dist-packages (from requests<3.0.0,>=2.0.1->requests-toolbelt>=0.8.0->kfp) (2.6)\n", "Requirement already satisfied, skipping upgrade: google-api-core<2.0.0dev,>=1.16.0 in /usr/local/lib/python3.6/dist-packages (from google-cloud-core<2.0dev,>=1.2.0->google-cloud-storage>=1.13.0->kfp) (1.16.0)\n", "Requirement already satisfied, skipping upgrade: zipp>=0.5 in /usr/local/lib/python3.6/dist-packages (from importlib-metadata; python_version < \"3.8\"->jsonschema>=3.0.1->kfp) (2.1.0)\n", "Requirement already satisfied, skipping upgrade: pyasn1>=0.1.3 in /usr/local/lib/python3.6/dist-packages (from rsa<4.1,>=3.1.4->google-auth>=1.6.1->kfp) (0.4.8)\n", "Requirement already satisfied, skipping upgrade: oauthlib>=3.0.0 in /usr/local/lib/python3.6/dist-packages (from requests-oauthlib->kubernetes<=10.0.0,>=8.0.0->kfp) (3.1.0)\n", "Requirement already satisfied, skipping upgrade: pycparser in ./.local/lib/python3.6/site-packages (from cffi!=1.11.3,>=1.8->cryptography>=2.4.2->kfp) (2.19)\n", "Requirement already satisfied, skipping upgrade: googleapis-common-protos<2.0dev,>=1.6.0 in /usr/local/lib/python3.6/dist-packages (from google-api-core<2.0.0dev,>=1.16.0->google-cloud-core<2.0dev,>=1.2.0->google-cloud-storage>=1.13.0->kfp) (1.51.0)\n", "Requirement already satisfied, skipping upgrade: protobuf>=3.4.0 in /usr/local/lib/python3.6/dist-packages (from google-api-core<2.0.0dev,>=1.16.0->google-cloud-core<2.0dev,>=1.2.0->google-cloud-storage>=1.13.0->kfp) (3.11.2)\n", "Requirement already satisfied, skipping upgrade: pytz in /usr/local/lib/python3.6/dist-packages (from google-api-core<2.0.0dev,>=1.16.0->google-cloud-core<2.0dev,>=1.2.0->google-cloud-storage>=1.13.0->kfp) (2019.3)\n", "Installing collected packages: kubernetes\n", " Attempting uninstall: kubernetes\n", " Found existing installation: kubernetes 10.0.1\n", " Uninstalling kubernetes-10.0.1:\n", " Successfully uninstalled kubernetes-10.0.1\n", "Successfully installed kubernetes-10.0.0\n" ] } ], "source": [ "!pip install kubernetes --upgrade --user\n", "!pip install kfp --upgrade --user\n", "\n", "\n", "import kfp # the Pipelines SDK. This library is included with the notebook image.\n", "from kfp import compiler\n", "import kfp.dsl as dsl\n", "import kfp.notebook\n", "from kubernetes import client as k8s_client" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Create/Get an Experiment in the Kubeflow Pipeline System\n", "The Kubeflow Pipeline system requires an \"Experiment\" to group pipeline runs. You can create a new experiment, or call client.list_experiments() to get existing ones." ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [], "source": [ "client = kfp.Client()\n", "client.list_experiments()\n", "#exp = client.create_experiment(name='mdupdate')\n", "exp = client.get_experiment(experiment_name ='mdupdate')" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Define a Pipeline\n", "Authoring a pipeline is like authoring a normal Python function. The pipeline function describes the topology of the pipeline.\n", "\n", "Each step in the pipeline is typically a ContainerOp --- a simple class or function describing how to interact with a docker container image. In the pipeline, all the container images referenced in the pipeline are already built." ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [], "source": [ "@dsl.pipeline(\n", " name='Recommender model update',\n", " description='Demonstrate usage of pipelines for multi-step model update'\n", ")\n", "def recommender_pipeline():\n", " # Load new data\n", " data = dsl.ContainerOp(\n", " name='updatedata',\n", " image='lightbend/recommender-data-update-publisher:0.2') \\\n", " .add_env_variable(k8s_client.V1EnvVar(name='MINIO_URL',value='http://minio-service.kubeflow.svc.cluster.local:9000')) \\\n", " .add_env_variable(k8s_client.V1EnvVar(name='MINIO_KEY', value='minio')) \\\n", " .add_env_variable(k8s_client.V1EnvVar(name='MINIO_SECRET', value='minio123'))\n", " # Train the model\n", " train = dsl.ContainerOp(\n", " name='trainmodel',\n", " image='lightbend/ml-tf-recommender:0.1') \\\n", " .add_env_variable(k8s_client.V1EnvVar(name='MINIO_URL',value='minio-service.kubeflow.svc.cluster.local:9000')) \\\n", " .add_env_variable(k8s_client.V1EnvVar(name='MINIO_KEY', value='minio')) \\\n", " .add_env_variable(k8s_client.V1EnvVar(name='MINIO_SECRET', value='minio123'))\n", " train.after(data)\n", " # Publish new model model\n", " publish = dsl.ContainerOp(\n", " name='publishmodel',\n", " image='lightbend/recommender-model-publisher:0.2') \\\n", " .add_env_variable(k8s_client.V1EnvVar(name='MINIO_URL',value='http://minio-service.kubeflow.svc.cluster.local:9000')) \\\n", " .add_env_variable(k8s_client.V1EnvVar(name='MINIO_KEY', value='minio')) \\\n", " .add_env_variable(k8s_client.V1EnvVar(name='MINIO_SECRET', value='minio123')) \\\n", " .add_env_variable(k8s_client.V1EnvVar(name='KAFKA_BROKERS', value='cloudflow-kafka-brokers.cloudflow.svc.cluster.local:9092')) \\\n", " .add_env_variable(k8s_client.V1EnvVar(name='DEFAULT_RECOMMENDER_URL', value='http://recommendermodelserver.kubeflow.svc.cluster.local:8501')) \\\n", " .add_env_variable(k8s_client.V1EnvVar(name='ALTERNATIVE_RECOMMENDER_URL', value='http://recommendermodelserver1.kubeflow.svc.cluster.local:8501'))\n", " publish.after(train)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Compile pipeline" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [], "source": [ "compiler.Compiler().compile(recommender_pipeline, 'pipeline.tar.gz')" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Submit an experiment run" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [ { "data": { "text/html": [ "Run link here" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "run = client.run_pipeline(exp.id, 'pipeline1', 'pipeline.tar.gz')" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.6.9" } }, "nbformat": 4, "nbformat_minor": 2 } ================================================ FILE: pipelines/download_components.sh ================================================ #!/bin/bash #tag::dlPipelineRelease[] wget https://github.com/kubeflow/pipelines/archive/0.2.5.tar.gz tar -xvf 0.2.5.tar.gz #end::dlPipelineRelease[] ================================================ FILE: recommender/Dockerfile ================================================ FROM tensorflow/tensorflow:1.12.0-devel-py3 RUN pip3 install --upgrade pip RUN pip3 install pandas --upgrade RUN pip3 install keras --upgrade RUN pip3 install minio --upgrade RUN mkdir -p /opt/kubeflow COPY Recommender_Kubeflow.py /opt/kubeflow/ ENTRYPOINT ["python3", "/opt/kubeflow/Recommender_Kubeflow.py"] ================================================ FILE: recommender/Recommender_Kubeflow.ipynb ================================================ { "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# This is implementation of the Recommender training\n", "\n", "This implementation takes a list of users and their purchasing history to calculate prediction\n", "on the probability that they would by a certain product.\n", "The implementation is structured in 2 parts:\n", "1. Build rating matrix based on the purchasing history. The implementation is based on this blog post\n", "https://medium.com/datadriveninvestor/how-to-build-a-recommendation-system-for-purchase-data-step-by-step-d6d7a78800b6\n", "2. Build collabarative filtering model based on the rating matrix. The implementation is based on this project https://github.com/Piyushdharkar/Collaborative-Filtering-Using-Keras \n", "\n", "Implementation is leveraging Minio for storing both source data and result models\n", "\n", "It also uses Python kubernetes client for re starting model server pod\n" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# 1. Install libraries" ] }, { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Collecting pandas\n", " Downloading pandas-1.0.1-cp36-cp36m-manylinux1_x86_64.whl (10.1 MB)\n", "\u001b[K |████████████████████████████████| 10.1 MB 3.2 MB/s eta 0:00:01\n", "\u001b[?25hRequirement already satisfied, skipping upgrade: numpy>=1.13.3 in /usr/local/lib/python3.6/dist-packages (from pandas) (1.18.1)\n", "Requirement already satisfied, skipping upgrade: python-dateutil>=2.6.1 in /usr/local/lib/python3.6/dist-packages (from pandas) (2.8.1)\n", "Requirement already satisfied, skipping upgrade: pytz>=2017.2 in /usr/local/lib/python3.6/dist-packages (from pandas) (2019.3)\n", "Requirement already satisfied, skipping upgrade: six>=1.5 in /usr/lib/python3/dist-packages (from python-dateutil>=2.6.1->pandas) (1.11.0)\n", "Installing collected packages: pandas\n", "Successfully installed pandas-1.0.1\n", "Collecting keras\n", " Downloading Keras-2.3.1-py2.py3-none-any.whl (377 kB)\n", "\u001b[K |████████████████████████████████| 377 kB 3.2 MB/s eta 0:00:01\n", "\u001b[?25hRequirement already satisfied, skipping upgrade: h5py in /usr/local/lib/python3.6/dist-packages (from keras) (2.10.0)\n", "Requirement already satisfied, skipping upgrade: numpy>=1.9.1 in /usr/local/lib/python3.6/dist-packages (from keras) (1.18.1)\n", "Requirement already satisfied, skipping upgrade: keras-applications>=1.0.6 in /usr/local/lib/python3.6/dist-packages (from keras) (1.0.8)\n", "Requirement already satisfied, skipping upgrade: scipy>=0.14 in /usr/local/lib/python3.6/dist-packages (from keras) (1.4.1)\n", "Requirement already satisfied, skipping upgrade: keras-preprocessing>=1.0.5 in /usr/local/lib/python3.6/dist-packages (from keras) (1.1.0)\n", "Requirement already satisfied, skipping upgrade: six>=1.9.0 in /usr/lib/python3/dist-packages (from keras) (1.11.0)\n", "Requirement already satisfied, skipping upgrade: pyyaml in /usr/local/lib/python3.6/dist-packages (from keras) (5.3)\n", "Installing collected packages: keras\n", "Successfully installed keras-2.3.1\n", "Collecting minio\n", " Downloading minio-5.0.7-py2.py3-none-any.whl (71 kB)\n", "\u001b[K |████████████████████████████████| 71 kB 1.9 MB/s eta 0:00:011\n", "\u001b[?25hRequirement already satisfied, skipping upgrade: python-dateutil in /usr/local/lib/python3.6/dist-packages (from minio) (2.8.1)\n", "Requirement already satisfied, skipping upgrade: urllib3 in ./.local/lib/python3.6/site-packages (from minio) (1.24.3)\n", "Requirement already satisfied, skipping upgrade: pytz in /usr/local/lib/python3.6/dist-packages (from minio) (2019.3)\n", "Requirement already satisfied, skipping upgrade: certifi in /usr/local/lib/python3.6/dist-packages (from minio) (2019.11.28)\n", "Collecting configparser\n", " Downloading configparser-4.0.2-py2.py3-none-any.whl (22 kB)\n", "Requirement already satisfied, skipping upgrade: six>=1.5 in /usr/lib/python3/dist-packages (from python-dateutil->minio) (1.11.0)\n", "Installing collected packages: configparser, minio\n", "Successfully installed configparser-4.0.2 minio-5.0.7\n", "Collecting kubernetes\n", " Downloading kubernetes-10.0.1-py2.py3-none-any.whl (1.5 MB)\n", "\u001b[K |████████████████████████████████| 1.5 MB 3.4 MB/s eta 0:00:01\n", "\u001b[?25hRequirement already satisfied, skipping upgrade: certifi>=14.05.14 in /usr/local/lib/python3.6/dist-packages (from kubernetes) (2019.11.28)\n", "Requirement already satisfied, skipping upgrade: python-dateutil>=2.5.3 in /usr/local/lib/python3.6/dist-packages (from kubernetes) (2.8.1)\n", "Requirement already satisfied, skipping upgrade: pyyaml>=3.12 in /usr/local/lib/python3.6/dist-packages (from kubernetes) (5.3)\n", "Requirement already satisfied, skipping upgrade: six>=1.9.0 in /usr/lib/python3/dist-packages (from kubernetes) (1.11.0)\n", "Requirement already satisfied, skipping upgrade: websocket-client!=0.40.0,!=0.41.*,!=0.42.*,>=0.32.0 in /usr/local/lib/python3.6/dist-packages (from kubernetes) (0.57.0)\n", "Requirement already satisfied, skipping upgrade: urllib3>=1.24.2 in ./.local/lib/python3.6/site-packages (from kubernetes) (1.24.3)\n", "Requirement already satisfied, skipping upgrade: setuptools>=21.0.0 in /usr/local/lib/python3.6/dist-packages (from kubernetes) (45.1.0)\n", "Requirement already satisfied, skipping upgrade: requests in /usr/local/lib/python3.6/dist-packages (from kubernetes) (2.22.0)\n", "Requirement already satisfied, skipping upgrade: google-auth>=1.0.1 in /usr/local/lib/python3.6/dist-packages (from kubernetes) (1.11.0)\n", "Requirement already satisfied, skipping upgrade: requests-oauthlib in /usr/local/lib/python3.6/dist-packages (from kubernetes) (1.3.0)\n", "Requirement already satisfied, skipping upgrade: idna<2.9,>=2.5 in /usr/lib/python3/dist-packages (from requests->kubernetes) (2.6)\n", "Requirement already satisfied, skipping upgrade: chardet<3.1.0,>=3.0.2 in /usr/local/lib/python3.6/dist-packages (from requests->kubernetes) (3.0.4)\n", "Requirement already satisfied, skipping upgrade: pyasn1-modules>=0.2.1 in /usr/local/lib/python3.6/dist-packages (from google-auth>=1.0.1->kubernetes) (0.2.8)\n", "Requirement already satisfied, skipping upgrade: rsa<4.1,>=3.1.4 in /usr/local/lib/python3.6/dist-packages (from google-auth>=1.0.1->kubernetes) (4.0)\n", "Requirement already satisfied, skipping upgrade: cachetools<5.0,>=2.0.0 in /usr/local/lib/python3.6/dist-packages (from google-auth>=1.0.1->kubernetes) (4.0.0)\n", "Requirement already satisfied, skipping upgrade: oauthlib>=3.0.0 in /usr/local/lib/python3.6/dist-packages (from requests-oauthlib->kubernetes) (3.1.0)\n", "Requirement already satisfied, skipping upgrade: pyasn1<0.5.0,>=0.4.6 in /usr/local/lib/python3.6/dist-packages (from pyasn1-modules>=0.2.1->google-auth>=1.0.1->kubernetes) (0.4.8)\n", "\u001b[31mERROR: kfp 0.2.2.1 has requirement kubernetes<=10.0.0,>=8.0.0, but you'll have kubernetes 10.0.1 which is incompatible.\u001b[0m\n", "Installing collected packages: kubernetes\n", " Attempting uninstall: kubernetes\n", " Found existing installation: kubernetes 10.0.0\n", " Uninstalling kubernetes-10.0.0:\n", " Successfully uninstalled kubernetes-10.0.0\n", "Successfully installed kubernetes-10.0.1\n", "Collecting kfmd\n", " Downloading kfmd-0.1.8.tar.gz (29 kB)\n", "Building wheels for collected packages: kfmd\n", " Building wheel for kfmd (setup.py) ... \u001b[?25ldone\n", "\u001b[?25h Created wheel for kfmd: filename=kfmd-0.1.8-py3-none-any.whl size=65919 sha256=c65ab8ff649134dbe6c8391743d5361546e5b29e6df9c0ff13915c99b67be1e7\n", " Stored in directory: /home/jovyan/.cache/pip/wheels/54/6b/5c/f063f501d5c632c93566ed967f2f0c36bad3b384d68c83aa65\n", "Successfully built kfmd\n", "Installing collected packages: kfmd\n", "Successfully installed kfmd-0.1.8\n" ] } ], "source": [ "!pip install pandas --upgrade --user\n", "!pip install keras --upgrade --user\n", "!pip install minio --upgrade --user\n", "!pip install kubernetes --upgrade --user\n", "!pip install kfmd --upgrade --user" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## imports" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "Using TensorFlow backend.\n" ] } ], "source": [ "import pandas as pd\n", "import numpy as np\n", "import time\n", "from minio import Minio\n", "from keras.models import Model\n", "from keras.layers import *\n", "from keras.losses import *\n", "import tensorflow as tf\n", "import os\n", "from kfmd import metadata\n", "from datetime import datetime\n", "from keras import backend as K\n", "from kubernetes import client as k8s_client, config as k8s_config\n" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Create a workspace, run and execution" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [], "source": [ "execTime = datetime.utcnow().isoformat(\"T\")\n", "ws = metadata.Workspace(\n", " # Connect to metadata-service in namesapce kubeflow in k8s cluster.\n", " backend_url_prefix=\"metadata-service.kubeflow.svc.cluster.local:8080\",\n", " name=\"recommender\",\n", " description=\"a workspace for saving recommender experiments\")\n", "r = metadata.Run(\n", " workspace=ws,\n", " name=\"run-\" + execTime ,\n", " description=\"recommender run\",\n", ")\n", "exec = metadata.Execution(\n", " name = \"execution\" + execTime ,\n", " workspace=ws,\n", " run=r,\n", " description=\"recommender ML execution\",\n", ")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# 2. Read data\n", "\n", "For reading data we are using two diffierent approaches:\n", "1. We use Tensorflow build in support to write resulting model to Minio\n", "2. We use Minio APIs to read source data using Pandas. We could of use Boto APIs here instead." ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Minio parameters : URL minio-service.kubeflow.svc.cluster.local:9000 key minio secret minio123\n" ] } ], "source": [ "minio_endpoint = os.environ.get('MINIO_URL', 'minio-service.kubeflow.svc.cluster.local:9000')\n", "minio_key = os.environ.get('MINIO_KEY', 'minio')\n", "minio_secret = os.environ.get('MINIO_SECRET', 'minio123')\n", "\n", "print('Minio parameters : URL ', minio_endpoint, ' key ', minio_key, ' secret ', minio_secret)\n", "\n", "os.environ['AWS_ACCESS_KEY_ID'] = minio_key\n", "os.environ['AWS_SECRET_ACCESS_KEY'] = minio_secret\n", "os.environ['AWS_REGION'] = 'us-west-1'\n", "os.environ['S3_REGION'] = 'us-west-1'\n", "os.environ['S3_ENDPOINT'] = minio_endpoint\n", "os.environ['S3_USE_HTTPS'] = '0'\n", "os.environ['S3_VERIFY_SSL'] = '0'\n" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [], "source": [ "minioClient = Minio(minio_endpoint,\n", " access_key=minio_key,\n", " secret_key=minio_secret,\n", " secure=False)\n", "\n", "minioClient.fget_object('data', 'recommender/users.csv', '/tmp/users.csv')\n", "customers = pd.read_csv('/tmp/users.csv')\n", "minioClient.fget_object('data', 'recommender/transactions.csv', '/tmp/transactions.csv')\n", "transactions = pd.read_csv('/tmp/transactions.csv')\n", "\n", "#Log experiment data set\n", "data_set = exec.log_input(\n", " metadata.DataSet(\n", " description=\"recommender current transactions and customers\",\n", " name=\"Current transactions and customers\",\n", " version=execTime,\n", " uri=\"minio:/tmp/transactions.csv; minio:/tmp/users.csv\"))" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "(1000, 1)\n" ] }, { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
customerId
01553
120400
219750
36334
427773
\n", "
" ], "text/plain": [ " customerId\n", "0 1553\n", "1 20400\n", "2 19750\n", "3 6334\n", "4 27773" ] }, "execution_count": 6, "metadata": {}, "output_type": "execute_result" } ], "source": [ "print(customers.shape)\n", "customers.head()" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "(62483, 2)\n" ] }, { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
customerIdproducts
0020
112|2|23|68|68|111|29|86|107|152
22111|107|29|11|11|11|33|23
33164|227
452|2
\n", "
" ], "text/plain": [ " customerId products\n", "0 0 20\n", "1 1 2|2|23|68|68|111|29|86|107|152\n", "2 2 111|107|29|11|11|11|33|23\n", "3 3 164|227\n", "4 5 2|2" ] }, "execution_count": 7, "metadata": {}, "output_type": "execute_result" } ], "source": [ "print(transactions.shape)\n", "transactions.head()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# 3 Data preparation\n", "\n", "Our goal here is to break down each list of items in the products column into rows \n", "and count the number of products bought by a user" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
customerId0123456789
0020.0NaNNaNNaNNaNNaNNaNNaNNaNNaN
112.02.023.068.068.0111.029.086.0107.0152.0
\n", "
" ], "text/plain": [ " customerId 0 1 2 3 4 5 6 7 8 9\n", "0 0 20.0 NaN NaN NaN NaN NaN NaN NaN NaN NaN\n", "1 1 2.0 2.0 23.0 68.0 68.0 111.0 29.0 86.0 107.0 152.0" ] }, "execution_count": 8, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# 1: split product items\n", "transactions['products'] = transactions['products'].apply(lambda x: [int(i) for i in x.split('|')])\n", "transactions.head(2).set_index('customerId')['products'].apply(pd.Series).reset_index()" ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
customerIdproductIdpurchase_count
0020.01
112.02
2123.01
3129.01
4168.02
5186.01
61107.01
71111.01
81152.01
\n", "
" ], "text/plain": [ " customerId productId purchase_count\n", "0 0 20.0 1\n", "1 1 2.0 2\n", "2 1 23.0 1\n", "3 1 29.0 1\n", "4 1 68.0 2\n", "5 1 86.0 1\n", "6 1 107.0 1\n", "7 1 111.0 1\n", "8 1 152.0 1" ] }, "execution_count": 9, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# 2: organize a given table into a dataframe with customerId, single productId, and purchase count\n", "pd.melt(transactions.head(2).set_index('customerId')['products'].apply(pd.Series).reset_index(), \n", " id_vars=['customerId'],\n", " value_name='products') \\\n", " .dropna().drop(['variable'], axis=1) \\\n", " .groupby(['customerId', 'products']) \\\n", " .agg({'products': 'count'}) \\\n", " .rename(columns={'products': 'purchase_count'}) \\\n", " .reset_index() \\\n", " .rename(columns={'products': 'productId'})" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## 3.1 Create data with user, item, and target field" ] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "(133585, 3)\n" ] }, { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
customerIdproductIdpurchase_count
0012
10131
20193
30201
40312
\n", "
" ], "text/plain": [ " customerId productId purchase_count\n", "0 0 1 2\n", "1 0 13 1\n", "2 0 19 3\n", "3 0 20 1\n", "4 0 31 2" ] }, "execution_count": 10, "metadata": {}, "output_type": "execute_result" } ], "source": [ "data = pd.melt(transactions.set_index('customerId')['products'].apply(pd.Series).reset_index(), \n", " id_vars=['customerId'],\n", " value_name='products') \\\n", " .dropna().drop(['variable'], axis=1) \\\n", " .groupby(['customerId', 'products']) \\\n", " .agg({'products': 'count'}) \\\n", " .rename(columns={'products': 'purchase_count'}) \\\n", " .reset_index() \\\n", " .rename(columns={'products': 'productId'})\n", "data['productId'] = data['productId'].astype(np.int64)\n", "\n", "print(data.shape)\n", "data.head()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## 3.2 Normalize item values across users" ] }, { "cell_type": "code", "execution_count": 11, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
productId0123456789...290291292293294295296297298299
customerId
0NaN2.0NaNNaNNaNNaNNaNNaNNaNNaN...NaNNaNNaNNaNNaNNaNNaNNaNNaNNaN
1NaNNaN6.0NaNNaNNaNNaNNaNNaNNaN...NaNNaNNaN1.0NaNNaN1.0NaNNaNNaN
2NaNNaNNaNNaNNaNNaNNaNNaNNaNNaN...NaNNaNNaNNaNNaNNaNNaNNaNNaNNaN
3NaNNaNNaNNaNNaNNaNNaNNaNNaNNaN...NaNNaNNaNNaNNaNNaNNaNNaNNaNNaN
4NaNNaNNaNNaNNaNNaNNaNNaNNaNNaN...NaNNaNNaNNaNNaNNaNNaNNaNNaNNaN
\n", "

5 rows × 300 columns

\n", "
" ], "text/plain": [ "productId 0 1 2 3 4 5 6 7 8 9 ... 290 291 \\\n", "customerId ... \n", "0 NaN 2.0 NaN NaN NaN NaN NaN NaN NaN NaN ... NaN NaN \n", "1 NaN NaN 6.0 NaN NaN NaN NaN NaN NaN NaN ... NaN NaN \n", "2 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN ... NaN NaN \n", "3 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN ... NaN NaN \n", "4 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN ... NaN NaN \n", "\n", "productId 292 293 294 295 296 297 298 299 \n", "customerId \n", "0 NaN NaN NaN NaN NaN NaN NaN NaN \n", "1 NaN 1.0 NaN NaN 1.0 NaN NaN NaN \n", "2 NaN NaN NaN NaN NaN NaN NaN NaN \n", "3 NaN NaN NaN NaN NaN NaN NaN NaN \n", "4 NaN NaN NaN NaN NaN NaN NaN NaN \n", "\n", "[5 rows x 300 columns]" ] }, "execution_count": 11, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df_matrix = pd.pivot_table(data, values='purchase_count', index='customerId', columns='productId')\n", "df_matrix.head()" ] }, { "cell_type": "code", "execution_count": 12, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "(24429, 300)\n" ] }, { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
productId0123456789...290291292293294295296297298299
customerId
0NaN0.1NaNNaNNaNNaNNaNNaNNaNNaN...NaNNaNNaNNaNNaNNaNNaNNaNNaNNaN
1NaNNaN0.166667NaNNaNNaNNaNNaNNaNNaN...NaNNaNNaN0.0NaNNaN0.0NaNNaNNaN
2NaNNaNNaNNaNNaNNaNNaNNaNNaNNaN...NaNNaNNaNNaNNaNNaNNaNNaNNaNNaN
3NaNNaNNaNNaNNaNNaNNaNNaNNaNNaN...NaNNaNNaNNaNNaNNaNNaNNaNNaNNaN
4NaNNaNNaNNaNNaNNaNNaNNaNNaNNaN...NaNNaNNaNNaNNaNNaNNaNNaNNaNNaN
\n", "

5 rows × 300 columns

\n", "
" ], "text/plain": [ "productId 0 1 2 3 4 5 6 7 8 9 ... 290 \\\n", "customerId ... \n", "0 NaN 0.1 NaN NaN NaN NaN NaN NaN NaN NaN ... NaN \n", "1 NaN NaN 0.166667 NaN NaN NaN NaN NaN NaN NaN ... NaN \n", "2 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN ... NaN \n", "3 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN ... NaN \n", "4 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN ... NaN \n", "\n", "productId 291 292 293 294 295 296 297 298 299 \n", "customerId \n", "0 NaN NaN NaN NaN NaN NaN NaN NaN NaN \n", "1 NaN NaN 0.0 NaN NaN 0.0 NaN NaN NaN \n", "2 NaN NaN NaN NaN NaN NaN NaN NaN NaN \n", "3 NaN NaN NaN NaN NaN NaN NaN NaN NaN \n", "4 NaN NaN NaN NaN NaN NaN NaN NaN NaN \n", "\n", "[5 rows x 300 columns]" ] }, "execution_count": 12, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df_matrix_norm = (df_matrix-df_matrix.min())/(df_matrix.max()-df_matrix.min())\n", "print(df_matrix_norm.shape)\n", "df_matrix_norm.head()" ] }, { "cell_type": "code", "execution_count": 13, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "(133585, 3)\n" ] }, { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
customerIdproductIdscaled_purchase_freq
9900.133333
252500.133333
323300.133333
353600.133333
434400.133333
\n", "
" ], "text/plain": [ " customerId productId scaled_purchase_freq\n", "9 9 0 0.133333\n", "25 25 0 0.133333\n", "32 33 0 0.133333\n", "35 36 0 0.133333\n", "43 44 0 0.133333" ] }, "execution_count": 13, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# create a table for input to the modeling\n", "\n", "d = df_matrix_norm.reset_index()\n", "d.index.names = ['scaled_purchase_freq']\n", "data_norm = pd.melt(d, id_vars=['customerId'], value_name='scaled_purchase_freq').dropna()\n", "print(data_norm.shape)\n", "data_norm.head()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# 4 Preparing data for learning" ] }, { "cell_type": "code", "execution_count": 14, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "28606\n", "300\n", "[ 9 25 33 ... 26873 26998 28066]\n", "[ 0 0 0 ... 299 299 299]\n", "[0.13333333 0.13333333 0.13333333 ... 0. 0. 0. ]\n" ] } ], "source": [ "customer_idxs = np.array(data_norm.customerId, dtype = np.int)\n", "product_idxs = np.array(data_norm.productId, dtype = np.int)\n", "\n", "ratings = np.array(data_norm.scaled_purchase_freq)\n", "\n", "n_customers = int(data_norm['customerId'].drop_duplicates().max()) + 1\n", "n_products = int(data_norm['productId'].drop_duplicates().max()) + 1\n", "n_factors = 50\n", "\n", "input_shape = (1,)\n", "\n", "print(n_customers)\n", "print(n_products)\n", "print(customer_idxs)\n", "print(product_idxs)\n", "print(ratings)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## 4.1 Tensorflow Session" ] }, { "cell_type": "code", "execution_count": 15, "metadata": {}, "outputs": [], "source": [ "# create TF session and set it in Keras\n", "sess = tf.Session()\n", "K.set_session(sess)\n", "K.set_learning_phase(1)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## 4.2 Model Class" ] }, { "cell_type": "code", "execution_count": 16, "metadata": {}, "outputs": [], "source": [ "class DeepCollaborativeFiltering(Model):\n", " def __init__(self, n_customers, n_products, n_factors, p_dropout = 0.2):\n", " x1 = Input(shape = (1,), name=\"user\")\n", "\n", " P = Embedding(n_customers, n_factors, input_length = 1)(x1)\n", " P = Reshape((n_factors,))(P)\n", "\n", " x2 = Input(shape = (1,), name=\"product\")\n", "\n", " Q = Embedding(n_products, n_factors, input_length = 1)(x2)\n", " Q = Reshape((n_factors,))(Q)\n", "\n", " x = concatenate([P, Q], axis=1)\n", " x = Dropout(p_dropout)(x)\n", "\n", " x = Dense(n_factors)(x)\n", " x = Activation('relu')(x)\n", " x = Dropout(p_dropout)(x)\n", "\n", " output = Dense(1)(x) \n", " \n", " super(DeepCollaborativeFiltering, self).__init__([x1, x2], output)\n", " \n", " def rate(self, customer_idxs, product_idxs):\n", " if (type(customer_idxs) == int and type(product_idxs) == int):\n", " return self.predict([np.array(customer_idxs).reshape((1,)), np.array(product_idxs).reshape((1,))])\n", " \n", " if (type(customer_idxs) == str and type(product_idxs) == str):\n", " return self.predict([np.array(customerMapping[customer_idxs]).reshape((1,)), np.array(productMapping[product_idxs]).reshape((1,))])\n", " \n", " return self.predict([\n", " np.array([customerMapping[customer_idx] for customer_idx in customer_idxs]), \n", " np.array([productMapping[product_idx] for product_idx in product_idxs])\n", " ])" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## 4.3 Hyperparameters" ] }, { "cell_type": "code", "execution_count": 17, "metadata": {}, "outputs": [], "source": [ "bs = 64\n", "val_per = 0.25\n", "epochs = 3" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## 4.4 Model Definition" ] }, { "cell_type": "code", "execution_count": 18, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "WARNING:tensorflow:From /usr/local/lib/python3.6/dist-packages/tensorflow_core/python/ops/resource_variable_ops.py:1630: calling BaseResourceVariable.__init__ (from tensorflow.python.ops.resource_variable_ops) with constraint is deprecated and will be removed in a future version.\n", "Instructions for updating:\n", "If using Keras pass *_constraint arguments to layers.\n", "Model: \"deepcollaborativefiltering_1\"\n", "__________________________________________________________________________________________________\n", "Layer (type) Output Shape Param # Connected to \n", "==================================================================================================\n", "user (InputLayer) (None, 1) 0 \n", "__________________________________________________________________________________________________\n", "product (InputLayer) (None, 1) 0 \n", "__________________________________________________________________________________________________\n", "embedding_1 (Embedding) (None, 1, 50) 1430300 user[0][0] \n", "__________________________________________________________________________________________________\n", "embedding_2 (Embedding) (None, 1, 50) 15000 product[0][0] \n", "__________________________________________________________________________________________________\n", "reshape_1 (Reshape) (None, 50) 0 embedding_1[0][0] \n", "__________________________________________________________________________________________________\n", "reshape_2 (Reshape) (None, 50) 0 embedding_2[0][0] \n", "__________________________________________________________________________________________________\n", "concatenate_1 (Concatenate) (None, 100) 0 reshape_1[0][0] \n", " reshape_2[0][0] \n", "__________________________________________________________________________________________________\n", "dropout_1 (Dropout) (None, 100) 0 concatenate_1[0][0] \n", "__________________________________________________________________________________________________\n", "dense_1 (Dense) (None, 50) 5050 dropout_1[0][0] \n", "__________________________________________________________________________________________________\n", "activation_1 (Activation) (None, 50) 0 dense_1[0][0] \n", "__________________________________________________________________________________________________\n", "dropout_2 (Dropout) (None, 50) 0 activation_1[0][0] \n", "__________________________________________________________________________________________________\n", "dense_2 (Dense) (None, 1) 51 dropout_2[0][0] \n", "==================================================================================================\n", "Total params: 1,450,401\n", "Trainable params: 1,450,401\n", "Non-trainable params: 0\n", "__________________________________________________________________________________________________\n" ] } ], "source": [ "model = DeepCollaborativeFiltering(n_customers, n_products, n_factors)\n", "model.summary()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# 5 Training" ] }, { "cell_type": "code", "execution_count": 19, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "WARNING:tensorflow:From /usr/local/lib/python3.6/dist-packages/tensorflow_core/python/ops/math_grad.py:1424: where (from tensorflow.python.ops.array_ops) is deprecated and will be removed in a future version.\n", "Instructions for updating:\n", "Use tf.where in 2.0, which has the same broadcast rule as np.where\n", "WARNING:tensorflow:From /home/jovyan/.local/lib/python3.6/site-packages/keras/backend/tensorflow_backend.py:422: The name tf.global_variables is deprecated. Please use tf.compat.v1.global_variables instead.\n", "\n", "Train on 100188 samples, validate on 33397 samples\n", "Epoch 1/3\n", "100188/100188 [==============================] - 14s 142us/step - loss: 0.0105 - val_loss: 0.0184\n", "Epoch 2/3\n", "100188/100188 [==============================] - 14s 137us/step - loss: 0.0091 - val_loss: 0.0187\n", "Epoch 3/3\n", "100188/100188 [==============================] - 14s 139us/step - loss: 0.0078 - val_loss: 0.0193\n", "Done training!\n" ] } ], "source": [ "model.compile(optimizer = 'adam', loss = mean_squared_logarithmic_error)\n", "model.fit(x = [customer_idxs, product_idxs], y = ratings, batch_size = bs, epochs = epochs, validation_split = val_per)\n", "print('Done training!')" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## 5.1 Log model and metrics" ] }, { "cell_type": "code", "execution_count": 20, "metadata": {}, "outputs": [], "source": [ "logmodel = exec.log_output(\n", " metadata.Model(\n", " name=\"DeepCollaborativeFiltering\",\n", " description=\"Model for product recommender\",\n", " uri=\"\",\n", " model_type=\"neural network\",\n", " version=execTime,\n", " training_framework={\n", " \"name\": \"tensorflow\",\n", " \"version\": \"v1.14\"\n", " },\n", " hyperparameters={\n", " \"batch_size\" : 64,\n", " \"validation_split\" : 0.25,\n", " \"layers\": [n_customers, n_products, n_factors],\n", " \"epochs\" : 3\n", " }))\n", "metrics = exec.log_output(\n", " metadata.Metrics(\n", " name=\"Model for product recommender evaluation\",\n", " description=\"Validating of the recommender model\",\n", " uri=\"\",\n", " version=execTime,\n", " data_set_id=data_set.id,\n", " model_id=logmodel.id))" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# 6 Get current output directory for model" ] }, { "cell_type": "code", "execution_count": 21, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Exporting trained model to s3://models/recommender/1/\n" ] } ], "source": [ "directorystream = minioClient.get_object('data', 'recommender/directory.txt')\n", "directory = \"\"\n", "for d in directorystream.stream(32*1024):\n", " directory += d.decode('utf-8')\n", "arg_version = \"1\" \n", "export_path = 's3://models/' + directory + '/' + arg_version + '/'\n", "print ('Exporting trained model to', export_path)\n" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## 6.1 Export models" ] }, { "cell_type": "code", "execution_count": 22, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "WARNING:tensorflow:From :2: build_tensor_info (from tensorflow.python.saved_model.utils_impl) is deprecated and will be removed in a future version.\n", "Instructions for updating:\n", "This function will only be available through the v1 compatibility library as tf.compat.v1.saved_model.utils.build_tensor_info or tf.compat.v1.saved_model.build_tensor_info.\n", "tensor_info_users user:0\n", "tensor_info_products product:0\n", "tensor_info_pred dense_2/BiasAdd:0\n" ] } ], "source": [ "# inputs/outputs\n", "tensor_info_users = tf.saved_model.utils.build_tensor_info(model.input[0])\n", "tensor_info_products = tf.saved_model.utils.build_tensor_info(model.input[1])\n", "tensor_info_pred = tf.saved_model.utils.build_tensor_info(model.output)\n", "\n", "print (\"tensor_info_users\", tensor_info_users.name)\n", "print (\"tensor_info_products\", tensor_info_products.name)\n", "print (\"tensor_info_pred\", tensor_info_pred.name)" ] }, { "cell_type": "code", "execution_count": 23, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "WARNING:tensorflow:From :14: calling SavedModelBuilder.add_meta_graph_and_variables (from tensorflow.python.saved_model.builder_impl) with legacy_init_op is deprecated and will be removed in a future version.\n", "Instructions for updating:\n", "Pass your op to the equivalent parameter main_op instead.\n", "INFO:tensorflow:No assets to save.\n", "INFO:tensorflow:No assets to write.\n", "INFO:tensorflow:SavedModel written to: s3://models/recommender/1/saved_model.pb\n" ] }, { "data": { "text/plain": [ "b's3://models/recommender/1/saved_model.pb'" ] }, "execution_count": 23, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# signature\n", "prediction_signature = (tf.saved_model.signature_def_utils.build_signature_def(\n", " inputs={\"users\": tensor_info_users, \"products\": tensor_info_products},\n", " outputs={\"predictions\": tensor_info_pred},\n", " method_name=tf.saved_model.signature_constants.PREDICT_METHOD_NAME))\n", "# export\n", "legacy_init_op = tf.group(tf.tables_initializer(), name='legacy_init_op')\n", "builder = tf.saved_model.builder.SavedModelBuilder(export_path)\n", "builder.add_meta_graph_and_variables(\n", " sess, [tf.saved_model.tag_constants.SERVING],\n", " signature_def_map={\n", " tf.saved_model.signature_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY: prediction_signature,\n", " },\n", " legacy_init_op=legacy_init_op)\n", "builder.save()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# 7 Restarting of the model serving server\n", "\n", "In order for a new model to take effect it is also necessary to restart a model server.\n", "The issue here is that we are not changing the model version version and as a result, \n", "the model will not be updated. To ensure model update, we are here restarting a server -\n", "simply killing the running instance, and as a server is installed using deployment, the instance\n", "will be recreated. Additionally for pods operations to work correctly from the notebook,\n", "it is necessary to create permissions allowing for access to pods in another namespace. \n", "Look at the podaccessroles.yaml for details." ] }, { "cell_type": "code", "execution_count": 24, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "pod prefix recommendermodelserver-\n", "pod namespace kubeflow\n" ] } ], "source": [ "recommender = \"recommendermodelserver-\"\n", "if directory == \"recommender1\":\n", " recommender = \"recommendermodelserver1-\"\n", "print(\"pod prefix \", recommender) \n", "\n", "namespace = \"kubeflow\"\n", "print(\"pod namespace \", namespace) " ] }, { "cell_type": "code", "execution_count": 26, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Current pod name recommendermodelserver-6d5d5c654-snl99\n" ] } ], "source": [ "# Get full pod name for the current model\n", "\n", "k8s_config.load_incluster_config()\n", "\n", "v1 = k8s_client.CoreV1Api()\n", "\n", "pod_list = v1.list_namespaced_pod(namespace)\n", "pod = [item.metadata.name for item in pod_list.items if recommender in item.metadata.name][0]\n", "print(\"Current pod name \", pod)" ] }, { "cell_type": "code", "execution_count": 27, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Done deleting\n" ] } ], "source": [ "# Delete pod, so that it gets recreated\n", "v1.delete_namespaced_pod(pod, namespace, grace_period_seconds=0)\n", "\n", "print(\"Done deleting\")" ] }, { "cell_type": "code", "execution_count": 28, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "New pod name recommendermodelserver-6d5d5c654-xvxf7\n" ] } ], "source": [ "# Verify that the new instance was created\n", "time.sleep(20)\n", "pod_list = v1.list_namespaced_pod(namespace)\n", "pod = [item.metadata.name for item in pod_list.items if recommender in item.metadata.name][0]\n", "print(\"New pod name \", pod)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.6.9" } }, "nbformat": 4, "nbformat_minor": 2 } ================================================ FILE: recommender/Recommender_Kubeflow.py ================================================ #!/usr/bin/env python # coding: utf-8 # # This is implementation of the Recommender training # # This implementation takes a list of users and their purchasing history to calculate prediction # on the probability that they would by a certain product. # The implementation is structured in 2 parts: # 1. Build rating matrix based on the purchasing history. The implementation is based on this blog post # https://medium.com/datadriveninvestor/how-to-build-a-recommendation-system-for-purchase-data-step-by-step-d6d7a78800b6 # 2. Build collabarative filtering model based on the rating matrix. The implementation is based on this project https://github.com/Piyushdharkar/Collaborative-Filtering-Using-Keras # # Implementation is leveraging Minio for storing both source data and result models # # It also uses Python kubernetes client for re starting model server pod # # # 1. Install libraries # In[1]: get_ipython().system('pip install pandas --upgrade --user') get_ipython().system('pip install keras --upgrade --user') get_ipython().system('pip install minio --upgrade --user') get_ipython().system('pip install kubernetes --upgrade --user') get_ipython().system('pip install kfmd --upgrade --user') # ## imports # In[2]: import pandas as pd import numpy as np import time from minio import Minio from keras.models import Model from keras.layers import * from keras.losses import * import tensorflow as tf import os from kfmd import metadata from datetime import datetime from keras import backend as K from kubernetes import client as k8s_client, config as k8s_config # Create a workspace, run and execution # In[3]: execTime = datetime.utcnow().isoformat("T") ws = metadata.Workspace( # Connect to metadata-service in namesapce kubeflow in k8s cluster. backend_url_prefix="metadata-service.kubeflow.svc.cluster.local:8080", name="recommender", description="a workspace for saving recommender experiments") r = metadata.Run( workspace=ws, name="run-" + execTime, description="recommender run", ) exec = metadata.Execution( name="execution" + execTime, workspace=ws, run=r, description="recommender ML execution", ) # # 2. Read data # # For reading data we are using two diffierent approaches: # 1. We use Tensorflow build in support to write resulting model to Minio # 2. We use Minio APIs to read source data using Pandas. We could of use Boto APIs here instead. # In[4]: minio_endpoint = os.environ.get( 'MINIO_URL', 'minio-service.kubeflow.svc.cluster.local:9000') minio_key = os.environ.get('MINIO_KEY', 'minio') minio_secret = os.environ.get('MINIO_SECRET', 'minio123') print('Minio parameters : URL ', minio_endpoint, ' key ', minio_key, ' secret ', minio_secret) os.environ['AWS_ACCESS_KEY_ID'] = minio_key os.environ['AWS_SECRET_ACCESS_KEY'] = minio_secret os.environ['AWS_REGION'] = 'us-west-1' os.environ['S3_REGION'] = 'us-west-1' os.environ['S3_ENDPOINT'] = minio_endpoint os.environ['S3_USE_HTTPS'] = '0' os.environ['S3_VERIFY_SSL'] = '0' # In[5]: minioClient = Minio(minio_endpoint, access_key=minio_key, secret_key=minio_secret, secure=False) minioClient.fget_object('data', 'recommender/users.csv', '/tmp/users.csv') customers = pd.read_csv('/tmp/users.csv') minioClient.fget_object('data', 'recommender/transactions.csv', '/tmp/transactions.csv') transactions = pd.read_csv('/tmp/transactions.csv') #Log experiment data set data_set = exec.log_input( metadata.DataSet( description="recommender current transactions and customers", name="Current transactions and customers", version=execTime, uri="minio:/tmp/transactions.csv; minio:/tmp/users.csv")) # In[6]: print(customers.shape) customers.head() # In[7]: print(transactions.shape) transactions.head() # # 3 Data preparation # # Our goal here is to break down each list of items in the products column into rows # and count the number of products bought by a user # In[8]: # 1: split product items transactions['products'] = transactions['products'].apply( lambda x: [int(i) for i in x.split('|')]) transactions.head(2).set_index('customerId')['products'].apply( pd.Series).reset_index() # In[9]: # 2: organize a given table into a dataframe with customerId, single productId, and purchase count pd.melt(transactions.head(2).set_index('customerId')['products'].apply(pd.Series).reset_index(), id_vars=['customerId'], value_name='products') \ .dropna().drop(['variable'], axis=1) \ .groupby(['customerId', 'products']) \ .agg({'products': 'count'}) \ .rename(columns={'products': 'purchase_count'}) \ .reset_index() \ .rename(columns={'products': 'productId'}) # ## 3.1 Create data with user, item, and target field # In[10]: data = pd.melt(transactions.set_index('customerId')['products'].apply(pd.Series).reset_index(), id_vars=['customerId'], value_name='products') \ .dropna().drop(['variable'], axis=1) \ .groupby(['customerId', 'products']) \ .agg({'products': 'count'}) \ .rename(columns={'products': 'purchase_count'}) \ .reset_index() \ .rename(columns={'products': 'productId'}) data['productId'] = data['productId'].astype(np.int64) print(data.shape) data.head() # ## 3.2 Normalize item values across users # In[11]: df_matrix = pd.pivot_table(data, values='purchase_count', index='customerId', columns='productId') df_matrix.head() # In[12]: df_matrix_norm = (df_matrix - df_matrix.min()) / \ (df_matrix.max() - df_matrix.min()) print(df_matrix_norm.shape) df_matrix_norm.head() # In[13]: # create a table for input to the modeling d = df_matrix_norm.reset_index() d.index.names = ['scaled_purchase_freq'] data_norm = pd.melt(d, id_vars=['customerId'], value_name='scaled_purchase_freq').dropna() print(data_norm.shape) data_norm.head() # # 4 Preparing data for learning # In[14]: customer_idxs = np.array(data_norm.customerId, dtype=np.int) product_idxs = np.array(data_norm.productId, dtype=np.int) ratings = np.array(data_norm.scaled_purchase_freq) n_customers = int(data_norm['customerId'].drop_duplicates().max()) + 1 n_products = int(data_norm['productId'].drop_duplicates().max()) + 1 n_factors = 50 input_shape = (1, ) print(n_customers) print(n_products) print(customer_idxs) print(product_idxs) print(ratings) # ## 4.1 Tensorflow Session # In[15]: # create TF session and set it in Keras sess = tf.Session() K.set_session(sess) K.set_learning_phase(1) # ## 4.2 Model Class # In[16]: class DeepCollaborativeFiltering(Model): def __init__(self, n_customers, n_products, n_factors, p_dropout=0.2): x1 = Input(shape=(1, ), name="user") P = Embedding(n_customers, n_factors, input_length=1)(x1) P = Reshape((n_factors, ))(P) x2 = Input(shape=(1, ), name="product") Q = Embedding(n_products, n_factors, input_length=1)(x2) Q = Reshape((n_factors, ))(Q) x = concatenate([P, Q], axis=1) x = Dropout(p_dropout)(x) x = Dense(n_factors)(x) x = Activation('relu')(x) x = Dropout(p_dropout)(x) output = Dense(1)(x) super(DeepCollaborativeFiltering, self).__init__([x1, x2], output) def rate(self, customer_idxs, product_idxs): if (type(customer_idxs) == int and type(product_idxs) == int): return self.predict([ np.array(customer_idxs).reshape((1, )), np.array(product_idxs).reshape((1, )) ]) if (type(customer_idxs) == str and type(product_idxs) == str): return self.predict([ np.array(customerMapping[customer_idxs]).reshape((1, )), np.array(productMapping[product_idxs]).reshape((1, )) ]) return self.predict([ np.array([ customerMapping[customer_idx] for customer_idx in customer_idxs ]), np.array( [productMapping[product_idx] for product_idx in product_idxs]) ]) # ## 4.3 Hyperparameters # In[17]: bs = 64 val_per = 0.25 epochs = 3 # ## 4.4 Model Definition # In[18]: model = DeepCollaborativeFiltering(n_customers, n_products, n_factors) model.summary() # # 5 Training # In[19]: model.compile(optimizer='adam', loss=mean_squared_logarithmic_error) model.fit(x=[customer_idxs, product_idxs], y=ratings, batch_size=bs, epochs=epochs, validation_split=val_per) print('Done training!') # ## 5.1 Log model and metrics # In[20]: logmodel = exec.log_output( metadata.Model(name="DeepCollaborativeFiltering", description="Model for product recommender", uri="", model_type="neural network", version=execTime, training_framework={ "name": "tensorflow", "version": "v1.14" }, hyperparameters={ "batch_size": 64, "validation_split": 0.25, "layers": [n_customers, n_products, n_factors], "epochs": 3 })) metrics = exec.log_output( metadata.Metrics(name="Model for product recommender evaluation", description="Validating of the recommender model", uri="", version=execTime, data_set_id=data_set.id, model_id=logmodel.id)) # # 6 Get current output directory for model # In[21]: directorystream = minioClient.get_object('data', 'recommender/directory.txt') directory = "" for d in directorystream.stream(32 * 1024): directory += d.decode('utf-8') arg_version = "1" export_path = 's3://models/' + directory + '/' + arg_version + '/' print('Exporting trained model to', export_path) # ## 6.1 Export models # In[22]: # inputs/outputs tensor_info_users = tf.saved_model.utils.build_tensor_info(model.input[0]) tensor_info_products = tf.saved_model.utils.build_tensor_info(model.input[1]) tensor_info_pred = tf.saved_model.utils.build_tensor_info(model.output) print("tensor_info_users", tensor_info_users.name) print("tensor_info_products", tensor_info_products.name) print("tensor_info_pred", tensor_info_pred.name) # In[23]: # signature prediction_signature = (tf.saved_model.signature_def_utils.build_signature_def( inputs={ "users": tensor_info_users, "products": tensor_info_products }, outputs={"predictions": tensor_info_pred}, method_name=tf.saved_model.signature_constants.PREDICT_METHOD_NAME)) # export legacy_init_op = tf.group(tf.tables_initializer(), name='legacy_init_op') builder = tf.saved_model.builder.SavedModelBuilder(export_path) builder.add_meta_graph_and_variables( sess, [tf.saved_model.tag_constants.SERVING], signature_def_map={ tf.saved_model.signature_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY: prediction_signature, }, legacy_init_op=legacy_init_op) builder.save() # # 7 Restarting of the model serving server # # In order for a new model to take effect it is also necessary to restart a model server. # The issue here is that we are not changing the model version version and as a result, # the model will not be updated. To ensure model update, we are here restarting a server - # simply killing the running instance, and as a server is installed using deployment, the instance # will be recreated. Additionally for pods operations to work correctly from the notebook, # it is necessary to create permissions allowing for access to pods in another namespace. # Look at the podaccessroles.yaml for details. # In[24]: recommender = "recommendermodelserver-" if directory == "recommender1": recommender = "recommendermodelserver1-" print("pod prefix ", recommender) namespace = "kubeflow" print("pod namespace ", namespace) # In[26]: # Get full pod name for the current model k8s_config.load_incluster_config() v1 = k8s_client.CoreV1Api() pod_list = v1.list_namespaced_pod(namespace) pod = [ item.metadata.name for item in pod_list.items if recommender in item.metadata.name ][0] print("Current pod name ", pod) # In[27]: # Delete pod, so that it gets recreated v1.delete_namespaced_pod(pod, namespace, grace_period_seconds=0) print("Done deleting") # In[28]: # Verify that the new instance was created time.sleep(20) pod_list = v1.list_namespaced_pod(namespace) pod = [ item.metadata.name for item in pod_list.items if recommender in item.metadata.name ][0] print("New pod name ", pod) # In[ ]: ================================================ FILE: recommender/docker/Dockerfile ================================================ FROM tensorflow/tensorflow:1.15.0-py3 RUN pip3 install --upgrade pip RUN pip3 install pandas --upgrade RUN pip3 install keras --upgrade RUN pip3 install minio --upgrade RUN pip3 install kubernetes --upgrade RUN pip3 install kfmd --upgrade RUN mkdir -p /opt/kubeflow COPY Recommender_Kubeflow.py /opt/kubeflow/ ENTRYPOINT ["python3", "/opt/kubeflow/Recommender_Kubeflow.py"] ================================================ FILE: recommender/docker/build.sh ================================================ #!/bin/bash img='lightbend/ml-tf-recommender' tag='0.1' docker build -t $img:$tag . ================================================ FILE: recommender/tfservingchart/.helmignore ================================================ # Patterns to ignore when building packages. # This supports shell glob matching, relative path matching, and # negation (prefixed with !). Only one pattern per line. .DS_Store # Common VCS dirs .git/ .gitignore .bzr/ .bzrignore .hg/ .hgignore .svn/ # Common backup files *.swp *.bak *.tmp *~ # Various IDEs .project .idea/ *.tmproj ================================================ FILE: recommender/tfservingchart/Chart.yaml ================================================ apiVersion: v1 appVersion: 1.14.0 description: TF Serving maintainers: - name: Boris Lublinsky name: TF Serving Recommender model server version: 1.0.0 ================================================ FILE: recommender/tfservingchart/templates/NOTES.txt ================================================ Kubeflow Model serving components : tfserving is installed ================================================ FILE: recommender/tfservingchart/templates/_helpers.tpl ================================================ {{/* vim: set filetype=mustache: */}} {{/* Expand the name of the chart. */}} {{- define "modelserverchart.name" -}} {{- default .Chart.Name .Values.nameOverride | trunc 63 | trimSuffix "-" -}} {{- end -}} {{/* Create a default fully qualified app name. We truncate at 63 chars because some Kubernetes name fields are limited to this (by the DNS naming spec). */}} {{- define "modelserverchart.fullname" -}} {{- $name := default .Chart.Name .Values.nameOverride -}} {{- printf "%s-%s" .Release.Name $name | trunc 63 | trimSuffix "-" -}} {{- end -}} ================================================ FILE: recommender/tfservingchart/templates/minioaccess.yaml ================================================ apiVersion: v1 kind: Secret metadata: name: minioaccess namespace: kubeflow data: AWS_ACCESS_KEY_ID: bWluaW8= AWS_SECRET_ACCESS_KEY: bWluaW8xMjM= ================================================ FILE: recommender/tfservingchart/templates/tfserving.yaml ================================================ apiVersion: apps/v1 kind: Deployment metadata: namespace: kubeflow name: recommendermodelserver labels: app: recommendermodelserver spec: replicas: 1 selector: matchLabels: app: recommendermodelserver strategy: type: RollingUpdate template: metadata: labels: app: recommendermodelserver spec: containers: - name: serving image: "{{ .Values.image.server }}:{{ .Values.image.version }}" imagePullPolicy: "{{ .Values.image.pullPolicy }}" ports: - containerPort: 8500 name: grpc protocol: TCP - containerPort: 8501 name: http protocol: TCP readinessProbe: tcpSocket: port: http initialDelaySeconds: 15 timeoutSeconds: 1 livenessProbe: initialDelaySeconds: 30 periodSeconds: 30 tcpSocket: port: htttp resources: limits: cpu: "2" memory: 2Gi requests: cpu: "1" memory: 1Gi env: - name: "AWS_REGION" value: "us-west-1" - name: "S3_REGION" value: "us-west-1" - name: "S3_ENDPOINT" value: "minio-service.kubeflow.svc.cluster.local:9000" - name: "S3_USE_HTTPS" value: "0" - name: "S3_VERIFY_SSL" value: "0" - name: "AWS_ACCESS_KEY_ID" valueFrom: { secretKeyRef: { name: "minioaccess", key: "AWS_ACCESS_KEY_ID" } } - name: "AWS_SECRET_ACCESS_KEY" valueFrom: { secretKeyRef: { name: "minioaccess", key: "AWS_SECRET_ACCESS_KEY" } } - name: "MODEL_BASE_PATH" value: "s3://models" - name: "MODEL_NAME" value: "recommender" volumes: - name: secret-volume secret: secretName: minioaccess --- apiVersion: v1 kind: Service metadata: namespace: kubeflow name: recommendermodelserver spec: selector: app: recommendermodelserver ports: - name: grpc protocol: TCP port: 8500 targetPort: 8500 - name: http protocol: TCP port: 8501 targetPort: 8501 ================================================ FILE: recommender/tfservingchart/templates/tfserving1.yaml ================================================ apiVersion: apps/v1 kind: Deployment metadata: namespace: kubeflow name: recommendermodelserver1 labels: app: recommendermodelserver1 spec: replicas: 1 selector: matchLabels: app: recommendermodelserver1 strategy: type: RollingUpdate template: metadata: labels: app: recommendermodelserver1 spec: containers: - name: serving image: "{{ .Values.image.server }}:{{ .Values.image.version }}" imagePullPolicy: "{{ .Values.image.pullPolicy }}" ports: - containerPort: 8500 name: grpc protocol: TCP - containerPort: 8501 name: http protocol: TCP readinessProbe: tcpSocket: port: http initialDelaySeconds: 15 timeoutSeconds: 1 livenessProbe: initialDelaySeconds: 30 periodSeconds: 30 tcpSocket: port: htttp resources: limits: cpu: "2" memory: 2Gi requests: cpu: "1" memory: 1Gi env: - name: "AWS_REGION" value: "us-west-1" - name: "S3_REGION" value: "us-west-1" - name: "S3_ENDPOINT" value: "minio-service.kubeflow.svc.cluster.local:9000" - name: "S3_USE_HTTPS" value: "0" - name: "S3_VERIFY_SSL" value: "0" - name: "AWS_ACCESS_KEY_ID" valueFrom: { secretKeyRef: { name: "minioaccess", key: "AWS_ACCESS_KEY_ID" } } - name: "AWS_SECRET_ACCESS_KEY" valueFrom: { secretKeyRef: { name: "minioaccess", key: "AWS_SECRET_ACCESS_KEY" } } - name: "MODEL_BASE_PATH" value: "s3://models" - name: "MODEL_NAME" value: "recommender1" volumes: - name: secret-volume secret: secretName: minioaccess --- apiVersion: v1 kind: Service metadata: namespace: kubeflow name: recommendermodelserver1 spec: selector: app: recommendermodelserver1 ports: - name: grpc protocol: TCP port: 8500 targetPort: 8500 - name: http protocol: TCP port: 8501 targetPort: 8501 ================================================ FILE: recommender/tfservingchart/values.yaml ================================================ # application name is a namespace # docker images image: server: tensorflow/serving pullPolicy: Always version: 1.15.0 ================================================ FILE: runthrough.sh ================================================ #!/bin/bash set -ex example_repo_home="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )" KF_PLATFORM=${KF_PLATFORM:-minikube} export KF_PLATFORM if [ "$PLATFORM" == "gcp" ]; then # In GCP we also need a default zone gcloud config set compute/zone us-west1-b fi pushd dev-setup command -v kfctl >/dev/null 2>&1 || source install-kf.sh command -v kustomize >/dev/null 2>&1 || source install-kustomize.sh command -v argo >/dev/null 2>&1 || source install-argo.sh source install-kf-pipeline-sdk.sh popd mkdir -p /tmp/abc pushd /tmp/abc source "${example_repo_home}/ch2_seldon_examples/setup_example.sh" popd # rm -rf /tmp/abc ================================================ FILE: scikitLearn/python/IncomePrediction.ipynb ================================================ { "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Income prediction\n", "based on Seldon's implementation\n", "https://github.com/SeldonIO/alibi/blob/master/examples/anchor_tabular_adult.ipynb and\n", "https://github.com/SeldonIO/alibi/blob/5aec3ab4ce651ca2249bf849ecb434371c9278e4/alibi/datasets.py#L183" ] }, { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Requirement already up-to-date: pandas in ./.local/lib/python3.6/site-packages (1.0.3)\n", "Requirement already satisfied, skipping upgrade: numpy>=1.13.3 in /usr/local/lib/python3.6/dist-packages (from pandas) (1.18.1)\n", "Requirement already satisfied, skipping upgrade: pytz>=2017.2 in /usr/local/lib/python3.6/dist-packages (from pandas) (2019.3)\n", "Requirement already satisfied, skipping upgrade: python-dateutil>=2.6.1 in /usr/local/lib/python3.6/dist-packages (from pandas) (2.8.1)\n", "Requirement already satisfied, skipping upgrade: six>=1.5 in /usr/lib/python3/dist-packages (from python-dateutil>=2.6.1->pandas) (1.11.0)\n", "Requirement already up-to-date: scikit-learn in ./.local/lib/python3.6/site-packages (0.22.2.post1)\n", "Requirement already satisfied, skipping upgrade: joblib>=0.11 in ./.local/lib/python3.6/site-packages (from scikit-learn) (0.14.1)\n", "Requirement already satisfied, skipping upgrade: numpy>=1.11.0 in /usr/local/lib/python3.6/dist-packages (from scikit-learn) (1.18.1)\n", "Requirement already satisfied, skipping upgrade: scipy>=0.17.0 in /usr/local/lib/python3.6/dist-packages (from scikit-learn) (1.4.1)\n", "Requirement already up-to-date: alibi in ./.local/lib/python3.6/site-packages (0.4.0)\n", "Requirement already satisfied, skipping upgrade: scikit-learn in ./.local/lib/python3.6/site-packages (from alibi) (0.22.2.post1)\n", "Requirement already satisfied, skipping upgrade: attrs in /usr/local/lib/python3.6/dist-packages (from alibi) (19.3.0)\n", "Requirement already satisfied, skipping upgrade: beautifulsoup4 in ./.local/lib/python3.6/site-packages (from alibi) (4.8.2)\n", "Requirement already satisfied, skipping upgrade: spacy in ./.local/lib/python3.6/site-packages (from alibi) (2.2.4)\n", "Requirement already satisfied, skipping upgrade: shap in ./.local/lib/python3.6/site-packages (from alibi) (0.35.0)\n", "Requirement already satisfied, skipping upgrade: scipy in /usr/local/lib/python3.6/dist-packages (from alibi) (1.4.1)\n", "Requirement already satisfied, skipping upgrade: requests in /usr/local/lib/python3.6/dist-packages (from alibi) (2.22.0)\n", "Requirement already satisfied, skipping upgrade: numpy in /usr/local/lib/python3.6/dist-packages (from alibi) (1.18.1)\n", "Requirement already satisfied, skipping upgrade: Pillow in ./.local/lib/python3.6/site-packages (from alibi) (7.0.0)\n", "Requirement already satisfied, skipping upgrade: tensorflow<2.0 in /usr/local/lib/python3.6/dist-packages (from alibi) (1.15.2)\n", "Requirement already satisfied, skipping upgrade: pandas in ./.local/lib/python3.6/site-packages (from alibi) (1.0.3)\n", "Requirement already satisfied, skipping upgrade: prettyprinter in ./.local/lib/python3.6/site-packages (from alibi) (0.18.0)\n", "Requirement already satisfied, skipping upgrade: scikit-image in ./.local/lib/python3.6/site-packages (from alibi) (0.16.2)\n", "Requirement already satisfied, skipping upgrade: joblib>=0.11 in ./.local/lib/python3.6/site-packages (from scikit-learn->alibi) (0.14.1)\n", "Requirement already satisfied, skipping upgrade: soupsieve>=1.2 in ./.local/lib/python3.6/site-packages (from beautifulsoup4->alibi) (2.0)\n", "Requirement already satisfied, skipping upgrade: srsly<1.1.0,>=1.0.2 in ./.local/lib/python3.6/site-packages (from spacy->alibi) (1.0.2)\n", "Requirement already satisfied, skipping upgrade: preshed<3.1.0,>=3.0.2 in ./.local/lib/python3.6/site-packages (from spacy->alibi) (3.0.2)\n", "Requirement already satisfied, skipping upgrade: plac<1.2.0,>=0.9.6 in ./.local/lib/python3.6/site-packages (from spacy->alibi) (1.1.3)\n", "Requirement already satisfied, skipping upgrade: blis<0.5.0,>=0.4.0 in ./.local/lib/python3.6/site-packages (from spacy->alibi) (0.4.1)\n", "Requirement already satisfied, skipping upgrade: cymem<2.1.0,>=2.0.2 in ./.local/lib/python3.6/site-packages (from spacy->alibi) (2.0.3)\n", "Requirement already satisfied, skipping upgrade: tqdm<5.0.0,>=4.38.0 in ./.local/lib/python3.6/site-packages (from spacy->alibi) (4.43.0)\n", "Requirement already satisfied, skipping upgrade: catalogue<1.1.0,>=0.0.7 in ./.local/lib/python3.6/site-packages (from spacy->alibi) (1.0.0)\n", "Requirement already satisfied, skipping upgrade: thinc==7.4.0 in ./.local/lib/python3.6/site-packages (from spacy->alibi) (7.4.0)\n", "Requirement already satisfied, skipping upgrade: murmurhash<1.1.0,>=0.28.0 in ./.local/lib/python3.6/site-packages (from spacy->alibi) (1.0.2)\n", "Requirement already satisfied, skipping upgrade: setuptools in /usr/local/lib/python3.6/dist-packages (from spacy->alibi) (45.1.0)\n", "Requirement already satisfied, skipping upgrade: wasabi<1.1.0,>=0.4.0 in ./.local/lib/python3.6/site-packages (from spacy->alibi) (0.6.0)\n", "Requirement already satisfied, skipping upgrade: certifi>=2017.4.17 in /usr/local/lib/python3.6/dist-packages (from requests->alibi) (2019.11.28)\n", "Requirement already satisfied, skipping upgrade: idna<2.9,>=2.5 in /usr/lib/python3/dist-packages (from requests->alibi) (2.6)\n", "Requirement already satisfied, skipping upgrade: urllib3!=1.25.0,!=1.25.1,<1.26,>=1.21.1 in ./.local/lib/python3.6/site-packages (from requests->alibi) (1.24.3)\n", "Requirement already satisfied, skipping upgrade: chardet<3.1.0,>=3.0.2 in /usr/local/lib/python3.6/dist-packages (from requests->alibi) (3.0.4)\n", "Requirement already satisfied, skipping upgrade: keras-preprocessing>=1.0.5 in /usr/local/lib/python3.6/dist-packages (from tensorflow<2.0->alibi) (1.1.0)\n", "Requirement already satisfied, skipping upgrade: gast==0.2.2 in /usr/local/lib/python3.6/dist-packages (from tensorflow<2.0->alibi) (0.2.2)\n", "Requirement already satisfied, skipping upgrade: wheel>=0.26; python_version >= \"3\" in /usr/lib/python3/dist-packages (from tensorflow<2.0->alibi) (0.30.0)\n", "Requirement already satisfied, skipping upgrade: six>=1.10.0 in /usr/lib/python3/dist-packages (from tensorflow<2.0->alibi) (1.11.0)\n", "Requirement already satisfied, skipping upgrade: termcolor>=1.1.0 in /usr/local/lib/python3.6/dist-packages (from tensorflow<2.0->alibi) (1.1.0)\n", "Requirement already satisfied, skipping upgrade: astor>=0.6.0 in /usr/local/lib/python3.6/dist-packages (from tensorflow<2.0->alibi) (0.8.1)\n", "Requirement already satisfied, skipping upgrade: keras-applications>=1.0.8 in /usr/local/lib/python3.6/dist-packages (from tensorflow<2.0->alibi) (1.0.8)\n", "Requirement already satisfied, skipping upgrade: opt-einsum>=2.3.2 in /usr/local/lib/python3.6/dist-packages (from tensorflow<2.0->alibi) (3.1.0)\n", "Requirement already satisfied, skipping upgrade: protobuf>=3.6.1 in /usr/local/lib/python3.6/dist-packages (from tensorflow<2.0->alibi) (3.11.2)\n", "Requirement already satisfied, skipping upgrade: tensorflow-estimator==1.15.1 in /usr/local/lib/python3.6/dist-packages (from tensorflow<2.0->alibi) (1.15.1)\n", "Requirement already satisfied, skipping upgrade: google-pasta>=0.1.6 in /usr/local/lib/python3.6/dist-packages (from tensorflow<2.0->alibi) (0.1.8)\n", "Requirement already satisfied, skipping upgrade: wrapt>=1.11.1 in /usr/local/lib/python3.6/dist-packages (from tensorflow<2.0->alibi) (1.11.2)\n", "Requirement already satisfied, skipping upgrade: grpcio>=1.8.6 in /usr/local/lib/python3.6/dist-packages (from tensorflow<2.0->alibi) (1.26.0)\n", "Requirement already satisfied, skipping upgrade: tensorboard<1.16.0,>=1.15.0 in /usr/local/lib/python3.6/dist-packages (from tensorflow<2.0->alibi) (1.15.0)\n", "Requirement already satisfied, skipping upgrade: absl-py>=0.7.0 in /usr/local/lib/python3.6/dist-packages (from tensorflow<2.0->alibi) (0.9.0)\n", "Requirement already satisfied, skipping upgrade: pytz>=2017.2 in /usr/local/lib/python3.6/dist-packages (from pandas->alibi) (2019.3)\n", "Requirement already satisfied, skipping upgrade: python-dateutil>=2.6.1 in /usr/local/lib/python3.6/dist-packages (from pandas->alibi) (2.8.1)\n", "Requirement already satisfied, skipping upgrade: Pygments>=2.2.0 in /usr/local/lib/python3.6/dist-packages (from prettyprinter->alibi) (2.5.2)\n", "Requirement already satisfied, skipping upgrade: colorful>=0.4.0 in ./.local/lib/python3.6/site-packages (from prettyprinter->alibi) (0.5.4)\n", "Requirement already satisfied, skipping upgrade: networkx>=2.0 in ./.local/lib/python3.6/site-packages (from scikit-image->alibi) (2.4)\n", "Requirement already satisfied, skipping upgrade: imageio>=2.3.0 in ./.local/lib/python3.6/site-packages (from scikit-image->alibi) (2.8.0)\n", "Requirement already satisfied, skipping upgrade: matplotlib!=3.0.0,>=2.0.0 in /usr/local/lib/python3.6/dist-packages (from scikit-image->alibi) (3.1.2)\n", "Requirement already satisfied, skipping upgrade: PyWavelets>=0.4.0 in ./.local/lib/python3.6/site-packages (from scikit-image->alibi) (1.1.1)\n", "Requirement already satisfied, skipping upgrade: importlib-metadata>=0.20; python_version < \"3.8\" in /usr/local/lib/python3.6/dist-packages (from catalogue<1.1.0,>=0.0.7->spacy->alibi) (1.4.0)\n", "Requirement already satisfied, skipping upgrade: h5py in /usr/local/lib/python3.6/dist-packages (from keras-applications>=1.0.8->tensorflow<2.0->alibi) (2.10.0)\n", "Requirement already satisfied, skipping upgrade: werkzeug>=0.11.15 in /usr/local/lib/python3.6/dist-packages (from tensorboard<1.16.0,>=1.15.0->tensorflow<2.0->alibi) (0.16.1)\n", "Requirement already satisfied, skipping upgrade: markdown>=2.6.8 in /usr/local/lib/python3.6/dist-packages (from tensorboard<1.16.0,>=1.15.0->tensorflow<2.0->alibi) (3.1.1)\n", "Requirement already satisfied, skipping upgrade: decorator>=4.3.0 in /usr/local/lib/python3.6/dist-packages (from networkx>=2.0->scikit-image->alibi) (4.4.1)\n", "Requirement already satisfied, skipping upgrade: kiwisolver>=1.0.1 in /usr/local/lib/python3.6/dist-packages (from matplotlib!=3.0.0,>=2.0.0->scikit-image->alibi) (1.1.0)\n", "Requirement already satisfied, skipping upgrade: cycler>=0.10 in /usr/local/lib/python3.6/dist-packages (from matplotlib!=3.0.0,>=2.0.0->scikit-image->alibi) (0.10.0)\n", "Requirement already satisfied, skipping upgrade: pyparsing!=2.0.4,!=2.1.2,!=2.1.6,>=2.0.1 in /usr/local/lib/python3.6/dist-packages (from matplotlib!=3.0.0,>=2.0.0->scikit-image->alibi) (2.4.6)\n", "Requirement already satisfied, skipping upgrade: zipp>=0.5 in /usr/local/lib/python3.6/dist-packages (from importlib-metadata>=0.20; python_version < \"3.8\"->catalogue<1.1.0,>=0.0.7->spacy->alibi) (2.1.0)\r\n" ] } ], "source": [ "!pip install pandas --upgrade --user\n", "!pip install scikit-learn --upgrade --user\n", "!pip install alibi --upgrade --user" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [], "source": [ "import numpy as np\n", "import pandas as pd\n", "from sklearn.ensemble import RandomForestClassifier\n", "from sklearn.compose import ColumnTransformer\n", "from sklearn.pipeline import Pipeline\n", "from sklearn.impute import SimpleImputer\n", "from sklearn.metrics import accuracy_score\n", "from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder\n", "from alibi.explainers import AnchorTabular\n", "from alibi.datasets import fetch_adult\n", "from alibi.utils.data import Bunch, gen_category_map\n", "from typing import Tuple, Union\n", "import requests\n", "from requests import RequestException\n", "from io import BytesIO, StringIO" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Fetching and preprocessing data" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [], "source": [ "def fetch_adult(features_drop: list = None, return_X_y: bool = False, url_id: int = 0) -> Union[Bunch, Tuple[np.ndarray, np.ndarray]]:\n", " \"\"\"\n", " Downloads and pre-processes 'adult' dataset.\n", " More info: http://mlr.cs.umass.edu/ml/machine-learning-databases/adult/\n", " Parameters\n", " ----------\n", " features_drop\n", " List of features to be dropped from dataset, by default drops [\"fnlwgt\", \"Education-Num\"]\n", " return_X_y\n", " If true, return features X and labels y as numpy arrays, if False return a Bunch object\n", " url_id\n", " Index specifying which URL to use for downloading\n", " Returns\n", " -------\n", " Bunch\n", " Dataset, labels, a list of features and a dictionary containing a list with the potential categories\n", " for each categorical feature where the key refers to the feature column.\n", " (data, target)\n", " Tuple if ``return_X_y`` is true\n", " \"\"\"\n", " ADULT_URLS = ['https://storage.googleapis.com/seldon-datasets/adult/adult.data',\n", " 'https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data',\n", " 'http://mlr.cs.umass.edu/ml/machine-learning-databases/adult/adult.data']\n", " if features_drop is None:\n", " features_drop = [\"fnlwgt\", \"Education-Num\"]\n", "\n", " # download data\n", " dataset_url = ADULT_URLS[url_id]\n", " raw_features = ['Age', 'Workclass', 'fnlwgt', 'Education', 'Education-Num', 'Marital Status',\n", " 'Occupation', 'Relationship', 'Race', 'Sex', 'Capital Gain', 'Capital Loss',\n", " 'Hours per week', 'Country', 'Target']\n", " try:\n", " resp = requests.get(dataset_url)\n", " resp.raise_for_status()\n", " except RequestException:\n", " logger.exception(\"Could not connect, URL may be out of service\")\n", " raise\n", "\n", " raw_data = pd.read_csv(StringIO(resp.text), names=raw_features, delimiter=', ', engine='python').fillna('?')\n", "\n", " # get labels, features and drop unnecessary features\n", " labels = (raw_data['Target'] == '>50K').astype(int).values\n", " features_drop += ['Target']\n", " data = raw_data.drop(features_drop, axis=1)\n", " features = list(data.columns)\n", "\n", " # map categorical features\n", " education_map = {\n", " '10th': 'Dropout', '11th': 'Dropout', '12th': 'Dropout', '1st-4th':\n", " 'Dropout', '5th-6th': 'Dropout', '7th-8th': 'Dropout', '9th':\n", " 'Dropout', 'Preschool': 'Dropout', 'HS-grad': 'High School grad',\n", " 'Some-college': 'High School grad', 'Masters': 'Masters',\n", " 'Prof-school': 'Prof-School', 'Assoc-acdm': 'Associates',\n", " 'Assoc-voc': 'Associates'\n", " }\n", " occupation_map = {\n", " \"Adm-clerical\": \"Admin\", \"Armed-Forces\": \"Military\",\n", " \"Craft-repair\": \"Blue-Collar\", \"Exec-managerial\": \"White-Collar\",\n", " \"Farming-fishing\": \"Blue-Collar\", \"Handlers-cleaners\":\n", " \"Blue-Collar\", \"Machine-op-inspct\": \"Blue-Collar\", \"Other-service\":\n", " \"Service\", \"Priv-house-serv\": \"Service\", \"Prof-specialty\":\n", " \"Professional\", \"Protective-serv\": \"Other\", \"Sales\":\n", " \"Sales\", \"Tech-support\": \"Other\", \"Transport-moving\":\n", " \"Blue-Collar\"\n", " }\n", " country_map = {\n", " 'Cambodia': 'SE-Asia', 'Canada': 'British-Commonwealth', 'China':\n", " 'China', 'Columbia': 'South-America', 'Cuba': 'Other',\n", " 'Dominican-Republic': 'Latin-America', 'Ecuador': 'South-America',\n", " 'El-Salvador': 'South-America', 'England': 'British-Commonwealth',\n", " 'France': 'Euro_1', 'Germany': 'Euro_1', 'Greece': 'Euro_2',\n", " 'Guatemala': 'Latin-America', 'Haiti': 'Latin-America',\n", " 'Holand-Netherlands': 'Euro_1', 'Honduras': 'Latin-America',\n", " 'Hong': 'China', 'Hungary': 'Euro_2', 'India':\n", " 'British-Commonwealth', 'Iran': 'Other', 'Ireland':\n", " 'British-Commonwealth', 'Italy': 'Euro_1', 'Jamaica':\n", " 'Latin-America', 'Japan': 'Other', 'Laos': 'SE-Asia', 'Mexico':\n", " 'Latin-America', 'Nicaragua': 'Latin-America',\n", " 'Outlying-US(Guam-USVI-etc)': 'Latin-America', 'Peru':\n", " 'South-America', 'Philippines': 'SE-Asia', 'Poland': 'Euro_2',\n", " 'Portugal': 'Euro_2', 'Puerto-Rico': 'Latin-America', 'Scotland':\n", " 'British-Commonwealth', 'South': 'Euro_2', 'Taiwan': 'China',\n", " 'Thailand': 'SE-Asia', 'Trinadad&Tobago': 'Latin-America',\n", " 'United-States': 'United-States', 'Vietnam': 'SE-Asia'\n", " }\n", " married_map = {\n", " 'Never-married': 'Never-Married', 'Married-AF-spouse': 'Married',\n", " 'Married-civ-spouse': 'Married', 'Married-spouse-absent':\n", " 'Separated', 'Separated': 'Separated', 'Divorced':\n", " 'Separated', 'Widowed': 'Widowed'\n", " }\n", " mapping = {'Education': education_map, 'Occupation': occupation_map, 'Country': country_map,\n", " 'Marital Status': married_map}\n", "\n", " data_copy = data.copy()\n", " for f, f_map in mapping.items():\n", " data_tmp = data_copy[f].values\n", " for key, value in f_map.items():\n", " data_tmp[data_tmp == key] = value\n", " data[f] = data_tmp\n", "\n", " # get categorical features and apply labelencoding\n", " categorical_features = [f for f in features if data[f].dtype == 'O']\n", " category_map = {}\n", " for f in categorical_features:\n", " le = LabelEncoder()\n", " data_tmp = le.fit_transform(data[f].values)\n", " data[f] = data_tmp\n", " category_map[features.index(f)] = list(le.classes_)\n", "\n", " # only return data values\n", " data = data.values\n", " target_names = ['<=50K', '>50K']\n", "\n", " if return_X_y:\n", " return data, labels\n", "\n", " return Bunch(data=data, target=labels, feature_names=features, target_names=target_names, category_map=category_map)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Load adult dataset\n", "The fetch_adult function returns a Bunch object containing the features, the targets, the feature names and a mapping of categorical variables to numbers which are required for formatting the output of the Anchor explainer." ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "dict_keys(['data', 'target', 'feature_names', 'target_names', 'category_map'])" ] }, "execution_count": 4, "metadata": {}, "output_type": "execute_result" } ], "source": [ "adult = fetch_adult()\n", "adult.keys()" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [], "source": [ "data = adult.data\n", "target = adult.target\n", "feature_names = adult.feature_names\n", "category_map = adult.category_map" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Define shuffled training and test set" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [], "source": [ "np.random.seed(0)\n", "data_perm = np.random.permutation(np.c_[data, target])\n", "data = data_perm[:,:-1]\n", "target = data_perm[:,-1]\n", "idx = 30000\n", "X_train,Y_train = data[:idx,:], target[:idx]\n", "X_test, Y_test = data[idx+1:,:], target[idx+1:]" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Create feature transformation pipeline\n", "Create feature pre-processor. Needs to have 'fit' and 'transform' methods. Different types of pre-processing can be applied to all or part of the features. In the example below we will standardize ordinal features and apply one-hot-encoding to categorical features.\n", "\n", "Ordinal features:" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [], "source": [ "ordinal_features = [x for x in range(len(feature_names)) if x not in list(category_map.keys())]\n", "ordinal_transformer = Pipeline(steps=[('imputer', SimpleImputer(strategy='median')),\n", " ('scaler', StandardScaler())])" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Categorical features:" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [], "source": [ "categorical_features = list(category_map.keys())\n", "categorical_transformer = Pipeline(steps=[('imputer', SimpleImputer(strategy='median')),\n", " ('onehot', OneHotEncoder(handle_unknown='ignore'))])" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Combine and fit:" ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "ColumnTransformer(n_jobs=None, remainder='drop', sparse_threshold=0.3,\n", " transformer_weights=None,\n", " transformers=[('num',\n", " Pipeline(memory=None,\n", " steps=[('imputer',\n", " SimpleImputer(add_indicator=False,\n", " copy=True,\n", " fill_value=None,\n", " missing_values=nan,\n", " strategy='median',\n", " verbose=0)),\n", " ('scaler',\n", " StandardScaler(copy=True,\n", " with_mean=True,\n", " with_std=True))],\n", " verbose=False),\n", " [0, 8, 9, 10]),\n", " ('cat',\n", " Pipeline(memory=None,\n", " steps=[('imputer',\n", " SimpleImputer(add_indicator=False,\n", " copy=True,\n", " fill_value=None,\n", " missing_values=nan,\n", " strategy='median',\n", " verbose=0)),\n", " ('onehot',\n", " OneHotEncoder(categories='auto',\n", " drop=None,\n", " dtype=,\n", " handle_unknown='ignore',\n", " sparse=True))],\n", " verbose=False),\n", " [1, 2, 3, 4, 5, 6, 7, 11])],\n", " verbose=False)" ] }, "execution_count": 9, "metadata": {}, "output_type": "execute_result" } ], "source": [ "preprocessor = ColumnTransformer(transformers=[('num', ordinal_transformer, ordinal_features),\n", " ('cat', categorical_transformer, categorical_features)])\n", "preprocessor.fit(X_train)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Train Random Forest model\n", "Fit on pre-processed (imputing, OHE, standardizing) data." ] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,\n", " criterion='gini', max_depth=None, max_features='auto',\n", " max_leaf_nodes=None, max_samples=None,\n", " min_impurity_decrease=0.0, min_impurity_split=None,\n", " min_samples_leaf=1, min_samples_split=2,\n", " min_weight_fraction_leaf=0.0, n_estimators=50,\n", " n_jobs=None, oob_score=False, random_state=None,\n", " verbose=0, warm_start=False)" ] }, "execution_count": 10, "metadata": {}, "output_type": "execute_result" } ], "source": [ "np.random.seed(0)\n", "clf = RandomForestClassifier(n_estimators=50)\n", "clf.fit(preprocessor.transform(X_train), Y_train)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Define predict function" ] }, { "cell_type": "code", "execution_count": 11, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Train accuracy: 0.9655333333333334\n", "Test accuracy: 0.855859375\n" ] } ], "source": [ "predict_fn = lambda x: clf.predict(preprocessor.transform(x))\n", "print('Train accuracy: ', accuracy_score(Y_train, predict_fn(X_train)))\n", "print('Test accuracy: ', accuracy_score(Y_test, predict_fn(X_test)))" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Initialize and fit anchor explainer for tabular data" ] }, { "cell_type": "code", "execution_count": 12, "metadata": {}, "outputs": [], "source": [ "explainer = AnchorTabular(predict_fn, feature_names, categorical_names=category_map, seed=1)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Discretize the ordinal features into quartiles" ] }, { "cell_type": "code", "execution_count": 13, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "AnchorTabular(meta={\n", " 'name': 'AnchorTabular',\n", " 'type': ['blackbox'],\n", " 'explanations': ['local'],\n", " 'params': {'seed': 1, 'disc_perc': [25, 50, 75]}\n", "})" ] }, "execution_count": 13, "metadata": {}, "output_type": "execute_result" } ], "source": [ "explainer.fit(X_train, disc_perc=[25, 50, 75])" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Getting an anchor\n", "Below, we get an anchor for the prediction of the first observation in the test set. An anchor is a sufficient condition - that is, when the anchor holds, the prediction should be the same as the prediction for this instance." ] }, { "cell_type": "code", "execution_count": 14, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Prediction: <=50K\n" ] } ], "source": [ "idx = 0\n", "class_names = adult.target_names\n", "print('Prediction: ', class_names[explainer.predictor(X_test[idx].reshape(1, -1))[0]])" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "We set the precision threshold to 0.95. This means that predictions on observations where the anchor holds will be the same as the prediction on the explained instance at least 95% of the time." ] }, { "cell_type": "code", "execution_count": 15, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Anchor: Marital Status = Separated AND Sex = Female\n", "Precision: 0.95\n", "Coverage: 0.18\n" ] } ], "source": [ "explanation = explainer.explain(X_test[idx], threshold=0.95)\n", "print('Anchor: %s' % (' AND '.join(explanation.anchor)))\n", "print('Precision: %.2f' % explanation.precision)\n", "print('Coverage: %.2f' % explanation.coverage)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# ...or not?\n", "Let's try getting an anchor for a different observation in the test set - one for the which the prediction is >50K." ] }, { "cell_type": "code", "execution_count": 16, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Prediction: >50K\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "Could not find an result satisfying the 0.95 precision constraint. Now returning the best non-eligible result.\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "Anchor: Capital Loss > 0.00 AND Relationship = Husband AND Marital Status = Married AND Age > 37.00 AND Race = White AND Country = United-States AND Sex = Male\n", "Precision: 0.71\n", "Coverage: 0.05\n" ] } ], "source": [ "idx = 6\n", "class_names = adult.target_names\n", "print('Prediction: ', class_names[explainer.predictor(X_test[idx].reshape(1, -1))[0]])\n", "\n", "explanation = explainer.explain(X_test[idx], threshold=0.95)\n", "print('Anchor: %s' % (' AND '.join(explanation.anchor)))\n", "print('Precision: %.2f' % explanation.precision)\n", "print('Coverage: %.2f' % explanation.coverage)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Notice how no anchor is found!\n", "\n", "This is due to the imbalanced dataset (roughly 25:75 high:low earner proportion), so during the sampling stage feature ranges corresponding to low-earners will be oversampled. This is a feature because it can point out an imbalanced dataset, but it can also be fixed by producing balanced datasets to enable anchors to be found for either class." ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.6.9" } }, "nbformat": 4, "nbformat_minor": 4 }