Repository: intro-to-ml-with-kubeflow/intro-to-ml-with-kubeflow-examples
Branch: master
Commit: b00b44a88011
Files: 147
Total size: 501.5 KB

Directory structure:
gitextract_v2ir0_h2/

├── .circleci/
│   └── config.yml
├── .gitignore
├── .travis.yaml
├── LICENSE
├── README.md
├── autopep_stuff.sh
├── ch03/
│   ├── example_secret.yaml
│   ├── linux_install.sh
│   ├── mac_install.sh
│   └── minio.sh
├── ch04/
│   ├── code/
│   │   ├── ControlStructures.ipynb
│   │   ├── ControlStructures.py
│   │   ├── Lightweight Pipeline.ipynb
│   │   ├── Lightweight Pipeline.py
│   │   ├── RecommenderPipeline.ipynb
│   │   ├── RecommenderPipeline.py
│   │   └── download_components.sh
│   └── install/
│       ├── deployment.yaml
│       └── virtualservice.yaml
├── ch06/
│   ├── MLflow.ipynb
│   ├── MLflow.py
│   ├── Metadata.ipynb
│   ├── Metadata.py
│   ├── docker/
│   │   ├── Dockerfile
│   │   ├── build.sh
│   │   └── run.sh
│   └── install/
│       └── mlflowchart/
│           ├── .helmignore
│           ├── Chart.yaml
│           ├── templates/
│           │   ├── NOTES.txt
│           │   ├── _helpers.tpl
│           │   └── mlflow.yaml
│           └── values.yaml
├── ch10/
│   ├── experiment.yaml
│   ├── hptuning.py
│   └── random.yaml
├── ch2/
│   ├── Dockerfile
│   ├── build-and-push.sh
│   └── query-endpoint.py
├── ch2_seldon_examples/
│   ├── pipeline_role.yaml
│   ├── pipeline_rolebinding.yaml
│   ├── pv-claim.yaml
│   ├── pv-volume.yaml
│   ├── request_example.ipynb
│   ├── run_example.sh
│   ├── setup_example.sh
│   ├── tf_mnist_no_seldon_pipeline.py
│   ├── tiller_rbac.yaml
│   └── train_pipeline.py
├── ch9/
│   └── ctscans/
│       ├── DICOM Denoising Pipeline.ipynb
│       ├── calculate-basis-vectors/
│       │   ├── Dockerfile
│       │   ├── build-component.sh
│       │   ├── pom.xml
│       │   └── src/
│       │       └── main/
│       │           └── scala/
│       │               └── org/
│       │                   └── rawkintrevo/
│       │                       └── covid/
│       │                           └── App.scala
│       ├── download-dicom/
│       │   ├── Dockerfile
│       │   ├── build-component.sh
│       │   └── run.sh
│       ├── process-dicoms-into-vectors/
│       │   ├── Dockerfile
│       │   ├── build-component.sh
│       │   ├── data/
│       │   │   └── s.150.csv
│       │   ├── process-dicoms-into-vectors.yaml
│       │   └── src/
│       │       └── program.py
│       └── visualize-basis-vectors/
│           ├── Dockerfile
│           ├── build-component.sh
│           └── src/
│               └── program.py
├── ci.sh
├── convert_notebooks.sh
├── data-extraction/
│   ├── README.md
│   ├── github_comments_query.bsql
│   ├── github_issues_query.bsql
│   ├── iot/
│   │   ├── basic.yaml
│   │   └── build.sh
│   ├── python-notebook/
│   │   ├── AddSpamassassinDockerfile
│   │   ├── MailingListDataPrep.ipynb
│   │   ├── MailingListDataPrep.py
│   │   └── RunNBDockerfile
│   ├── python-spark/
│   │   ├── Dockerfile
│   │   ├── LaunchSparkJobs.ipynb
│   │   ├── LaunchSparkJobs.py
│   │   ├── fake_job.py
│   │   └── requirements.txt
│   ├── python-spark-notebook/
│   │   ├── AddGCSDockerfile
│   │   ├── AddPython3.6Dockerfile
│   │   ├── Dockerfile
│   │   ├── SparkMailingListForKF.ipynb
│   │   ├── SparkMailingListForKF.py
│   │   ├── build.sh
│   │   ├── dr.yaml
│   │   ├── no-saprk-tls.yaml
│   │   ├── spark-driver-service.yaml
│   │   └── virt_service.yaml
│   ├── spark-hello-world/
│   │   ├── Dockerfile
│   │   ├── README.md
│   │   ├── hello_world_pipeline.py
│   │   ├── lr_demo/
│   │   │   ├── .gitignore
│   │   │   ├── .travis.yml
│   │   │   ├── README.md
│   │   │   ├── build.sbt
│   │   │   ├── project/
│   │   │   │   ├── build.properties
│   │   │   │   └── plugins.sbt
│   │   │   ├── sample.csv
│   │   │   ├── sbt/
│   │   │   │   └── sbt
│   │   │   └── src/
│   │   │       ├── main/
│   │   │       │   └── scala/
│   │   │       │       └── com/
│   │   │       │           └── introtomlwithkubeflow/
│   │   │       │               └── spark/
│   │   │       │                   └── demo/
│   │   │       │                       └── lr/
│   │   │       │                           ├── TrainingApp.scala
│   │   │       │                           └── TrainingPipeline.scala
│   │   │       └── test/
│   │   │           └── scala/
│   │   │               └── com/
│   │   │                   └── introtomlwithkubeflow/
│   │   │                       └── spark/
│   │   │                           └── demo/
│   │   │                               └── lr/
│   │   │                                   └── TrainingPipelineTest.scala
│   │   ├── setup.sh
│   │   ├── spark-pi-min.yaml
│   │   └── spark-pi.yaml
│   ├── stack_overflow_questions.bsql
│   └── tfx/
│       ├── TFDV.ipynb
│       ├── TFDV.py
│       ├── install_tfx.sh
│       ├── requirements.txt
│       └── run_on_dataflow_ex.py
├── dev-setup/
│   ├── install-argo.sh
│   ├── install-kf-pipeline-sdk.sh
│   ├── install-kf.sh
│   ├── install-kubectl.sh
│   ├── install-kustomize.sh
│   ├── install-microk8s.sh
│   └── jsonnet.sh
├── feature-prep/
│   ├── README.md
│   ├── spark/
│   │   ├── SparkMailingListFeaturePrep.ipynb
│   │   └── SparkMailingListFeaturePrep.py
│   └── tft/
│       ├── requirements.txt
│       └── transform.py
├── gcp-setup/
│   ├── cloudshell_scrip.sh
│   └── setup-gcp.sh
├── kfctl_gcp_iap.v1.0.1.yaml
├── pipelines/
│   ├── ControlStructures.ipynb
│   ├── Lightweight Pipeline.ipynb
│   ├── RecommenderPipeline.ipynb
│   └── download_components.sh
├── recommender/
│   ├── Dockerfile
│   ├── Recommender_Kubeflow.ipynb
│   ├── Recommender_Kubeflow.py
│   ├── docker/
│   │   ├── Dockerfile
│   │   └── build.sh
│   └── tfservingchart/
│       ├── .helmignore
│       ├── Chart.yaml
│       ├── templates/
│       │   ├── NOTES.txt
│       │   ├── _helpers.tpl
│       │   ├── minioaccess.yaml
│       │   ├── tfserving.yaml
│       │   └── tfserving1.yaml
│       └── values.yaml
├── runthrough.sh
└── scikitLearn/
    └── python/
        └── IncomePrediction.ipynb

================================================
FILE CONTENTS
================================================

================================================
FILE: .circleci/config.yml
================================================
version: 2

apt-run:  &apt-install
  name: Install apt packages
  command: |
    sudo apt-get -qq update
    sudo apt-get install -y \
      shellcheck

jobs:
  build:
    working_directory: ~/mermaid-starter
    docker:
      - image: circleci/python:3.6-jessie-node-browsers-legacy
    steps:
      - checkout
      - run: *apt-install
      - run:
        name: Run our basic shell CI
        command: ./ci.sh

================================================
FILE: .gitignore
================================================
# Byte-compiled / optimized / DLL files
__pycache__/
*.py[cod]
*$py.class

# C extensions
*.so

# Distribution / packaging
.idea
.Python
build/
develop-eggs/
dist/
downloads/
eggs/
.eggs/
lib/
lib64/
parts/
sdist/
var/
wheels/
*.egg-info/
.installed.cfg
*.egg
MANIFEST

# PyInstaller
#  Usually these files are written by a python script from a template
#  before PyInstaller builds the exe, so as to inject date/other infos into it.
*.manifest
*.spec


# Installer logs
pip-log.txt
pip-delete-this-directory.txt

# Unit test / coverage reports
htmlcov/
.tox/
.coverage
.coverage.*
.cache
nosetests.xml
coverage.xml
*.cover
.hypothesis/
.pytest_cache/

# Translations
*.mo
*.pot

# Django stuff:
*.log
local_settings.py
db.sqlite3

# Flask stuff:
instance/
.webassets-cache

# Scrapy stuff:
.scrapy

# Sphinx documentation
docs/_build/

# PyBuilder
target/

# Jupyter Notebook
.ipynb_checkpoints

# pyenv
.python-version

# celery beat schedule file
celerybeat-schedule

# SageMath parsed files
*.sage.py

# Environments
.env
.venv
env/
venv/
ENV/
env.bak/
venv.bak/

# Spyder project settings
.spyderproject
.spyproject

# Rope project settings
.ropeproject

# mkdocs documentation
/site

# mypy
.mypy_cache/

# Emacs
*~

# Ignore kfctl's downloaded
kfctl*.t*z

================================================
FILE: .travis.yaml
================================================
language: generic
sudo: true
addons:
  apt:
    packages:
     - shellcheck
script:
  - ./ci.sh

================================================
FILE: LICENSE
================================================
                                 Apache License
                           Version 2.0, January 2004
                        http://www.apache.org/licenses/

   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION

   1. Definitions.

      "License" shall mean the terms and conditions for use, reproduction,
      and distribution as defined by Sections 1 through 9 of this document.

      "Licensor" shall mean the copyright owner or entity authorized by
      the copyright owner that is granting the License.

      "Legal Entity" shall mean the union of the acting entity and all
      other entities that control, are controlled by, or are under common
      control with that entity. For the purposes of this definition,
      "control" means (i) the power, direct or indirect, to cause the
      direction or management of such entity, whether by contract or
      otherwise, or (ii) ownership of fifty percent (50%) or more of the
      outstanding shares, or (iii) beneficial ownership of such entity.

      "You" (or "Your") shall mean an individual or Legal Entity
      exercising permissions granted by this License.

      "Source" form shall mean the preferred form for making modifications,
      including but not limited to software source code, documentation
      source, and configuration files.

      "Object" form shall mean any form resulting from mechanical
      transformation or translation of a Source form, including but
      not limited to compiled object code, generated documentation,
      and conversions to other media types.

      "Work" shall mean the work of authorship, whether in Source or
      Object form, made available under the License, as indicated by a
      copyright notice that is included in or attached to the work
      (an example is provided in the Appendix below).

      "Derivative Works" shall mean any work, whether in Source or Object
      form, that is based on (or derived from) the Work and for which the
      editorial revisions, annotations, elaborations, or other modifications
      represent, as a whole, an original work of authorship. For the purposes
      of this License, Derivative Works shall not include works that remain
      separable from, or merely link (or bind by name) to the interfaces of,
      the Work and Derivative Works thereof.

      "Contribution" shall mean any work of authorship, including
      the original version of the Work and any modifications or additions
      to that Work or Derivative Works thereof, that is intentionally
      submitted to Licensor for inclusion in the Work by the copyright owner
      or by an individual or Legal Entity authorized to submit on behalf of
      the copyright owner. For the purposes of this definition, "submitted"
      means any form of electronic, verbal, or written communication sent
      to the Licensor or its representatives, including but not limited to
      communication on electronic mailing lists, source code control systems,
      and issue tracking systems that are managed by, or on behalf of, the
      Licensor for the purpose of discussing and improving the Work, but
      excluding communication that is conspicuously marked or otherwise
      designated in writing by the copyright owner as "Not a Contribution."

      "Contributor" shall mean Licensor and any individual or Legal Entity
      on behalf of whom a Contribution has been received by Licensor and
      subsequently incorporated within the Work.

   2. Grant of Copyright License. Subject to the terms and conditions of
      this License, each Contributor hereby grants to You a perpetual,
      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
      copyright license to reproduce, prepare Derivative Works of,
      publicly display, publicly perform, sublicense, and distribute the
      Work and such Derivative Works in Source or Object form.

   3. Grant of Patent License. Subject to the terms and conditions of
      this License, each Contributor hereby grants to You a perpetual,
      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
      (except as stated in this section) patent license to make, have made,
      use, offer to sell, sell, import, and otherwise transfer the Work,
      where such license applies only to those patent claims licensable
      by such Contributor that are necessarily infringed by their
      Contribution(s) alone or by combination of their Contribution(s)
      with the Work to which such Contribution(s) was submitted. If You
      institute patent litigation against any entity (including a
      cross-claim or counterclaim in a lawsuit) alleging that the Work
      or a Contribution incorporated within the Work constitutes direct
      or contributory patent infringement, then any patent licenses
      granted to You under this License for that Work shall terminate
      as of the date such litigation is filed.

   4. Redistribution. You may reproduce and distribute copies of the
      Work or Derivative Works thereof in any medium, with or without
      modifications, and in Source or Object form, provided that You
      meet the following conditions:

      (a) You must give any other recipients of the Work or
          Derivative Works a copy of this License; and

      (b) You must cause any modified files to carry prominent notices
          stating that You changed the files; and

      (c) You must retain, in the Source form of any Derivative Works
          that You distribute, all copyright, patent, trademark, and
          attribution notices from the Source form of the Work,
          excluding those notices that do not pertain to any part of
          the Derivative Works; and

      (d) If the Work includes a "NOTICE" text file as part of its
          distribution, then any Derivative Works that You distribute must
          include a readable copy of the attribution notices contained
          within such NOTICE file, excluding those notices that do not
          pertain to any part of the Derivative Works, in at least one
          of the following places: within a NOTICE text file distributed
          as part of the Derivative Works; within the Source form or
          documentation, if provided along with the Derivative Works; or,
          within a display generated by the Derivative Works, if and
          wherever such third-party notices normally appear. The contents
          of the NOTICE file are for informational purposes only and
          do not modify the License. You may add Your own attribution
          notices within Derivative Works that You distribute, alongside
          or as an addendum to the NOTICE text from the Work, provided
          that such additional attribution notices cannot be construed
          as modifying the License.

      You may add Your own copyright statement to Your modifications and
      may provide additional or different license terms and conditions
      for use, reproduction, or distribution of Your modifications, or
      for any such Derivative Works as a whole, provided Your use,
      reproduction, and distribution of the Work otherwise complies with
      the conditions stated in this License.

   5. Submission of Contributions. Unless You explicitly state otherwise,
      any Contribution intentionally submitted for inclusion in the Work
      by You to the Licensor shall be under the terms and conditions of
      this License, without any additional terms or conditions.
      Notwithstanding the above, nothing herein shall supersede or modify
      the terms of any separate license agreement you may have executed
      with Licensor regarding such Contributions.

   6. Trademarks. This License does not grant permission to use the trade
      names, trademarks, service marks, or product names of the Licensor,
      except as required for reasonable and customary use in describing the
      origin of the Work and reproducing the content of the NOTICE file.

   7. Disclaimer of Warranty. Unless required by applicable law or
      agreed to in writing, Licensor provides the Work (and each
      Contributor provides its Contributions) on an "AS IS" BASIS,
      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
      implied, including, without limitation, any warranties or conditions
      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
      PARTICULAR PURPOSE. You are solely responsible for determining the
      appropriateness of using or redistributing the Work and assume any
      risks associated with Your exercise of permissions under this License.

   8. Limitation of Liability. In no event and under no legal theory,
      whether in tort (including negligence), contract, or otherwise,
      unless required by applicable law (such as deliberate and grossly
      negligent acts) or agreed to in writing, shall any Contributor be
      liable to You for damages, including any direct, indirect, special,
      incidental, or consequential damages of any character arising as a
      result of this License or out of the use or inability to use the
      Work (including but not limited to damages for loss of goodwill,
      work stoppage, computer failure or malfunction, or any and all
      other commercial damages or losses), even if such Contributor
      has been advised of the possibility of such damages.

   9. Accepting Warranty or Additional Liability. While redistributing
      the Work or Derivative Works thereof, You may choose to offer,
      and charge a fee for, acceptance of support, warranty, indemnity,
      or other liability obligations and/or rights consistent with this
      License. However, in accepting such obligations, You may act only
      on Your own behalf and on Your sole responsibility, not on behalf
      of any other Contributor, and only if You agree to indemnify,
      defend, and hold each Contributor harmless for any liability
      incurred by, or claims asserted against, such Contributor by reason
      of your accepting any such warranty or additional liability.

   END OF TERMS AND CONDITIONS

   APPENDIX: How to apply the Apache License to your work.

      To apply the Apache License to your work, attach the following
      boilerplate notice, with the fields enclosed by brackets "[]"
      replaced with your own identifying information. (Don't include
      the brackets!)  The text should be enclosed in the appropriate
      comment syntax for the file format. We also recommend that a
      file or class name and description of purpose be included on the
      same "printed page" as the copyright notice for easier
      identification within third-party archives.

   Copyright [yyyy] [name of copyright owner]

   Licensed under the Apache License, Version 2.0 (the "License");
   you may not use this file except in compliance with the License.
   You may obtain a copy of the License at

       http://www.apache.org/licenses/LICENSE-2.0

   Unless required by applicable law or agreed to in writing, software
   distributed under the License is distributed on an "AS IS" BASIS,
   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   See the License for the specific language governing permissions and
   limitations under the License.


================================================
FILE: README.md
================================================
# intro-to-ml-with-kubeflow-examples
Examples for the Intro to ML with Kubeflow book


================================================
FILE: autopep_stuff.sh
================================================
#!/bin/bash
# autopep8 a bunch of things that we can
autopep8 -i -r ./ \
	 --select E101,E202,E201,E203,E211,E221,E222,E223,E224,E225,E226,E227,\
	 E228,E231,E241,E242,E251,E252,E262,E271,E272,E273,E274,E301,E302,E303,\
	 E304,E305,E306,E501,E502,E711,E712,E713,E714,E721,E722,E731,W291,W293,\
	 W391,W601,W602,W603,W604,W690\
	 -j 0 --exclude "*venv*"
# Then we use YAPF because it does a better job on long-lines
yapf -i -r ./ --exclude "*venv*"


================================================
FILE: ch03/example_secret.yaml
================================================
apiVersion: v1
kind: Secret
metadata:
  name: minioaccess
  namespace: mynamespace
data:
  AWS_ACCESS_KEY_ID: xxxxxxxxxx
  AWS_SECRET_ACCESS_KEY: xxxxxxxxxxxxxxxxxxxxx


================================================
FILE: ch03/linux_install.sh
================================================
#!/bin/bash
#tag::installMCLinux[]
pushd ~/bin
wget https://dl.min.io/client/mc/release/linux-amd64/mc
chmod a+x mc
#end::installMCLinux[]


================================================
FILE: ch03/mac_install.sh
================================================
#!/bin/bash
#tag::installMCMac[]
brew install minio/stable/minio
#end::installMCMac[]


================================================
FILE: ch03/minio.sh
================================================
#!/bin/bash
set -ex

# Minio runs on port 9000 (both UI and service) so expose locally to use cli or UI
#tag::fwdMinio[]
kubectl port-forward -n kubeflow svc/minio-service 9000:9000 &
#end::fwdMinio[]

# Give it a spell to settle
sleep 10

# Kubeflow creates a minio user with password minio123 at install
#tag::configMC[]
mc config host add minio http://localhost:9000 minio minio123
#end::configMC[]

#tag::listMC[]
mc ls minio
#end::listMC[]
# Output [2018-12-13 18:23:41 CST]     0B mlpipeline/

# Make a new bucket for our work
#tag::makeBucket[]
mc mb minio/kf-book-examples
#end::makeBucket[]


================================================
FILE: ch04/code/ControlStructures.ipynb
================================================
{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Simple Control structure\n",
    "\n",
    "Shows how to use conditional execution"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Requirement already up-to-date: kfp in ./.local/lib/python3.6/site-packages (0.2.2.1)\n",
      "Requirement already satisfied, skipping upgrade: jsonschema>=3.0.1 in /usr/local/lib/python3.6/dist-packages (from kfp) (3.2.0)\n",
      "Requirement already satisfied, skipping upgrade: google-cloud-storage>=1.13.0 in /usr/local/lib/python3.6/dist-packages (from kfp) (1.25.0)\n",
      "Requirement already satisfied, skipping upgrade: python-dateutil in /usr/local/lib/python3.6/dist-packages (from kfp) (2.8.1)\n",
      "Requirement already satisfied, skipping upgrade: google-auth>=1.6.1 in /usr/local/lib/python3.6/dist-packages (from kfp) (1.11.0)\n",
      "Requirement already satisfied, skipping upgrade: argo-models==2.2.1a in ./.local/lib/python3.6/site-packages (from kfp) (2.2.1a0)\n",
      "Requirement already satisfied, skipping upgrade: urllib3<1.25,>=1.15 in ./.local/lib/python3.6/site-packages (from kfp) (1.24.3)\n",
      "Requirement already satisfied, skipping upgrade: cloudpickle==1.1.1 in ./.local/lib/python3.6/site-packages (from kfp) (1.1.1)\n",
      "Requirement already satisfied, skipping upgrade: kubernetes<=10.0.0,>=8.0.0 in ./.local/lib/python3.6/site-packages (from kfp) (10.0.0)\n",
      "Requirement already satisfied, skipping upgrade: click==7.0 in ./.local/lib/python3.6/site-packages (from kfp) (7.0)\n",
      "Requirement already satisfied, skipping upgrade: certifi in /usr/local/lib/python3.6/dist-packages (from kfp) (2019.11.28)\n",
      "Requirement already satisfied, skipping upgrade: PyYAML in /usr/local/lib/python3.6/dist-packages (from kfp) (5.3)\n",
      "Requirement already satisfied, skipping upgrade: kfp-server-api<=0.1.40,>=0.1.18 in ./.local/lib/python3.6/site-packages (from kfp) (0.1.40)\n",
      "Requirement already satisfied, skipping upgrade: six>=1.10 in /usr/lib/python3/dist-packages (from kfp) (1.11.0)\n",
      "Requirement already satisfied, skipping upgrade: PyJWT>=1.6.4 in ./.local/lib/python3.6/site-packages (from kfp) (1.7.1)\n",
      "Requirement already satisfied, skipping upgrade: Deprecated in ./.local/lib/python3.6/site-packages (from kfp) (1.2.7)\n",
      "Requirement already satisfied, skipping upgrade: requests-toolbelt>=0.8.0 in ./.local/lib/python3.6/site-packages (from kfp) (0.9.1)\n",
      "Requirement already satisfied, skipping upgrade: cryptography>=2.4.2 in ./.local/lib/python3.6/site-packages (from kfp) (2.8)\n",
      "Requirement already satisfied, skipping upgrade: tabulate==0.8.3 in ./.local/lib/python3.6/site-packages (from kfp) (0.8.3)\n",
      "Requirement already satisfied, skipping upgrade: importlib-metadata; python_version < \"3.8\" in /usr/local/lib/python3.6/dist-packages (from jsonschema>=3.0.1->kfp) (1.4.0)\n",
      "Requirement already satisfied, skipping upgrade: attrs>=17.4.0 in /usr/local/lib/python3.6/dist-packages (from jsonschema>=3.0.1->kfp) (19.3.0)\n",
      "Requirement already satisfied, skipping upgrade: pyrsistent>=0.14.0 in /usr/local/lib/python3.6/dist-packages (from jsonschema>=3.0.1->kfp) (0.15.7)\n",
      "Requirement already satisfied, skipping upgrade: setuptools in /usr/local/lib/python3.6/dist-packages (from jsonschema>=3.0.1->kfp) (45.1.0)\n",
      "Requirement already satisfied, skipping upgrade: google-resumable-media<0.6dev,>=0.5.0 in /usr/local/lib/python3.6/dist-packages (from google-cloud-storage>=1.13.0->kfp) (0.5.0)\n",
      "Requirement already satisfied, skipping upgrade: google-cloud-core<2.0dev,>=1.2.0 in /usr/local/lib/python3.6/dist-packages (from google-cloud-storage>=1.13.0->kfp) (1.3.0)\n",
      "Requirement already satisfied, skipping upgrade: cachetools<5.0,>=2.0.0 in /usr/local/lib/python3.6/dist-packages (from google-auth>=1.6.1->kfp) (4.0.0)\n",
      "Requirement already satisfied, skipping upgrade: pyasn1-modules>=0.2.1 in /usr/local/lib/python3.6/dist-packages (from google-auth>=1.6.1->kfp) (0.2.8)\n",
      "Requirement already satisfied, skipping upgrade: rsa<4.1,>=3.1.4 in /usr/local/lib/python3.6/dist-packages (from google-auth>=1.6.1->kfp) (4.0)\n",
      "Requirement already satisfied, skipping upgrade: requests-oauthlib in /usr/local/lib/python3.6/dist-packages (from kubernetes<=10.0.0,>=8.0.0->kfp) (1.3.0)\n",
      "Requirement already satisfied, skipping upgrade: requests in /usr/local/lib/python3.6/dist-packages (from kubernetes<=10.0.0,>=8.0.0->kfp) (2.22.0)\n",
      "Requirement already satisfied, skipping upgrade: websocket-client!=0.40.0,!=0.41.*,!=0.42.*,>=0.32.0 in /usr/local/lib/python3.6/dist-packages (from kubernetes<=10.0.0,>=8.0.0->kfp) (0.57.0)\n",
      "Requirement already satisfied, skipping upgrade: wrapt<2,>=1.10 in /usr/local/lib/python3.6/dist-packages (from Deprecated->kfp) (1.11.2)\n",
      "Requirement already satisfied, skipping upgrade: cffi!=1.11.3,>=1.8 in ./.local/lib/python3.6/site-packages (from cryptography>=2.4.2->kfp) (1.14.0)\n",
      "Requirement already satisfied, skipping upgrade: zipp>=0.5 in /usr/local/lib/python3.6/dist-packages (from importlib-metadata; python_version < \"3.8\"->jsonschema>=3.0.1->kfp) (2.1.0)\n",
      "Requirement already satisfied, skipping upgrade: google-api-core<2.0.0dev,>=1.16.0 in /usr/local/lib/python3.6/dist-packages (from google-cloud-core<2.0dev,>=1.2.0->google-cloud-storage>=1.13.0->kfp) (1.16.0)\n",
      "Requirement already satisfied, skipping upgrade: pyasn1<0.5.0,>=0.4.6 in /usr/local/lib/python3.6/dist-packages (from pyasn1-modules>=0.2.1->google-auth>=1.6.1->kfp) (0.4.8)\n",
      "Requirement already satisfied, skipping upgrade: oauthlib>=3.0.0 in /usr/local/lib/python3.6/dist-packages (from requests-oauthlib->kubernetes<=10.0.0,>=8.0.0->kfp) (3.1.0)\n",
      "Requirement already satisfied, skipping upgrade: chardet<3.1.0,>=3.0.2 in /usr/local/lib/python3.6/dist-packages (from requests->kubernetes<=10.0.0,>=8.0.0->kfp) (3.0.4)\n",
      "Requirement already satisfied, skipping upgrade: idna<2.9,>=2.5 in /usr/lib/python3/dist-packages (from requests->kubernetes<=10.0.0,>=8.0.0->kfp) (2.6)\n",
      "Requirement already satisfied, skipping upgrade: pycparser in ./.local/lib/python3.6/site-packages (from cffi!=1.11.3,>=1.8->cryptography>=2.4.2->kfp) (2.19)\n",
      "Requirement already satisfied, skipping upgrade: protobuf>=3.4.0 in /usr/local/lib/python3.6/dist-packages (from google-api-core<2.0.0dev,>=1.16.0->google-cloud-core<2.0dev,>=1.2.0->google-cloud-storage>=1.13.0->kfp) (3.11.2)\n",
      "Requirement already satisfied, skipping upgrade: pytz in /usr/local/lib/python3.6/dist-packages (from google-api-core<2.0.0dev,>=1.16.0->google-cloud-core<2.0dev,>=1.2.0->google-cloud-storage>=1.13.0->kfp) (2019.3)\n",
      "Requirement already satisfied, skipping upgrade: googleapis-common-protos<2.0dev,>=1.6.0 in /usr/local/lib/python3.6/dist-packages (from google-api-core<2.0.0dev,>=1.16.0->google-cloud-core<2.0dev,>=1.2.0->google-cloud-storage>=1.13.0->kfp) (1.51.0)\n"
     ]
    }
   ],
   "source": [
    "!pip install kfp --upgrade --user\n",
    "\n",
    "import kfp\n",
    "from kfp import dsl\n",
    "from kfp.components import func_to_container_op, InputPath, OutputPath"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Functions"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "@func_to_container_op\n",
    "def get_random_int_op(minimum: int, maximum: int) -> int:\n",
    "    \"\"\"Generate a random number between minimum and maximum (inclusive).\"\"\"\n",
    "    import random\n",
    "    result = random.randint(minimum, maximum)\n",
    "    print(result)\n",
    "    return result\n",
    "\n",
    "@func_to_container_op\n",
    "def process_small_op(data: int):\n",
    "    \"\"\"Process small numbers.\"\"\"\n",
    "    print(\"Processing small result\", data)\n",
    "    return\n",
    "\n",
    "@func_to_container_op\n",
    "def process_medium_op(data: int):\n",
    "    \"\"\"Process medium numbers.\"\"\"\n",
    "    print(\"Processing medium result\", data)\n",
    "    return\n",
    "\n",
    "@func_to_container_op\n",
    "def process_large_op(data: int):\n",
    "    \"\"\"Process large numbers.\"\"\"\n",
    "    print(\"Processing large result\", data)\n",
    "    return"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Conditional pipeline"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "@dsl.pipeline(\n",
    "    name='Conditional execution pipeline',\n",
    "    description='Shows how to use dsl.Condition().'\n",
    ")\n",
    "def conditional_pipeline():\n",
    "    number = get_random_int_op(0, 100).output\n",
    "    with dsl.Condition(number < 10):\n",
    "        process_small_op(number)\n",
    "    with dsl.Condition(number > 10 and number < 50):\n",
    "        process_medium_op(number)\n",
    "    with dsl.Condition(number > 50):\n",
    "        process_large_op(number)\n",
    "        "
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Submit the pipeline for execution:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "Experiment link <a href=\"/pipeline/#/experiments/details/2abe16d1-fa2e-4f49-a3a5-acad8d36790d\" target=\"_blank\" >here</a>"
      ],
      "text/plain": [
       "<IPython.core.display.HTML object>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "text/html": [
       "Run link <a href=\"/pipeline/#/runs/details/293a92c5-50b2-4a96-bbd4-ebc85106f337\" target=\"_blank\" >here</a>"
      ],
      "text/plain": [
       "<IPython.core.display.HTML object>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "text/plain": [
       "RunPipelineResult(run_id=293a92c5-50b2-4a96-bbd4-ebc85106f337)"
      ]
     },
     "execution_count": 4,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "kfp.Client().create_run_from_pipeline_func(conditional_pipeline, arguments={})"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.9"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}


================================================
FILE: ch04/code/ControlStructures.py
================================================
#!/usr/bin/env python
# coding: utf-8

# # Simple Control structure
#
# Shows how to use conditional execution

# In[1]:

get_ipython().system('pip install kfp --upgrade --user')

import kfp
from kfp import dsl
from kfp.components import func_to_container_op, InputPath, OutputPath

# # Functions

# In[2]:


@func_to_container_op
def get_random_int_op(minimum: int, maximum: int) -> int:
    """Generate a random number between minimum and maximum (inclusive)."""
    import random
    result = random.randint(minimum, maximum)
    print(result)
    return result


@func_to_container_op
def process_small_op(data: int):
    """Process small numbers."""
    print("Processing small result", data)
    return


@func_to_container_op
def process_medium_op(data: int):
    """Process medium numbers."""
    print("Processing medium result", data)
    return


@func_to_container_op
def process_large_op(data: int):
    """Process large numbers."""
    print("Processing large result", data)
    return


# # Conditional pipeline

# In[3]:


@dsl.pipeline(name='Conditional execution pipeline',
              description='Shows how to use dsl.Condition().')
def conditional_pipeline():
    number = get_random_int_op(0, 100).output
    with dsl.Condition(number < 10):
        process_small_op(number)
    with dsl.Condition(number > 10 and number < 50):
        process_medium_op(number)
    with dsl.Condition(number > 50):
        process_large_op(number)


# # Submit the pipeline for execution:

# In[4]:

kfp.Client().create_run_from_pipeline_func(conditional_pipeline, arguments={})

# In[ ]:


================================================
FILE: ch04/code/Lightweight Pipeline.ipynb
================================================
{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Setup"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Requirement already up-to-date: kfp in ./.local/lib/python3.6/site-packages (0.2.2.1)\n",
      "Requirement already satisfied, skipping upgrade: PyYAML in /usr/local/lib/python3.6/dist-packages (from kfp) (5.3)\n",
      "Requirement already satisfied, skipping upgrade: requests-toolbelt>=0.8.0 in ./.local/lib/python3.6/site-packages (from kfp) (0.9.1)\n",
      "Requirement already satisfied, skipping upgrade: click==7.0 in ./.local/lib/python3.6/site-packages (from kfp) (7.0)\n",
      "Requirement already satisfied, skipping upgrade: Deprecated in ./.local/lib/python3.6/site-packages (from kfp) (1.2.7)\n",
      "Requirement already satisfied, skipping upgrade: kubernetes<=10.0.0,>=8.0.0 in ./.local/lib/python3.6/site-packages (from kfp) (10.0.0)\n",
      "Requirement already satisfied, skipping upgrade: six>=1.10 in /usr/lib/python3/dist-packages (from kfp) (1.11.0)\n",
      "Requirement already satisfied, skipping upgrade: certifi in /usr/local/lib/python3.6/dist-packages (from kfp) (2019.11.28)\n",
      "Requirement already satisfied, skipping upgrade: google-cloud-storage>=1.13.0 in /usr/local/lib/python3.6/dist-packages (from kfp) (1.25.0)\n",
      "Requirement already satisfied, skipping upgrade: PyJWT>=1.6.4 in ./.local/lib/python3.6/site-packages (from kfp) (1.7.1)\n",
      "Requirement already satisfied, skipping upgrade: cryptography>=2.4.2 in ./.local/lib/python3.6/site-packages (from kfp) (2.8)\n",
      "Requirement already satisfied, skipping upgrade: google-auth>=1.6.1 in /usr/local/lib/python3.6/dist-packages (from kfp) (1.11.0)\n",
      "Requirement already satisfied, skipping upgrade: cloudpickle==1.1.1 in ./.local/lib/python3.6/site-packages (from kfp) (1.1.1)\n",
      "Requirement already satisfied, skipping upgrade: kfp-server-api<=0.1.40,>=0.1.18 in ./.local/lib/python3.6/site-packages (from kfp) (0.1.40)\n",
      "Requirement already satisfied, skipping upgrade: argo-models==2.2.1a in ./.local/lib/python3.6/site-packages (from kfp) (2.2.1a0)\n",
      "Requirement already satisfied, skipping upgrade: tabulate==0.8.3 in ./.local/lib/python3.6/site-packages (from kfp) (0.8.3)\n",
      "Requirement already satisfied, skipping upgrade: python-dateutil in /usr/local/lib/python3.6/dist-packages (from kfp) (2.8.1)\n",
      "Requirement already satisfied, skipping upgrade: urllib3<1.25,>=1.15 in ./.local/lib/python3.6/site-packages (from kfp) (1.24.3)\n",
      "Requirement already satisfied, skipping upgrade: jsonschema>=3.0.1 in /usr/local/lib/python3.6/dist-packages (from kfp) (3.2.0)\n",
      "Requirement already satisfied, skipping upgrade: requests<3.0.0,>=2.0.1 in /usr/local/lib/python3.6/dist-packages (from requests-toolbelt>=0.8.0->kfp) (2.22.0)\n",
      "Requirement already satisfied, skipping upgrade: wrapt<2,>=1.10 in /usr/local/lib/python3.6/dist-packages (from Deprecated->kfp) (1.11.2)\n",
      "Requirement already satisfied, skipping upgrade: setuptools>=21.0.0 in /usr/local/lib/python3.6/dist-packages (from kubernetes<=10.0.0,>=8.0.0->kfp) (45.1.0)\n",
      "Requirement already satisfied, skipping upgrade: websocket-client!=0.40.0,!=0.41.*,!=0.42.*,>=0.32.0 in /usr/local/lib/python3.6/dist-packages (from kubernetes<=10.0.0,>=8.0.0->kfp) (0.57.0)\n",
      "Requirement already satisfied, skipping upgrade: requests-oauthlib in /usr/local/lib/python3.6/dist-packages (from kubernetes<=10.0.0,>=8.0.0->kfp) (1.3.0)\n",
      "Requirement already satisfied, skipping upgrade: google-cloud-core<2.0dev,>=1.2.0 in /usr/local/lib/python3.6/dist-packages (from google-cloud-storage>=1.13.0->kfp) (1.3.0)\n",
      "Requirement already satisfied, skipping upgrade: google-resumable-media<0.6dev,>=0.5.0 in /usr/local/lib/python3.6/dist-packages (from google-cloud-storage>=1.13.0->kfp) (0.5.0)\n",
      "Requirement already satisfied, skipping upgrade: cffi!=1.11.3,>=1.8 in ./.local/lib/python3.6/site-packages (from cryptography>=2.4.2->kfp) (1.14.0)\n",
      "Requirement already satisfied, skipping upgrade: cachetools<5.0,>=2.0.0 in /usr/local/lib/python3.6/dist-packages (from google-auth>=1.6.1->kfp) (4.0.0)\n",
      "Requirement already satisfied, skipping upgrade: rsa<4.1,>=3.1.4 in /usr/local/lib/python3.6/dist-packages (from google-auth>=1.6.1->kfp) (4.0)\n",
      "Requirement already satisfied, skipping upgrade: pyasn1-modules>=0.2.1 in /usr/local/lib/python3.6/dist-packages (from google-auth>=1.6.1->kfp) (0.2.8)\n",
      "Requirement already satisfied, skipping upgrade: attrs>=17.4.0 in /usr/local/lib/python3.6/dist-packages (from jsonschema>=3.0.1->kfp) (19.3.0)\n",
      "Requirement already satisfied, skipping upgrade: pyrsistent>=0.14.0 in /usr/local/lib/python3.6/dist-packages (from jsonschema>=3.0.1->kfp) (0.15.7)\n",
      "Requirement already satisfied, skipping upgrade: importlib-metadata; python_version < \"3.8\" in /usr/local/lib/python3.6/dist-packages (from jsonschema>=3.0.1->kfp) (1.4.0)\n",
      "Requirement already satisfied, skipping upgrade: chardet<3.1.0,>=3.0.2 in /usr/local/lib/python3.6/dist-packages (from requests<3.0.0,>=2.0.1->requests-toolbelt>=0.8.0->kfp) (3.0.4)\n",
      "Requirement already satisfied, skipping upgrade: idna<2.9,>=2.5 in /usr/lib/python3/dist-packages (from requests<3.0.0,>=2.0.1->requests-toolbelt>=0.8.0->kfp) (2.6)\n",
      "Requirement already satisfied, skipping upgrade: oauthlib>=3.0.0 in /usr/local/lib/python3.6/dist-packages (from requests-oauthlib->kubernetes<=10.0.0,>=8.0.0->kfp) (3.1.0)\n",
      "Requirement already satisfied, skipping upgrade: google-api-core<2.0.0dev,>=1.16.0 in /usr/local/lib/python3.6/dist-packages (from google-cloud-core<2.0dev,>=1.2.0->google-cloud-storage>=1.13.0->kfp) (1.16.0)\n",
      "Requirement already satisfied, skipping upgrade: pycparser in ./.local/lib/python3.6/site-packages (from cffi!=1.11.3,>=1.8->cryptography>=2.4.2->kfp) (2.19)\n",
      "Requirement already satisfied, skipping upgrade: pyasn1>=0.1.3 in /usr/local/lib/python3.6/dist-packages (from rsa<4.1,>=3.1.4->google-auth>=1.6.1->kfp) (0.4.8)\n",
      "Requirement already satisfied, skipping upgrade: zipp>=0.5 in /usr/local/lib/python3.6/dist-packages (from importlib-metadata; python_version < \"3.8\"->jsonschema>=3.0.1->kfp) (2.1.0)\n",
      "Requirement already satisfied, skipping upgrade: googleapis-common-protos<2.0dev,>=1.6.0 in /usr/local/lib/python3.6/dist-packages (from google-api-core<2.0.0dev,>=1.16.0->google-cloud-core<2.0dev,>=1.2.0->google-cloud-storage>=1.13.0->kfp) (1.51.0)\n",
      "Requirement already satisfied, skipping upgrade: pytz in /usr/local/lib/python3.6/dist-packages (from google-api-core<2.0.0dev,>=1.16.0->google-cloud-core<2.0dev,>=1.2.0->google-cloud-storage>=1.13.0->kfp) (2019.3)\n",
      "Requirement already satisfied, skipping upgrade: protobuf>=3.4.0 in /usr/local/lib/python3.6/dist-packages (from google-api-core<2.0.0dev,>=1.16.0->google-cloud-core<2.0dev,>=1.2.0->google-cloud-storage>=1.13.0->kfp) (3.11.2)\n"
     ]
    }
   ],
   "source": [
    "!pip install kfp --upgrade --user\n",
    "\n",
    "import kfp \n",
    "from kfp import compiler\n",
    "import kfp.dsl as dsl\n",
    "import kfp.notebook\n",
    "import kfp.components as comp\n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Simple function that just add two numbers:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "#Define a Python function\n",
    "def add(a: float, b: float) -> float:\n",
    "   '''Calculates sum of two arguments'''\n",
    "   return a + b"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Convert the function to a pipeline operation"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "add_op = comp.func_to_container_op(add)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "A bit more advanced function which demonstrates how to use imports, helper functions and produce multiple outputs."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "from typing import NamedTuple\n",
    "def my_divmod(dividend: float, divisor:float) -> NamedTuple('MyDivmodOutput', [('quotient', float), ('remainder', float)]):\n",
    "    '''Divides two numbers and calculate  the quotient and remainder'''\n",
    "    #Imports inside a component function:\n",
    "    import numpy as np\n",
    "\n",
    "    #This function demonstrates how to use nested functions inside a component function:\n",
    "    def divmod_helper(dividend, divisor):\n",
    "        return np.divmod(dividend, divisor)\n",
    "\n",
    "    (quotient, remainder) = divmod_helper(dividend, divisor)\n",
    "\n",
    "    from collections import namedtuple\n",
    "    divmod_output = namedtuple('MyDivmodOutput', ['quotient', 'remainder'])\n",
    "    return divmod_output(quotient, remainder)\n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Test running the python function directly"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "MyDivmodOutput(quotient=14, remainder=2)"
      ]
     },
     "execution_count": 5,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "my_divmod(100, 7)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Convert the function to a pipeline operation"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [],
   "source": [
    "divmod_op = comp.func_to_container_op(my_divmod, base_image='tensorflow/tensorflow:1.14.0-py3')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Define the pipeline\n",
    "Pipeline function has to be decorated with the @dsl.pipeline decorator"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [],
   "source": [
    "@dsl.pipeline(\n",
    "   name='Calculation pipeline',\n",
    "   description='A toy pipeline that performs arithmetic calculations.'\n",
    ")\n",
    "def calc_pipeline(\n",
    "   a='a',\n",
    "   b='7',\n",
    "   c='17',\n",
    "):\n",
    "    #Passing pipeline parameter and a constant value as operation arguments\n",
    "    add_task = add_op(a, 4) #Returns a dsl.ContainerOp class instance. \n",
    "    \n",
    "    #Passing a task output reference as operation arguments\n",
    "    #For an operation with a single return value, the output reference can be accessed using `task.output` or `task.outputs['output_name']` syntax\n",
    "    divmod_task = divmod_op(add_task.output, b)\n",
    "\n",
    "    #For an operation with a multiple return values, the output references can be accessed using `task.outputs['output_name']` syntax\n",
    "    result_task = add_op(divmod_task.outputs['quotient'], c)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Submit the pipeline for execution"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "Experiment link <a href=\"/pipeline/#/experiments/details/2abe16d1-fa2e-4f49-a3a5-acad8d36790d\" target=\"_blank\" >here</a>"
      ],
      "text/plain": [
       "<IPython.core.display.HTML object>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "text/html": [
       "Run link <a href=\"/pipeline/#/runs/details/87276776-0c3a-4d4e-99d0-4563b7f42fa5\" target=\"_blank\" >here</a>"
      ],
      "text/plain": [
       "<IPython.core.display.HTML object>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "text/plain": [
       "RunPipelineResult(run_id=87276776-0c3a-4d4e-99d0-4563b7f42fa5)"
      ]
     },
     "execution_count": 8,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "client = kfp.Client()\n",
    "\n",
    "#Specify pipeline argument values\n",
    "arguments = {'a': '7', 'b': '8'}\n",
    "\n",
    "#Submit a pipeline run\n",
    "client.create_run_from_pipeline_func(calc_pipeline, arguments=arguments)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.9"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}


================================================
FILE: ch04/code/Lightweight Pipeline.py
================================================
#!/usr/bin/env python
# coding: utf-8

# # Setup

# In[1]:

get_ipython().system('pip install kfp --upgrade --user')

import kfp
from kfp import compiler
import kfp.dsl as dsl
import kfp.notebook
import kfp.components as comp

# Simple function that just add two numbers:

# In[2]:


#Define a Python function
def add(a: float, b: float) -> float:
    '''Calculates sum of two arguments'''
    return a + b


# Convert the function to a pipeline operation

# In[3]:

add_op = comp.func_to_container_op(add)

# A bit more advanced function which demonstrates how to use imports, helper functions and produce multiple outputs.

# In[4]:

from typing import NamedTuple


def my_divmod(
    dividend: float, divisor: float
) -> NamedTuple('MyDivmodOutput', [('quotient', float), ('remainder', float)]):
    '''Divides two numbers and calculate  the quotient and remainder'''
    #Imports inside a component function:
    import numpy as np

    #This function demonstrates how to use nested functions inside a component function:
    def divmod_helper(dividend, divisor):
        return np.divmod(dividend, divisor)

    (quotient, remainder) = divmod_helper(dividend, divisor)

    from collections import namedtuple
    divmod_output = namedtuple('MyDivmodOutput', ['quotient', 'remainder'])
    return divmod_output(quotient, remainder)


# Test running the python function directly

# In[5]:

my_divmod(100, 7)

# Convert the function to a pipeline operation

# In[6]:

divmod_op = comp.func_to_container_op(
    my_divmod, base_image='tensorflow/tensorflow:1.14.0-py3')

# Define the pipeline
# Pipeline function has to be decorated with the @dsl.pipeline decorator

# In[7]:


@dsl.pipeline(
    name='Calculation pipeline',
    description='A toy pipeline that performs arithmetic calculations.')
def calc_pipeline(
    a='a',
    b='7',
    c='17',
):
    #Passing pipeline parameter and a constant value as operation arguments
    add_task = add_op(a, 4)  # Returns a dsl.ContainerOp class instance.

    #Passing a task output reference as operation arguments
    #For an operation with a single return value, the output reference can be accessed using `task.output` or `task.outputs['output_name']` syntax
    divmod_task = divmod_op(add_task.output, b)

    #For an operation with a multiple return values, the output references can be accessed using `task.outputs['output_name']` syntax
    result_task = add_op(divmod_task.outputs['quotient'], c)


# Submit the pipeline for execution

# In[8]:

client = kfp.Client()

#Specify pipeline argument values
arguments = {'a': '7', 'b': '8'}

#Submit a pipeline run
client.create_run_from_pipeline_func(calc_pipeline, arguments=arguments)

# In[ ]:


================================================
FILE: ch04/code/RecommenderPipeline.ipynb
================================================
{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Kubeflow pipeline\n",
    "This is a fairly simple pipeline, containing sequential steps:\n",
    "\n",
    "1. Update data - implemented by lightbend/recommender-data-update-publisher:0.2 image\n",
    "2. Run model training. Ideally we would run TFJob, but due to the current limitations for pipelines, we will directly use an image implementing training lightbend/ml-tf-recommender:0.1\n",
    "3. Update serving model - implemented by lightbend/recommender-model-publisher:0.2"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Setup"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Requirement already up-to-date: kubernetes in ./.local/lib/python3.6/site-packages (10.0.1)\n",
      "Requirement already satisfied, skipping upgrade: pyyaml>=3.12 in /usr/local/lib/python3.6/dist-packages (from kubernetes) (5.3)\n",
      "Requirement already satisfied, skipping upgrade: six>=1.9.0 in /usr/lib/python3/dist-packages (from kubernetes) (1.11.0)\n",
      "Requirement already satisfied, skipping upgrade: urllib3>=1.24.2 in ./.local/lib/python3.6/site-packages (from kubernetes) (1.24.3)\n",
      "Requirement already satisfied, skipping upgrade: requests-oauthlib in /usr/local/lib/python3.6/dist-packages (from kubernetes) (1.3.0)\n",
      "Requirement already satisfied, skipping upgrade: certifi>=14.05.14 in /usr/local/lib/python3.6/dist-packages (from kubernetes) (2019.11.28)\n",
      "Requirement already satisfied, skipping upgrade: python-dateutil>=2.5.3 in /usr/local/lib/python3.6/dist-packages (from kubernetes) (2.8.1)\n",
      "Requirement already satisfied, skipping upgrade: setuptools>=21.0.0 in /usr/local/lib/python3.6/dist-packages (from kubernetes) (45.1.0)\n",
      "Requirement already satisfied, skipping upgrade: requests in /usr/local/lib/python3.6/dist-packages (from kubernetes) (2.22.0)\n",
      "Requirement already satisfied, skipping upgrade: websocket-client!=0.40.0,!=0.41.*,!=0.42.*,>=0.32.0 in /usr/local/lib/python3.6/dist-packages (from kubernetes) (0.57.0)\n",
      "Requirement already satisfied, skipping upgrade: google-auth>=1.0.1 in /usr/local/lib/python3.6/dist-packages (from kubernetes) (1.11.0)\n",
      "Requirement already satisfied, skipping upgrade: oauthlib>=3.0.0 in /usr/local/lib/python3.6/dist-packages (from requests-oauthlib->kubernetes) (3.1.0)\n",
      "Requirement already satisfied, skipping upgrade: idna<2.9,>=2.5 in /usr/lib/python3/dist-packages (from requests->kubernetes) (2.6)\n",
      "Requirement already satisfied, skipping upgrade: chardet<3.1.0,>=3.0.2 in /usr/local/lib/python3.6/dist-packages (from requests->kubernetes) (3.0.4)\n",
      "Requirement already satisfied, skipping upgrade: pyasn1-modules>=0.2.1 in /usr/local/lib/python3.6/dist-packages (from google-auth>=1.0.1->kubernetes) (0.2.8)\n",
      "Requirement already satisfied, skipping upgrade: cachetools<5.0,>=2.0.0 in /usr/local/lib/python3.6/dist-packages (from google-auth>=1.0.1->kubernetes) (4.0.0)\n",
      "Requirement already satisfied, skipping upgrade: rsa<4.1,>=3.1.4 in /usr/local/lib/python3.6/dist-packages (from google-auth>=1.0.1->kubernetes) (4.0)\n",
      "Requirement already satisfied, skipping upgrade: pyasn1<0.5.0,>=0.4.6 in /usr/local/lib/python3.6/dist-packages (from pyasn1-modules>=0.2.1->google-auth>=1.0.1->kubernetes) (0.4.8)\n",
      "Requirement already up-to-date: kfp in ./.local/lib/python3.6/site-packages (0.2.2.1)\n",
      "Requirement already satisfied, skipping upgrade: PyJWT>=1.6.4 in ./.local/lib/python3.6/site-packages (from kfp) (1.7.1)\n",
      "Requirement already satisfied, skipping upgrade: requests-toolbelt>=0.8.0 in ./.local/lib/python3.6/site-packages (from kfp) (0.9.1)\n",
      "Requirement already satisfied, skipping upgrade: python-dateutil in /usr/local/lib/python3.6/dist-packages (from kfp) (2.8.1)\n",
      "Requirement already satisfied, skipping upgrade: PyYAML in /usr/local/lib/python3.6/dist-packages (from kfp) (5.3)\n",
      "Requirement already satisfied, skipping upgrade: kfp-server-api<=0.1.40,>=0.1.18 in ./.local/lib/python3.6/site-packages (from kfp) (0.1.40)\n",
      "Requirement already satisfied, skipping upgrade: google-cloud-storage>=1.13.0 in /usr/local/lib/python3.6/dist-packages (from kfp) (1.25.0)\n",
      "Requirement already satisfied, skipping upgrade: jsonschema>=3.0.1 in /usr/local/lib/python3.6/dist-packages (from kfp) (3.2.0)\n",
      "Requirement already satisfied, skipping upgrade: Deprecated in ./.local/lib/python3.6/site-packages (from kfp) (1.2.7)\n",
      "Requirement already satisfied, skipping upgrade: google-auth>=1.6.1 in /usr/local/lib/python3.6/dist-packages (from kfp) (1.11.0)\n",
      "Collecting kubernetes<=10.0.0,>=8.0.0\n",
      "  Using cached kubernetes-10.0.0-py2.py3-none-any.whl (1.5 MB)\n",
      "Requirement already satisfied, skipping upgrade: argo-models==2.2.1a in ./.local/lib/python3.6/site-packages (from kfp) (2.2.1a0)\n",
      "Requirement already satisfied, skipping upgrade: urllib3<1.25,>=1.15 in ./.local/lib/python3.6/site-packages (from kfp) (1.24.3)\n",
      "Requirement already satisfied, skipping upgrade: certifi in /usr/local/lib/python3.6/dist-packages (from kfp) (2019.11.28)\n",
      "Requirement already satisfied, skipping upgrade: tabulate==0.8.3 in ./.local/lib/python3.6/site-packages (from kfp) (0.8.3)\n",
      "Requirement already satisfied, skipping upgrade: click==7.0 in ./.local/lib/python3.6/site-packages (from kfp) (7.0)\n",
      "Requirement already satisfied, skipping upgrade: cloudpickle==1.1.1 in ./.local/lib/python3.6/site-packages (from kfp) (1.1.1)\n",
      "Requirement already satisfied, skipping upgrade: six>=1.10 in /usr/lib/python3/dist-packages (from kfp) (1.11.0)\n",
      "Requirement already satisfied, skipping upgrade: cryptography>=2.4.2 in ./.local/lib/python3.6/site-packages (from kfp) (2.8)\n",
      "Requirement already satisfied, skipping upgrade: requests<3.0.0,>=2.0.1 in /usr/local/lib/python3.6/dist-packages (from requests-toolbelt>=0.8.0->kfp) (2.22.0)\n",
      "Requirement already satisfied, skipping upgrade: google-resumable-media<0.6dev,>=0.5.0 in /usr/local/lib/python3.6/dist-packages (from google-cloud-storage>=1.13.0->kfp) (0.5.0)\n",
      "Requirement already satisfied, skipping upgrade: google-cloud-core<2.0dev,>=1.2.0 in /usr/local/lib/python3.6/dist-packages (from google-cloud-storage>=1.13.0->kfp) (1.3.0)\n",
      "Requirement already satisfied, skipping upgrade: setuptools in /usr/local/lib/python3.6/dist-packages (from jsonschema>=3.0.1->kfp) (45.1.0)\n",
      "Requirement already satisfied, skipping upgrade: attrs>=17.4.0 in /usr/local/lib/python3.6/dist-packages (from jsonschema>=3.0.1->kfp) (19.3.0)\n",
      "Requirement already satisfied, skipping upgrade: importlib-metadata; python_version < \"3.8\" in /usr/local/lib/python3.6/dist-packages (from jsonschema>=3.0.1->kfp) (1.4.0)\n",
      "Requirement already satisfied, skipping upgrade: pyrsistent>=0.14.0 in /usr/local/lib/python3.6/dist-packages (from jsonschema>=3.0.1->kfp) (0.15.7)\n",
      "Requirement already satisfied, skipping upgrade: wrapt<2,>=1.10 in /usr/local/lib/python3.6/dist-packages (from Deprecated->kfp) (1.11.2)\n",
      "Requirement already satisfied, skipping upgrade: rsa<4.1,>=3.1.4 in /usr/local/lib/python3.6/dist-packages (from google-auth>=1.6.1->kfp) (4.0)\n",
      "Requirement already satisfied, skipping upgrade: cachetools<5.0,>=2.0.0 in /usr/local/lib/python3.6/dist-packages (from google-auth>=1.6.1->kfp) (4.0.0)\n",
      "Requirement already satisfied, skipping upgrade: pyasn1-modules>=0.2.1 in /usr/local/lib/python3.6/dist-packages (from google-auth>=1.6.1->kfp) (0.2.8)\n",
      "Requirement already satisfied, skipping upgrade: websocket-client!=0.40.0,!=0.41.*,!=0.42.*,>=0.32.0 in /usr/local/lib/python3.6/dist-packages (from kubernetes<=10.0.0,>=8.0.0->kfp) (0.57.0)\n",
      "Requirement already satisfied, skipping upgrade: requests-oauthlib in /usr/local/lib/python3.6/dist-packages (from kubernetes<=10.0.0,>=8.0.0->kfp) (1.3.0)\n",
      "Requirement already satisfied, skipping upgrade: cffi!=1.11.3,>=1.8 in ./.local/lib/python3.6/site-packages (from cryptography>=2.4.2->kfp) (1.14.0)\n",
      "Requirement already satisfied, skipping upgrade: chardet<3.1.0,>=3.0.2 in /usr/local/lib/python3.6/dist-packages (from requests<3.0.0,>=2.0.1->requests-toolbelt>=0.8.0->kfp) (3.0.4)\n",
      "Requirement already satisfied, skipping upgrade: idna<2.9,>=2.5 in /usr/lib/python3/dist-packages (from requests<3.0.0,>=2.0.1->requests-toolbelt>=0.8.0->kfp) (2.6)\n",
      "Requirement already satisfied, skipping upgrade: google-api-core<2.0.0dev,>=1.16.0 in /usr/local/lib/python3.6/dist-packages (from google-cloud-core<2.0dev,>=1.2.0->google-cloud-storage>=1.13.0->kfp) (1.16.0)\n",
      "Requirement already satisfied, skipping upgrade: zipp>=0.5 in /usr/local/lib/python3.6/dist-packages (from importlib-metadata; python_version < \"3.8\"->jsonschema>=3.0.1->kfp) (2.1.0)\n",
      "Requirement already satisfied, skipping upgrade: pyasn1>=0.1.3 in /usr/local/lib/python3.6/dist-packages (from rsa<4.1,>=3.1.4->google-auth>=1.6.1->kfp) (0.4.8)\n",
      "Requirement already satisfied, skipping upgrade: oauthlib>=3.0.0 in /usr/local/lib/python3.6/dist-packages (from requests-oauthlib->kubernetes<=10.0.0,>=8.0.0->kfp) (3.1.0)\n",
      "Requirement already satisfied, skipping upgrade: pycparser in ./.local/lib/python3.6/site-packages (from cffi!=1.11.3,>=1.8->cryptography>=2.4.2->kfp) (2.19)\n",
      "Requirement already satisfied, skipping upgrade: googleapis-common-protos<2.0dev,>=1.6.0 in /usr/local/lib/python3.6/dist-packages (from google-api-core<2.0.0dev,>=1.16.0->google-cloud-core<2.0dev,>=1.2.0->google-cloud-storage>=1.13.0->kfp) (1.51.0)\n",
      "Requirement already satisfied, skipping upgrade: protobuf>=3.4.0 in /usr/local/lib/python3.6/dist-packages (from google-api-core<2.0.0dev,>=1.16.0->google-cloud-core<2.0dev,>=1.2.0->google-cloud-storage>=1.13.0->kfp) (3.11.2)\n",
      "Requirement already satisfied, skipping upgrade: pytz in /usr/local/lib/python3.6/dist-packages (from google-api-core<2.0.0dev,>=1.16.0->google-cloud-core<2.0dev,>=1.2.0->google-cloud-storage>=1.13.0->kfp) (2019.3)\n",
      "Installing collected packages: kubernetes\n",
      "  Attempting uninstall: kubernetes\n",
      "    Found existing installation: kubernetes 10.0.1\n",
      "    Uninstalling kubernetes-10.0.1:\n",
      "      Successfully uninstalled kubernetes-10.0.1\n",
      "Successfully installed kubernetes-10.0.0\n"
     ]
    }
   ],
   "source": [
    "!pip install kubernetes --upgrade --user\n",
    "!pip install kfp --upgrade --user\n",
    "\n",
    "\n",
    "import kfp  # the Pipelines SDK.  This library is included with the notebook image.\n",
    "from kfp import compiler\n",
    "import kfp.dsl as dsl\n",
    "import kfp.notebook\n",
    "from kubernetes import client as k8s_client"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Create/Get an Experiment in the Kubeflow Pipeline System\n",
    "The Kubeflow Pipeline system requires an \"Experiment\" to group pipeline runs. You can create a new experiment, or call client.list_experiments() to get existing ones."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "client = kfp.Client()\n",
    "client.list_experiments()\n",
    "#exp = client.create_experiment(name='mdupdate')\n",
    "exp = client.get_experiment(experiment_name ='mdupdate')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Define a Pipeline\n",
    "Authoring a pipeline is like authoring a normal Python function. The pipeline function describes the topology of the pipeline.\n",
    "\n",
    "Each step in the pipeline is typically a ContainerOp --- a simple class or function describing how to interact with a docker container image. In the pipeline, all the container images referenced in the pipeline are already built."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "@dsl.pipeline(\n",
    "  name='Recommender model update',\n",
    "  description='Demonstrate usage of pipelines for multi-step model update'\n",
    ")\n",
    "def recommender_pipeline():\n",
    "    # Load new data\n",
    "  data = dsl.ContainerOp(\n",
    "      name='updatedata',\n",
    "      image='lightbend/recommender-data-update-publisher:0.2') \\\n",
    "    .add_env_variable(k8s_client.V1EnvVar(name='MINIO_URL',value='http://minio-service.kubeflow.svc.cluster.local:9000')) \\\n",
    "    .add_env_variable(k8s_client.V1EnvVar(name='MINIO_KEY', value='minio')) \\\n",
    "    .add_env_variable(k8s_client.V1EnvVar(name='MINIO_SECRET', value='minio123'))\n",
    "    # Train the model\n",
    "  train = dsl.ContainerOp(\n",
    "      name='trainmodel',\n",
    "      image='lightbend/ml-tf-recommender:0.1') \\\n",
    "    .add_env_variable(k8s_client.V1EnvVar(name='MINIO_URL',value='minio-service.kubeflow.svc.cluster.local:9000')) \\\n",
    "    .add_env_variable(k8s_client.V1EnvVar(name='MINIO_KEY', value='minio')) \\\n",
    "    .add_env_variable(k8s_client.V1EnvVar(name='MINIO_SECRET', value='minio123'))\n",
    "  train.after(data)\n",
    "    # Publish new model model\n",
    "  publish = dsl.ContainerOp(\n",
    "      name='publishmodel',\n",
    "      image='lightbend/recommender-model-publisher:0.2') \\\n",
    "    .add_env_variable(k8s_client.V1EnvVar(name='MINIO_URL',value='http://minio-service.kubeflow.svc.cluster.local:9000')) \\\n",
    "    .add_env_variable(k8s_client.V1EnvVar(name='MINIO_KEY', value='minio')) \\\n",
    "    .add_env_variable(k8s_client.V1EnvVar(name='MINIO_SECRET', value='minio123')) \\\n",
    "    .add_env_variable(k8s_client.V1EnvVar(name='KAFKA_BROKERS', value='cloudflow-kafka-brokers.cloudflow.svc.cluster.local:9092')) \\\n",
    "    .add_env_variable(k8s_client.V1EnvVar(name='DEFAULT_RECOMMENDER_URL', value='http://recommendermodelserver.kubeflow.svc.cluster.local:8501')) \\\n",
    "    .add_env_variable(k8s_client.V1EnvVar(name='ALTERNATIVE_RECOMMENDER_URL', value='http://recommendermodelserver1.kubeflow.svc.cluster.local:8501'))\n",
    "  publish.after(train)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Compile pipeline"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
    "compiler.Compiler().compile(recommender_pipeline, 'pipeline.tar.gz')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Submit an experiment run"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "Run link <a href=\"/pipeline/#/runs/details/df24284c-c7a1-480e-91b6-398bd352f164\" target=\"_blank\" >here</a>"
      ],
      "text/plain": [
       "<IPython.core.display.HTML object>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "run = client.run_pipeline(exp.id, 'pipeline1', 'pipeline.tar.gz')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.9"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}


================================================
FILE: ch04/code/RecommenderPipeline.py
================================================
#!/usr/bin/env python
# coding: utf-8

# # Kubeflow pipeline
# This is a fairly simple pipeline, containing sequential steps:
#
# 1. Update data - implemented by lightbend/recommender-data-update-publisher:0.2 image
# 2. Run model training. Ideally we would run TFJob, but due to the current limitations for pipelines, we will directly use an image implementing training lightbend/ml-tf-recommender:0.1
# 3. Update serving model - implemented by lightbend/recommender-model-publisher:0.2

# # Setup

# In[1]:

get_ipython().system('pip install kubernetes --upgrade --user')
get_ipython().system('pip install kfp --upgrade --user')

# the Pipelines SDK.  This library is included with the notebook image.
import kfp
from kfp import compiler
import kfp.dsl as dsl
import kfp.notebook
from kubernetes import client as k8s_client

# # Create/Get an Experiment in the Kubeflow Pipeline System
# The Kubeflow Pipeline system requires an "Experiment" to group pipeline runs. You can create a new experiment, or call client.list_experiments() to get existing ones.

# In[3]:

client = kfp.Client()
client.list_experiments()
#exp = client.create_experiment(name='mdupdate')
exp = client.get_experiment(experiment_name='mdupdate')

# # Define a Pipeline
# Authoring a pipeline is like authoring a normal Python function. The pipeline function describes the topology of the pipeline.
#
# Each step in the pipeline is typically a ContainerOp --- a simple class or function describing how to interact with a docker container image. In the pipeline, all the container images referenced in the pipeline are already built.

# In[4]:


@dsl.pipeline(
    name='Recommender model update',
    description='Demonstrate usage of pipelines for multi-step model update')
def recommender_pipeline():
    # Load new data
    data = dsl.ContainerOp(
        name='updatedata',
        image='lightbend/recommender-data-update-publisher:0.2') \
      .add_env_variable(k8s_client.V1EnvVar(name='MINIO_URL', value='http://minio-service.kubeflow.svc.cluster.local:9000')) \
      .add_env_variable(k8s_client.V1EnvVar(name='MINIO_KEY', value='minio')) \
      .add_env_variable(k8s_client.V1EnvVar(name='MINIO_SECRET', value='minio123'))
    # Train the model
    train = dsl.ContainerOp(
        name='trainmodel',
        image='lightbend/ml-tf-recommender:0.1') \
      .add_env_variable(k8s_client.V1EnvVar(name='MINIO_URL', value='minio-service.kubeflow.svc.cluster.local:9000')) \
      .add_env_variable(k8s_client.V1EnvVar(name='MINIO_KEY', value='minio')) \
      .add_env_variable(k8s_client.V1EnvVar(name='MINIO_SECRET', value='minio123'))
    train.after(data)
    # Publish new model model
    publish = dsl.ContainerOp(
        name='publishmodel',
        image='lightbend/recommender-model-publisher:0.2') \
      .add_env_variable(k8s_client.V1EnvVar(name='MINIO_URL', value='http://minio-service.kubeflow.svc.cluster.local:9000')) \
      .add_env_variable(k8s_client.V1EnvVar(name='MINIO_KEY', value='minio')) \
      .add_env_variable(k8s_client.V1EnvVar(name='MINIO_SECRET', value='minio123')) \
      .add_env_variable(k8s_client.V1EnvVar(name='KAFKA_BROKERS', value='cloudflow-kafka-brokers.cloudflow.svc.cluster.local:9092')) \
      .add_env_variable(k8s_client.V1EnvVar(name='DEFAULT_RECOMMENDER_URL', value='http://recommendermodelserver.kubeflow.svc.cluster.local:8501')) \
      .add_env_variable(k8s_client.V1EnvVar(name='ALTERNATIVE_RECOMMENDER_URL', value='http://recommendermodelserver1.kubeflow.svc.cluster.local:8501'))
    publish.after(train)


# # Compile pipeline

# In[5]:

compiler.Compiler().compile(recommender_pipeline, 'pipeline.tar.gz')

# # Submit an experiment run

# In[6]:

run = client.run_pipeline(exp.id, 'pipeline1', 'pipeline.tar.gz')

# In[ ]:


================================================
FILE: ch04/code/download_components.sh
================================================
#!/bin/bash
#tag::dlPipelineRelease[]
wget https://github.com/kubeflow/pipelines/archive/0.2.5.tar.gz
tar -xvf 0.2.5.tar.gz
#end::dlPipelineRelease[]


================================================
FILE: ch04/install/deployment.yaml
================================================
apiVersion: extensions/v1beta1
kind: Deployment
metadata:
  labels:
    app: argo-ui
    app.kubernetes.io/component: argo
    app.kubernetes.io/instance: argo-v2.3.0
    app.kubernetes.io/managed-by: kfctl
    app.kubernetes.io/name: argo
    app.kubernetes.io/part-of: kubeflow
    app.kubernetes.io/version: v2.3.0
    kustomize.component: argo
  name: argo-ui
  namespace: kubeflow
spec:
  progressDeadlineSeconds: 600
  replicas: 1
  revisionHistoryLimit: 10
  selector:
    matchLabels:
      app: argo-ui
      app.kubernetes.io/component: argo
      app.kubernetes.io/instance: argo-v2.3.0
      app.kubernetes.io/managed-by: kfctl
      app.kubernetes.io/name: argo
      app.kubernetes.io/part-of: kubeflow
      app.kubernetes.io/version: v2.3.0
      kustomize.component: argo
  strategy:
    rollingUpdate:
      maxSurge: 25%
      maxUnavailable: 25%
    type: RollingUpdate
  template:
    metadata:
      annotations:
        sidecar.istio.io/inject: "false"
      creationTimestamp: null
      labels:
        app: argo-ui
        app.kubernetes.io/component: argo
        app.kubernetes.io/instance: argo-v2.3.0
        app.kubernetes.io/managed-by: kfctl
        app.kubernetes.io/name: argo
        app.kubernetes.io/part-of: kubeflow
        app.kubernetes.io/version: v2.3.0
        kustomize.component: argo
    spec:
      containers:
        - env:
            - name: ARGO_NAMESPACE
              valueFrom:
                fieldRef:
                  apiVersion: v1
                  fieldPath: metadata.namespace
            - name: IN_CLUSTER
              value: "true"
            - name: ENABLE_WEB_CONSOLE
              value: "true"
            - name: BASE_HREF
              value: /
          image: argoproj/argoui:v2.3.0
          imagePullPolicy: IfNotPresent
          name: argo-ui
          ports:
            - containerPort: 8001
              name: ui
              protocol: TCP
          readinessProbe:
            failureThreshold: 3
            httpGet:
              path: /
              port: 8001
              scheme: HTTP
            periodSeconds: 10
            successThreshold: 1
            timeoutSeconds: 1
          resources: {}
          terminationMessagePath: /dev/termination-log
          terminationMessagePolicy: File
      dnsPolicy: ClusterFirst
      restartPolicy: Always
      schedulerName: default-scheduler
      securityContext: {}
      serviceAccount: argo-ui
      serviceAccountName: argo-ui
      terminationGracePeriodSeconds: 30

================================================
FILE: ch04/install/virtualservice.yaml
================================================
apiVersion: networking.istio.io/v1alpha3
kind: VirtualService
metadata:
  name: argo-ui
  namespace: kubeflow
spec:
  gateways:
    - kubeflow-gateway
  hosts:
    - '*'
  http:
    - match:
        - uri:
            prefix: /argo/
      rewrite:
        uri: /
      route:
        - destination:
            host: argo-ui.kubeflow.svc.cluster.local
            port:
              number: 80

================================================
FILE: ch06/MLflow.ipynb
================================================
{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# mlflow-energyforecast\n",
    "\n",
    "This is a showcase for ML Flow capabilities, based on the article\n",
    "http://the-odd-dataguy.com/be-more-efficient-to-produce-ml-models-with-mlflow\n",
    "and a github https://github.com/jeanmidevacc/mlflow-energyforecast\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Collecting pandas\n",
      "\u001b[?25l  Downloading https://files.pythonhosted.org/packages/12/d1/a6502c2f5c15b50f5dd579fc1c52b47edf6f2e9f682aed917dd7565b3e60/pandas-1.0.0-cp36-cp36m-manylinux1_x86_64.whl (10.1MB)\n",
      "\u001b[K     |████████████████████████████████| 10.1MB 3.2MB/s eta 0:00:01\n",
      "\u001b[?25hRequirement already satisfied, skipping upgrade: numpy>=1.13.3 in ./.local/lib/python3.6/site-packages (from pandas) (1.18.1)\n",
      "Requirement already satisfied, skipping upgrade: python-dateutil>=2.6.1 in /usr/local/lib/python3.6/dist-packages (from pandas) (2.8.0)\n",
      "Requirement already satisfied, skipping upgrade: pytz>=2017.2 in /usr/local/lib/python3.6/dist-packages (from pandas) (2019.2)\n",
      "Requirement already satisfied, skipping upgrade: six>=1.5 in /usr/lib/python3/dist-packages (from python-dateutil>=2.6.1->pandas) (1.11.0)\n",
      "Installing collected packages: pandas\n",
      "  Found existing installation: pandas 0.25.3\n",
      "    Uninstalling pandas-0.25.3:\n",
      "      Successfully uninstalled pandas-0.25.3\n",
      "Successfully installed pandas-1.0.0\n",
      "\u001b[33mWARNING: You are using pip version 19.1.1, however version 20.0.2 is available.\n",
      "You should consider upgrading via the 'pip install --upgrade pip' command.\u001b[0m\n",
      "Collecting mlflow\n",
      "\u001b[?25l  Downloading https://files.pythonhosted.org/packages/65/33/5fe1559f7eb95e1fa2077df747ada7fd225045bad4e76bcdb53605e4b937/mlflow-1.6.0.tar.gz (15.9MB)\n",
      "\u001b[K     |████████████████████████████████| 15.9MB 3.0MB/s eta 0:00:01\n",
      "\u001b[?25hRequirement already satisfied, skipping upgrade: alembic in ./.local/lib/python3.6/site-packages (from mlflow) (1.3.2)\n",
      "Requirement already satisfied, skipping upgrade: click>=7.0 in /usr/local/lib/python3.6/dist-packages (from mlflow) (7.0)\n",
      "Requirement already satisfied, skipping upgrade: cloudpickle in ./.local/lib/python3.6/site-packages (from mlflow) (1.1.1)\n",
      "Requirement already satisfied, skipping upgrade: databricks-cli>=0.8.7 in ./.local/lib/python3.6/site-packages (from mlflow) (0.9.1)\n",
      "Requirement already satisfied, skipping upgrade: requests>=2.17.3 in /usr/local/lib/python3.6/dist-packages (from mlflow) (2.22.0)\n",
      "Requirement already satisfied, skipping upgrade: six>=1.10.0 in /usr/lib/python3/dist-packages (from mlflow) (1.11.0)\n",
      "Requirement already satisfied, skipping upgrade: Flask in ./.local/lib/python3.6/site-packages (from mlflow) (1.1.1)\n",
      "Requirement already satisfied, skipping upgrade: numpy in ./.local/lib/python3.6/site-packages (from mlflow) (1.18.1)\n",
      "Requirement already satisfied, skipping upgrade: pandas in ./.local/lib/python3.6/site-packages (from mlflow) (1.0.0)\n",
      "Requirement already satisfied, skipping upgrade: python-dateutil in /usr/local/lib/python3.6/dist-packages (from mlflow) (2.8.0)\n",
      "Requirement already satisfied, skipping upgrade: protobuf>=3.6.0 in /usr/local/lib/python3.6/dist-packages (from mlflow) (3.8.0)\n",
      "Requirement already satisfied, skipping upgrade: gitpython>=2.1.0 in ./.local/lib/python3.6/site-packages (from mlflow) (3.0.5)\n",
      "Requirement already satisfied, skipping upgrade: pyyaml in /usr/local/lib/python3.6/dist-packages (from mlflow) (5.1.2)\n",
      "Requirement already satisfied, skipping upgrade: querystring_parser in ./.local/lib/python3.6/site-packages (from mlflow) (1.2.4)\n",
      "Requirement already satisfied, skipping upgrade: simplejson in ./.local/lib/python3.6/site-packages (from mlflow) (3.17.0)\n",
      "Requirement already satisfied, skipping upgrade: docker>=4.0.0 in /usr/local/lib/python3.6/dist-packages (from mlflow) (4.0.2)\n",
      "Requirement already satisfied, skipping upgrade: entrypoints in /usr/local/lib/python3.6/dist-packages (from mlflow) (0.3)\n",
      "Requirement already satisfied, skipping upgrade: sqlparse in ./.local/lib/python3.6/site-packages (from mlflow) (0.3.0)\n",
      "Requirement already satisfied, skipping upgrade: sqlalchemy in ./.local/lib/python3.6/site-packages (from mlflow) (1.3.12)\n",
      "Requirement already satisfied, skipping upgrade: gorilla in ./.local/lib/python3.6/site-packages (from mlflow) (0.3.0)\n",
      "Requirement already satisfied, skipping upgrade: prometheus-flask-exporter in ./.local/lib/python3.6/site-packages (from mlflow) (0.12.1)\n",
      "Requirement already satisfied, skipping upgrade: gunicorn in ./.local/lib/python3.6/site-packages (from mlflow) (20.0.4)\n",
      "Requirement already satisfied, skipping upgrade: Mako in ./.local/lib/python3.6/site-packages (from alembic->mlflow) (1.1.0)\n",
      "Requirement already satisfied, skipping upgrade: python-editor>=0.3 in ./.local/lib/python3.6/site-packages (from alembic->mlflow) (1.0.4)\n",
      "Requirement already satisfied, skipping upgrade: configparser>=0.3.5 in ./.local/lib/python3.6/site-packages (from databricks-cli>=0.8.7->mlflow) (4.0.2)\n",
      "Requirement already satisfied, skipping upgrade: tabulate>=0.7.7 in /usr/local/lib/python3.6/dist-packages (from databricks-cli>=0.8.7->mlflow) (0.8.3)\n",
      "Requirement already satisfied, skipping upgrade: chardet<3.1.0,>=3.0.2 in /usr/local/lib/python3.6/dist-packages (from requests>=2.17.3->mlflow) (3.0.4)\n",
      "Requirement already satisfied, skipping upgrade: certifi>=2017.4.17 in /usr/local/lib/python3.6/dist-packages (from requests>=2.17.3->mlflow) (2019.9.11)\n",
      "Requirement already satisfied, skipping upgrade: urllib3!=1.25.0,!=1.25.1,<1.26,>=1.21.1 in /usr/local/lib/python3.6/dist-packages (from requests>=2.17.3->mlflow) (1.24.3)\n",
      "Requirement already satisfied, skipping upgrade: idna<2.9,>=2.5 in /usr/lib/python3/dist-packages (from requests>=2.17.3->mlflow) (2.6)\n",
      "Requirement already satisfied, skipping upgrade: Jinja2>=2.10.1 in /usr/local/lib/python3.6/dist-packages (from Flask->mlflow) (2.10.1)\n",
      "Requirement already satisfied, skipping upgrade: itsdangerous>=0.24 in ./.local/lib/python3.6/site-packages (from Flask->mlflow) (1.1.0)\n",
      "Requirement already satisfied, skipping upgrade: Werkzeug>=0.15 in /usr/local/lib/python3.6/dist-packages (from Flask->mlflow) (0.15.4)\n",
      "Requirement already satisfied, skipping upgrade: pytz>=2017.2 in /usr/local/lib/python3.6/dist-packages (from pandas->mlflow) (2019.2)\n",
      "Requirement already satisfied, skipping upgrade: setuptools in /usr/local/lib/python3.6/dist-packages (from protobuf>=3.6.0->mlflow) (41.0.1)\n",
      "Requirement already satisfied, skipping upgrade: gitdb2>=2.0.0 in ./.local/lib/python3.6/site-packages (from gitpython>=2.1.0->mlflow) (2.0.6)\n",
      "Requirement already satisfied, skipping upgrade: websocket-client>=0.32.0 in /usr/local/lib/python3.6/dist-packages (from docker>=4.0.0->mlflow) (0.56.0)\n",
      "Requirement already satisfied, skipping upgrade: prometheus-client in /usr/local/lib/python3.6/dist-packages (from prometheus-flask-exporter->mlflow) (0.7.1)\n",
      "Requirement already satisfied, skipping upgrade: MarkupSafe>=0.9.2 in /usr/local/lib/python3.6/dist-packages (from Mako->alembic->mlflow) (1.1.1)\n",
      "Requirement already satisfied, skipping upgrade: smmap2>=2.0.0 in ./.local/lib/python3.6/site-packages (from gitdb2>=2.0.0->gitpython>=2.1.0->mlflow) (2.0.5)\n",
      "Building wheels for collected packages: mlflow\n",
      "  Building wheel for mlflow (setup.py) ... \u001b[?25ldone\n",
      "\u001b[?25h  Stored in directory: /home/jovyan/.cache/pip/wheels/46/4e/83/e58b14b6d2d494783e31690de9572c5777882f675f480374b6\n",
      "Successfully built mlflow\n",
      "Installing collected packages: mlflow\n",
      "  Found existing installation: mlflow 1.5.0\n",
      "    Uninstalling mlflow-1.5.0:\n",
      "      Successfully uninstalled mlflow-1.5.0\n",
      "\u001b[33m  WARNING: The script mlflow is installed in '/home/jovyan/.local/bin' which is not on PATH.\n",
      "  Consider adding this directory to PATH or, if you prefer to suppress this warning, use --no-warn-script-location.\u001b[0m\n",
      "Successfully installed mlflow-1.6.0\n",
      "\u001b[33mWARNING: You are using pip version 19.1.1, however version 20.0.2 is available.\n",
      "You should consider upgrading via the 'pip install --upgrade pip' command.\u001b[0m\n",
      "Requirement already up-to-date: joblib in ./.local/lib/python3.6/site-packages (0.14.1)\n",
      "\u001b[33mWARNING: You are using pip version 19.1.1, however version 20.0.2 is available.\n",
      "You should consider upgrading via the 'pip install --upgrade pip' command.\u001b[0m\n",
      "Requirement already up-to-date: numpy in ./.local/lib/python3.6/site-packages (1.18.1)\n",
      "\u001b[33mWARNING: You are using pip version 19.1.1, however version 20.0.2 is available.\n",
      "You should consider upgrading via the 'pip install --upgrade pip' command.\u001b[0m\n",
      "Requirement already up-to-date: scipy in ./.local/lib/python3.6/site-packages (1.4.1)\n",
      "Requirement already satisfied, skipping upgrade: numpy>=1.13.3 in ./.local/lib/python3.6/site-packages (from scipy) (1.18.1)\n",
      "\u001b[33mWARNING: You are using pip version 19.1.1, however version 20.0.2 is available.\n",
      "You should consider upgrading via the 'pip install --upgrade pip' command.\u001b[0m\n",
      "Requirement already up-to-date: scikit-learn in ./.local/lib/python3.6/site-packages (0.22.1)\n",
      "Requirement already satisfied, skipping upgrade: numpy>=1.11.0 in ./.local/lib/python3.6/site-packages (from scikit-learn) (1.18.1)\n",
      "Requirement already satisfied, skipping upgrade: scipy>=0.17.0 in ./.local/lib/python3.6/site-packages (from scikit-learn) (1.4.1)\n",
      "Requirement already satisfied, skipping upgrade: joblib>=0.11 in ./.local/lib/python3.6/site-packages (from scikit-learn) (0.14.1)\n",
      "\u001b[33mWARNING: You are using pip version 19.1.1, however version 20.0.2 is available.\n",
      "You should consider upgrading via the 'pip install --upgrade pip' command.\u001b[0m\n",
      "Collecting boto3\n",
      "\u001b[?25l  Downloading https://files.pythonhosted.org/packages/d5/57/e9675a5a8d0ee586594ff19cb9a601334fbf24fa2fb29052d2a900ee5d23/boto3-1.11.9-py2.py3-none-any.whl (128kB)\n",
      "\u001b[K     |████████████████████████████████| 133kB 3.5MB/s eta 0:00:01\n",
      "\u001b[?25hCollecting botocore<1.15.0,>=1.14.9 (from boto3)\n",
      "\u001b[?25l  Downloading https://files.pythonhosted.org/packages/64/4c/b0b0d3b6f84a05f9135051b56d3eb8708012a289c4b82ee21c8c766f47b5/botocore-1.14.9-py2.py3-none-any.whl (5.9MB)\n",
      "\u001b[K     |████████████████████████████████| 5.9MB 11.6MB/s eta 0:00:01\n",
      "\u001b[?25hRequirement already satisfied, skipping upgrade: jmespath<1.0.0,>=0.7.1 in ./.local/lib/python3.6/site-packages (from boto3) (0.9.4)\n",
      "Requirement already satisfied, skipping upgrade: s3transfer<0.4.0,>=0.3.0 in ./.local/lib/python3.6/site-packages (from boto3) (0.3.0)\n",
      "Requirement already satisfied, skipping upgrade: python-dateutil<3.0.0,>=2.1 in /usr/local/lib/python3.6/dist-packages (from botocore<1.15.0,>=1.14.9->boto3) (2.8.0)\n",
      "Requirement already satisfied, skipping upgrade: docutils<0.16,>=0.10 in ./.local/lib/python3.6/site-packages (from botocore<1.15.0,>=1.14.9->boto3) (0.15.2)\n",
      "Requirement already satisfied, skipping upgrade: urllib3<1.26,>=1.20 in /usr/local/lib/python3.6/dist-packages (from botocore<1.15.0,>=1.14.9->boto3) (1.24.3)\n",
      "Requirement already satisfied, skipping upgrade: six>=1.5 in /usr/lib/python3/dist-packages (from python-dateutil<3.0.0,>=2.1->botocore<1.15.0,>=1.14.9->boto3) (1.11.0)\n",
      "Installing collected packages: botocore, boto3\n",
      "  Found existing installation: botocore 1.14.4\n",
      "    Uninstalling botocore-1.14.4:\n",
      "      Successfully uninstalled botocore-1.14.4\n",
      "  Found existing installation: boto3 1.11.4\n",
      "    Uninstalling boto3-1.11.4:\n",
      "      Successfully uninstalled boto3-1.11.4\n",
      "Successfully installed boto3-1.11.9 botocore-1.14.9\n",
      "\u001b[33mWARNING: You are using pip version 19.1.1, however version 20.0.2 is available.\n",
      "You should consider upgrading via the 'pip install --upgrade pip' command.\u001b[0m\n"
     ]
    }
   ],
   "source": [
    "!pip install pandas --upgrade --user\n",
    "!pip install mlflow --upgrade --user\n",
    "!pip install joblib --upgrade --user\n",
    "!pip install numpy --upgrade --user \n",
    "!pip install scipy --upgrade --user \n",
    "!pip install scikit-learn --upgrade --user\n",
    "!pip install boto3 --upgrade --user"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "import time\n",
    "import json\n",
    "import os\n",
    "from joblib import Parallel, delayed\n",
    "\n",
    "import pandas as pd\n",
    "import numpy as np\n",
    "import scipy\n",
    "\n",
    "from sklearn.model_selection import train_test_split, KFold\n",
    "from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score, explained_variance_score\n",
    "from sklearn.exceptions import ConvergenceWarning\n",
    "\n",
    "import mlflow\n",
    "import mlflow.sklearn\n",
    "from  mlflow.tracking import MlflowClient\n",
    "\n",
    "from warnings import simplefilter\n",
    "simplefilter(action='ignore', category = FutureWarning)\n",
    "simplefilter(action='ignore', category = ConvergenceWarning)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Ensure Minio access\n",
    "os.environ['MLFLOW_S3_ENDPOINT_URL'] = 'http://minio-service.kubeflow.svc.cluster.local:9000'\n",
    "os.environ['AWS_ACCESS_KEY_ID'] = 'minio'\n",
    "os.environ['AWS_SECRET_ACCESS_KEY'] = 'minio123'"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Data preparation"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Collect the data \n",
    "df_nationalconsumption_electricity_daily = pd.read_csv(\"https://raw.githubusercontent.com/jeanmidevacc/mlflow-energyforecast/master/data/rtu_data.csv\")\n",
    "df_nationalconsumption_electricity_daily.set_index([\"day\"], inplace = True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Size of the training set :  1081\n",
      "Size of the testing set :  233\n"
     ]
    }
   ],
   "source": [
    "# Prepare the training set and the testing set\n",
    "df_trainvalidate_energyconsumption = df_nationalconsumption_electricity_daily[df_nationalconsumption_electricity_daily[\"datastatus\"] == \"Définitif\"]\n",
    "del df_trainvalidate_energyconsumption[\"datastatus\"]\n",
    "\n",
    "df_test_energyconsumption = df_nationalconsumption_electricity_daily[df_nationalconsumption_electricity_daily[\"datastatus\"] == \"Consolidé\"]\n",
    "del df_test_energyconsumption[\"datastatus\"]\n",
    "\n",
    "print(\"Size of the training set : \",len(df_trainvalidate_energyconsumption))\n",
    "print(\"Size of the testing set : \",len(df_test_energyconsumption))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Output to predict :  dailyconsumption\n",
      "Inputs for the prediction :  ['weekday', 'week', 'month', 'year', 'avg_min_temperature', 'avg_max_temperature', 'avg_mean_temperature', 'wavg_min_temperature', 'wavg_max_temperature', 'wavg_mean_temperature', 'is_holiday']\n"
     ]
    }
   ],
   "source": [
    "# Define the inputs and the output\n",
    "output = \"dailyconsumption\"\n",
    "allinputs = list(df_trainvalidate_energyconsumption.columns)\n",
    "allinputs.remove(output)\n",
    "\n",
    "print(\"Output to predict : \", output)\n",
    "print(\"Inputs for the prediction : \", allinputs)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Build different set of featurws for the model\n",
    "possible_inputs = {\n",
    "    \"all\" : allinputs,\n",
    "    \"only_allday_inputs\" : [\"weekday\", \"month\", \"is_holiday\", \"week\"],\n",
    "    \"only_allweatheravg_inputs\" : [\"avg_min_temperature\", \"avg_max_temperature\", \"avg_mean_temperature\",\"wavg_min_temperature\", \"wavg_max_temperature\", \"wavg_mean_temperature\"],\n",
    "    \"only_meanweather_inputs_avg\" : [\"avg_mean_temperature\"],\n",
    "    \"only_meanweather_inputs_wavg\" : [\"wavg_mean_temperature\"],\n",
    "}"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Prepare the output of the model\n",
    "array_output_train = np.array(df_trainvalidate_energyconsumption[output])\n",
    "array_output_test = np.array(df_test_energyconsumption[output])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [],
   "source": [
    "# connect to remote server\n",
    "remote_server_uri = \"http://mlflowserver.kubeflow.svc.cluster.local:5000\"\n",
    "mlflow.set_tracking_uri(remote_server_uri)\n",
    "# Launch the experiment on mlflow\n",
    "experiment_name = \"electricityconsumption-forecast\"\n",
    "mlflow.set_experiment(experiment_name)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Define the evaluation function that will do the computation of the different metrics of accuracy (RMSE,MAE,R2)\n",
    "def evaluation_model(y_test, y_pred):\n",
    "\n",
    "    rmse = np.sqrt(mean_squared_error(y_test, y_pred))\n",
    "    mae = mean_absolute_error(y_test, y_pred)\n",
    "    r2 = r2_score(y_test, y_pred)\n",
    "\n",
    "    metrics = {\n",
    "        \"rmse\" : rmse,\n",
    "        \"r2\" : r2,\n",
    "        \"mae\" : mae,\n",
    "    }\n",
    "    \n",
    "    return metrics"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# KNN regressor"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [],
   "source": [
    "from sklearn.neighbors import KNeighborsRegressor\n",
    "\n",
    "def train_knnmodel(parameters, inputs, tags, log = False):\n",
    "    with mlflow.start_run(nested = True):\n",
    "        \n",
    "        # Prepare the data\n",
    "        array_inputs_train = np.array(df_trainvalidate_energyconsumption[inputs])\n",
    "        array_inputs_test = np.array(df_test_energyconsumption[inputs])\n",
    "        \n",
    "        \n",
    "        # Build the model\n",
    "        tic = time.time()\n",
    "        model = KNeighborsRegressor(parameters[\"nbr_neighbors\"], weights = parameters[\"weight_method\"])\n",
    "        model.fit(array_inputs_train, array_output_train)\n",
    "        duration_training = time.time() - tic\n",
    "\n",
    "        # Make the prediction\n",
    "        tic1 = time.time()\n",
    "        prediction = model.predict(array_inputs_test)\n",
    "        duration_prediction = time.time() - tic1\n",
    "\n",
    "        # Evaluate the model prediction\n",
    "        metrics = evaluation_model(array_output_test, prediction)\n",
    "\n",
    "        # Log in the console\n",
    "        if log:\n",
    "            print(f\"KNN regressor:\")\n",
    "            print(parameters)\n",
    "            print(metrics)\n",
    "\n",
    "        # Log in mlflow (parameter)\n",
    "        mlflow.log_params(parameters)\n",
    "\n",
    "        # Log in mlflow (metrics)\n",
    "        metrics[\"duration_training\"] = duration_training\n",
    "        metrics[\"duration_prediction\"] = duration_prediction\n",
    "        mlflow.log_metrics(metrics)\n",
    "\n",
    "        # log in mlflow (model)\n",
    "        mlflow.sklearn.log_model(model, f\"model\")\n",
    "                \n",
    "        # Tag the model\n",
    "        mlflow.set_tags(tags)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Test the different combinations\n",
    "configurations = []\n",
    "for nbr_neighbors in [1,2,5,10]:\n",
    "    for weight_method in ['uniform','distance']:\n",
    "        for field in possible_inputs:\n",
    "            parameters = {\n",
    "                \"nbr_neighbors\" : nbr_neighbors,\n",
    "                \"weight_method\" : weight_method\n",
    "            }\n",
    "\n",
    "            tags = {\n",
    "                \"model\" : \"knn\",\n",
    "                \"inputs\" : field\n",
    "            }\n",
    "            \n",
    "            configurations.append([parameters, tags])\n",
    "\n",
    "            train_knnmodel(parameters, possible_inputs[field], tags)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# MLP regressor"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [],
   "source": [
    "from sklearn.neural_network import MLPRegressor\n",
    "\n",
    "def train_mlpmodel(parameters, inputs, tags, log = False):\n",
    "    with mlflow.start_run(nested = True):\n",
    "        \n",
    "        # Prepare the data\n",
    "        array_inputs_train = np.array(df_trainvalidate_energyconsumption[inputs])\n",
    "        array_inputs_test = np.array(df_test_energyconsumption[inputs])\n",
    "        \n",
    "        # Build the model\n",
    "        tic = time.time()\n",
    "\n",
    "        model = MLPRegressor(\n",
    "            hidden_layer_sizes = parameters[\"hidden_layers\"],\n",
    "            activation = parameters[\"activation\"],\n",
    "            solver = parameters[\"solver\"],\n",
    "            max_iter = parameters[\"nbr_iteration\"],\n",
    "            random_state = 0)\n",
    "        \n",
    "        model.fit(array_inputs_train, array_output_train)\n",
    "        duration_training = time.time() - tic\n",
    "\n",
    "        # Make the prediction\n",
    "        tic1 = time.time()\n",
    "        prediction = model.predict(array_inputs_test)\n",
    "        duration_prediction = time.time() - tic1\n",
    "\n",
    "        # Evaluate the model prediction\n",
    "        metrics = evaluation_model(array_output_test, prediction)\n",
    "\n",
    "        # Log in the console\n",
    "        if log:\n",
    "            print(f\"Random forest regressor:\")\n",
    "            print(parameters)\n",
    "            print(metrics)\n",
    "    \n",
    "        # Log in mlflow (parameter)\n",
    "        mlflow.log_params(parameters)\n",
    "\n",
    "        # Log in mlflow (metrics)\n",
    "        metrics[\"duration_training\"] = duration_training\n",
    "        metrics[\"duration_prediction\"] = duration_prediction\n",
    "        mlflow.log_metrics(metrics)\n",
    "\n",
    "        # log in mlflow (model)\n",
    "        mlflow.sklearn.log_model(model, f\"model\")\n",
    "        \n",
    "        # Tag the model\n",
    "        mlflow.set_tags(tags)        "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [],
   "source": [
    "for hiddenlayers in [4,8,16]:\n",
    "    for activation in [\"identity\",\"logistic\",]:\n",
    "        for solver in [\"lbfgs\"]:\n",
    "            for nbriteration in [10,100,1000]:\n",
    "                for field in possible_inputs:\n",
    "                    parameters = {\n",
    "                        \"hidden_layers\" : hiddenlayers,\n",
    "                        \"activation\" : activation,\n",
    "                        \"solver\" : solver,\n",
    "                        \"nbr_iteration\" : nbriteration\n",
    "                    }\n",
    "\n",
    "                    tags = {\n",
    "                        \"model\" : \"mlp\",\n",
    "                        \"inputs\" : field\n",
    "                    }\n",
    "\n",
    "                    train_mlpmodel(parameters, possible_inputs[field], tags)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Use a handmade model (scipy approach)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {},
   "outputs": [],
   "source": [
    "class PTG:\n",
    "    def __init__(self, thresholds_x0, thresholds_a, thresholds_b):\n",
    "        self.thresholds_x0 = thresholds_x0\n",
    "        self.thresholds_a = thresholds_a\n",
    "        self.thresholds_b = thresholds_b\n",
    "        \n",
    "    def get_ptgmodel(self, x, a, b, x0):\n",
    "        return np.piecewise(x, [x < x0, x >= x0], [lambda x: a*x + b , lambda x : a*x0 + b])\n",
    "        \n",
    "    def fit(self, dfx, y):\n",
    "        x = np.array(dfx)\n",
    "        \n",
    "        # Define the bounds\n",
    "        bounds_min = [thresholds_a[0], thresholds_b[0], thresholds_x0[0]]\n",
    "        bounds_max = [thresholds_a[1], thresholds_b[1], thresholds_x0[1]]\n",
    "        bounds = (bounds_min, bounds_max)\n",
    "\n",
    "        # Fit a model\n",
    "        popt, pcov = scipy.optimize.curve_fit(self.get_ptgmodel, x, y, bounds = bounds)\n",
    "\n",
    "        # Get the parameter of the model\n",
    "        a = popt[0]\n",
    "        b = popt[1]\n",
    "        x0 = popt[2]\n",
    "        \n",
    "        self.coefficients = [a, b, x0]\n",
    "        \n",
    "    def predict(self,dfx):\n",
    "        x = np.array(dfx)\n",
    "        predictions = []\n",
    "        for elt in x:\n",
    "            forecast = self.get_ptgmodel(elt, self.coefficients[0], self.coefficients[1], self.coefficients[2])\n",
    "            predictions.append(forecast)\n",
    "        return np.array(predictions)\n",
    "        \n",
    "def train_ptgmodel(parameters, inputs, tags, log = False):\n",
    "    with mlflow.start_run(nested = True):\n",
    "        \n",
    "        # Prepare the data\n",
    "        df_inputs_train = df_trainvalidate_energyconsumption[inputs[0]]\n",
    "        df_inputs_test = df_test_energyconsumption[inputs[0]]\n",
    "        \n",
    "        \n",
    "        # Build the model\n",
    "        tic = time.time()\n",
    "        \n",
    "        model = PTG(parameters[\"thresholds_x0\"], parameters[\"thresholds_a\"], parameters[\"thresholds_b\"])\n",
    "        \n",
    "        model.fit(df_inputs_train, array_output_train)\n",
    "        duration_training = time.time() - tic\n",
    "\n",
    "        # Make the prediction\n",
    "        tic1 = time.time()\n",
    "        prediction = model.predict(df_inputs_test)\n",
    "        duration_prediction = time.time() - tic1\n",
    "\n",
    "        # Evaluate the model prediction\n",
    "        metrics = evaluation_model(array_output_test, prediction)\n",
    "\n",
    "        # Log in the console\n",
    "        if log:\n",
    "            print(f\"PTG:\")\n",
    "            print(parameters)\n",
    "            print(metrics)\n",
    "    \n",
    "        # Log in mlflow (parameter)\n",
    "        mlflow.log_params(parameters)  \n",
    "\n",
    "        # Log in mlflow (metrics)\n",
    "        metrics[\"duration_training\"] = duration_training\n",
    "        metrics[\"duration_prediction\"] = duration_prediction\n",
    "        mlflow.log_metrics(metrics)\n",
    "\n",
    "        # log in mlflow (model)\n",
    "        mlflow.sklearn.log_model(model, f\"model\")\n",
    "        \n",
    "        # Tag the model\n",
    "        mlflow.set_tags(tags)           "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Define the parameters of the model\n",
    "thresholds_x0 = [0, 20]\n",
    "thresholds_a = [-200000, -50000]\n",
    "thresholds_b = [1000000, 3000000]\n",
    "\n",
    "parameters = {\n",
    "    \"thresholds_x0\" : thresholds_x0,\n",
    "    \"thresholds_a\" : thresholds_a,\n",
    "    \"thresholds_b\" : thresholds_b\n",
    "}\n",
    "\n",
    "for field in [\"only_meanweather_inputs_avg\", \"only_meanweather_inputs_wavg\"]:\n",
    "    \n",
    "    tags = {\n",
    "        \"model\" : \"ptg\",\n",
    "        \"inputs\" : field\n",
    "    }\n",
    "    \n",
    "    train_ptgmodel(parameters, possible_inputs[field], tags, log = False)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Evaluate mlflow results"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Number of runs done :  272\n"
     ]
    }
   ],
   "source": [
    "# Select the run of the experiment\n",
    "df_runs = mlflow.search_runs(experiment_ids=\"0\")\n",
    "print(\"Number of runs done : \", len(df_runs))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>run_id</th>\n",
       "      <th>experiment_id</th>\n",
       "      <th>status</th>\n",
       "      <th>artifact_uri</th>\n",
       "      <th>start_time</th>\n",
       "      <th>end_time</th>\n",
       "      <th>metrics.r2</th>\n",
       "      <th>metrics.mae</th>\n",
       "      <th>metrics.duration_prediction</th>\n",
       "      <th>metrics.rmse</th>\n",
       "      <th>...</th>\n",
       "      <th>params.activation</th>\n",
       "      <th>params.nbr_iteration</th>\n",
       "      <th>params.hidden_layers</th>\n",
       "      <th>params.nbr_neighbors</th>\n",
       "      <th>params.weight_method</th>\n",
       "      <th>tags.model</th>\n",
       "      <th>tags.mlflow.source.type</th>\n",
       "      <th>tags.inputs</th>\n",
       "      <th>tags.mlflow.user</th>\n",
       "      <th>tags.mlflow.source.name</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>238</th>\n",
       "      <td>50ee6409ad3a4778bb9d8cb59034df5d</td>\n",
       "      <td>0</td>\n",
       "      <td>FINISHED</td>\n",
       "      <td>s3://mlflow/mlflow/artifacts/0/50ee6409ad3a477...</td>\n",
       "      <td>2020-01-17 18:17:47.448000+00:00</td>\n",
       "      <td>2020-01-17 18:17:47.929000+00:00</td>\n",
       "      <td>0.935956</td>\n",
       "      <td>104040.339809</td>\n",
       "      <td>0.003205</td>\n",
       "      <td>134649.399348</td>\n",
       "      <td>...</td>\n",
       "      <td>None</td>\n",
       "      <td>None</td>\n",
       "      <td>None</td>\n",
       "      <td>5</td>\n",
       "      <td>distance</td>\n",
       "      <td>knn</td>\n",
       "      <td>LOCAL</td>\n",
       "      <td>all</td>\n",
       "      <td>jovyan</td>\n",
       "      <td>/usr/local/lib/python3.6/dist-packages/ipykern...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>106</th>\n",
       "      <td>614bcf7042ca465c8d86296f12ac9c09</td>\n",
       "      <td>0</td>\n",
       "      <td>FINISHED</td>\n",
       "      <td>s3://mlflow/mlflow/artifacts/0/614bcf7042ca465...</td>\n",
       "      <td>2020-01-31 15:21:29.978000+00:00</td>\n",
       "      <td>2020-01-31 15:21:30.503000+00:00</td>\n",
       "      <td>0.935956</td>\n",
       "      <td>104040.339809</td>\n",
       "      <td>0.003404</td>\n",
       "      <td>134649.399348</td>\n",
       "      <td>...</td>\n",
       "      <td>None</td>\n",
       "      <td>None</td>\n",
       "      <td>None</td>\n",
       "      <td>5</td>\n",
       "      <td>distance</td>\n",
       "      <td>knn</td>\n",
       "      <td>LOCAL</td>\n",
       "      <td>all</td>\n",
       "      <td>jovyan</td>\n",
       "      <td>/usr/local/lib/python3.6/dist-packages/ipykern...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>96</th>\n",
       "      <td>b05667486f7d45779d23519eb0dbe24f</td>\n",
       "      <td>0</td>\n",
       "      <td>FINISHED</td>\n",
       "      <td>s3://mlflow/mlflow/artifacts/0/b05667486f7d457...</td>\n",
       "      <td>2020-01-31 15:21:35.424000+00:00</td>\n",
       "      <td>2020-01-31 15:21:35.922000+00:00</td>\n",
       "      <td>0.935111</td>\n",
       "      <td>105833.358681</td>\n",
       "      <td>0.002732</td>\n",
       "      <td>135534.759873</td>\n",
       "      <td>...</td>\n",
       "      <td>None</td>\n",
       "      <td>None</td>\n",
       "      <td>None</td>\n",
       "      <td>10</td>\n",
       "      <td>distance</td>\n",
       "      <td>knn</td>\n",
       "      <td>LOCAL</td>\n",
       "      <td>all</td>\n",
       "      <td>jovyan</td>\n",
       "      <td>/usr/local/lib/python3.6/dist-packages/ipykern...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>228</th>\n",
       "      <td>d279d728946e4b74811203a842d79df3</td>\n",
       "      <td>0</td>\n",
       "      <td>FINISHED</td>\n",
       "      <td>s3://mlflow/mlflow/artifacts/0/d279d728946e4b7...</td>\n",
       "      <td>2020-01-17 18:17:52.555000+00:00</td>\n",
       "      <td>2020-01-17 18:17:53.029000+00:00</td>\n",
       "      <td>0.935111</td>\n",
       "      <td>105833.358681</td>\n",
       "      <td>0.002863</td>\n",
       "      <td>135534.759873</td>\n",
       "      <td>...</td>\n",
       "      <td>None</td>\n",
       "      <td>None</td>\n",
       "      <td>None</td>\n",
       "      <td>10</td>\n",
       "      <td>distance</td>\n",
       "      <td>knn</td>\n",
       "      <td>LOCAL</td>\n",
       "      <td>all</td>\n",
       "      <td>jovyan</td>\n",
       "      <td>/usr/local/lib/python3.6/dist-packages/ipykern...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>111</th>\n",
       "      <td>88af21719e0a408b91448f7ddd27e84c</td>\n",
       "      <td>0</td>\n",
       "      <td>FINISHED</td>\n",
       "      <td>s3://mlflow/mlflow/artifacts/0/88af21719e0a408...</td>\n",
       "      <td>2020-01-31 15:21:27.338000+00:00</td>\n",
       "      <td>2020-01-31 15:21:27.947000+00:00</td>\n",
       "      <td>0.934465</td>\n",
       "      <td>105793.727897</td>\n",
       "      <td>0.002668</td>\n",
       "      <td>136207.422483</td>\n",
       "      <td>...</td>\n",
       "      <td>None</td>\n",
       "      <td>None</td>\n",
       "      <td>None</td>\n",
       "      <td>5</td>\n",
       "      <td>uniform</td>\n",
       "      <td>knn</td>\n",
       "      <td>LOCAL</td>\n",
       "      <td>all</td>\n",
       "      <td>jovyan</td>\n",
       "      <td>/usr/local/lib/python3.6/dist-packages/ipykern...</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>5 rows × 25 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "                               run_id experiment_id    status  \\\n",
       "238  50ee6409ad3a4778bb9d8cb59034df5d             0  FINISHED   \n",
       "106  614bcf7042ca465c8d86296f12ac9c09             0  FINISHED   \n",
       "96   b05667486f7d45779d23519eb0dbe24f             0  FINISHED   \n",
       "228  d279d728946e4b74811203a842d79df3             0  FINISHED   \n",
       "111  88af21719e0a408b91448f7ddd27e84c             0  FINISHED   \n",
       "\n",
       "                                          artifact_uri  \\\n",
       "238  s3://mlflow/mlflow/artifacts/0/50ee6409ad3a477...   \n",
       "106  s3://mlflow/mlflow/artifacts/0/614bcf7042ca465...   \n",
       "96   s3://mlflow/mlflow/artifacts/0/b05667486f7d457...   \n",
       "228  s3://mlflow/mlflow/artifacts/0/d279d728946e4b7...   \n",
       "111  s3://mlflow/mlflow/artifacts/0/88af21719e0a408...   \n",
       "\n",
       "                          start_time                         end_time  \\\n",
       "238 2020-01-17 18:17:47.448000+00:00 2020-01-17 18:17:47.929000+00:00   \n",
       "106 2020-01-31 15:21:29.978000+00:00 2020-01-31 15:21:30.503000+00:00   \n",
       "96  2020-01-31 15:21:35.424000+00:00 2020-01-31 15:21:35.922000+00:00   \n",
       "228 2020-01-17 18:17:52.555000+00:00 2020-01-17 18:17:53.029000+00:00   \n",
       "111 2020-01-31 15:21:27.338000+00:00 2020-01-31 15:21:27.947000+00:00   \n",
       "\n",
       "     metrics.r2    metrics.mae  metrics.duration_prediction   metrics.rmse  \\\n",
       "238    0.935956  104040.339809                     0.003205  134649.399348   \n",
       "106    0.935956  104040.339809                     0.003404  134649.399348   \n",
       "96     0.935111  105833.358681                     0.002732  135534.759873   \n",
       "228    0.935111  105833.358681                     0.002863  135534.759873   \n",
       "111    0.934465  105793.727897                     0.002668  136207.422483   \n",
       "\n",
       "     ...  params.activation params.nbr_iteration params.hidden_layers  \\\n",
       "238  ...               None                 None                 None   \n",
       "106  ...               None                 None                 None   \n",
       "96   ...               None                 None                 None   \n",
       "228  ...               None                 None                 None   \n",
       "111  ...               None                 None                 None   \n",
       "\n",
       "    params.nbr_neighbors params.weight_method tags.model  \\\n",
       "238                    5             distance        knn   \n",
       "106                    5             distance        knn   \n",
       "96                    10             distance        knn   \n",
       "228                   10             distance        knn   \n",
       "111                    5              uniform        knn   \n",
       "\n",
       "    tags.mlflow.source.type tags.inputs tags.mlflow.user  \\\n",
       "238                   LOCAL         all           jovyan   \n",
       "106                   LOCAL         all           jovyan   \n",
       "96                    LOCAL         all           jovyan   \n",
       "228                   LOCAL         all           jovyan   \n",
       "111                   LOCAL         all           jovyan   \n",
       "\n",
       "                               tags.mlflow.source.name  \n",
       "238  /usr/local/lib/python3.6/dist-packages/ipykern...  \n",
       "106  /usr/local/lib/python3.6/dist-packages/ipykern...  \n",
       "96   /usr/local/lib/python3.6/dist-packages/ipykern...  \n",
       "228  /usr/local/lib/python3.6/dist-packages/ipykern...  \n",
       "111  /usr/local/lib/python3.6/dist-packages/ipykern...  \n",
       "\n",
       "[5 rows x 25 columns]"
      ]
     },
     "execution_count": 19,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# Quick sorting to get the best models based on the RMSE metric\n",
    "df_runs.sort_values([\"metrics.rmse\"], ascending = True, inplace = True)\n",
    "df_runs.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'50ee6409ad3a4778bb9d8cb59034df5d'"
      ]
     },
     "execution_count": 20,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# Get the best one\n",
    "runid_selected = df_runs.head(1)[\"run_id\"].values[0]\n",
    "runid_selected"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "celltoolbar": "Raw Cell Format",
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.8"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}


================================================
FILE: ch06/MLflow.py
================================================
#!/usr/bin/env python
# coding: utf-8

# # mlflow-energyforecast
#
# This is a showcase for ML Flow capabilities, based on the article
# http://the-odd-dataguy.com/be-more-efficient-to-produce-ml-models-with-mlflow
# and a github https://github.com/jeanmidevacc/mlflow-energyforecast
#

# In[2]:

get_ipython().system('pip install pandas --upgrade --user')
get_ipython().system('pip install mlflow --upgrade --user')
get_ipython().system('pip install joblib --upgrade --user')
get_ipython().system('pip install numpy --upgrade --user ')
get_ipython().system('pip install scipy --upgrade --user ')
get_ipython().system('pip install scikit-learn --upgrade --user')
get_ipython().system('pip install boto3 --upgrade --user')

# In[3]:

import time
import json
import os
from joblib import Parallel, delayed

import pandas as pd
import numpy as np
import scipy

from sklearn.model_selection import train_test_split, KFold
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score, explained_variance_score
from sklearn.exceptions import ConvergenceWarning

import mlflow
import mlflow.sklearn
from mlflow.tracking import MlflowClient

from warnings import simplefilter
simplefilter(action='ignore', category=FutureWarning)
simplefilter(action='ignore', category=ConvergenceWarning)

# In[4]:

# Ensure Minio access
os.environ[
    'MLFLOW_S3_ENDPOINT_URL'] = 'http://minio-service.kubeflow.svc.cluster.local:9000'
os.environ['AWS_ACCESS_KEY_ID'] = 'minio'
os.environ['AWS_SECRET_ACCESS_KEY'] = 'minio123'

# # Data preparation

# In[5]:

# Collect the data
df_nationalconsumption_electricity_daily = pd.read_csv(
    "https://raw.githubusercontent.com/jeanmidevacc/mlflow-energyforecast/master/data/rtu_data.csv"
)
df_nationalconsumption_electricity_daily.set_index(["day"], inplace=True)

# In[6]:

# Prepare the training set and the testing set
df_trainvalidate_energyconsumption = df_nationalconsumption_electricity_daily[
    df_nationalconsumption_electricity_daily["datastatus"] == "Définitif"]
del df_trainvalidate_energyconsumption["datastatus"]

df_test_energyconsumption = df_nationalconsumption_electricity_daily[
    df_nationalconsumption_electricity_daily["datastatus"] == "Consolidé"]
del df_test_energyconsumption["datastatus"]

print("Size of the training set : ", len(df_trainvalidate_energyconsumption))
print("Size of the testing set : ", len(df_test_energyconsumption))

# In[7]:

# Define the inputs and the output
output = "dailyconsumption"
allinputs = list(df_trainvalidate_energyconsumption.columns)
allinputs.remove(output)

print("Output to predict : ", output)
print("Inputs for the prediction : ", allinputs)

# In[8]:

# Build different set of featurws for the model
possible_inputs = {
    "all":
    allinputs,
    "only_allday_inputs": ["weekday", "month", "is_holiday", "week"],
    "only_allweatheravg_inputs": [
        "avg_min_temperature", "avg_max_temperature", "avg_mean_temperature",
        "wavg_min_temperature", "wavg_max_temperature", "wavg_mean_temperature"
    ],
    "only_meanweather_inputs_avg": ["avg_mean_temperature"],
    "only_meanweather_inputs_wavg": ["wavg_mean_temperature"],
}

# In[9]:

# Prepare the output of the model
array_output_train = np.array(df_trainvalidate_energyconsumption[output])
array_output_test = np.array(df_test_energyconsumption[output])

# In[10]:

# connect to remote server
remote_server_uri = "http://mlflowserver.kubeflow.svc.cluster.local:5000"
mlflow.set_tracking_uri(remote_server_uri)
# Launch the experiment on mlflow
experiment_name = "electricityconsumption-forecast"
mlflow.set_experiment(experiment_name)

# In[11]:


# Define the evaluation function that will do the computation of the different metrics of accuracy (RMSE,MAE,R2)
def evaluation_model(y_test, y_pred):

    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    mae = mean_absolute_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)

    metrics = {
        "rmse": rmse,
        "r2": r2,
        "mae": mae,
    }

    return metrics


# # KNN regressor

# In[12]:

from sklearn.neighbors import KNeighborsRegressor


def train_knnmodel(parameters, inputs, tags, log=False):
    with mlflow.start_run(nested=True):

        # Prepare the data
        array_inputs_train = np.array(
            df_trainvalidate_energyconsumption[inputs])
        array_inputs_test = np.array(df_test_energyconsumption[inputs])

        # Build the model
        tic = time.time()
        model = KNeighborsRegressor(parameters["nbr_neighbors"],
                                    weights=parameters["weight_method"])
        model.fit(array_inputs_train, array_output_train)
        duration_training = time.time() - tic

        # Make the prediction
        tic1 = time.time()
        prediction = model.predict(array_inputs_test)
        duration_prediction = time.time() - tic1

        # Evaluate the model prediction
        metrics = evaluation_model(array_output_test, prediction)

        # Log in the console
        if log:
            print(f"KNN regressor:")
            print(parameters)
            print(metrics)

        # Log in mlflow (parameter)
        mlflow.log_params(parameters)

        # Log in mlflow (metrics)
        metrics["duration_training"] = duration_training
        metrics["duration_prediction"] = duration_prediction
        mlflow.log_metrics(metrics)

        # log in mlflow (model)
        mlflow.sklearn.log_model(model, f"model")

        # Tag the model
        mlflow.set_tags(tags)


# In[13]:

# Test the different combinations
configurations = []
for nbr_neighbors in [1, 2, 5, 10]:
    for weight_method in ['uniform', 'distance']:
        for field in possible_inputs:
            parameters = {
                "nbr_neighbors": nbr_neighbors,
                "weight_method": weight_method
            }

            tags = {"model": "knn", "inputs": field}

            configurations.append([parameters, tags])

            train_knnmodel(parameters, possible_inputs[field], tags)

# # MLP regressor

# In[14]:

from sklearn.neural_network import MLPRegressor


def train_mlpmodel(parameters, inputs, tags, log=False):
    with mlflow.start_run(nested=True):

        # Prepare the data
        array_inputs_train = np.array(
            df_trainvalidate_energyconsumption[inputs])
        array_inputs_test = np.array(df_test_energyconsumption[inputs])

        # Build the model
        tic = time.time()

        model = MLPRegressor(hidden_layer_sizes=parameters["hidden_layers"],
                             activation=parameters["activation"],
                             solver=parameters["solver"],
                             max_iter=parameters["nbr_iteration"],
                             random_state=0)

        model.fit(array_inputs_train, array_output_train)
        duration_training = time.time() - tic

        # Make the prediction
        tic1 = time.time()
        prediction = model.predict(array_inputs_test)
        duration_prediction = time.time() - tic1

        # Evaluate the model prediction
        metrics = evaluation_model(array_output_test, prediction)

        # Log in the console
        if log:
            print(f"Random forest regressor:")
            print(parameters)
            print(metrics)

        # Log in mlflow (parameter)
        mlflow.log_params(parameters)

        # Log in mlflow (metrics)
        metrics["duration_training"] = duration_training
        metrics["duration_prediction"] = duration_prediction
        mlflow.log_metrics(metrics)

        # log in mlflow (model)
        mlflow.sklearn.log_model(model, f"model")

        # Tag the model
        mlflow.set_tags(tags)


# In[15]:

for hiddenlayers in [4, 8, 16]:
    for activation in [
            "identity",
            "logistic",
    ]:
        for solver in ["lbfgs"]:
            for nbriteration in [10, 100, 1000]:
                for field in possible_inputs:
                    parameters = {
                        "hidden_layers": hiddenlayers,
                        "activation": activation,
                        "solver": solver,
                        "nbr_iteration": nbriteration
                    }

                    tags = {"model": "mlp", "inputs": field}

                    train_mlpmodel(parameters, possible_inputs[field], tags)

# # Use a handmade model (scipy approach)

# In[16]:


class PTG:
    def __init__(self, thresholds_x0, thresholds_a, thresholds_b):
        self.thresholds_x0 = thresholds_x0
        self.thresholds_a = thresholds_a
        self.thresholds_b = thresholds_b

    def get_ptgmodel(self, x, a, b, x0):
        return np.piecewise(x, [x < x0, x >= x0],
                            [lambda x: a * x + b, lambda x: a * x0 + b])

    def fit(self, dfx, y):
        x = np.array(dfx)

        # Define the bounds
        bounds_min = [thresholds_a[0], thresholds_b[0], thresholds_x0[0]]
        bounds_max = [thresholds_a[1], thresholds_b[1], thresholds_x0[1]]
        bounds = (bounds_min, bounds_max)

        # Fit a model
        popt, pcov = scipy.optimize.curve_fit(self.get_ptgmodel,
                                              x,
                                              y,
                                              bounds=bounds)

        # Get the parameter of the model
        a = popt[0]
        b = popt[1]
        x0 = popt[2]

        self.coefficients = [a, b, x0]

    def predict(self, dfx):
        x = np.array(dfx)
        predictions = []
        for elt in x:
            forecast = self.get_ptgmodel(elt, self.coefficients[0],
                                         self.coefficients[1],
                                         self.coefficients[2])
            predictions.append(forecast)
        return np.array(predictions)


def train_ptgmodel(parameters, inputs, tags, log=False):
    with mlflow.start_run(nested=True):

        # Prepare the data
        df_inputs_train = df_trainvalidate_energyconsumption[inputs[0]]
        df_inputs_test = df_test_energyconsumption[inputs[0]]

        # Build the model
        tic = time.time()

        model = PTG(parameters["thresholds_x0"], parameters["thresholds_a"],
                    parameters["thresholds_b"])

        model.fit(df_inputs_train, array_output_train)
        duration_training = time.time() - tic

        # Make the prediction
        tic1 = time.time()
        prediction = model.predict(df_inputs_test)
        duration_prediction = time.time() - tic1

        # Evaluate the model prediction
        metrics = evaluation_model(array_output_test, prediction)

        # Log in the console
        if log:
            print(f"PTG:")
            print(parameters)
            print(metrics)

        # Log in mlflow (parameter)
        mlflow.log_params(parameters)

        # Log in mlflow (metrics)
        metrics["duration_training"] = duration_training
        metrics["duration_prediction"] = duration_prediction
        mlflow.log_metrics(metrics)

        # log in mlflow (model)
        mlflow.sklearn.log_model(model, f"model")

        # Tag the model
        mlflow.set_tags(tags)


# In[17]:

# Define the parameters of the model
thresholds_x0 = [0, 20]
thresholds_a = [-200000, -50000]
thresholds_b = [1000000, 3000000]

parameters = {
    "thresholds_x0": thresholds_x0,
    "thresholds_a": thresholds_a,
    "thresholds_b": thresholds_b
}

for field in ["only_meanweather_inputs_avg", "only_meanweather_inputs_wavg"]:

    tags = {"model": "ptg", "inputs": field}

    train_ptgmodel(parameters, possible_inputs[field], tags, log=False)

# # Evaluate mlflow results

# In[18]:

# Select the run of the experiment
df_runs = mlflow.search_runs(experiment_ids="0")
print("Number of runs done : ", len(df_runs))

# In[19]:

# Quick sorting to get the best models based on the RMSE metric
df_runs.sort_values(["metrics.rmse"], ascending=True, inplace=True)
df_runs.head()

# In[20]:

# Get the best one
runid_selected = df_runs.head(1)["run_id"].values[0]
runid_selected

# In[ ]:


================================================
FILE: ch06/Metadata.ipynb
================================================
{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Installation and imports"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Requirement already up-to-date: kfmd in ./.local/lib/python3.6/site-packages (0.1.8)\n",
      "Requirement already up-to-date: pandas in ./.local/lib/python3.6/site-packages (1.0.1)\n",
      "Requirement already satisfied, skipping upgrade: python-dateutil>=2.6.1 in /usr/local/lib/python3.6/dist-packages (from pandas) (2.8.1)\n",
      "Requirement already satisfied, skipping upgrade: numpy>=1.13.3 in /usr/local/lib/python3.6/dist-packages (from pandas) (1.18.1)\n",
      "Requirement already satisfied, skipping upgrade: pytz>=2017.2 in /usr/local/lib/python3.6/dist-packages (from pandas) (2019.3)\n",
      "Requirement already satisfied, skipping upgrade: six>=1.5 in /usr/lib/python3/dist-packages (from python-dateutil>=2.6.1->pandas) (1.11.0)\n"
     ]
    }
   ],
   "source": [
    "!pip install kfmd --upgrade --user\n",
    "!pip install pandas --upgrade --user\n",
    "\n",
    "from kfmd import metadata\n",
    "import pandas\n",
    "from datetime import datetime\n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Create a workspace, run and execution"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "ws1 = metadata.Workspace(\n",
    "    # Connect to metadata-service in namesapce kubeflow in k8s cluster.\n",
    "    backend_url_prefix=\"metadata-service.kubeflow.svc.cluster.local:8080\",\n",
    "    name=\"ws1\",\n",
    "    description=\"a workspace for testing\",\n",
    "    labels={\"n1\": \"v1\"})\n",
    "r = metadata.Run(\n",
    "    workspace=ws1,\n",
    "    name=\"run-\" + datetime.utcnow().isoformat(\"T\") ,\n",
    "    description=\"a run in ws_1\",\n",
    ")\n",
    "exec = metadata.Execution(\n",
    "    name = \"execution\" + datetime.utcnow().isoformat(\"T\") ,\n",
    "    workspace=ws1,\n",
    "    run=r,\n",
    "    description=\"execution example\",\n",
    ")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Log data set, model and its evaluation"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "data_set = exec.log_input(\n",
    "        metadata.DataSet(\n",
    "            description=\"an example data\",\n",
    "            name=\"mytable-dump\",\n",
    "            owner=\"owner@my-company.org\",\n",
    "            uri=\"file://path/to/dataset\",\n",
    "            version=\"v1.0.0\",\n",
    "            query=\"SELECT * FROM mytable\"))\n",
    "model = exec.log_output(\n",
    "    metadata.Model(\n",
    "            name=\"MNIST\",\n",
    "            description=\"model to recognize handwritten digits\",\n",
    "            owner=\"someone@kubeflow.org\",\n",
    "            uri=\"gcs://my-bucket/mnist\",\n",
    "            model_type=\"neural network\",\n",
    "            training_framework={\n",
    "                \"name\": \"tensorflow\",\n",
    "                \"version\": \"v1.0\"\n",
    "            },\n",
    "            hyperparameters={\n",
    "                \"learning_rate\": 0.5,\n",
    "                \"layers\": [10, 3, 1],\n",
    "                \"early_stop\": True\n",
    "            },\n",
    "            version=\"v0.0.1\",\n",
    "            labels={\"mylabel\": \"l1\"}))\n",
    "metrics = exec.log_output(\n",
    "    metadata.Metrics(\n",
    "            name=\"MNIST-evaluation\",\n",
    "            description=\"validating the MNIST model to recognize handwritten digits\",\n",
    "            owner=\"someone@kubeflow.org\",\n",
    "            uri=\"gcs://my-bucket/mnist-eval.csv\",\n",
    "            data_set_id=data_set.id,\n",
    "            model_id=model.id,\n",
    "            metrics_type=metadata.Metrics.VALIDATION,\n",
    "            values={\"accuracy\": 0.95},\n",
    "            labels={\"mylabel\": \"l1\"}))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "List all the models in the workspace"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>id</th>\n",
       "      <th>workspace</th>\n",
       "      <th>run</th>\n",
       "      <th>create_time</th>\n",
       "      <th>description</th>\n",
       "      <th>model_type</th>\n",
       "      <th>name</th>\n",
       "      <th>owner</th>\n",
       "      <th>version</th>\n",
       "      <th>uri</th>\n",
       "      <th>training_framework</th>\n",
       "      <th>hyperparameters</th>\n",
       "      <th>labels</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>8</td>\n",
       "      <td>ws1</td>\n",
       "      <td>run-2020-02-18T00:48:10.734939</td>\n",
       "      <td>2020-02-18T00:48:13.273533Z</td>\n",
       "      <td>model to recognize handwritten digits</td>\n",
       "      <td>neural network</td>\n",
       "      <td>MNIST</td>\n",
       "      <td>someone@kubeflow.org</td>\n",
       "      <td>v0.0.1</td>\n",
       "      <td>gcs://my-bucket/mnist</td>\n",
       "      <td>{'name': 'tensorflow', 'version': 'v1.0'}</td>\n",
       "      <td>{'learning_rate': 0.5, 'layers': [10, 3, 1], '...</td>\n",
       "      <td>{'mylabel': 'l1'}</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "  id workspace                             run                  create_time  \\\n",
       "0  8       ws1  run-2020-02-18T00:48:10.734939  2020-02-18T00:48:13.273533Z   \n",
       "\n",
       "                             description      model_type   name  \\\n",
       "0  model to recognize handwritten digits  neural network  MNIST   \n",
       "\n",
       "                  owner version                    uri  \\\n",
       "0  someone@kubeflow.org  v0.0.1  gcs://my-bucket/mnist   \n",
       "\n",
       "                          training_framework  \\\n",
       "0  {'name': 'tensorflow', 'version': 'v1.0'}   \n",
       "\n",
       "                                     hyperparameters             labels  \n",
       "0  {'learning_rate': 0.5, 'layers': [10, 3, 1], '...  {'mylabel': 'l1'}  "
      ]
     },
     "execution_count": 4,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "pandas.DataFrame.from_dict(ws1.list(metadata.Model.ARTIFACT_TYPE_NAME))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Get basic lineage"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "model id is 8\n",
      "\n"
     ]
    }
   ],
   "source": [
    "print(\"model id is %s\\n\" % model.id)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Find the execution that produces this model."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "3\n"
     ]
    }
   ],
   "source": [
    "output_events = ws1.client.list_events2(model.id).events\n",
    "assert len(output_events) == 1\n",
    "execution_id = output_events[0].execution_id\n",
    "print(execution_id)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Find all events related to that execution."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n",
      "All events related to this model:\n"
     ]
    },
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>artifact_id</th>\n",
       "      <th>execution_id</th>\n",
       "      <th>path</th>\n",
       "      <th>type</th>\n",
       "      <th>milliseconds_since_epoch</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>7</td>\n",
       "      <td>3</td>\n",
       "      <td>None</td>\n",
       "      <td>INPUT</td>\n",
       "      <td>1581986893248</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>8</td>\n",
       "      <td>3</td>\n",
       "      <td>None</td>\n",
       "      <td>OUTPUT</td>\n",
       "      <td>1581986893273</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>9</td>\n",
       "      <td>3</td>\n",
       "      <td>None</td>\n",
       "      <td>OUTPUT</td>\n",
       "      <td>1581986893298</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "  artifact_id execution_id  path    type milliseconds_since_epoch\n",
       "0           7            3  None   INPUT            1581986893248\n",
       "1           8            3  None  OUTPUT            1581986893273\n",
       "2           9            3  None  OUTPUT            1581986893298"
      ]
     },
     "execution_count": 7,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "all_events = ws1.client.list_events(execution_id).events\n",
    "assert len(all_events) == 3\n",
    "print(\"\\nAll events related to this model:\")\n",
    "pandas.DataFrame.from_dict([e.to_dict() for e in all_events])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.9"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}


================================================
FILE: ch06/Metadata.py
================================================
#!/usr/bin/env python
# coding: utf-8

# # Installation and imports

# In[1]:

get_ipython().system('pip install kfmd --upgrade --user')
get_ipython().system('pip install pandas --upgrade --user')

from kfmd import metadata
import pandas
from datetime import datetime

# Create a workspace, run and execution

# In[2]:

ws1 = metadata.Workspace(
    # Connect to metadata-service in namesapce kubeflow in k8s cluster.
    backend_url_prefix="metadata-service.kubeflow.svc.cluster.local:8080",
    name="ws1",
    description="a workspace for testing",
    labels={"n1": "v1"})
r = metadata.Run(
    workspace=ws1,
    name="run-" + datetime.utcnow().isoformat("T"),
    description="a run in ws_1",
)
exec = metadata.Execution(
    name="execution" + datetime.utcnow().isoformat("T"),
    workspace=ws1,
    run=r,
    description="execution example",
)

# Log data set, model and its evaluation

# In[3]:

data_set = exec.log_input(
    metadata.DataSet(description="an example data",
                     name="mytable-dump",
                     owner="owner@my-company.org",
                     uri="file://path/to/dataset",
                     version="v1.0.0",
                     query="SELECT * FROM mytable"))
model = exec.log_output(
    metadata.Model(name="MNIST",
                   description="model to recognize handwritten digits",
                   owner="someone@kubeflow.org",
                   uri="gcs://my-bucket/mnist",
                   model_type="neural network",
                   training_framework={
                       "name": "tensorflow",
                       "version": "v1.0"
                   },
                   hyperparameters={
                       "learning_rate": 0.5,
                       "layers": [10, 3, 1],
                       "early_stop": True
                   },
                   version="v0.0.1",
                   labels={"mylabel": "l1"}))
metrics = exec.log_output(
    metadata.Metrics(
        name="MNIST-evaluation",
        description=
        "validating the MNIST model to recognize handwritten digits",
        owner="someone@kubeflow.org",
        uri="gcs://my-bucket/mnist-eval.csv",
        data_set_id=data_set.id,
        model_id=model.id,
        metrics_type=metadata.Metrics.VALIDATION,
        values={"accuracy": 0.95},
        labels={"mylabel": "l1"}))

# List all the models in the workspace

# In[4]:

pandas.DataFrame.from_dict(ws1.list(metadata.Model.ARTIFACT_TYPE_NAME))

# Get basic lineage

# In[5]:

print("model id is %s\n" % model.id)

# Find the execution that produces this model.

# In[6]:

output_events = ws1.client.list_events2(model.id).events
assert len(output_events) == 1
execution_id = output_events[0].execution_id
print(execution_id)

# Find all events related to that execution.

# In[7]:

all_events = ws1.client.list_events(execution_id).events
assert len(all_events) == 3
print("\nAll events related to this model:")
pandas.DataFrame.from_dict([e.to_dict() for e in all_events])

# In[ ]:


================================================
FILE: ch06/docker/Dockerfile
================================================
# from https://github.com/flmu/mlflow-tracking-server

FROM python:3.7

RUN pip3 install --upgrade pip && \
    pip3 install mlflow --upgrade && \
    pip3 install awscli --upgrade  && \
    pip3 install boto3 --upgrade

ENV PORT 5000
ENV AWS_BUCKET bucket
ENV AWS_ACCESS_KEY_ID aws_id
ENV AWS_SECRET_ACCESS_KEY aws_key

ENV FILE_DIR /tmp/mlflow

RUN mkdir -p /opt/mlflow

COPY run.sh /opt/mlflow
RUN chmod -R 777 /opt/mlflow/

ENTRYPOINT ["/opt/mlflow/run.sh"]

================================================
FILE: ch06/docker/build.sh
================================================
#!/bin/bash

img='lightbend/mlflow'
tag='0.1'
docker build -t $img:$tag .


================================================
FILE: ch06/docker/run.sh
================================================
#!/bin/sh

set -e

if [ -z "${AWS_BUCKET}" ]; then
  echo >&2 "AWS_BUCKET must be set"
  exit 1
fi

if [ -z "${AWS_ACCESS_KEY_ID}" ]; then
  echo >&2 "AWS_ACCESS_KEY_ID must be set"
  exit 1
fi

if [ -z "${AWS_SECRET_ACCESS_KEY}" ]; then
  echo >&2 "AWS_SECRET_ACCESS_KEY must be set"
  exit 1
fi

mkdir -p "${FILE_DIR}"

mlflow server \
    --backend-store-uri "file://$FILE_DIR" \
    --default-artifact-root "s3://$AWS_BUCKET/mlflow/artifacts" \
    --host 0.0.0.0 \
    --port "$PORT"


================================================
FILE: ch06/install/mlflowchart/.helmignore
================================================
# Patterns to ignore when building packages.
# This supports shell glob matching, relative path matching, and
# negation (prefixed with !). Only one pattern per line.
.DS_Store
# Common VCS dirs
.git/
.gitignore
.bzr/
.bzrignore
.hg/
.hgignore
.svn/
# Common backup files
*.swp
*.bak
*.tmp
*~
# Various IDEs
.project
.idea/
*.tmproj


================================================
FILE: ch06/install/mlflowchart/Chart.yaml
================================================
apiVersion: v1
appVersion: 0.1
description: MLFlow
maintainers:
- name: Boris Lublinsky
name: MLFLOW tracking server
version: 0.1

================================================
FILE: ch06/install/mlflowchart/templates/NOTES.txt
================================================
ML Flow tracking server is installed


================================================
FILE: ch06/install/mlflowchart/templates/_helpers.tpl
================================================
{{/* vim: set filetype=mustache: */}}
{{/*
Expand the name of the chart.
*/}}
{{- define "modelserverchart.name" -}}
{{- default .Chart.Name .Values.nameOverride | trunc 63 | trimSuffix "-" -}}
{{- end -}}

{{/*
Create a default fully qualified app name.
We truncate at 63 chars because some Kubernetes name fields are limited to this (by the DNS naming spec).
*/}}
{{- define "modelserverchart.fullname" -}}
{{- $name := default .Chart.Name .Values.nameOverride -}}
{{- printf "%s-%s" .Release.Name $name | trunc 63 | trimSuffix "-" -}}
{{- end -}}


================================================
FILE: ch06/install/mlflowchart/templates/mlflow.yaml
================================================
apiVersion: apps/v1
kind: Deployment
metadata:
  namespace: kubeflow
  name: mlflowserver
  labels:
    app: mlflowserver
spec:
  replicas: 1
  selector:
    matchLabels:
      app: mlflowserver
  strategy:
    type: RollingUpdate
  template:
    metadata:
      labels:
        app: mlflowserver
    spec:
      containers:
        - name: server
          image: "{{ .Values.image.server }}:{{ .Values.image.version }}"
          imagePullPolicy: "{{ .Values.image.pullPolicy }}"
          ports:
            - containerPort: 5000
              name: serving
              protocol: TCP
          env:
            - name: "MLFLOW_S3_ENDPOINT_URL"
              value: "http://minio-service.kubeflow.svc.cluster.local:9000"
            - name: "AWS_ACCESS_KEY_ID"
              valueFrom: { secretKeyRef: { name: "minioaccess", key: "AWS_ACCESS_KEY_ID" } }
            - name: "AWS_SECRET_ACCESS_KEY"
              valueFrom: { secretKeyRef: { name: "minioaccess", key: "AWS_SECRET_ACCESS_KEY" } }
            - name: "AWS_BUCKET"
              value: "mlflow"
          volumes:
            - name: secret-volume
              secret:
                secretName: minioaccess
---
apiVersion: v1
kind: Service
metadata:
  namespace: kubeflow
  name: mlflowserver
spec:
  selector:
    app: mlflowserver
  ports:
  - protocol: TCP
    port: 5000
    targetPort: 5000
---
apiVersion: networking.istio.io/v1alpha3
kind: VirtualService
metadata:
  name: mlflow-server
  namespace: kubeflow
spec:
  gateways:
    - kubeflow-gateway
  hosts:
    - '*'
  http:
    - match:
        - uri:
            prefix: /mlflow/
      rewrite:
        uri: /
      route:
        - destination:
            host: mlflowserver.kubeflow.svc.cluster.local
            port:
              number: 5000

================================================
FILE: ch06/install/mlflowchart/values.yaml
================================================
# application name is a namespace
# docker images
image:
  server: lightbend/mlflow
  pullPolicy: Always
  version: 0.1


================================================
FILE: ch10/experiment.yaml
================================================
Name:         random-example
Namespace:    kubeflow
Labels:       controller-tools.k8s.io=1.0
Annotations:  <none>
API Version:  kubeflow.org/v1alpha3
Kind:         Experiment
Metadata:
  Creation Timestamp:  2019-12-22T22:53:25Z
  Finalizers:
    update-prometheus-metrics
  Generation:        2
  Resource Version:  720692
  Self Link:         /apis/kubeflow.org/v1alpha3/namespaces/kubeflow/experiments/random-example
  UID:               dc6bc15a-250d-11ea-8cae-42010a80010f
Spec:
  Algorithm:
    Algorithm Name:        random
    Algorithm Settings:    <nil>
  Max Failed Trial Count:  3
  Max Trial Count:         12
  Metrics Collector Spec:
    Collector:
      Kind:  StdOut
  Objective:
    Additional Metric Names:
      accuracy
    Goal:                   0.99
    Objective Metric Name:  Validation-accuracy
    Type:                   maximize
  Parallel Trial Count:     3
  Parameters:
    Feasible Space:
      Max:           0.03
      Min:           0.01
    Name:            --lr
    Parameter Type:  double
    Feasible Space:
      Max:           5
      Min:           2
    Name:            --num-layers
    Parameter Type:  int
    Feasible Space:
      List:
        sgd
        adam
        ftrl
    Name:            --optimizer
    Parameter Type:  categorical
  Trial Template:
    Go Template:
      Raw Template:  apiVersion: batch/v1
kind: Job
metadata:
  name: {{.Trial}}
  namespace: {{.NameSpace}}
spec:
  template:
    spec:
      containers:
      - name: {{.Trial}}
        image: docker.io/kubeflowkatib/mxnet-mnist-example
        command:
        - "python"
        - "/mxnet/example/image-classification/train_mnist.py"
        - "--batch-size=64"
        {{- with .HyperParameters}}
        {{- range .}}
        - "{{.Name}}={{.Value}}"
        {{- end}}
        {{- end}}
      restartPolicy: Never
Status:
  Conditions:
    Last Transition Time:  2019-12-22T22:53:25Z
    Last Update Time:      2019-12-22T22:53:25Z
    Message:               Experiment is created
    Reason:                ExperimentCreated
    Status:                True
    Type:                  Created
    Last Transition Time:  2019-12-22T22:55:10Z
    Last Update Time:      2019-12-22T22:55:10Z
    Message:               Experiment is running
    Reason:                ExperimentRunning
    Status:                True
    Type:                  Running
  Current Optimal Trial:
    Observation:
      Metrics:
        Name:   Validation-accuracy
        Value:  0.981091
    Parameter Assignments:
      Name:          --lr
      Value:         0.025139701133432946
      Name:          --num-layers
      Value:         4
      Name:          --optimizer
      Value:         sgd
  Start Time:        2019-12-22T22:53:25Z
  Trials:            12
  Trials Running:    2
  Trials Succeeded:  10
Events:              <none>Type something here!


================================================
FILE: ch10/hptuning.py
================================================
# Initialize search space
# Initialize model
while not objective_reached and not bugdget_exhausted:
    # Obtain new hyperparameters
    suggestion = GetSuggestions()

    # Run trial with new hyperparameters; collect metrics
    metrics = RunTrial(suggestion)

    # Report metrics
    Report(metrics)


================================================
FILE: ch10/random.yaml
================================================
apiVersion: "kubeflow.org/v1alpha3"
kind: Experiment
metadata:
  namespace: kubeflow
  labels:
    controller-tools.k8s.io: "1.0"
  name: random-example
spec:
  objective:
    type: maximize
    goal: 0.99
    objectiveMetricName: Validation-accuracy
    additionalMetricNames:
      - Train-accuracy
  algorithm:
    algorithmName: random
  parallelTrialCount: 3
  maxTrialCount: 12
  maxFailedTrialCount: 3
  parameters:
    - name: --lr
      parameterType: double
      feasibleSpace:
        min: "0.01"
        max: "0.03"
    - name: --num-layers
      parameterType: int
      feasibleSpace:
        min: "2"
        max: "5"
    - name: --optimizer
      parameterType: categorical
      feasibleSpace:
        list:
        - sgd
        - adam
        - ftrl
  trialTemplate:
    goTemplate:
        rawTemplate: |-
          apiVersion: batch/v1
          kind: Job
          metadata:
            name: {{.Trial}}
            namespace: {{.NameSpace}}
          spec:
            template:
              spec:
                containers:
                - name: {{.Trial}}
                  image: docker.io/kubeflowkatib/mxnet-mnist
                  command:
                  - "python3"
                  - "/opt/mxnet-mnist/mnist.py"
                  - "--batch-size=64"
                  {{- with .HyperParameters}}
                  {{- range .}}
                  - "{{.Name}}={{.Value}}"
                  {{- end}}
                  {{- end}}
                restartPolicy: NeverType something here!


================================================
FILE: ch2/Dockerfile
================================================
FROM gcr.io/kubeflow-images-public/tensorflow-2.1.0-notebook-cpu:1.0.0

================================================
FILE: ch2/build-and-push.sh
================================================
#!/bin/bash
#tag::buildandpush[]
IMAGE="${CONTAINER_REGISTRY}/kubeflow/test:v1"
docker build  -t "${IMAGE}" -f Dockerfile .
docker push "${IMAGE}"
#end::buildandpush[]


================================================
FILE: ch2/query-endpoint.py
================================================
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements.  See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership.  The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License.  You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied.  See the License for the
#     specific language governing permissions and limitations
# under the License.

#tag::scriptSetup[]
import requests
import numpy as np

from tensorflow.examples.tutorials.mnist import input_data
from matplotlib import pyplot as plt

def download_mnist():
    return input_data.read_data_sets("MNIST_data/", one_hot=True)


def gen_image(arr):
    two_d = (np.reshape(arr, (28, 28)) * 255).astype(np.uint8)
    plt.imshow(two_d, cmap=plt.cm.gray_r, interpolation='nearest')
    return plt
#end::scriptSetup[]

AMBASSADOR_API_IP = "10.53.148.167:30134"

#tag::scriptGuts[]
mnist = download_mnist()
batch_xs, batch_ys = mnist.train.next_batch(1)
chosen = 0
gen_image(batch_xs[chosen]).show()
data = batch_xs[chosen].reshape((1, 784))
features = ["X" + str(i + 1) for i in range(0, 784)]
request = {"data": {"names": features, "ndarray": data.tolist()}}
deploymentName = "mnist-classifier"
uri = "http://" + AMBASSADOR_API_IP + "/seldon/" + \
    deploymentName + "/api/v0.1/predictions"

response = requests.post(uri, json=request)
#end::scriptGuts[]
print(response.status_code)


================================================
FILE: ch2_seldon_examples/pipeline_role.yaml
================================================
apiVersion: rbac.authorization.k8s.io/v1
kind: Role
metadata:
  namespace: kubeflow
  name: pipeline-runner
rules:
- apiGroups: ["machinelearning.seldon.io"]
  resources: ["seldondeployments"]
  verbs: ["*"]


================================================
FILE: ch2_seldon_examples/pipeline_rolebinding.yaml
================================================
apiVersion: rbac.authorization.k8s.io/v1
kind: RoleBinding
metadata:
  name: pipeline-runner
  namespace: kubeflow
subjects:
- kind: ServiceAccount
  name: pipeline-runner
  namespace: kubeflow
roleRef:
  kind: Role
  name: pipeline-runner
  apiGroup: rbac.authorization.k8s.io


================================================
FILE: ch2_seldon_examples/pv-claim.yaml
================================================
kind: PersistentVolumeClaim
apiVersion: v1
metadata:
  name: "nfs-1"
spec:
  storageClassName: manual
  accessModes:
    - ReadWriteOnce
  resources:
    requests:
      storage: 3Gi


================================================
FILE: ch2_seldon_examples/pv-volume.yaml
================================================
kind: PersistentVolume
apiVersion: v1
metadata:
  name: task-pv-volume
  labels:
    type: local
spec:
  storageClassName: manual
  capacity:
    storage: 10Gi
  accessModes:
    - ReadWriteOnce
  hostPath:
    path: "/mnt/data"

================================================
FILE: ch2_seldon_examples/request_example.ipynb
================================================
{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Collecting matplotlib\n",
      "\u001b[?25l  Downloading https://files.pythonhosted.org/packages/57/4f/dd381ecf6c6ab9bcdaa8ea912e866dedc6e696756156d8ecc087e20817e2/matplotlib-3.1.1-cp36-cp36m-manylinux1_x86_64.whl (13.1MB)\n",
      "\u001b[K    100% |████████████████████████████████| 13.1MB 2.7MB/s eta 0:00:01\n",
      "\u001b[?25hRequirement already satisfied: python-dateutil>=2.1 in /opt/conda/lib/python3.6/site-packages (from matplotlib) (2.8.0)\n",
      "Collecting cycler>=0.10 (from matplotlib)\n",
      "  Downloading https://files.pythonhosted.org/packages/f7/d2/e07d3ebb2bd7af696440ce7e754c59dd546ffe1bbe732c8ab68b9c834e61/cycler-0.10.0-py2.py3-none-any.whl\n",
      "Collecting kiwisolver>=1.0.1 (from matplotlib)\n",
      "\u001b[?25l  Downloading https://files.pythonhosted.org/packages/f8/a1/5742b56282449b1c0968197f63eae486eca2c35dcd334bab75ad524e0de1/kiwisolver-1.1.0-cp36-cp36m-manylinux1_x86_64.whl (90kB)\n",
      "\u001b[K    100% |████████████████████████████████| 92kB 32.5MB/s ta 0:00:01\n",
      "\u001b[?25hCollecting pyparsing!=2.0.4,!=2.1.2,!=2.1.6,>=2.0.1 (from matplotlib)\n",
      "\u001b[?25l  Downloading https://files.pythonhosted.org/packages/11/fa/0160cd525c62d7abd076a070ff02b2b94de589f1a9789774f17d7c54058e/pyparsing-2.4.2-py2.py3-none-any.whl (65kB)\n",
      "\u001b[K    100% |████████████████████████████████| 71kB 25.6MB/s ta 0:00:01\n",
      "\u001b[?25hRequirement already satisfied: numpy>=1.11 in /opt/conda/lib/python3.6/site-packages (from matplotlib) (1.16.2)\n",
      "Requirement already satisfied: six>=1.5 in /opt/conda/lib/python3.6/site-packages (from python-dateutil>=2.1->matplotlib) (1.12.0)\n",
      "Requirement already satisfied: setuptools in /opt/conda/lib/python3.6/site-packages (from kiwisolver>=1.0.1->matplotlib) (40.9.0)\n",
      "Installing collected packages: cycler, kiwisolver, pyparsing, matplotlib\n",
      "Successfully installed cycler-0.10.0 kiwisolver-1.1.0 matplotlib-3.1.1 pyparsing-2.4.2\n",
      "\u001b[33mYou are using pip version 19.0.1, however version 19.2.3 is available.\n",
      "You should consider upgrading via the 'pip install --upgrade pip' command.\u001b[0m\n"
     ]
    }
   ],
   "source": [
    "!pip install matplotlib"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "import requests\n",
    "import numpy as np\n",
    "\n",
    "from tensorflow.examples.tutorials.mnist import input_data\n",
    "from matplotlib import pyplot as plt\n",
    "\n",
    "\n",
    "def download_mnist():\n",
    "    return input_data.read_data_sets(\"MNIST_data/\", one_hot = True)\n",
    "\n",
    "def gen_image(arr):\n",
    "    two_d = (np.reshape(arr, (28, 28)) * 255).astype(np.uint8)\n",
    "    plt.imshow(two_d,cmap=plt.cm.gray_r, interpolation='nearest')\n",
    "    return plt\n",
    "\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "WARNING:tensorflow:From <ipython-input-3-0613226129c0>:9: read_data_sets (from tensorflow.contrib.learn.python.learn.datasets.mnist) is deprecated and will be removed in a future version.\n",
      "Instructions for updating:\n",
      "Please use alternatives such as official/mnist/dataset.py from tensorflow/models.\n",
      "WARNING:tensorflow:From /opt/conda/lib/python3.6/site-packages/tensorflow/contrib/learn/python/learn/datasets/mnist.py:260: maybe_download (from tensorflow.contrib.learn.python.learn.datasets.base) is deprecated and will be removed in a future version.\n",
      "Instructions for updating:\n",
      "Please write your own downloading logic.\n",
      "WARNING:tensorflow:From /opt/conda/lib/python3.6/site-packages/tensorflow/contrib/learn/python/learn/datasets/base.py:252: _internal_retry.<locals>.wrap.<locals>.wrapped_fn (from tensorflow.contrib.learn.python.learn.datasets.base) is deprecated and will be removed in a future version.\n",
      "Instructions for updating:\n",
      "Please use urllib or similar directly.\n",
      "Successfully downloaded train-images-idx3-ubyte.gz 9912422 bytes.\n",
      "WARNING:tensorflow:From /opt/conda/lib/python3.6/site-packages/tensorflow/contrib/learn/python/learn/datasets/mnist.py:262: extract_images (from tensorflow.contrib.learn.python.learn.datasets.mnist) is deprecated and will be removed in a future version.\n",
      "Instructions for updating:\n",
      "Please use tf.data to implement this functionality.\n",
      "Extracting MNIST_data/train-images-idx3-ubyte.gz\n",
      "Successfully downloaded train-labels-idx1-ubyte.gz 28881 bytes.\n",
      "WARNING:tensorflow:From /opt/conda/lib/python3.6/site-packages/tensorflow/contrib/learn/python/learn/datasets/mnist.py:267: extract_labels (from tensorflow.contrib.learn.python.learn.datasets.mnist) is deprecated and will be removed in a future version.\n",
      "Instructions for updating:\n",
      "Please use tf.data to implement this functionality.\n",
      "Extracting MNIST_data/train-labels-idx1-ubyte.gz\n",
      "WARNING:tensorflow:From /opt/conda/lib/python3.6/site-packages/tensorflow/contrib/learn/python/learn/datasets/mnist.py:110: dense_to_one_hot (from tensorflow.contrib.learn.python.learn.datasets.mnist) is deprecated and will be removed in a future version.\n",
      "Instructions for updating:\n",
      "Please use tf.one_hot on tensors.\n",
      "Successfully downloaded t10k-images-idx3-ubyte.gz 1648877 bytes.\n",
      "Extracting MNIST_data/t10k-images-idx3-ubyte.gz\n",
      "Successfully downloaded t10k-labels-idx1-ubyte.gz 4542 bytes.\n",
      "Extracting MNIST_data/t10k-labels-idx1-ubyte.gz\n",
      "WARNING:tensorflow:From /opt/conda/lib/python3.6/site-packages/tensorflow/contrib/learn/python/learn/datasets/mnist.py:290: DataSet.__init__ (from tensorflow.contrib.learn.python.learn.datasets.mnist) is deprecated and will be removed in a future version.\n",
      "Instructions for updating:\n",
      "Please use alternatives such as official/mnist/dataset.py from tensorflow/models.\n"
     ]
    }
   ],
   "source": [
    "mnist = download_mnist()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "image/png": "iVBORw0KGgoAAAANSUhEUgAAAPsAAAD4CAYAAAAq5pAIAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADh0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uMy4xLjEsIGh0dHA6Ly9tYXRwbG90bGliLm9yZy8QZhcZAAANMklEQVR4nO3dXaxV9ZnH8d9Ppr0REmE4ORDAgakYo2Ok5EhMahonZIgvIdgYTblATMxQXyCtaeIYJ1ovvMAJ0BQzklAlpaRDbWwVYohTB5uY3hCPBoUjaX0JBghyDhqiqFiVZy7Osjni2Wsf9lr7RZ7vJznZe69nrb2erPBj7b3+e++/I0IAzn3ndbsBAJ1B2IEkCDuQBGEHkiDsQBL/0MmdTZ8+PebOndvJXQKpHDx4UMePH/d4tUpht32tpF9ImiTp8YhYW7b+3LlzNTg4WGWXAEoMDAw0rLX8Mt72JEn/Lek6SZdKWm770lafD0B7VXnPvkjSmxHxdkT8TdJvJS2rpy0AdasS9lmSDo15fLhY9hW2V9ketD04MjJSYXcAqmj71fiI2BwRAxEx0NfX1+7dAWigStiPSJoz5vHsYhmAHlQl7C9Jmm97nu1vS/qhpJ31tAWgbi0PvUXE57ZXS/pfjQ69bYmIodo6A1CrSuPsEbFL0q6aegHQRnxcFkiCsANJEHYgCcIOJEHYgSQIO5AEYQeSIOxAEoQdSIKwA0kQdiAJwg4kQdiBJAg7kARhB5Ig7EAShB1IgrADSRB2IAnCDiRB2IEkCDuQBGEHkiDsQBKEHUiCsANJEHYgCcIOJEHYgSQIO5BEpSmbbR+U9KGkLyR9HhEDdTQFoH6Vwl7414g4XsPzAGgjXsYDSVQNe0j6o+2Xba8abwXbq2wP2h4cGRmpuDsAraoa9qsjYqGk6yTdbfv7Z64QEZsjYiAiBvr6+iruDkCrKoU9Io4Ut8OSnpa0qI6mANSv5bDbPt/2lC/vS1oiaX9djQGoV5Wr8f2Snrb95fP8T0Q8V0tXAGrXctgj4m1JV9TYC4A2YugNSIKwA0kQdiAJwg4kQdiBJOr4Igy+wbZt21Za/+STTzrUydl74IEHSuvr1q1rWFuxYkXd7fQ8zuxAEoQdSIKwA0kQdiAJwg4kQdiBJAg7kATj7OeANWvWNKzt2bOndNu9e/eW1j/77LPSen9/f8vbnz59unTbEydOlNabKb5+jQJndiAJwg4kQdiBJAg7kARhB5Ig7EAShB1IgnH2Djh27Fhp/eGHH670/Dt27GhYO3ToUKXnvuOOO0rrt912W2n9o48+aljbtGlT6bZPPfVUaX3+/Pml9SuvvLK0ng1ndiAJwg4kQdiBJAg7kARhB5Ig7EAShB1IgnH2Grz11lul9WXLlpXWh4aGKu1/8uTJDWu33npr6bbr168vrU+bNq20ft555eeLxx9/vGFtcHCwdNvLLrustP7cc+UzhM+ePbu0nk3TM7vtLbaHbe8fs2ya7edtv1HcTm1vmwCqmsjL+F9JuvaMZfdJ2h0R8yXtLh4D6GFNwx4RL0p6/4zFyyRtLe5vlXRjzX0BqFmrF+j6I+Jocf9dSQ1/iMz2KtuDtgdHRkZa3B2AqipfjY+IkBQl9c0RMRARA319fVV3B6BFrYb9mO2ZklTcDtfXEoB2aDXsOyWtLO6vlNT4O5YAekLTcXbb2yVdI2m67cOSfiZpraTf2b5d0juSbmlnk72ubJxbkmbNmlVarzrO/sgjjzSs3XXXXZWeu5n33nuvtL5hw4aGtZMnT5Zue/PNN5fWGUc/O03DHhHLG5QW19wLgDbi47JAEoQdSIKwA0kQdiAJwg4kwVdca9Bs2uKNGzeW1i+55JJK+2/2k8rt9Nhjj5XWDxw40LDW7Ou3N9xwQ0s9YXyc2YEkCDuQBGEHkiDsQBKEHUiCsANJEHYgCcbZO+Ciiy4qra9Zs6a0/uijj9bZzlk5depUab3Z13NnzJjRsHbnnXeWbnvVVVeV1nF2OLMDSRB2IAnCDiRB2IEkCDuQBGEHkiDsQBKMs3fApEmTSuurV68urS9durS0vnDhwrPuaaJOnDhRWn/yySdL60uWLGlYYxy9szizA0kQdiAJwg4kQdiBJAg7kARhB5Ig7EASjLP3gIsvvrhSvZ2eeeaZru0b9Wp6Zre9xfaw7f1jlj1k+4jtvcXf9e1tE0BVE3kZ/ytJ146z/OcRsaD421VvWwDq1jTsEfGipPc70AuANqpygW617deKl/lTG61ke5XtQduDIyMjFXYHoIpWw75J0nckLZB0VNL6RitGxOaIGIiIgb6+vhZ3B6CqlsIeEcci4ouIOC3pl5IW1dsWgLq1FHbbM8c8/IGk/Y3WBdAbmo6z294u6RpJ020flvQzSdfYXiApJB2U9KM29ogu2rWrfKDlnnvuKa0/+OCDdbaDCpqGPSKWj7P4iTb0AqCN+LgskARhB5Ig7EAShB1IgrADSfAV1+SGh4dL659++mlpffLkyaX1Cy644Kx7QntwZgeSIOxAEoQdSIKwA0kQdiAJwg4kQdiBJBhnT+7yyy8vrX/88cel9XvvvbfOdtBGnNmBJAg7kARhB5Ig7EAShB1IgrADSRB2IAnG2VGq2ffVFy9e3KFOUBVndiAJwg4kQdiBJAg7kARhB5Ig7EAShB1IgnH2c9z27dtL6x988EFpfcaMGXW2gy5qema3Pcf2n2y/bnvI9o+L5dNsP2/7jeJ2avvbBdCqibyM/1zSTyPiUklXSbrb9qWS7pO0OyLmS9pdPAbQo5qGPSKORsQrxf0PJR2QNEvSMklbi9W2SrqxXU0CqO6sLtDZnivpu5L2SOqPiKNF6V1J/Q22WWV70PbgyMhIhVYBVDHhsNueLOn3kn4SEV+5qhMRISnG2y4iNkfEQEQM9PX1VWoWQOsmFHbb39Jo0H8TEX8oFh+zPbOoz5RUPh0ogK5qOvRm25KekHQgIjaMKe2UtFLS2uJ2R1s6RCVDQ0Ol9VOnTpXWN27cWGc76KKJjLN/T9IKSfts7y2W3a/RkP/O9u2S3pF0S3taBFCHpmGPiD9LcoMyv1wAfEPwcVkgCcIOJEHYgSQIO5AEYQeSIOxAEoQdSIKwA0kQdiAJwg4kQdiBJAg7kARhB5Lgp6TPAWvXrm1Y27lzZ+m2CxYsKK1fccUVLfWE3sOZHUiCsANJEHYgCcIOJEHYgSQIO5AEYQeSYJz9HPDCCy80rO3bt6/Sc7/66qul9QsvvLDS86NzOLMDSRB2IAnCDiRB2IEkCDuQBGEHkiDsQBITmZ99jqRfS+qXFJI2R8QvbD8k6d8ljRSr3h8Ru9rVKBqbN29e255727ZtpfWlS5e2bd+o10Q+VPO5pJ9GxCu2p0h62fbzRe3nEbGufe0BqMtE5mc/Kulocf9D2wckzWp3YwDqdVbv2W3PlfRdSXuKRattv2Z7i+2pDbZZZXvQ9uDIyMh4qwDogAmH3fZkSb+X9JOI+EDSJknfkbRAo2f+9eNtFxGbI2IgIgb6+vpqaBlAKyYUdtvf0mjQfxMRf5CkiDgWEV9ExGlJv5S0qH1tAqiqadhtW9ITkg5ExIYxy2eOWe0HkvbX3x6Aukzkavz3JK2QtM/23mLZ/ZKW216g0eG4g5J+1JYO0dS6dY0HRI4cOVK67eLFi0vrN910U0s9ofdM5Gr8nyV5nBJj6sA3CJ+gA5Ig7EAShB1IgrADSRB2IAnCDiTBT0mfA6ZMmdKw9uyzz3awE/QyzuxAEoQdSIKwA0kQdiAJwg4kQdiBJAg7kIQjonM7s0ckvTNm0XRJxzvWwNnp1d56tS+J3lpVZ2//FBHj/v5bR8P+tZ3bgxEx0LUGSvRqb73al0RvrepUb7yMB5Ig7EAS3Q775i7vv0yv9tarfUn01qqO9NbV9+wAOqfbZ3YAHULYgSS6Enbb19r+i+03bd/XjR4asX3Q9j7be20PdrmXLbaHbe8fs2ya7edtv1HcjjvHXpd6e8j2keLY7bV9fZd6m2P7T7Zftz1k+8fF8q4eu5K+OnLcOv6e3fYkSX+V9G+SDkt6SdLyiHi9o400YPugpIGI6PoHMGx/X9JJSb+OiH8plv2XpPcjYm3xH+XUiPiPHuntIUknuz2NdzFb0cyx04xLulHSberisSvp6xZ14Lh148y+SNKbEfF2RPxN0m8lLetCHz0vIl6U9P4Zi5dJ2lrc36rRfywd16C3nhARRyPileL+h5K+nGa8q8eupK+O6EbYZ0k6NObxYfXWfO8h6Y+2X7a9qtvNjKM/Io4W99+V1N/NZsbRdBrvTjpjmvGeOXatTH9eFRfovu7qiFgo6TpJdxcvV3tSjL4H66Wx0wlN490p40wz/nfdPHatTn9eVTfCfkTSnDGPZxfLekJEHCluhyU9rd6bivrYlzPoFrfDXe7n73ppGu/xphlXDxy7bk5/3o2wvyRpvu15tr8t6YeSdnahj6+xfX5x4US2z5e0RL03FfVOSSuL+ysl7ehiL1/RK9N4N5pmXF0+dl2f/jwiOv4n6XqNXpF/S9J/dqOHBn39s6RXi7+hbvcmabtGX9Z9ptFrG7dL+kdJuyW9Ien/JE3rod62Sdon6TWNBmtml3q7WqMv0V+TtLf4u77bx66kr44cNz4uCyTBBTogCcIOJEHYgSQIO5AEYQeSIOxAEoQdSOL/AQe88PwDu2A0AAAAAElFTkSuQmCC\n",
      "text/plain": [
       "<Figure size 432x288 with 1 Axes>"
      ]
     },
     "metadata": {
      "needs_background": "light"
     },
     "output_type": "display_data"
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "401\n"
     ]
    }
   ],
   "source": [
    "batch_xs, batch_ys = mnist.train.next_batch(1)\n",
    "chosen=0\n",
    "gen_image(batch_xs[chosen]).show()\n",
    "data = batch_xs[chosen].reshape((1,784))\n",
    "features = [\"X\"+str(i+1) for i in range (0,784)]\n",
    "request = {\"data\":{\"names\":features,\"ndarray\":data.tolist()}}\n",
    "deploymentName = \"mnist-classifier\"\n",
    "uri = \"http://istio-ingressgateway.istio-system.svc.cluster.local/seldon/\"+deploymentName+\"/api/v0.1/predictions\"\n",
    "\n",
    "response = requests.post(\n",
    "    uri,\n",
    "    json=request)\n",
    "\n",
    "print(response.status_code)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Origin authentication failed.\n"
     ]
    }
   ],
   "source": [
    "print(response.text)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.7"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}


================================================
FILE: ch2_seldon_examples/run_example.sh
================================================
#!/bin/bash
#tag::buildPipeline[]
dsl-compile --py train_pipeline.py --output job.yaml
#end::buildPipeline[]
#tag::connectToWebUI[]
# If you're on minikube and not using a loadbalancer:
minikube service --url -n istio-system istio-ingressgateway
# If your on GCP https://<kf_app_name>.endpoints.<gcp_project_name>.cloud.goog/
# If you're on vanilla K8s
INGRESS_HOST=$(kubectl -n istio-system get service istio-ingressgateway \
		       -o jsonpath='{.status.loadBalancer.ingress[0].ip}')
export INGRESS_HOST
INGRESS_PORT=$(kubectl -n istio-system get service istio-ingressgateway \
	       -o jsonpath='{.spec.ports[?(@.name=="http2")].port}')
export INGRESS_PORT
SECURE_INGRESS_PORT=$(kubectl -n istio-system get service istio-ingressgateway \
		      -o jsonpath='{.spec.ports[?(@.name=="https")].port}')
export SECURE_INGRESS_PORT

kubectl get svc istio-ingressgateway -n istio-system
#end::connectToWebUI[]


================================================
FILE: ch2_seldon_examples/setup_example.sh
================================================
#!/bin/bash

set -ex

echo "Setting up example"

unset ch2_example_path
ch2_example_path="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )"
echo "Using path ${ch2_example_path} for our example path"
example_path=$(dirname "${ch2_example_path}")
#tag::generate_kf_app_p1[]
# Pick the correct config file for your platform from
# https://github.com/kubeflow/manifests/tree/[version]/kfdef
# You can download & edit the configuration at this point if you need to.
# For generic k8s with istio:
MANIFEST_BRANCH=${MANIFEST_BRANCH:-v1.0-branch}
export MANIFEST_BRANCH
MANIFEST_VERSION=${MANIFEST_VERSION:-v1.0.1}
export MANIFEST_VERSION

KF_PROJECT_NAME=${KF_PROJECT_NAME:-hello-kf-${PLATFORM}}
export KF_PROJECT_NAME
mkdir "${KF_PROJECT_NAME}"
pushd "${KF_PROJECT_NAME}"

manifest_root=https://raw.githubusercontent.com/kubeflow/manifests/
# On most enviroments this will create a "vanilla" kubeflow install using istio.
KFDEF=${manifest_root}${MANIFEST_BRANCH}/kfdef/kfctl_k8s_istio.${MANIFEST_VERSION}.yaml
#end::generate_kf_app_p1[]
# On GCP this will create a cluster with basic authentication
if [ "$PLATFORM" == "gcp" ]; then
  KFDEF=${manifest_root}${MANIFEST_BRANCH}/kfdef/kfctl_gcp_iap.${MANIFEST_VERSION}.yaml
  # Temp hack
  cp "${example_path}/kfctl_gcp_iap.v1.0.1.yaml" ./
  KFDEF=./kfctl_gcp_iap.v1.0.1.yaml
  # Set up IAP
  # TODO(holden)
  # Set up environment variables for GCP
  export PROJECT=${PROJECT:-"<your GCP project name>"}
  gcloud config set project "${PROJECT}"
  export ZONE=${ZONE:-"<your GCP zone>"}
  gcloud config set compute/zone "${ZONE}"
fi
pwd
#tag::generate_kf_app_p2[]
kfctl apply -f $KFDEF -V
echo $?

popd
#end::generate_kf_app_p2[]


# TODO(trevor): what version/tag?
#tag::cloneSeldonExample[]
# Clone the base seldon example
git clone https://github.com/kubeflow/example-seldon
#end::cloneSeldonExample[]


================================================
FILE: ch2_seldon_examples/tf_mnist_no_seldon_pipeline.py
================================================
# Copyright 2019 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#      http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
Kubeflow Pipelines MNIST example

Run this script to compile pipeline
"""

import kfp.dsl as dsl
import kfp.gcp as gcp
import kfp.onprem as onprem

gcs_or_pvc = 'PVC'


@dsl.pipeline(name='MNIST',
              description='A pipeline to train and serve the MNIST example.')
def mnist_pipeline(gcs_bucket=None,
                   train_steps='200',
                   learning_rate='0.01',
                   batch_size='100'):
    """
    Pipeline with three stages:
      1. train an MNIST classifier
      2. deploy a tf-serving instance to the cluster
      3. deploy a web-ui to interact with it
    """

    vop = None
    volume = None
    if gcs_or_pvc == "PVC":
        vop = dsl.VolumeOp(name="create_pvc",
                           resource_name="nfs-1",
                           modes=dsl.VOLUME_MODE_RWO,
                           size="10G")
        volume = vop.volume

    train = dsl.ContainerOp(
        name='train',
        image=
        'gcr.io/kubeflow-examples/mnist/model:v20190304-v0.2-176-g15d997b',
        arguments=[
            "/opt/model.py", "--tf-export-dir", gcs_bucket or "/mnt",
            "--tf-train-steps", train_steps, "--tf-batch-size", batch_size,
            "--tf-learning-rate", learning_rate
        ])

    serve_args = [
        '--model-export-path', gcs_bucket or "/mnt", '--server-name',
        "mnist-service"
    ]
    if gcs_or_pvc != 'GCS':
        serve_args.extend(
            ['--cluster-name', "mnist-pipeline", '--pvc-name', volume])

    serve = dsl.ContainerOp(
        name='serve',
        image='gcr.io/ml-pipeline/ml-pipeline-kubeflow-deployer:'
        '7775692adf28d6f79098e76e839986c9ee55dd61',
        arguments=serve_args)
    serve.after(train)

    webui_args = [
        '--image', 'gcr.io/kubeflow-examples/mnist/web-ui:'
        'v20190304-v0.2-176-g15d997b-pipelines', '--name', 'web-ui',
        '--container-port', '5000', '--service-port', '80', '--service-type',
        "LoadBalancer"
    ]

    web_ui = dsl.ContainerOp(
        name='web-ui',
        image='gcr.io/kubeflow-examples/mnist/deploy-service:latest',
        arguments=webui_args)
    web_ui.after(serve)

    steps = [train, serve, web_ui]
    for step in steps:
        if gcs_or_pvc == 'GCS':
            step.apply(gcp.use_gcp_secret('user-gcp-sa'))
        else:
            step.after(vop)
            step.add_pvolumes({"/mnt": volume})


if __name__ == '__main__':
    import kfp.compiler as compiler
    compiler.Compiler().compile(mnist_pipeline, __file__ + '.tar.gz')


================================================
FILE: ch2_seldon_examples/tiller_rbac.yaml
================================================
apiVersion: v1
kind: ServiceAccount
metadata:
  name: tiller
  namespace: kube-system
---
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRoleBinding
metadata:
  name: tiller
roleRef:
  apiGroup: rbac.authorization.k8s.io
  kind: ClusterRole
  name: cluster-admin
subjects:
  - kind: ServiceAccount
    name: tiller
    namespace: kube-system

================================================
FILE: ch2_seldon_examples/train_pipeline.py
================================================
import kfp.dsl as dsl
import kfp.gcp as gcp
import kfp.onprem as onprem

from string import Template
import json


@dsl.pipeline(name='Simple sci-kit KF Pipeline',
              description='A simple end to end sci-kit seldon kf pipeline')
def mnist_train_pipeline(docker_org="index.docker.io/seldonio",
                         train_container_version="0.2",
                         serve_container_version="0.1"):

    vop = dsl.VolumeOp(name="create_pvc",
                       resource_name="nfs-1",
                       modes=dsl.VOLUME_MODE_RWO,
                       size="10G")
    volume = vop.volume
    train = dsl.ContainerOp(
        name='sk-train',
        image=
        f"{docker_org}/skmnistclassifier_trainer:{train_container_version}",
        pvolumes={"/data": volume})

    seldon_serving_json_template = Template("""
{
	"apiVersion": "machinelearning.seldon.io/v1alpha2",
	"kind": "SeldonDeployment",
	"metadata": {
		"labels": {
			"app": "seldon"
		},
		"name": "mnist-classifier"
	},
	"spec": {
		"annotations": {
			"deployment_version": "v1",
			"project_name": "MNIST Example"
		},
		"name": "mnist-classifier",
		"predictors": [
			{
				"annotations": {
					"predictor_version": "v1"
				},
				"componentSpecs": [
					{
						"spec": {
							"containers": [
								{
									"image": "$dockerreposerving:$dockertagserving",
									"imagePullPolicy": "Always",
									"name": "mnist-classifier",
									"volumeMounts": [
										{
											"mountPath": "/data",
											"name": "persistent-storage"
										}
									]
								}
							],
							"terminationGracePeriodSeconds": 1,
							"volumes": [
								{
									"name": "persistent-storage",
									"persistentVolumeClaim": {
											"claimName": "$modelpvc"
									}
								}
							]
						}
					}
				],
				"graph": {
					"children": [],
					"endpoint": {
						"type": "REST"
					},
					"name": "mnist-classifier",
					"type": "MODEL"
				},
				"name": "mnist-classifier",
				"replicas": 1
			}
		]
	}
}    
""")
    seldon_serving_json = seldon_serving_json_template.substitute({
        'dockerreposerving':
        f"{docker_org}/skmnistclassifier_runtime",
        'dockertagserving':
        str(serve_container_version),
        'modelpvc':
        vop.outputs["name"]
    })

    seldon_deployment = json.loads(seldon_serving_json)

    serve = dsl.ResourceOp(
        name='serve',
        k8s_resource=seldon_deployment,
        success_condition='status.state == Available').after(train)


# If we're called directly create an expirement and run
if __name__ == '__main__':
    pipeline_func = mnist_train_pipeline
    pipeline_filename = pipeline_func.__name__ + '.pipeline.zip'
    import kfp.compiler as compiler
    compiler.Compiler().compile(pipeline_func, pipeline_filename)
    expirement_name = "cheese"
    experiment = client.create_experiment(expirement_name)
    run_name = pipeline_func.__name__ + ' run'
    run_result = client.run_pipeline(experiment.id, run_name,
                                     pipeline_filename, arguments)
    print(run_result)


================================================
FILE: ch9/ctscans/DICOM Denoising Pipeline.ipynb
================================================
{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Defaulting to user installation because normal site-packages is not writeable\n",
      "Collecting kfp\n",
      "  Downloading kfp-0.5.1.tar.gz (119 kB)\n",
      "\u001b[K     |████████████████████████████████| 119 kB 3.5 MB/s eta 0:00:01\n",
      "\u001b[?25hRequirement already satisfied: PyYAML in /usr/local/lib/python3.6/dist-packages (from kfp) (5.3)\n",
      "Requirement already satisfied: google-cloud-storage>=1.13.0 in /usr/local/lib/python3.6/dist-packages (from kfp) (1.25.0)\n",
      "Requirement already satisfied: kubernetes<12.0.0,>=8.0.0 in /usr/local/lib/python3.6/dist-packages (from kfp) (10.0.1)\n",
      "Requirement already satisfied: google-auth>=1.6.1 in /usr/local/lib/python3.6/dist-packages (from kfp) (1.11.0)\n",
      "Collecting requests_toolbelt>=0.8.0\n",
      "  Downloading requests_toolbelt-0.9.1-py2.py3-none-any.whl (54 kB)\n",
      "\u001b[K     |████████████████████████████████| 54 kB 4.0 MB/s  eta 0:00:01\n",
      "\u001b[?25hRequirement already satisfied: cloudpickle in /usr/local/lib/python3.6/dist-packages (from kfp) (1.2.2)\n",
      "Collecting kfp-server-api<0.6.0,>=0.2.5\n",
      "  Downloading kfp-server-api-0.5.0.tar.gz (39 kB)\n",
      "Requirement already satisfied: jsonschema>=3.0.1 in /usr/local/lib/python3.6/dist-packages (from kfp) (3.2.0)\n",
      "Collecting tabulate\n",
      "  Downloading tabulate-0.8.7-py3-none-any.whl (24 kB)\n",
      "Collecting click\n",
      "  Downloading click-7.1.2-py2.py3-none-any.whl (82 kB)\n",
      "\u001b[K     |████████████████████████████████| 82 kB 1.5 MB/s  eta 0:00:01\n",
      "\u001b[?25hCollecting Deprecated\n",
      "  Downloading Deprecated-1.2.9-py2.py3-none-any.whl (8.6 kB)\n",
      "Collecting strip-hints\n",
      "  Downloading strip-hints-0.1.9.tar.gz (30 kB)\n",
      "Requirement already satisfied: google-cloud-core<2.0dev,>=1.2.0 in /usr/local/lib/python3.6/dist-packages (from google-cloud-storage>=1.13.0->kfp) (1.3.0)\n",
      "Requirement already satisfied: google-resumable-media<0.6dev,>=0.5.0 in /usr/local/lib/python3.6/dist-packages (from google-cloud-storage>=1.13.0->kfp) (0.5.0)\n",
      "Requirement already satisfied: six>=1.9.0 in /usr/lib/python3/dist-packages (from kubernetes<12.0.0,>=8.0.0->kfp) (1.11.0)\n",
      "Requirement already satisfied: websocket-client!=0.40.0,!=0.41.*,!=0.42.*,>=0.32.0 in /usr/local/lib/python3.6/dist-packages (from kubernetes<12.0.0,>=8.0.0->kfp) (0.57.0)\n",
      "Requirement already satisfied: python-dateutil>=2.5.3 in /usr/local/lib/python3.6/dist-packages (from kubernetes<12.0.0,>=8.0.0->kfp) (2.8.1)\n",
      "Requirement already satisfied: requests in /usr/local/lib/python3.6/dist-packages (from kubernetes<12.0.0,>=8.0.0->kfp) (2.22.0)\n",
      "Requirement already satisfied: requests-oauthlib in /usr/local/lib/python3.6/dist-packages (from kubernetes<12.0.0,>=8.0.0->kfp) (1.3.0)\n",
      "Requirement already satisfied: setuptools>=21.0.0 in /usr/local/lib/python3.6/dist-packages (from kubernetes<12.0.0,>=8.0.0->kfp) (45.1.0)\n",
      "Requirement already satisfied: urllib3>=1.24.2 in /usr/local/lib/python3.6/dist-packages (from kubernetes<12.0.0,>=8.0.0->kfp) (1.25.8)\n",
      "Requirement already satisfied: certifi>=14.05.14 in /usr/local/lib/python3.6/dist-packages (from kubernetes<12.0.0,>=8.0.0->kfp) (2019.11.28)\n",
      "Requirement already satisfied: rsa<4.1,>=3.1.4 in /usr/local/lib/python3.6/dist-packages (from google-auth>=1.6.1->kfp) (4.0)\n",
      "Requirement already satisfied: cachetools<5.0,>=2.0.0 in /usr/local/lib/python3.6/dist-packages (from google-auth>=1.6.1->kfp) (4.0.0)\n",
      "Requirement already satisfied: pyasn1-modules>=0.2.1 in /usr/local/lib/python3.6/dist-packages (from google-auth>=1.6.1->kfp) (0.2.8)\n",
      "Requirement already satisfied: attrs>=17.4.0 in /usr/local/lib/python3.6/dist-packages (from jsonschema>=3.0.1->kfp) (19.3.0)\n",
      "Requirement already satisfied: importlib-metadata; python_version < \"3.8\" in /usr/local/lib/python3.6/dist-packages (from jsonschema>=3.0.1->kfp) (1.4.0)\n",
      "Requirement already satisfied: pyrsistent>=0.14.0 in /usr/local/lib/python3.6/dist-packages (from jsonschema>=3.0.1->kfp) (0.15.7)\n",
      "Requirement already satisfied: wrapt<2,>=1.10 in /usr/local/lib/python3.6/dist-packages (from Deprecated->kfp) (1.11.2)\n",
      "Requirement already satisfied: wheel in /usr/lib/python3/dist-packages (from strip-hints->kfp) (0.30.0)\n",
      "Requirement already satisfied: google-api-core<2.0.0dev,>=1.16.0 in /usr/local/lib/python3.6/dist-packages (from google-cloud-core<2.0dev,>=1.2.0->google-cloud-storage>=1.13.0->kfp) (1.16.0)\n",
      "Requirement already satisfied: idna<2.9,>=2.5 in /usr/lib/python3/dist-packages (from requests->kubernetes<12.0.0,>=8.0.0->kfp) (2.6)\n",
      "Requirement already satisfied: chardet<3.1.0,>=3.0.2 in /usr/local/lib/python3.6/dist-packages (from requests->kubernetes<12.0.0,>=8.0.0->kfp) (3.0.4)\n",
      "Requirement already satisfied: oauthlib>=3.0.0 in /usr/local/lib/python3.6/dist-packages (from requests-oauthlib->kubernetes<12.0.0,>=8.0.0->kfp) (3.1.0)\n",
      "Requirement already satisfied: pyasn1>=0.1.3 in /usr/local/lib/python3.6/dist-packages (from rsa<4.1,>=3.1.4->google-auth>=1.6.1->kfp) (0.4.8)\n",
      "Requirement already satisfied: zipp>=0.5 in /usr/local/lib/python3.6/dist-packages (from importlib-metadata; python_version < \"3.8\"->jsonschema>=3.0.1->kfp) (2.1.0)\n",
      "Requirement already satisfied: protobuf>=3.4.0 in /usr/local/lib/python3.6/dist-packages (from google-api-core<2.0.0dev,>=1.16.0->google-cloud-core<2.0dev,>=1.2.0->google-cloud-storage>=1.13.0->kfp) (3.11.2)\n",
      "Requirement already satisfied: pytz in /usr/local/lib/python3.6/dist-packages (from google-api-core<2.0.0dev,>=1.16.0->google-cloud-core<2.0dev,>=1.2.0->google-cloud-storage>=1.13.0->kfp) (2019.3)\n",
      "Requirement already satisfied: googleapis-common-protos<2.0dev,>=1.6.0 in /usr/local/lib/python3.6/dist-packages (from google-api-core<2.0.0dev,>=1.16.0->google-cloud-core<2.0dev,>=1.2.0->google-cloud-storage>=1.13.0->kfp) (1.51.0)\n",
      "Building wheels for collected packages: kfp, kfp-server-api, strip-hints\n",
      "  Building wheel for kfp (setup.py) ... \u001b[?25ldone\n",
      "\u001b[?25h  Created wheel for kfp: filename=kfp-0.5.1-py3-none-any.whl size=163151 sha256=da5b540ae9834d37659146f0576997ffd8f7a7e2b305e1eb7b2a99dd4745930b\n",
      "  Stored in directory: /home/jovyan/.cache/pip/wheels/2f/26/f9/e3836cb6e6cabd63ef912304e18a852ac29cb870a4a0b85f98\n",
      "  Building wheel for kfp-server-api (setup.py) ... \u001b[?25ldone\n",
      "\u001b[?25h  Created wheel for kfp-server-api: filename=kfp_server_api-0.5.0-py3-none-any.whl size=106319 sha256=84f55948cc254c0f836dffdfd51574a828ae8a503a2ca9198acf7a27ca2aaea7\n",
      "  Stored in directory: /home/jovyan/.cache/pip/wheels/73/36/4e/bfe2efeeea4f74f04984ebe1d44136202b72191302f4760951\n",
      "  Building wheel for strip-hints (setup.py) ... \u001b[?25ldone\n",
      "\u001b[?25h  Created wheel for strip-hints: filename=strip_hints-0.1.9-py2.py3-none-any.whl size=24671 sha256=3bcfd573a91f5f4c46d23509ac3fee9a0cf351b414e00ed505a8f71d0e6a1141\n",
      "  Stored in directory: /home/jovyan/.cache/pip/wheels/21/6d/fa/7ed7c0560e1ef39ebabd5cc0241e7fca711660bae1ad752e2b\n",
      "Successfully built kfp kfp-server-api strip-hints\n",
      "Installing collected packages: requests-toolbelt, kfp-server-api, tabulate, click, Deprecated, strip-hints, kfp\n",
      "\u001b[33m  WARNING: The script tabulate is installed in '/home/jovyan/.local/bin' which is not on PATH.\n",
      "  Consider adding this directory to PATH or, if you prefer to suppress this warning, use --no-warn-script-location.\u001b[0m\n",
      "\u001b[33m  WARNING: The script strip-hints is installed in '/home/jovyan/.local/bin' which is not on PATH.\n",
      "  Consider adding this directory to PATH or, if you prefer to suppress this warning, use --no-warn-script-location.\u001b[0m\n",
      "\u001b[33m  WARNING: The scripts dsl-compile and kfp are installed in '/home/jovyan/.local/bin' which is not on PATH.\n",
      "  Consider adding this directory to PATH or, if you prefer to suppress this warning, use --no-warn-script-location.\u001b[0m\n",
      "Successfully installed Deprecated-1.2.9 click-7.1.2 kfp-0.5.1 kfp-server-api-0.5.0 requests-toolbelt-0.9.1 strip-hints-0.1.9 tabulate-0.8.7\n",
      "\u001b[33mWARNING: You are using pip version 20.0.2; however, version 20.1 is available.\n",
      "You should consider upgrading via the '/usr/bin/python3 -m pip install --upgrade pip' command.\u001b[0m\n"
     ]
    }
   ],
   "source": [
    "!pip3 install kfp\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import kfp\n",
    "import kubernetes"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "\n",
    "container_manifest = {\n",
    "    \"apiVersion\": \"sparkoperator.k8s.io/v1beta2\",\n",
    "    \"kind\": \"SparkApplication\",\n",
    "    \"metadata\": {\n",
    "        \"name\": \"spark-app\",\n",
    "        \"namespace\": \"kubeflow\"\n",
    "    },\n",
    "    \"spec\": {\n",
    "        \"type\": \"Scala\",\n",
    "        \"mode\": \"cluster\",\n",
    "        \"image\": \"docker.io/rawkintrevo/covid-basis-vectors:0.2.0\",\n",
    "        \"imagePullPolicy\": \"Always\",\n",
    "        \"hadoopConf\": {\n",
    "            \"fs.gs.project.id\": \"kubeflow-hacky-hacky\",\n",
    "            \"fs.gs.system.bucket\": \"covid-dicoms\",\n",
    "            \"fs.gs.impl\" : \"com.google.cloud.hadoop.fs.gcs.GoogleHadoopFileSystem\",\n",
    "            \"google.cloud.auth.service.account.enable\": \"true\",\n",
    "            \"google.cloud.auth.service.account.json.keyfile\": \"/mnt/secrets/user-gcp-sa.json\",\n",
    "        },\n",
    "        \"mainClass\": \"org.rawkintrevo.covid.App\",\n",
    "        \"mainApplicationFile\": \"local:///covid-0.1-jar-with-dependencies.jar\", # See the Dockerfile\n",
    "        \"arguments\": [\"245\", \"15\", \"1\"],\n",
    "        \"sparkVersion\": \"2.4.5\",\n",
    "        \"restartPolicy\": {\n",
    "            \"type\": \"Never\"\n",
    "        },\n",
    "        \"driver\": {\n",
    "            \"cores\": 1,\n",
    "            \"secrets\": [\n",
    "                {\"name\": \"user-gcp-sa\",\n",
    "                 \"path\": \"/mnt/secrets\",\n",
    "                 \"secretType\": \"GCPServiceAccount\"\n",
    "                 }\n",
    "            ],\n",
    "\n",
    "            \"coreLimit\": \"1200m\",\n",
    "            \"memory\": \"512m\",\n",
    "            \"labels\": {\n",
    "                \"version\": \"2.4.5\",\n",
    "            },\n",
    "            \"serviceAccount\": \"spark-operatoroperator-sa\", # also try spark-operatoroperator-sa\n",
    "        },\n",
    "        \"executor\": {\n",
    "            \"cores\": 1,\n",
    "            \"secrets\": [\n",
    "                {\"name\": \"user-gcp-sa\",\n",
    "                 \"path\": \"/mnt/secrets\",\n",
    "                 \"secretType\": \"GCPServiceAccount\"\n",
    "                 }\n",
    "            ],\n",
    "            \"instances\": 4,\n",
    "            \"memory\": \"4084m\"\n",
    "        },\n",
    "        \"labels\": {\n",
    "            \"version\": \"2.4.5\"\n",
    "        },\n",
    "\n",
    "    }\n",
    "}"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {},
   "outputs": [],
   "source": [
    "from kfp.gcp import use_gcp_secret\n",
    "@kfp.dsl.pipeline(\n",
    "    name=\"Covid DICOM Pipe v2\",\n",
    "    description=\"Create Basis Vectors for Lung Images\"\n",
    ")\n",
    "def covid_dicom_pipeline():\n",
    "    vop = kfp.dsl.VolumeOp(\n",
    "        name=\"requisition-PVC\",\n",
    "        resource_name=\"datapvc\",\n",
    "        size=\"20Gi\", #10 Gi blows up...\n",
    "        modes=kfp.dsl.VOLUME_MODE_RWO\n",
    "    )\n",
    "    step1 = kfp.dsl.ContainerOp(\n",
    "        name=\"download-dicom\",\n",
    "        image=\"rawkintrevo/download-dicom:0.0.0.4\",\n",
    "        command=[\"/run.sh\"],\n",
    "        pvolumes={\"/data\": vop.volume}\n",
    "    )\n",
    "    step2 = kfp.dsl.ContainerOp(\n",
    "        name=\"convert-dicoms-to-vectors\",\n",
    "        image=\"rawkintrevo/covid-prep-dicom:0.9.5\",\n",
    "        arguments=[\n",
    "            '--bucket_name', \"covid-dicoms\",\n",
    "        ],\n",
    "        command=[\"python\", \"/program.py\"],\n",
    "        pvolumes={\"/mnt/data\": step1.pvolume}\n",
    "    ).apply(kfp.gcp.use_gcp_secret(secret_name='user-gcp-sa'))\n",
    "    rop = kfp.dsl.ResourceOp(\n",
    "        name=\"calculate-basis-vectors\",\n",
    "        k8s_resource=container_manifest,\n",
    "        action=\"create\",\n",
    "        success_condition=\"status.applicationState.state == COMPLETED\"\n",
    "    ).after(step2)\n",
    "    pyviz = kfp.dsl.ContainerOp(\n",
    "        name=\"visualize-slice-of-dicom\",\n",
    "        image=\"rawkintrevo/visualize-dicom-output:0.0.11\",\n",
    "        command=[\"python\", \"/program.py\"],\n",
    "        arguments=[\n",
    "            '--bucket_name', \"covid-dicoms\",\n",
    "        ],\n",
    "    ).apply(kfp.gcp.use_gcp_secret(secret_name='user-gcp-sa')).after(rop)\n",
    "    \n",
    "\n",
    "kfp.compiler.Compiler().compile(covid_dicom_pipeline,\"dicom-pipeline-2.zip\")\n",
    "client = kfp.Client()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "Experiment link <a href=\"/pipeline/#/experiments/details/a7292089-5186-4e53-b0bb-9264dfbb9775\" target=\"_blank\" >here</a>"
      ],
      "text/plain": [
       "<IPython.core.display.HTML object>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "text/html": [
       "Run link <a href=\"/pipeline/#/runs/details/0f3f3d01-f6c4-4216-8e03-396c49fa040f\" target=\"_blank\" >here</a>"
      ],
      "text/plain": [
       "<IPython.core.display.HTML object>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "\n",
    "my_experiment = client.create_experiment(name='my-experiments')\n",
    "my_run = client.run_pipeline(my_experiment.id, 'my-run1', 'dicom-pipeline-2.zip')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.9"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}


================================================
FILE: ch9/ctscans/calculate-basis-vectors/Dockerfile
================================================
FROM gcr.io/spark-operator/spark:v2.4.5-gcs-prometheus

COPY target/covid-0.1-jar-with-dependencies.jar /

## Someday soon we'll live in a world where this hack is unnessecary
# https://github.com/GoogleCloudDataproc/hadoop-connectors/issues/323
CMD rm /opt/spark/jars/gcs-connector-latest-hadoop2.jar
ADD https://storage.googleapis.com/hadoop-lib/gcs/gcs-connector-hadoop2-2.0.1.jar $SPARK_HOME/jars

ENTRYPOINT ["/opt/entrypoint.sh"]


================================================
FILE: ch9/ctscans/calculate-basis-vectors/build-component.sh
================================================
#!/usr/bin/env bash

image_name=rawkintrevo/covid-basis-vectors # Specify the image name here
image_tag=0.2.0
full_image_name=${image_name}:${image_tag}

cd "$(dirname "$0")"
docker build -t "${full_image_name}" .
docker push "$full_image_name"


================================================
FILE: ch9/ctscans/calculate-basis-vectors/pom.xml
================================================
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/maven-v4_0_0.xsd">
  <modelVersion>4.0.0</modelVersion>
  <groupId>org.rawkintrevo</groupId>
  <artifactId>covid</artifactId>
  <version>0.1</version>
  <inceptionYear>2020</inceptionYear>
  <properties>
    <scala.version>2.11.12</scala.version>
  </properties>

  <repositories>
    <repository>
      <id>scala-tools.org</id>
      <name>Scala-Tools Maven2 Repository</name>
      <url>http://scala-tools.org/repo-releases</url>
    </repository>
  </repositories>

  <pluginRepositories>
    <pluginRepository>
      <id>scala-tools.org</id>
      <name>Scala-Tools Maven2 Repository</name>
      <url>http://scala-tools.org/repo-releases</url>
    </pluginRepository>
  </pluginRepositories>

  <dependencies>
    <dependency>
      <groupId>org.scala-lang</groupId>
      <artifactId>scala-library</artifactId>
      <version>${scala.version}</version>
    </dependency>
    <dependency>
      <groupId>junit</groupId>
      <artifactId>junit</artifactId>
      <version>4.4</version>
      <scope>test</scope>
    </dependency>
    <dependency>
      <groupId>org.specs</groupId>
      <artifactId>specs</artifactId>
      <version>1.2.5</version>
      <scope>test</scope>
    </dependency>

    <dependency>
      <groupId>org.apache.mahout</groupId>
      <artifactId>mahout-core_2.11</artifactId>
      <version>14.1-SNAPSHOT</version>
    </dependency>

    <dependency>
      <groupId>org.apache.mahout</groupId>
      <artifactId>mahout-hdfs_2.11</artifactId>
      <version>14.1-SNAPSHOT</version>
    </dependency>

    <dependency>
      <groupId>org.apache.mahout</groupId>
      <artifactId>mahout-spark_2.11</artifactId>
      <version>14.1-SNAPSHOT</version>
    </dependency>


  </dependencies>

  <build>
    <sourceDirectory>src/main/scala</sourceDirectory>
    <testSourceDirectory>src/test/scala</testSourceDirectory>
    <plugins>
      <plugin>
        <groupId>org.scala-tools</groupId>
        <artifactId>maven-scala-plugin</artifactId>
        <executions>
          <execution>
            <goals>
              <goal>compile</goal>
              <goal>testCompile</goal>
            </goals>
          </execution>
        </executions>
        <configuration>
          <scalaVersion>${scala.version}</scalaVersion>
          <args>
            <arg>-target:jvm-1.5</arg>
          </args>
        </configuration>
      </plugin>
      <plugin>
        <groupId>org.apache.maven.plugins</groupId>
        <artifactId>maven-eclipse-plugin</artifactId>
        <configuration>
          <downloadSources>true</downloadSources>
          <buildcommands>
            <buildcommand>ch.epfl.lamp.sdt.core.scalabuilder</buildcommand>
          </buildcommands>
          <additionalProjectnatures>
            <projectnature>ch.epfl.lamp.sdt.core.scalanature</projectnature>
          </additionalProjectnatures>
          <classpathContainers>
            <classpathContainer>org.eclipse.jdt.launching.JRE_CONTAINER</classpathContainer>
            <classpathContainer>ch.epfl.lamp.sdt.launching.SCALA_CONTAINER</classpathContainer>
          </classpathContainers>
        </configuration>
      </plugin>
      <!-- This builds the fat JAR -->
      <plugin>
        <artifactId>maven-assembly-plugin</artifactId>
        <configuration>
          <archive>
            <manifest>
              <mainClass>org.rawkintrevo.covid.App</mainClass>
            </manifest>
          </archive>
          <descriptorRefs>
            <descriptorRef>jar-with-dependencies</descriptorRef>
          </descriptorRefs>
        </configuration>
        <executions>
          <execution>
            <id>make-assembly</id>
            <phase>package</phase>
            <goals>
              <goal>single</goal>
            </goals>
          </execution>
        </executions>
      </plugin>
    </plugins>
  </build>
  <reporting>
    <plugins>
      <plugin>
        <groupId>org.scala-tools</groupId>
        <artifactId>maven-scala-plugin</artifactId>
        <configuration>
          <scalaVersion>${scala.version}</scalaVersion>
        </configuration>
      </plugin>
    </plugins>
  </reporting>
</project>


================================================
FILE: ch9/ctscans/calculate-basis-vectors/src/main/scala/org/rawkintrevo/covid/App.scala
================================================
package org.rawkintrevo.covid

import org.apache.mahout.math._
import org.apache.mahout.math.scalabindings._
import org.apache.mahout.math.drm._
import org.apache.mahout.math.scalabindings.RLikeOps._
import org.apache.mahout.math.drm.RLikeDrmOps._
import org.apache.mahout.sparkbindings._
import org.apache.mahout.math.decompositions._
import org.apache.mahout.math.scalabindings.MahoutCollections._

import org.apache.spark.SparkContext
import org.apache.spark.SparkConf

import org.apache.spark.SparkFiles

object App {
  def main(args: Array[String]) {

    val conf:SparkConf = new SparkConf()
      .setAppName("Calculate CT Scan Basis Vectors")
      .set("spark.kryo.referenceTracking", "false")
      .set("spark.kryo.registrator", "org.apache.mahout.sparkbindings.io.MahoutKryoRegistrator")
      .set("spark.kryoserializer.buffer", "32")
      .set("spark.kryoserializer.buffer.max" , "600m")
      .set("spark.serializer",	"org.apache.spark.serializer.KryoSerializer")

    //create spark context object
    val sc = new SparkContext(conf)
    implicit val sdc: org.apache.mahout.sparkbindings.SparkDistributedContext = sc2sdc(sc)


    val pathToMatrix = "gs://covid-dicoms/s.csv"  // todo make this an arg.

    val voxelRDD:DrmRdd[Int]  = sc.textFile(pathToMatrix)
      .map(s => dvec( s.split(",")
      .map(f => f.toDouble)))
      .zipWithIndex
      .map(o => (o._2.toInt, o._1))

    val voxelDRM = drmWrap(voxelRDD)

    // k, p, q should all be cli parameters
    // k is rank of the output e.g. the number of eigenfaces we want out.
    // p is oversampling parameter,
    // and q is the number of additional power iterations
    // Read https://mahout.apache.org/users/dim-reduction/ssvd.html
    val k = args(0).toInt
    val p = args(1).toInt
    val q = args(2).toInt

    val(drmU, drmV, s) = dssvd(voxelDRM.t, k, p, q)

    val V = drmV.checkpoint().rdd.saveAsTextFile("gs://covid-dicoms/drmV")
    val U = drmU.t.checkpoint().rdd.saveAsTextFile("gs://covid-dicoms/drmU")

    sc.parallelize(s.toArray,1).saveAsTextFile("gs://covid-dicoms/s")
    println("The job is done!")
  }
}

// $SPARK_HOME/bin/spark-submit --driver-memory 4G --executor-memory 4G --class org.rawkintrevo.book.App *jar

================================================
FILE: ch9/ctscans/download-dicom/Dockerfile
================================================
FROM gcr.io/google.com/cloudsdktool/cloud-sdk:latest
#
## install gsutil lightly
#RUN  apt update \
#  && apt install -y wget
#RUN wget https://storage.googleapis.com/pub/gsutil.tar.gz
#RUN tar xfz gsutil.tar.gz -C $HOME
#ENV PATH="${PATH}:$HOME/gsutil"

COPY ./run.sh /run.sh


================================================
FILE: ch9/ctscans/download-dicom/build-component.sh
================================================
#!/usr/bin/env bash

image_name=rawkintrevo/download-dicom # Specify the image name here
image_tag=0.0.0.4
full_image_name=${image_name}:${image_tag}

cd "$(dirname "$0")"
docker build -t "${full_image_name}" .
docker push "$full_image_name"


================================================
FILE: ch9/ctscans/download-dicom/run.sh
================================================
#!/usr/bin/env bash
set -e

# 1st arg- case number (leading zero required if < 10), defaults to case1

if [ -z "${1}" ]
then
      CASE="01"
else
      CASE="${1}"
fi


echo "Downloading DICOMs"
# If not on GCP need to download this
gsutil cp gs://covid-dicoms/covid-dicoms.tar.gz /tmp/covid-dicoms.tar.gz
tar -xzf /tmp/covid-dicoms.tar.gz -C /tmp

mv "/tmp/case0${CASE}/axial" /data/dicom


================================================
FILE: ch9/ctscans/process-dicoms-into-vectors/Dockerfile
================================================
FROM pydicom/dicom:v3.6.5

# From https://github.com/HealthplusAI/python3-gdcm
RUN apt update && apt install -y python-vtk6 libvtk6-dev cmake-curses-gui swig python3-dev libpython3.7-dev
## checkinstall missing...
RUN ln -s /opt/conda/bin/* /usr/local/bin
RUN git clone --branch release git://git.code.sf.net/p/gdcm/gdcm
RUN mkdir build
RUN cd build && cmake -D CMAKE_BUILD_TYPE=Release -D CMAKE_C_FLAGS=-fPIC -D CMAKE_CXX_FLAGS=-fPIC -D GDCM_BUILD_SHARED_LIBS:BOOL=ON \
    -D GDCM_WRAP_PYTHON=ON -D PYTHON_EXECUTABLE=/usr/local/bin/python3.7 \
    -D PYTHON_INCLUDE_DIR=/usr/include/python3.7m/ -D GDCM_BUILD_SHARED_LIBS=ON -D GDCM_USE_VTK=ON ../gdcm
## They forgot this line in instuctions, but is important...
RUN cd build && make install
# checkinstall -D -y --pkgversion --pkgname=python3-gdcm --pkgversion=1  # checkinstall doesn't exist in debian?
RUN cp /usr/local/lib/gdcm.py /opt/conda/lib/python3.7/site-packages/
RUN cp /usr/local/lib/gdcmswig.py /opt/conda/lib/python3.7/site-packages/
RUN cp /usr/local/lib/_gdcmswig.so /opt/conda/lib/python3.7/site-packages/
RUN cp /usr/local/lib/libgdcm* /opt/conda/lib/python3.7/site-packages/
RUN ldconfig

# RUN mkdir /data already exists in base
# todo move these to requirements.txt
RUN pip install numpy
RUN pip install scipy
RUN pip install google-cloud-storage

ENV GOOGLE_APPLICATION_CREDENTIALS="/secret/gcp-credentials/user-gcp-sa.json"
COPY src/program.py /program.py


================================================
FILE: ch9/ctscans/process-dicoms-into-vectors/build-component.sh
================================================
#!/usr/bin/env bash

image_name=rawkintrevo/covid-prep-dicom # Specify the image name here
image_tag=0.9.5
full_image_name=${image_name}:${image_tag}

cd "$(dirname "$0")"
docker build -t "${full_image_name}" .
docker push "$full_image_name"


================================================
FILE: ch9/ctscans/process-dicoms-into-vectors/data/s.150.csv
================================================


================================================
FILE: ch9/ctscans/process-dicoms-into-vectors/process-dicoms-into-vectors.yaml
================================================
name: Process DICOMs into Vectors
description: Take a number of COVID DICOMs - output a list of vectors for DS-SVD.
inputs:
  - {name: in, type: String, description='Input file name.'}
  - {name: out, type: String, description='Output file name.'}
implementation:
  container:
    image: rawkintrevo/covid-prep-docim
    command: [
      python, /program.py,
      {inputValue:  in},
      inputValue:  out}
    ]

================================================
FILE: ch9/ctscans/process-dicoms-into-vectors/src/program.py
================================================
from os import listdir
import numpy as np
import pydicom

import argparse
from google.cloud import storage

parser = argparse.ArgumentParser(
    description='Process DICOM Images into Vectors.')
parser.add_argument('--input_dir',
                    type=str,
                    default="/mnt/data/dicom",
                    help='Directory containing DICOM Images')
parser.add_argument('--bucket_name',
                    type=str,
                    help='name of bucket to write output to.')
parser.add_argument('--output_file',
                    type=str,
                    default="s.csv",
                    help='file name of dcm converted to 2d numerical matrix')

args = parser.parse_args()


def create_3d_matrix(path):
    dicoms = [pydicom.dcmread(f"{path}/{f}") for f in listdir(path)]
    slices = [d for d in dicoms if hasattr(d, "SliceLocation")]
    slices = sorted(slices, key=lambda s: s.SliceLocation)
    ps = slices[0].PixelSpacing
    ss = slices[0].SliceThickness
    ax_aspect = ps[1] / ps[0]
    sag_aspect = ps[1] / ss
    cor_aspect = ss / ps[0]

    # create 3D array
    img_shape = list(slices[0].pixel_array.shape)
    img_shape.append(len(slices))
    img3d = np.zeros(img_shape)

    for i, s in enumerate(slices):
        img2d = s.pixel_array
        img3d[:, :, i] = img2d

    return {
        "img3d": img3d,
        "img_shape": img_shape,
        "ax_aspect": ax_aspect,
        "sag_aspect": sag_aspect,
        "cor_aspect": cor_aspect
    }


def upload_blob(bucket_name, source_file_name, destination_blob_name):
    """Uploads a file to the bucket."""
    # bucket_name = "your-bucket-name"
    # source_file_name = "local/path/to/file"
    # destination_blob_name = "storage-object-name"

    storage_client = storage.Client()
    bucket = storage_client.bucket(bucket_name)
    blob = bucket.blob(destination_blob_name)

    blob.upload_from_filename(source_file_name)

    print("File {} uploaded to {}.".format(source_file_name,
                                           destination_blob_name))


input_dir = args.input_dir
output_file = args.output_file

m = create_3d_matrix(f"{input_dir}")
np.savetxt("/tmp/s.csv",
           m['img3d'].reshape((-1, m['img_shape'][2])),
           delimiter=",")

upload_blob(args.bucket_name, "/tmp/s.csv", output_file)


================================================
FILE: ch9/ctscans/visualize-basis-vectors/Dockerfile
================================================
FROM python:3-buster

RUN pip install numpy
RUN pip install matplotlib
RUN pip install google-cloud-storage
COPY src/program.py /program.py

CMD ["python" , "/program.py"]

================================================
FILE: ch9/ctscans/visualize-basis-vectors/build-component.sh
================================================
#!/usr/bin/env bash

image_name=rawkintrevo/visualize-dicom-output # Specify the image name here
image_tag=0.0.11
full_image_name=${image_name}:${image_tag}

cd "$(dirname "$0")"
docker build -t "${full_image_name}" .
docker push "$full_image_name"


================================================
FILE: ch9/ctscans/visualize-basis-vectors/src/program.py
================================================
from ast import literal_eval

from os import listdir

import matplotlib.pyplot as plt
import numpy as np

import argparse
from google.cloud import storage

parser = argparse.ArgumentParser(
    description='Convert DRMs into DICOMs and Images')
parser.add_argument('--bucket_name',
                    type=str,
                    help='name of bucket to write output to.')
args = parser.parse_args()


def read_mahout_drm(path):
    data = {}
    counter = 0
    parts = [p for p in listdir(path) if "part"]
    for p in parts:
        with open(f"{path}/{p}", 'r') as f:
            lines = f.read().split("\n")
            for l in lines[:-1]:
                counter += 1
                t = literal_eval(l)
                arr = np.array([t[1][i] for i in range(len(t[1].keys()))])
                data[t[0]] = arr
    print(f"read {counter} lines from {path}")
    return data


def plot_3d_matrix(img3d, img_shape, ax_aspect, sag_aspect, cor_aspect):
    # plot 3 orthogonal slices
    a1 = plt.subplot(2, 2, 1)
    plt.imshow(img3d[:, :, img_shape[2] // 2])
    a1.set_aspect(ax_aspect)

    a2 = plt.subplot(2, 2, 2)
    plt.imshow(img3d[:, img_shape[1] // 2, :])
    a2.set_aspect(sag_aspect)

    a3 = plt.subplot(2, 2, 3)
    plt.imshow(img3d[img_shape[0] // 2, :, :].T)
    a3.set_aspect(cor_aspect)
    plt.show(cmap=plt.cm.bone)


def plot_2_3d_matrices(img1, img2, aspect, slice, cmap):
    a1 = plt.subplot(1, 2, 1)
    plt.imshow(img1[:, slice, :], cmap=cmap)
    a1.set_aspect(aspect)

    a2 = plt.subplot(1, 2, 2)
    plt.imshow(img2[:, slice, :], cmap=cmap)
    a2.set_aspect(aspect)


def upload_blob(bucket_name, source_file_name, destination_blob_name):
    """Uploads a file to the bucket."""
    # bucket_name = "your-bucket-name"
    # source_file_name = "local/path/to/file"
    # destination_blob_name = "storage-object-name"

    storage_client = storage.Client()
    bucket = storage_client.bucket(bucket_name)
    blob = bucket.blob(destination_blob_name)

    blob.upload_from_filename(source_file_name)

    print("File {} uploaded to {}.".format(source_file_name,
                                           destination_blob_name))


def download_folder(bucket_name='your-bucket-name',
                    bucket_dir='your-bucket-directory/',
                    dl_dir="local-dir/"):
    storage_client = storage.Client()
    bucket = storage_client.get_bucket(bucket_name)
    blobs = bucket.list_blobs(prefix=bucket_dir)  # Get list of files
    for blob in blobs:
        filename = blob.name.replace('/', '_')
        blob.download_to_filename(dl_dir + filename)  # Download


import os

bucket_name = args.bucket_name
os.mkdir('/tmp/drmU')
os.mkdir('/tmp/drmV')
os.mkdir('/tmp/s')

download_folder(bucket_name, "drmU/", "/tmp/drmU/")
download_folder(bucket_name, "drmV/", "/tmp/drmV/")
download_folder(bucket_name, "s/", "/tmp/s/")

drmU = read_mahout_drm("/tmp/drmU")
drmV = read_mahout_drm("/tmp/drmV")

print(os.listdir("/tmp"))
print(os.listdir("/tmp/s"))

drmU_p5 = np.transpose(np.array([drmU[i] for i in range(len(drmU.keys()))]))
drmV_p5 = np.array([drmV[i] for i in range(len(drmV.keys()))])

with open(f"/tmp/s/s_part-00000", 'r') as f:
    diags = [float(d) for d in f.read().split('\n') if d != '']

recon = drmU_p5 @ np.diag(diags) @ drmV_p5.transpose()
# plot_3d_matrix(recon.transpose().reshape((512,512,301)), (512,512,301), 1.0, 0.810547, 1.2337347494963278)
composite_img = recon.transpose().reshape((512, 512, 301))

diags_orig = diags
percs = [0.001, 0.01, 0.05, 0.1, 0.3]

for p in range(len(percs)):
    perc = percs[p]
    diags = [
        diags_orig[i] if i < round(len(diags) - (len(diags) * perc)) else 0
        for i in range(len(diags))
    ]
    recon = drmU_p5 @ np.diag(diags) @ drmV_p5.transpose()
    # plot_3d_matrix(recon.transpose().reshape((512,512,301)), (512,512,301), 1.0, 0.810547, 1.2337347494963278)
    composite_img = recon.transpose().reshape((512, 512, 301))
    a1 = plt.subplot(1, 1, 1)
    plt.imshow(composite_img[:, :, 150], cmap=plt.cm.bone)
    plt.title(
        f"{perc*100}% denoised.  (k={len(diags)}, oversample=15, power_iters=2)"
    )
    a1.set_aspect(1.0)
    plt.axis('off')
    fname = f"{100-(perc*100)}%-denoised-img.png"
    plt.savefig(f"/tmp/{fname}")
    upload_blob(bucket_name, f"/tmp/{fname}", f"/output/{fname}")


================================================
FILE: ci.sh
================================================
#!/bin/bash

set -ex

# Check all the shell scripts
find ./ -iregex '^.+\.sh$' -type f -print0 | \
  xargs -0 shellcheck -e SC1091 -e SC2164 -e SC1090
# Check for cases where I use tags rather than tag
bad_tags=$(grep -r "tags::" ./ | grep -v "ci.sh:" || true)
# Look for long lines
long_lines=$(grep --include '*.sh' --exclude '*venv*' -Hnr '.\{90\}' ./ || true)
if [[ -n "$bad_tags" ]]; then
  echo "Found bad tags $bad_tags replace tags with tag"
fi
if [[ -n "$long_lines" ]]; then
  print "Found long lines:\n$long_lines"
fi
if [[ -n "$bad_tags" ]] || [[ -n "$long_lines" ]]; then
  exit 1
fi
./runthrough.sh


================================================
FILE: convert_notebooks.sh
================================================
#!/bin/bash
find . -name "*ipynb" |grep -v venv | xargs -d '\n' ipython3 nbconvert --to script


================================================
FILE: data-extraction/README.md
================================================
## Data Extraction

To successfully construct a machine learning pipeline we need to collect the data we are going to train on.
The data extraction is organized here by the different use case.


In many introduction to machine learning examples the data is pre-extracted, and sometimes even pre-cleaned.
Here we will show some ways to collect the initial data.
Once the initial training data has been extracted, we will continue on downstream with data cleaning, and
may later do some data augmentation.

================================================
FILE: data-extraction/github_comments_query.bsql
================================================
SELECT pull_request_url,
 ANY_VALUE(pull_patch_url) as pull_patch_url,
 ARRAY_AGG(comment_position) as comments_positions,
 ARRAY_AGG(diff_hunk) as diff_hunks,
 ARRAY_AGG(comment_original_position) as comments_original_positions,
 ARRAY_AGG(comment_commit_id IGNORE NULLS) as comment_commit_ids,
 ARRAY_AGG(comment_file_path IGNORE NULLS) as comment_file_paths  FROM (
   SELECT *, JSON_EXTRACT(payload, '$.action') AS action,
   JSON_EXTRACT(payload, '$.pull_request.url') AS pull_request_url,
   JSON_EXTRACT(payload, '$.pull_request.patch_url') AS pull_patch_url,
   IFNULL(JSON_EXTRACT(payload, '$.comment.original_position'), "-1") AS comment_original_position,
   IFNULL(JSON_EXTRACT(payload, '$.comment.position'), "-1") AS comment_position,
   JSON_EXTRACT(payload, '$.comment.commit_id') AS comment_commit_id,
   JSON_EXTRACT(payload, '$.comment.path') AS comment_file_path
   FROM "githubarchive.day.*"
   WHERE type = "PullRequestReviewCommentEvent")
 GROUP BY pull_request_url

================================================
FILE: data-extraction/github_issues_query.bsql
================================================
SELECT repo.name, JSON_EXTRACT(payload, '$.issue.url') 
AS url FROM (
  SELECT *, JSON_EXTRACT(payload, '$.action') AS action
  FROM "githubarchive.day.*" WHERE type = "IssuesEvent")
WHERE type = "IssuesEvent"  AND action = "\"opened\""

================================================
FILE: data-extraction/iot/basic.yaml
================================================
apiVersion: batch/v1
kind: Job
metadata:
  name: iot-data-extraction
  namespace: kubeflow
spec:
  template:
    spec:
      containers:
      - env:
        - name: GOOGLE_APPLICATION_CREDENTIALS
          value: /secret/gcp-credentials/user-gcp-sa.json
        image: IMAGE_NAME
        name: gh-data-extract-gh-job
        volumeMounts:
        - mountPath: /secret/gcp-credentials
          name: secret-volume
          readOnly: true
      restartPolicy: OnFailure
      volumes:
      - name: secret-volume
        secret:
          secretName: user-gcp-sa


================================================
FILE: data-extraction/iot/build.sh
================================================
#!/bin/bash

CONTAINER_REGISTRY="gcr.io/${PROJECT_NAME}"
#tag::buildandpush[]
TARGET="${CONTAINER_REGISTRY}/kf-steps/iot-extract:v2"
docker build . -t "${TARGET}"
docker push "${TARGET}"
#end::buildandpush[]
#tag::run[]
kubectl apply -f iot_extract_job.yaml
#end::run[]
#tag::verify[]
kubectl get jobs |grep gh-data
#end::verify[]


================================================
FILE: data-extraction/python-notebook/AddSpamassassinDockerfile
================================================
ARG base
FROM $base
# Run as root for updates
USER root
# Install Spamassassin
RUN apt-get update && \
    apt-get install -yq spamassassin spamc && \
    apt-get clean && \
    rm -rf /var/lib/apt/lists/* && \
    rm -rf /var/cache/apt
# Switch back to the expected user
USER jovyan

================================================
FILE: data-extraction/python-notebook/MailingListDataPrep.ipynb
================================================
{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Here we can install some packages our notebook needs. We can also install them in our container to speed things up & make it more reliable. But for prototyping this works great!"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "jupyter": {
     "outputs_hidden": true
    }
   },
   "outputs": [],
   "source": [
    "!pip3 install --upgrade lxml\n",
    "!pip3 install --upgrade pandas\n",
    "!pip3 install --upgrade scikit-learn\n",
    "!pip3 install --upgrade scipy\n",
    "!pip3 install --upgrade tables"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "We can use Jupyter notebooks just like normal inside of Kubeflow"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from datetime import datetime\n",
    "from requests import get\n",
    "from lxml import etree\n",
    "from time import sleep\n",
    "\n",
    "import re\n",
    "\n",
    "import pandas as pd\n",
    "\n",
    "import os"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "container_registry = \"\" # Wherever you put your containers"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def scrapeMailArchives(mailingList: str, year: int, month: int):\n",
    "    baseUrl = \"http://mail-archives.apache.org/mod_mbox/%s/%s.mbox/ajax/\" % (mailingList, datetime(year,month,1).strftime(\"%Y%m\"))\n",
    "    r = get(baseUrl + \"thread?0\")\n",
    "    utf8_parser = etree.XMLParser(encoding='utf-8')\n",
    "    root = etree.fromstring(r.text.replace('encoding=\"UTF-8\"', \"\"),  parser=utf8_parser)\n",
    "    output = []\n",
    "    for message in root.xpath(\"//message\"):\n",
    "        _id = message.get(\"id\")\n",
    "        linked = message.get(\"linked\")\n",
    "        depth = message.get(\"depth\")\n",
    "        fr = message.xpath(\"from\")[0].text\n",
    "        dt = message.xpath(\"date\")[0].text ## todo convert to date\n",
    "        subject = message.xpath(\"subject\")[0].text\n",
    "        r2 = get(baseUrl + _id)\n",
    "        bodyRoot = etree.fromstring(r2.text.replace('encoding=\"UTF-8\"', \"\"),  parser=utf8_parser)\n",
    "        body = bodyRoot.xpath(\"//contents\")[0].text\n",
    "        record = {\n",
    "            \"id\"        : _id,\n",
    "            \"linked\"    : linked,\n",
    "            \"depth\"     : depth,\n",
    "            \"from\"      : fr,\n",
    "            \"dt\"        : dt,\n",
    "            \"subject\"   : subject,\n",
    "            \"body\"      : body\n",
    "        }\n",
    "        output.append(record)\n",
    "        sleep(0.1)\n",
    "    return output\n",
    "\n",
    "\n",
    "def extract_links(body):\n",
    "    link_regex_str = r'(http(|s)://(.*?))([\\s\\n]|$)'\n",
    "    itr = re.finditer(link_regex_str, body, re.MULTILINE)\n",
    "    return list(map(lambda elem: elem.group(1), itr))\n",
    "\n",
    "def extract_domains(links):\n",
    "    from urllib.parse import urlparse\n",
    "    def extract_domain(link):\n",
    "        try:\n",
    "            nloc = urlparse(link).netloc\n",
    "            # We want to drop www and any extra spaces wtf nloc on the spaces.\n",
    "            regex_str = r'^(www\\.|)(.*?)\\s*$'\n",
    "            match = re.search(regex_str, nloc)\n",
    "            return match.group(2)\n",
    "        except:\n",
    "            return None\n",
    "    return list(map(extract_domain, links))\n",
    "\n",
    "def contains_python_stack_trace(body):\n",
    "    return \"Traceback (most recent call last)\" in body\n",
    "\n",
    "def contains_probably_java_stack_trace(body):\n",
    "    # Look for something based on regex\n",
    "    # Tried https://stackoverflow.com/questions/20609134/regular-expression-optional-multiline-java-stacktrace - more msg looking\n",
    "    # Tried https://stackoverflow.com/questions/3814327/regular-expression-to-parse-a-log-file-and-find-stacktraces\n",
    "    # Yes the compile is per call, but it's cached so w/e\n",
    "    import re\n",
    "    stack_regex_str = r'^\\s*(.+Exception.*):\\n(.*\\n){0,3}?(\\s+at\\s+.*\\(.*\\))+'\n",
    "    match = re.search(stack_regex_str, body, re.MULTILINE)\n",
    "    return match is not None\n",
    "\n",
    "def contains_exception_in_task(body):\n",
    "    # Look for a line along the lines of ERROR Executor: Exception in task\n",
    "    return \"ERROR Executor: Exception in task\" in body"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "datesToScrape =  [(2019, i) for i in range(1,13)]\n",
    "\n",
    "records = []\n",
    "for y,m in datesToScrape:\n",
    "    print(m,\"-\",y)\n",
    "    records += scrapeMailArchives(\"spark-dev\", y, m)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "df = pd.DataFrame(records)\n",
    "df['links'] = df['body'].apply(extract_links)\n",
    "df['containsPythonStackTrace'] = df['body'].apply(contains_python_stack_trace)\n",
    "df['containsJavaStackTrace'] = df['body'].apply(contains_probably_java_stack_trace)\n",
    "df['containsExceptionInTaskBody'] = df['body'].apply(contains_exception_in_task)\n",
    "\n",
    "df['domains'] = df['links'].apply(extract_domains)\n",
    "df['isThreadStart'] = df['depth'] == '0'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from sklearn.feature_extraction.text import TfidfVectorizer\n",
    "\n",
    "bodyV = TfidfVectorizer()\n",
    "# bodyV = TfidfVectorizer(max_features=10000) #if we cared about making this 1:1 w holden's code.\n",
    "bodyFeatures = bodyV.fit_transform(df['body'])\n",
    "\n",
    "domainV = TfidfVectorizer()\n",
    "# domainV = TfidfVectorizer(max_features=100)\n",
    "\n",
    "## A couple of \"None\" domains really screwed the pooch on this one. Also, no lists just space seperated domains.\n",
    "def makeDomainsAList(d):\n",
    "    return ' '.join([a for a in d if not a is None])\n",
    "\n",
    "domainFeatures = domainV.fit_transform(df['domains'].apply(makeDomainsAList))\n",
    "\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "\n",
    "from scipy.sparse import csr_matrix, hstack\n",
    "\n",
    "data = hstack([csr_matrix(df[['containsPythonStackTrace', 'containsJavaStackTrace', 'containsExceptionInTaskBody', 'isThreadStart']].to_numpy()),\n",
    "                             bodyFeatures,\n",
    "                            domainFeatures])\n",
    "\n",
    "\n",
    "from sklearn.cluster import KMeans\n",
    "from sklearn.model_selection import train_test_split\n",
    "\n",
    "train, test = train_test_split(data, test_size=0.1)\n",
    "\n",
    "kmeans = KMeans(n_clusters=2, random_state=42).fit(train)\n",
    "train_pred = kmeans.predict(train)\n",
    "test_pred = kmeans.predict(test)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Alternatively, by structuring our code correctly we can take advantage of pipelines"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "!pip3 install --upgrade kfp"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import kfp\n",
    "import kfp.dsl as dsl"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def download_data(year: int) -> str:\n",
    "    \n",
    "    from datetime import datetime\n",
    "    from lxml import etree\n",
    "    from requests import get\n",
    "    from time import sleep\n",
    "    \n",
    "    import json\n",
    "    \n",
    "    def scrapeMailArchives(mailingList: str, year: int, month: int):\n",
    "        baseUrl = \"http://mail-archives.apache.org/mod_mbox/%s/%s.mbox/ajax/\" % (mailingList, datetime(year,month,1).strftime(\"%Y%m\"))\n",
    "        r = get(baseUrl + \"thread?0\")\n",
    "        utf8_parser = etree.XMLParser(encoding='utf-8')\n",
    "        root = etree.fromstring(r.text.replace('encoding=\"UTF-8\"', \"\"),  parser=utf8_parser)\n",
    "        output = []\n",
    "        for message in root.xpath(\"//message\"):\n",
    "            _id = message.get(\"id\")\n",
    "            linked = message.get(\"linked\")\n",
    "            depth = message.get(\"depth\")\n",
    "            fr = message.xpath(\"from\")[0].text\n",
    "            dt = message.xpath(\"date\")[0].text ## todo convert to date\n",
    "            subject = message.xpath(\"subject\")[0].text\n",
    "            r2 = get(baseUrl + _id)\n",
    "            bodyRoot = etree.fromstring(r2.text.replace('encoding=\"UTF-8\"', \"\"),  parser=utf8_parser)\n",
    "            body = bodyRoot.xpath(\"//contents\")[0].text\n",
    "            record = {\n",
    "                \"id\"        : _id,\n",
    "                \"linked\"    : linked,\n",
    "                \"depth\"     : depth,\n",
    "                \"from\"      : fr,\n",
    "                \"dt\"        : dt,\n",
    "                \"subject\"   : subject,\n",
    "                \"body\"      : body\n",
    "            }\n",
    "            output.append(record)\n",
    "            sleep(0.1)\n",
    "            \n",
    "        return output\n",
    "\n",
    "    datesToScrape =  [(year, i) for i in range(1,2)]\n",
    "\n",
    "    records = []\n",
    "    ## todo, go back further\n",
    "    for y,m in datesToScrape:\n",
    "        print(m,\"-\",y)\n",
    "        records += scrapeMailArchives(\"spark-dev\", y, m)\n",
    "    import os\n",
    "    output_path = '/data_processing/data.json'\n",
    "    with open(output_path, 'w') as f:\n",
    "        json.dump(records, f)\n",
    "    \n",
    "    return output_path\n",
    "    "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def download_tld_data() -> str:\n",
    "    from requests import get\n",
    "    import pandas as pd\n",
    "    print(\"importing io....\")\n",
    "    import io\n",
    "\n",
    "    url = \"https://pkgstore.datahub.io/core/country-list/data_csv/data/d7c9d7cfb42cb69f4422dec222dbbaa8/data_csv.csv\"\n",
    "    print(\"Getting the url\")\n",
    "    s = get(url).content\n",
    "    print(\"Converting content\")\n",
    "    df = pd.read_csv(io.StringIO(s.decode('utf-8')))\n",
    "    print(\"Writing output\")\n",
    "    output_path_hdf = '/tld_info/clean_data.hdf'\n",
    "    df.to_hdf(output_path_hdf, key=\"tld\")\n",
    "    \n",
    "    return output_path_hdf"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Now that we have some data, we want to get rid of any \"bad\" records"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "#tag::clean_data_fun[]\n",
    "def clean_data(input_path: str) -> str:\n",
    "    import json\n",
    "    import pandas as pd\n",
    "    \n",
    "    print(\"loading records...\")\n",
    "    with open(input_path, 'r') as f:\n",
    "        records = json.load(f)\n",
    "    print(\"records loaded\")\n",
    "    \n",
    "    df = pd.DataFrame(records)\n",
    "    # Drop records without a subject, body, or sender\n",
    "    cleaned = df.dropna(subset=[\"subject\", \"body\", \"from\"])\n",
    "    \n",
    "    output_path_hdf = '/data_processing/clean_data.hdf'\n",
    "    cleaned.to_hdf(output_path_hdf, key=\"clean\")\n",
    "    \n",
    "    return output_path_hdf\n",
    "#end::clean_data_fun[]"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Preparing the data\n",
    "\n",
    "Remember earlier when we did that big (and arguably pointless) classification of emails from the Apache Spark mailing list? OK, now we're going to do it again, as a \"lightweight\" Python function in a Kubeflow Pipeline.  I hope the irony of the term \"lightweight\" isn't lost on anyone, because this is pretty blatent abuse of something that was originally presented for conveinience. \n",
    "\n",
    "First note, all of the imports and declarations of helper functions MUST be with in the \"ligthweight\" function. One could argue (and they would probably be correct) that I have two steps here- feature prep and ML, and as such I should split them. I would say that's fair, but I choose not to do so at this time.  Perhaps in some scripts later on?\n",
    "\n",
    "As has been pointed out so many times before, we assume the reader either arleady understands what is going on with the KMeans clustering, or better yet, doesn't even care. I won't be digging into that right now. What I will point out- and maybe as a note to the editor, the model that is finally saved really ought to be persisted somewhere.  If the model isn't saved, then this basically pointless pipeline, is truly pointless. \n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Now let's make sure we can read that data in the next step (before we write a big complicated model to do whatever torture to it)."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def prepare_features(input_path: str, tld_info_path: str):\n",
    "   \n",
    "    import re\n",
    "    import pandas as pd\n",
    "    \n",
    "    print(\"loading records...\")\n",
    "    df = pd.read_hdf(input_path, key=\"clean\")\n",
    "    print(\"records loaded\")\n",
    "    \n",
    "    print(\"Loading tld info....\")\n",
    "    tld_df = pd.read_hdf(tld_info_path, key=\"tld\")\n",
    "    print(\"Loaded tld info\")\n",
    "    \n",
    "    \n",
    "    ## Note: \"Lightweight\" Python Fns mean helper code must be inside the fn. (Bad Form)\n",
    "    def extract_links(body):\n",
    "        link_regex_str = r'(http(|s)://(.*?))([\\s\\n]|$)'\n",
    "        itr = re.finditer(link_regex_str, body, re.MULTILINE)\n",
    "        return list(map(lambda elem: elem.group(1), itr))\n",
    "\n",
    "    def extract_domains(links):\n",
    "        from urllib.parse import urlparse\n",
    "        def extract_domain(link):\n",
    "            try:\n",
    "                nloc = urlparse(link).netloc\n",
    "                # We want to drop www and any extra spaces wtf nloc on the spaces.\n",
    "                regex_str = r'^(www\\.|)(.*?)\\s*$'\n",
    "                match = re.search(regex_str, nloc)\n",
    "                return match.group(2)\n",
    "            except:\n",
    "                return None\n",
    "        return list(map(extract_domain, links))\n",
    "\n",
    "    def contains_python_stack_trace(body):\n",
    "        return \"Traceback (most recent call last)\" in body\n",
    "\n",
    "    def contains_probably_java_stack_trace(body):\n",
    "        # Look for something based on regex\n",
    "        # Tried https://stackoverflow.com/questions/20609134/regular-expression-optional-multiline-java-stacktrace - more msg looking\n",
    "        # Tried https://stackoverflow.com/questions/3814327/regular-expression-to-parse-a-log-file-and-find-stacktraces\n",
    "        # Yes the compile is per call, but it's cached so w/e\n",
    "        import re\n",
    "        stack_regex_str = r'^\\s*(.+Exception.*):\\n(.*\\n){0,3}?(\\s+at\\s+.*\\(.*\\))+'\n",
    "        match = re.search(stack_regex_str, body, re.MULTILINE)\n",
    "        return match is not None\n",
    "\n",
    "    def contains_exception_in_task(body):\n",
    "        # Look for a line along the lines of ERROR Executor: Exception in task\n",
    "        return \"ERROR Executor: Exception in task\" in body\n",
    "\n",
    "    print(df.shape)\n",
    "    df['links'] = df['body'].apply(extract_links)\n",
    "    df['containsPythonStackTrace'] = df['body'].apply(contains_python_stack_trace)\n",
    "    df['containsJavaStackTrace'] = df['body'].apply(contains_probably_java_stack_trace)\n",
    "    df['containsExceptionInTaskBody'] = df['body'].apply(contains_exception_in_task)\n",
    "\n",
    "    #tag::local_mailing_list_feature_prep_fun[]\n",
    "    df['domains'] = df['links'].apply(extract_domains)\n",
    "    df['isThreadStart'] = df['depth'] == '0'\n",
    "    \n",
    "    # Arguably, you could split building the dataset away from the actual witchcraft.\n",
    "    from sklearn.feature_extraction.text import TfidfVectorizer\n",
    "\n",
    "    bodyV = TfidfVectorizer()\n",
    "    bodyFeatures = bodyV.fit_transform(df['body'])\n",
    "\n",
    "    domainV = TfidfVectorizer()\n",
    "\n",
    "    ## A couple of \"None\" domains really screwed the pooch on this one.Also, no lists just space seperated domains.\n",
    "    def makeDomainsAList(d):\n",
    "        return ' '.join([a for a in d if not a is None])\n",
    "\n",
    "    domainFeatures = domainV.fit_transform(df['domains'].apply(makeDomainsAList))\n",
    "\n",
    "    from scipy.sparse import csr_matrix, hstack\n",
    "\n",
    "    data = hstack([csr_matrix(df[['containsPythonStackTrace',\n",
    "                                  'containsJavaStackTrace',\n",
    "                                  'containsExceptionInTaskBody', \n",
    "                                  'isThreadStart']].to_numpy()),\n",
    "                                 bodyFeatures,\n",
    "                                domainFeatures])\n",
    "    #end::local_mailing_list_feature_prep_fun[]"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "\n",
    "### The Kubeflow Bit.\n",
    "\n",
    "Now we can put these two pieces together into a pipeline. Since the data is relatively small we will use a persistent volume put them together. Later on we can add training to this pipeline as well.\n",
    "\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Make a volume example. We redo it inside of the pipeline definition because we need to be inside\n",
    "#tag::makeVolume[]\n",
    "dvop = dsl.VolumeOp(\n",
    "    name=\"create_pvc\",\n",
    "    resource_name=\"my-pvc-2\",\n",
    "    size=\"5Gi\",\n",
    "    modes=dsl.VOLUME_MODE_RWO)\n",
    "#end::makeVolume[]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "!rm local-data-prep-2.zip"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "#tag::makePipeline[]\n",
    "@kfp.dsl.pipeline(\n",
    "  name='Simple1',\n",
    "  description='Simple1'\n",
    ")\n",
    "def my_pipeline_mini(year: int):\n",
    "    dvop = dsl.VolumeOp(\n",
    "        name=\"create_pvc\",\n",
    "        resource_name=\"my-pvc-2\",\n",
    "        size=\"5Gi\",\n",
    "        modes=dsl.VOLUME_MODE_RWO)\n",
    "    tldvop = dsl.VolumeOp(\n",
    "        name=\"create_pvc\",\n",
    "        resource_name=\"tld-volume-2\",\n",
    "        size=\"100Mi\",\n",
    "        modes=dsl.VOLUME_MODE_RWO)\n",
    "    download_data_op = kfp.components.func_to_container_op(\n",
    "        download_data,\n",
    "        packages_to_install=['lxml', 'requests'])\n",
    "    download_tld_info_op = kfp.components.func_to_container_op(\n",
    "        download_tld_data,\n",
    "        packages_to_install=['requests', 'pandas>=0.24', 'tables'])\n",
    "    clean_data_op = kfp.components.func_to_container_op(\n",
    "        clean_data,\n",
    "        packages_to_install=['pandas>=0.24', 'tables'])\n",
    "\n",
    "    step1 = download_data_op(year).add_pvolumes({\"/data_processing\": dvop.volume})\n",
    "    step2 = clean_data_op(input_path=step1.output).add_pvolumes({\"/data_processing\": dvop.volume})\n",
    "    step3 = download_tld_info_op().add_pvolumes({\"/tld_info\": tldvop.volume})\n",
    "\n",
    "kfp.compiler.Compiler().compile(my_pipeline_mini, 'local-data-prep-2.zip')\n",
    "#end::makePipeline[]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "!rm *.zip"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "#tag::longPipeline[]\n",
    "@kfp.dsl.pipeline(\n",
    "  name='Simple1',\n",
    "  description='Simple1'\n",
    ")\n",
    "def my_pipeline2(year: int):\n",
    "    dvop = dsl.VolumeOp(\n",
    "        name=\"create_pvc\",\n",
    "        resource_name=\"my-pvc-2\",\n",
    "        size=\"5Gi\",\n",
    "        modes=dsl.VOLUME_MODE_RWO)\n",
    "    tldvop = dsl.VolumeOp(\n",
    "        name=\"create_pvc\",\n",
    "        resource_name=\"tld-volume-2\",\n",
    "        size=\"100Mi\",\n",
    "        modes=dsl.VOLUME_MODE_RWO)\n",
    "\n",
    "    download_data_op = kfp.components.func_to_container_op(\n",
    "        download_data,\n",
    "        packages_to_install=['lxml', 'requests'])\n",
    "    download_tld_info_op = kfp.components.func_to_container_op(\n",
    "        download_tld_data,\n",
    "        packages_to_install=['requests', 'pandas>=0.24', 'tables'])\n",
    "    clean_data_op = kfp.components.func_to_container_op(\n",
    "        clean_data,\n",
    "        packages_to_install=['pandas>=0.24', 'tables'])\n",
    "#tag::add_feature_step[]\n",
    "    prepare_features_op = kfp.components.func_to_container_op(\n",
    "        prepare_features,\n",
    "        packages_to_install=['pandas>=0.24', 'tables', 'scikit-learn'])\n",
    "#tag::end_feature_step[]\n",
    "\n",
    "    step1 = download_data_op(year).add_pvolumes({\"/data_processing\": dvop.volume})\n",
    "    step2 = clean_data_op(input_path=step1.output).add_pvolumes({\"/data_processing\": dvop.volume})\n",
    "    step3 = download_tld_info_op().add_pvolumes({\"/tld_info\": tldvop.volume})\n",
    "    step4 = prepare_features_op(input_path=step2.output, tld_info_path=step3.output).add_pvolumes({\n",
    "        \"/data_processing\": dvop.volume,\n",
    "        \"/tld_info\": tldvop.volume})\n",
    "#end::longPipeline[]\n",
    "\n",
    "kfp.compiler.Compiler().compile(my_pipeline2, 'local-data-and-feature-prep-2.zip')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "client = kfp.Client()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "my_experiment = client.create_experiment(name='local-data-prep-test-2')\n",
    "my_run = client.run_pipeline(my_experiment.id, 'local-data-prep', \n",
    "  'local-data-and-feature-prep-2.zip', params={'year': '2019'})"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "If we were using Spamassasin or some other library installed in a different base container we would:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Clean data with custom container\n",
    "#tag::cleanDataWithContainer[]\n",
    "clean_data_op = kfp.components.func_to_container_op(\n",
    "    clean_data,\n",
    "    base_image=\"{0}/kubeflow/spammassisan\".format(container_registry),\n",
    "    packages_to_install=['pandas>=0.24', 'tables'])\n",
    "#end::cleanDataWithContainer[]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def train_func(input_path: String):\n",
    "    from sklearn.cluster import KMeans\n",
    "    from sklearn.model_selection import train_test_split\n",
    "\n",
    "    train, test = train_test_split(data, test_size=0.1)\n",
    "\n",
    "    kmeans = KMeans(n_clusters=2, random_state=42).fit(train)\n",
    "    train_pred = kmeans.predict(train)\n",
    "    test_pred = kmeans.predict(test)\n",
    "    print(test_pred)\n",
    "    # TODO: Dump the model somewhere you can use it later. "
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "And just like that, we've done it. We've created a Kubeflow Pipeline.\n",
    "\n",
    "So let's take a moment to step back and think, \"what in the crazy-town-heck is going on here?!\".  A valid question, and well spotted.  Each \"Step\" is going to be creating a container.  Maybe I should have noted that earlier when talking about attatching volumes, beacuse if you thougth I was doing that to a function, you'd probably think me quite insane. \n",
    "\n",
    "But, if you follow this code, and create this pipeline, download it and run it, you will see each \"step\" as a seperate container, downloading data, saving it to a `PVC` then passing some parameters to a next container, which also will load the `PVC`, etc. etc.  \n",
    "\n",
    "### Using Python to Create Containers, but not like a crazy person\n",
    "\n",
    "For completeness, let's last explore how to do all of these things using annotations. \n",
    "\n",
    "The trick for the most part is to create a function that returns a `kfp.dsl.ContainerOp`.  This will point to an image, note the volumes that need to be mounted, and a number of other things. I've heard told people don't always just like creating absurdly large and fat functions to do everything in real life, so I leave this hear as an aside in case the reader is interested in it.  It's alsow worth noting that adding the `@kfp.dsl.component` annotation instructs teh Kubeflow compiler to turn on static typce checking. \n",
    "\n",
    "```\n",
    "@kfp.dsl.component\n",
    "def my_component(my_param):\n",
    "  ...\n",
    "  return kfp.dsl.ContainerOp(\n",
    "    name='My component name',\n",
    "    image='gcr.io/path/to/container/image'\n",
    "  )\n",
    "```\n",
    "\n",
    "Finally, when it comes to incorporating these components into pipelines, you would do something like this:\n",
    "\n",
    "```\n",
    "@kfp.dsl.pipeline(\n",
    "  name='My pipeline',\n",
    "  description='My machine learning pipeline'\n",
    ")\n",
    "def my_pipeline(param_1: PipelineParam, param_2: PipelineParam):\n",
    "  my_step = my_component(my_param='a')\n",
    "```\n",
    "\n",
    "Which should look exceedingly familiar as we did something very similar with our `download_data_fn` and `witchcraft_fn`.  "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.9"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}


================================================
FILE: data-extraction/python-notebook/MailingListDataPrep.py
================================================
#!/usr/bin/env python
# coding: utf-8

# Here we can install some packages our notebook needs. We can also install them in our container to speed things up & make it more reliable. But for prototyping this works great!

# In[ ]:

get_ipython().system('pip3 install --upgrade lxml')
get_ipython().system('pip3 install --upgrade pandas')
get_ipython().system('pip3 install --upgrade scikit-learn')
get_ipython().system('pip3 install --upgrade scipy')
get_ipython().system('pip3 install --upgrade tables')

# We can use Jupyter notebooks just like normal inside of Kubeflow

# In[ ]:

from datetime import datetime
from requests import get
from lxml import etree
from time import sleep

import re

import pandas as pd

import os

# In[ ]:

container_registry = ""  # Wherever you put your containers

# In[ ]:


def scrapeMailArchives(mailingList: str, year: int, month: int):
    baseUrl = "http://mail-archives.apache.org/mod_mbox/%s/%s.mbox/ajax/" % (
        mailingList, datetime(year, month, 1).strftime("%Y%m"))
    r = get(baseUrl + "thread?0")
    utf8_parser = etree.XMLParser(encoding='utf-8')
    root = etree.fromstring(r.text.replace('encoding="UTF-8"', ""),
                            parser=utf8_parser)
    output = []
    for message in root.xpath("//message"):
        _id = message.get("id")
        linked = message.get("linked")
        depth = message.get("depth")
        fr = message.xpath("from")[0].text
        dt = message.xpath("date")[0].text  # todo convert to date
        subject = message.xpath("subject")[0].text
        r2 = get(baseUrl + _id)
        bodyRoot = etree.fromstring(r2.text.replace('encoding="UTF-8"', ""),
                                    parser=utf8_parser)
        body = bodyRoot.xpath("//contents")[0].text
        record = {
            "id": _id,
            "linked": linked,
            "depth": depth,
            "from": fr,
            "dt": dt,
            "subject": subject,
            "body": body
        }
        output.append(record)
        sleep(0.1)
    return output


def extract_links(body):
    link_regex_str = r'(http(|s)://(.*?))([\s\n]|$)'
    itr = re.finditer(link_regex_str, body, re.MULTILINE)
    return list(map(lambda elem: elem.group(1), itr))


def extract_domains(links):
    from urllib.parse import urlparse

    def extract_domain(link):
        try:
            nloc = urlparse(link).netloc
            # We want to drop www and any extra spaces wtf nloc on the spaces.
            regex_str = r'^(www\.|)(.*?)\s*$'
            match = re.search(regex_str, nloc)
            return match.group(2)
        except:
            return None

    return list(map(extract_domain, links))


def contains_python_stack_trace(body):
    return "Traceback (most recent call last)" in body


def contains_probably_java_stack_trace(body):
    # Look for something based on regex
    # Tried https://stackoverflow.com/questions/20609134/regular-expression-optional-multiline-java-stacktrace - more msg looking
    # Tried https://stackoverflow.com/questions/3814327/regular-expression-to-parse-a-log-file-and-find-stacktraces
    # Yes the compile is per call, but it's cached so w/e
    import re
    stack_regex_str = r'^\s*(.+Exception.*):\n(.*\n){0,3}?(\s+at\s+.*\(.*\))+'
    match = re.search(stack_regex_str, body, re.MULTILINE)
    return match is not None


def contains_exception_in_task(body):
    # Look for a line along the lines of ERROR Executor: Exception in task
    return "ERROR Executor: Exception in task" in body


# In[ ]:

datesToScrape = [(2019, i) for i in range(1, 13)]

records = []
for y, m in datesToScrape:
    print(m, "-", y)
    records += scrapeMailArchives("spark-dev", y, m)

# In[ ]:

df = pd.DataFrame(records)
df['links'] = df['body'].apply(extract_links)
df['containsPythonStackTrace'] = df['body'].apply(contains_python_stack_trace)
df['containsJavaStackTrace'] = df['body'].apply(
    contains_probably_java_stack_trace)
df['containsExceptionInTaskBody'] = df['body'].apply(
    contains_exception_in_task)

df['domains'] = df['links'].apply(extract_domains)
df['isThreadStart'] = df['depth'] == '0'

# In[ ]:

from sklearn.feature_extraction.text import TfidfVectorizer

bodyV = TfidfVectorizer()
# bodyV = TfidfVectorizer(max_features=10000) #if we cared about making this 1:1 w holden's code.
bodyFeatures = bodyV.fit_transform(df['body'])

domainV = TfidfVectorizer()
# domainV = TfidfVectorizer(max_features=100)

## A couple of "None" domains really screwed the pooch on this one. Also, no lists just space seperated domains.


def makeDomainsAList(d):
    return ' '.join([a for a in d if not a is None])


domainFeatures = domainV.fit_transform(df['domains'].apply(makeDomainsAList))

# In[ ]:

# In[ ]:

from scipy.sparse import csr_matrix, hstack

data = hstack([
    csr_matrix(df[[
        'containsPythonStackTrace', 'containsJavaStackTrace',
        'containsExceptionInTaskBody', 'isThreadStart'
    ]].to_numpy()), bodyFeatures, domainFeatures
])

from sklearn.cluster import KMeans
from sklearn.model_selection import train_test_split

train, test = train_test_split(data, test_size=0.1)

kmeans = KMeans(n_clusters=2, random_state=42).fit(train)
train_pred = kmeans.predict(train)
test_pred = kmeans.predict(test)

# Alternatively, by structuring our code correctly we can take advantage of pipelines

# In[ ]:

get_ipython().system('pip3 install --upgrade kfp')

# In[ ]:

import kfp
import kfp.dsl as dsl

# In[ ]:


def download_data(year: int) -> str:

    from datetime import datetime
    from lxml import etree
    from requests import get
    from time import sleep

    import json

    def scrapeMailArchives(mailingList: str, year: int, month: int):
        baseUrl = "http://mail-archives.apache.org/mod_mbox/%s/%s.mbox/ajax/" % (
            mailingList, datetime(year, month, 1).strftime("%Y%m"))
        r = get(baseUrl + "thread?0")
        utf8_parser = etree.XMLParser(encoding='utf-8')
        root = etree.fromstring(r.text.replace('encoding="UTF-8"', ""),
                                parser=utf8_parser)
        output = []
        for message in root.xpath("//message"):
            _id = message.get("id")
            linked = message.get("linked")
            depth = message.get("depth")
            fr = message.xpath("from")[0].text
            dt = message.xpath("date")[0].text  # todo convert to date
            subject = message.xpath("subject")[0].text
            r2 = get(baseUrl + _id)
            bodyRoot = etree.fromstring(r2.text.replace(
                'encoding="UTF-8"', ""),
                                        parser=utf8_parser)
            body = bodyRoot.xpath("//contents")[0].text
            record = {
                "id": _id,
                "linked": linked,
                "depth": depth,
                "from": fr,
                "dt": dt,
                "subject": subject,
                "body": body
            }
            output.append(record)
            sleep(0.1)

        return output

    datesToScrape = [(year, i) for i in range(1, 2)]

    records = []
    ## todo, go back further
    for y, m in datesToScrape:
        print(m, "-", y)
        records += scrapeMailArchives("spark-dev", y, m)
    import os
    output_path = '/data_processing/data.json'
    with open(output_path, 'w') as f:
        json.dump(records, f)

    return output_path


# In[ ]:

# In[ ]:


def download_tld_data() -> str:
    from requests import get
    import pandas as pd
    print("importing io....")
    import io

    url = "https://pkgstore.datahub.io/core/country-list/data_csv/data/d7c9d7cfb42cb69f4422dec222dbbaa8/data_csv.csv"
    print("Getting the url")
    s = get(url).content
    print("Converting content")
    df = pd.read_csv(io.StringIO(s.decode('utf-8')))
    print("Writing output")
    output_path_hdf = '/tld_info/clean_data.hdf'
    df.to_hdf(output_path_hdf, key="tld")

    return output_path_hdf


# In[ ]:

# Now that we have some data, we want to get rid of any "bad" records

# In[ ]:


#tag::clean_data_fun[]
def clean_data(input_path: str) -> str:
    import json
    import pandas as pd

    print("loading records...")
    with open(input_path, 'r') as f:
        records = json.load(f)
    print("records loaded")

    df = pd.DataFrame(records)
    # Drop records without a subject, body, or sender
    cleaned = df.dropna(subset=["subject", "body", "from"])

    output_path_hdf = '/data_processing/clean_data.hdf'
    cleaned.to_hdf(output_path_hdf, key="clean")

    return output_path_hdf


#end::clean_data_fun[]

# ### Preparing the data
#
# Remember earlier when we did that big (and arguably pointless) classification of emails from the Apache Spark mailing list? OK, now we're going to do it again, as a "lightweight" Python function in a Kubeflow Pipeline.  I hope the irony of the term "lightweight" isn't lost on anyone, because this is pretty blatent abuse of something that was originally presented for conveinience.
#
# First note, all of the imports and declarations of helper functions MUST be with in the "ligthweight" function. One could argue (and they would probably be correct) that I have two steps here- feature prep and ML, and as such I should split them. I would say that's fair, but I choose not to do so at this time.  Perhaps in some scripts later on?
#
# As has been pointed out so many times before, we assume the reader either arleady understands what is going on with the KMeans clustering, or better yet, doesn't even care. I won't be digging into that right now. What I will point out- and maybe as a note to the editor, the model that is finally saved really ought to be persisted somewhere.  If the model isn't saved, then this basically pointless pipeline, is truly pointless.
#

# Now let's make sure we can read that data in the next step (before we write a big complicated model to do whatever torture to it).

# In[ ]:


def prepare_features(input_path: str, tld_info_path: str):

    import re
    import pandas as pd

    print("loading records...")
    df = pd.read_hdf(input_path, key="clean")
    print("records loaded")

    print("Loading tld info....")
    tld_df = pd.read_hdf(tld_info_path, key="tld")
    print("Loaded tld info")

    ## Note: "Lightweight" Python Fns mean helper code must be inside the fn. (Bad Form)

    def extract_links(body):
        link_regex_str = r'(http(|s)://(.*?))([\s\n]|$)'
        itr = re.finditer(link_regex_str, body, re.MULTILINE)
        return list(map(lambda elem: elem.group(1), itr))

    def extract_domains(links):
        from urllib.parse import urlparse

        def extract_domain(link):
            try:
                nloc = urlparse(link).netloc
                # We want to drop www and any extra spaces wtf nloc on the spaces.
                regex_str = r'^(www\.|)(.*?)\s*$'
                match = re.search(regex_str, nloc)
                return match.group(2)
            except:
                return None

        return list(map(extract_domain, links))

    def contains_python_stack_trace(body):
        return "Traceback (most recent call last)" in body

    def contains_probably_java_stack_trace(body):
        # Look for something based on regex
        # Tried https://stackoverflow.com/questions/20609134/regular-expression-optional-multiline-java-stacktrace - more msg looking
        # Tried https://stackoverflow.com/questions/3814327/regular-expression-to-parse-a-log-file-and-find-stacktraces
        # Yes the compile is per call, but it's cached so w/e
        import re
        stack_regex_str = r'^\s*(.+Exception.*):\n(.*\n){0,3}?(\s+at\s+.*\(.*\))+'
        match = re.search(stack_regex_str, body, re.MULTILINE)
        return match is not None

    def contains_exception_in_task(body):
        # Look for a line along the lines of ERROR Executor: Exception in task
        return "ERROR Executor: Exception in task" in body

    print(df.shape)
    df['links'] = df['body'].apply(extract_links)
    df['containsPythonStackTrace'] = df['body'].apply(
        contains_python_stack_trace)
    df['containsJavaStackTrace'] = df['body'].apply(
        contains_probably_java_stack_trace)
    df['containsExceptionInTaskBody'] = df['body'].apply(
        contains_exception_in_task)

    #tag::local_mailing_list_feature_prep_fun[]
    df['domains'] = df['links'].apply(extract_domains)
    df['isThreadStart'] = df['depth'] == '0'

    # Arguably, you could split building the dataset away from the actual witchcraft.
    from sklearn.feature_extraction.text import TfidfVectorizer

    bodyV = TfidfVectorizer()
    bodyFeatures = bodyV.fit_transform(df['body'])

    domainV = TfidfVectorizer()

    ## A couple of "None" domains really screwed the pooch on this one.Also, no lists just space seperated domains.
    def makeDomainsAList(d):
        return ' '.join([a for a in d if not a is None])

    domainFeatures = domainV.fit_transform(
        df['domains'].apply(makeDomainsAList))

    from scipy.sparse import csr_matrix, hstack

    data = hstack([
        csr_matrix(df[[
            'containsPythonStackTrace', 'containsJavaStackTrace',
            'containsExceptionInTaskBody', 'isThreadStart'
        ]].to_numpy()), bodyFeatures, domainFeatures
    ])
    #end::local_mailing_list_feature_prep_fun[]


#
# ### The Kubeflow Bit.
#
# Now we can put these two pieces together into a pipeline. Since the data is relatively small we will use a persistent volume put them together. Later on we can add training to this pipeline as well.
#
#

# In[ ]:

# Make a volume example. We redo it inside of the pipeline definition because we need to be inside
#tag::makeVolume[]
dvop = dsl.VolumeOp(name="create_pvc",
                    resource_name="my-pvc-2",
                    size="5Gi",
                    modes=dsl.VOLUME_MODE_RWO)
#end::makeVolume[]

# In[ ]:

get_ipython().system('rm local-data-prep-2.zip')

# In[ ]:


#tag::makePipeline[]
@kfp.dsl.pipeline(name='Simple1', description='Simple1')
def my_pipeline_mini(year: int):
    dvop = dsl.VolumeOp(name="create_pvc",
                        resource_name="my-pvc-2",
                        size="5Gi",
                        modes=dsl.VOLUME_MODE_RWO)
    tldvop = dsl.VolumeOp(name="create_pvc",
                          resource_name="tld-volume-2",
                          size="100Mi",
                          modes=dsl.VOLUME_MODE_RWO)
    download_data_op = kfp.components.func_to_container_op(
        download_data, packages_to_install=['lxml', 'requests'])
    download_tld_info_op = kfp.components.func_to_container_op(
        download_tld_data,
        packages_to_install=['requests', 'pandas>=0.24', 'tables'])
    clean_data_op = kfp.components.func_to_container_op(
        clean_data, packages_to_install=['pandas>=0.24', 'tables'])

    step1 = download_data_op(year).add_pvolumes(
        {"/data_processing": dvop.volume})
    step2 = clean_data_op(input_path=step1.output).add_pvolumes(
        {"/data_processing": dvop.volume})
    step3 = download_tld_info_op().add_pvolumes({"/tld_info": tldvop.volume})


kfp.compiler.Compiler().compile(my_pipeline_mini, 'local-data-prep-2.zip')
#end::makePipeline[]

# In[ ]:

get_ipython().system('rm *.zip')

# In[ ]:


#tag::longPipeline[]
@kfp.dsl.pipeline(name='Simple1', description='Simple1')
def my_pipeline2(year: int):
    dvop = dsl.VolumeOp(name="create_pvc",
                        resource_name="my-pvc-2",
                        size="5Gi",
                        modes=dsl.VOLUME_MODE_RWO)
    tldvop = dsl.VolumeOp(name="create_pvc",
                          resource_name="tld-volume-2",
                          size="100Mi",
                          modes=dsl.VOLUME_MODE_RWO)

    download_data_op = kfp.components.func_to_container_op(
        download_data, packages_to_install=['lxml', 'requests'])
    download_tld_info_op = kfp.components.func_to_container_op(
        download_tld_data,
        packages_to_install=['requests', 'pandas>=0.24', 'tables'])
    clean_data_op = kfp.components.func_to_container_op(
        clean_data, packages_to_install=['pandas>=0.24', 'tables'])
#tag::add_feature_step[]
    prepare_features_op = kfp.components.func_to_container_op(
        prepare_features,
        packages_to_install=['pandas>=0.24', 'tables', 'scikit-learn'])
#end::add_feature_step[]

    step1 = download_data_op(year).add_pvolumes(
        {"/data_processing": dvop.volume})
    step2 = clean_data_op(input_path=step1.output).add_pvolumes(
        {"/data_processing": dvop.volume})
    step3 = download_tld_info_op().add_pvolumes({"/tld_info": tldvop.volume})
    step4 = prepare_features_op(input_path=step2.output,
                                tld_info_path=step3.output).add_pvolumes({
                                    "/data_processing":
                                    dvop.volume,
                                    "/tld_info":
                                    tldvop.volume
                                })


#end::longPipeline[]

kfp.compiler.Compiler().compile(my_pipeline2,
                                'local-data-and-feature-prep-2.zip')

# In[ ]:

client = kfp.Client()

# In[ ]:

my_experiment = client.create_experiment(name='local-data-prep-test-2')
my_run = client.run_pipeline(my_experiment.id,
                             'local-data-prep',
                             'local-data-and-feature-prep-2.zip',
                             params={'year': '2019'})

# If we were using Spamassasin or some other library installed in a different base container we would:

# In[ ]:

# Clean data with custom container
#tag::cleanDataWithContainer[]
clean_data_op = kfp.components.func_to_container_op(
    clean_data,
    base_image="{0}/kubeflow/spammassisan".format(container_registry),
    packages_to_install=['pandas>=0.24', 'tables'])
#end::cleanDataWithContainer[]

# In[ ]:


def train_func(input_path: String):
    from sklearn.cluster import KMeans
    from sklearn.model_selection import train_test_split

    train, test = train_test_split(data, test_size=0.1)

    kmeans = KMeans(n_clusters=2, random_state=42).fit(train)
    train_pred = kmeans.predict(train)
    test_pred = kmeans.predict(test)
    print(test_pred)
    # TODO: Dump the model somewhere you can use it later.


# And just like that, we've done it. We've created a Kubeflow Pipeline.
#
# So let's take a moment to step back and think, "what in the crazy-town-heck is going on here?!".  A valid question, and well spotted.  Each "Step" is going to be creating a container.  Maybe I should have noted that earlier when talking about attatching volumes, beacuse if you thougth I was doing that to a function, you'd probably think me quite insane.
#
# But, if you follow this code, and create this pipeline, download it and run it, you will see each "step" as a seperate container, downloading data, saving it to a `PVC` then passing some parameters to a next container, which also will load the `PVC`, etc. etc.
#
# ### Using Python to Create Containers, but not like a crazy person
#
# For completeness, let's last explore how to do all of these things using annotations.
#
# The trick for the most part is to create a function that returns a `kfp.dsl.ContainerOp`.  This will point to an image, note the volumes that need to be mounted, and a number of other things. I've heard told people don't always just like creating absurdly large and fat functions to do everything in real life, so I leave this hear as an aside in case the reader is interested in it.  It's alsow worth noting that adding the `@kfp.dsl.component` annotation instructs teh Kubeflow compiler to turn on static typce checking.
#
# ```
# @kfp.dsl.component
# def my_component(my_param):
#   ...
#   return kfp.dsl.ContainerOp(
#     name='My component name',
#     image='gcr.io/path/to/container/image'
#   )
# ```
#
# Finally, when it comes to incorporating these components into pipelines, you would do something like this:
#
# ```
# @kfp.dsl.pipeline(
#   name='My pipeline',
#   description='My machine learning pipeline'
# )
# def my_pipeline(param_1: PipelineParam, param_2: PipelineParam):
#   my_step = my_component(my_param='a')
# ```
#
# Which should look exceedingly familiar as we did something very similar with our `download_data_fn` and `witchcraft_fn`.

# In[ ]:

# In[ ]:


================================================
FILE: data-extraction/python-notebook/RunNBDockerfile
================================================
# Since we used Jupyter notebooks to do the first pass extraction, we can try directly use that notebook with
# Kubeflow's pre-baked "tensorflow-notebook-image" (based on the Jupyter image) that automatically
# launches the notebooks included in the docker file. If you have multiple notebooks
# Give them names like:
# 01-mything.ipynb
# 02-step2.ipynb
# as they will be executed in lexiographical order.
#tag::spec[]
FROM gcr.io/kubeflow-images-public/tensorflow-1.6.0-notebook-cpu

COPY ./ /workdir /
#end::spec[]
#tag::deps[]
RUN pip3 install --upgrade lxml pandas
#end::deps[]


================================================
FILE: data-extraction/python-spark/Dockerfile
================================================
# Use the spark operator image as base
FROM gcr.io/spark-operator/spark-py:v2.4.5
# Install Python requirements
COPY requirements.txt /
RUN pip3 install -r /requirements.txt
# Now you can reference local:///job/my_file.py
RUN mkdir -p /job
COPY *.py /job

ENTRYPOINT ["/opt/entrypoint.sh"]

================================================
FILE: data-extraction/python-spark/LaunchSparkJobs.ipynb
================================================
{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "!pip3 install --upgrade --user kfp"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import kfp"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import kfp.dsl as dsl"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Use Kubeflow's built in Spark operator\n",
    "#tag::launch_operator[]\n",
    "resource = {\n",
    "    \"apiVersion\": \"sparkoperator.k8s.io/v1beta2\",\n",
    "    \"kind\": \"SparkApplication\",\n",
    "    \"metadata\": {\n",
    "        \"name\": \"boop\",\n",
    "        \"namespace\": \"kubeflow\"\n",
    "    },\n",
    "  \"spec\": {\n",
    "      \"type\": \"Python\",\n",
    "      \"mode\": \"cluster\",\n",
    "      \"image\": \"gcr.io/boos-demo-projects-are-rad/kf-steps/kubeflow/myspark\",\n",
    "      \"imagePullPolicy\": \"Always\",\n",
    "      \"mainApplicationFile\": \"local:///job/job.py\", # See the Dockerfile OR use GCS/S3/...\n",
    "      \"sparkVersion\": \"2.4.5\",\n",
    "      \"restartPolicy\": {\n",
    "        \"type\": \"Never\"\n",
    "      },\n",
    "  \"driver\": {\n",
    "    \"cores\": 1,  \n",
    "    \"coreLimit\": \"1200m\",  \n",
    "    \"memory\": \"512m\",  \n",
    "    \"labels\": {\n",
    "      \"version\": \"2.4.5\",  \n",
    "    },      \n",
    "    \"serviceAccount\": \"spark-operatoroperator-sa\", # also try spark-operatoroperator-sa\n",
    " },\n",
    "  \"executor\": {\n",
    "    \"cores\": 1,\n",
    "    \"instances\": 2,\n",
    "    \"memory\": \"512m\"  \n",
    "  },    \n",
    "  \"labels\": {\n",
    "    \"version\": \"2.4.5\"\n",
    "  },      \n",
    "  }\n",
    "}\n",
    "\n",
    "@dsl.pipeline(\n",
    "    name=\"local Pipeline\",\n",
    "    description=\"No need to ask why.\"\n",
    ")\n",
    "def local_pipeline():\n",
    "\n",
    "    rop = dsl.ResourceOp(\n",
    "        name=\"boop\",\n",
    "        k8s_resource=resource,\n",
    "        action=\"create\",\n",
    "        success_condition=\"status.applicationState.state == COMPLETED\"\n",
    "    )\n",
    "#end::launch_operator[]\n",
    "\n",
    "import kfp.compiler as compiler\n",
    "\n",
    "compiler.Compiler().compile(local_pipeline,\"boop.zip\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "client = kfp.Client()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "my_experiment = client.create_experiment(name='boop-test-2')\n",
    "my_run = client.run_pipeline(my_experiment.id, 'boop-test', \n",
    "  'boop.zip')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.9"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}


================================================
FILE: data-extraction/python-spark/LaunchSparkJobs.py
================================================
#!/usr/bin/env python
# coding: utf-8

# In[ ]:

get_ipython().system('pip3 install --upgrade --user kfp')

# In[ ]:

import kfp

# In[ ]:

import kfp.dsl as dsl

# In[ ]:

# Use Kubeflow's built in Spark operator
#tag::launch_operator[]
resource = {
    "apiVersion": "sparkoperator.k8s.io/v1beta2",
    "kind": "SparkApplication",
    "metadata": {
        "name": "boop",
        "namespace": "kubeflow"
    },
    "spec": {
        "type": "Python",
        "mode": "cluster",
        "image": "gcr.io/boos-demo-projects-are-rad/kf-steps/kubeflow/myspark",
        "imagePullPolicy": "Always",
        # See the Dockerfile OR use GCS/S3/...
        "mainApplicationFile": "local:///job/job.py",
        "sparkVersion": "2.4.5",
        "restartPolicy": {
            "type": "Never"
        },
        "driver": {
            "cores": 1,
            "coreLimit": "1200m",
            "memory": "512m",
            "labels": {
                "version": "2.4.5",
            },
            # also try spark-operatoroperator-sa
            "serviceAccount": "spark-operatoroperator-sa",
        },
        "executor": {
            "cores": 1,
            "instances": 2,
            "memory": "512m"
        },
        "labels": {
            "version": "2.4.5"
        },
    }
}


@dsl.pipeline(name="local Pipeline", description="No need to ask why.")
def local_pipeline():

    rop = dsl.ResourceOp(
        name="boop",
        k8s_resource=resource,
        action="create",
        success_condition="status.applicationState.state == COMPLETED")


#end::launch_operator[]

import kfp.compiler as compiler

compiler.Compiler().compile(local_pipeline, "boop.zip")

# In[ ]:

client = kfp.Client()

# In[ ]:

my_experiment = client.create_experiment(name='boop-test-2')
my_run = client.run_pipeline(my_experiment.id, 'boop-test', 'boop.zip')

# In[ ]:


================================================
FILE: data-extraction/python-spark/fake_job.py
================================================
# Yes we need both these imports
#tag::imports[]
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, to_date
from pyspark.sql.types import *
#end::imports[]
from pyspark.sql.catalog import UserDefinedFunction
import os

#tag::basic_session[]
session = SparkSession.builder.getOrCreate()
#end::basic_session[]


================================================
FILE: data-extraction/python-spark/requirements.txt
================================================
pandas


================================================
FILE: data-extraction/python-spark-notebook/AddGCSDockerfile
================================================
ARG base
FROM $base

# Set an enviroment variable for where we are going to put spark
ENV SPARK_HOME /opt/spark

# Run as root for updates
USER root

# Add access to GCS
RUN rm $SPARK_HOME/jars/guava-1*.jar
ADD http://maven-central.storage.googleapis.com/maven2/com/google/guava/guava/23.0/guava-23.0.jar $SPARK_HOME/jars
# Add the connector jar needed to access Google Cloud Storage using the Hadoop FileSystem API.
ADD https://storage.googleapis.com/hadoop-lib/gcs/gcs-connector-latest-hadoop3.jar $SPARK_HOME/jars

# Add the S3A connector
ADD https://repo1.maven.org/maven2/org/apache/hadoop/hadoop-aws/3.2.0/hadoop-aws-3.2.0.jar $SPARK_HOME/jars
ADD https://repo1.maven.org/maven2/com/amazonaws/aws-java-sdk-bundle/1.11.732/aws-java-sdk-bundle-1.11.732.jar $SPARK_HOME/jars

RUN chmod -R 777 $SPARK_HOME/jars

USER 185

================================================
FILE: data-extraction/python-spark-notebook/AddPython3.6Dockerfile
================================================
ARG base
FROM $base

USER root

# Install libraries we need to build Python 3.6
RUN apt-get update && \
    DEBIAN_FRONTEND=noninteractive apt-get install -y -q \
    make build-essential libssl-dev zlib1g-dev libbz2-dev \
    libreadline-dev libsqlite3-dev wget curl llvm libncurses5-dev \
    libncursesw5-dev xz-utils tk-dev libffi-dev liblzma-dev && \
    rm -rf /var/cache/apt

# Install python3.6 to match the notebook
RUN cd /tmp && \
    wget https://www.python.org/ftp/python/3.6.10/Python-3.6.10.tgz && \
    tar -xvf Python-3.6.10.tgz && \
    cd Python-3.6.10 && \
    ./configure && \
    make -j 8 && \
    make altinstall
    
    
RUN python3.6 -m pip install pandas pyarrow==0.11.0 spacy
# We depend on Spark being on the PYTHONPATH so no pip install
USER 185

================================================
FILE: data-extraction/python-spark-notebook/Dockerfile
================================================
#tag::include[]
# See https://www.kubeflow.org/docs/notebooks/custom-notebook/
ARG base
FROM $base
ARG sparkversion
ARG sparkrelease
ARG sparkserver https://www-us.apache.org/dist/spark
# We need to run as root for updates
USER root

# Set an enviroment variable for where we are going to put spark
ENV SPARK_HOME /opt/spark

# Install java because Spark needs it
RUN apt-get update && \
    apt-get install -yq openjdk-8-jre openjdk-8-jre-headless && \
    apt-get clean && \
    rm -rf /var/lib/apt/lists/*

# Install Spark
RUN set -ex && \
    rm /bin/sh && \
    ln -sv /bin/bash /bin/sh

RUN  echo "Setting up $sparkversion"
RUN  cd /tmp && \
     (wget ${sparkserver}/spark-${sparkversion}/${sparkrelease}.tgz) && \
     cd /opt && tar -xvf /tmp/${sparkrelease}.tgz && \
     rm /tmp/${sparkrelease}.tgz && mv ${sparkrelease} spark && \
     cd spark/python && pip install -e .
#end::include[]

# Add access to GCS
RUN rm $SPARK_HOME/jars/guava-1*.jar
ADD https://maven-central.storage.googleapis.com/maven2/com/google/guava/guava/23.0/guava-23.0.jar $SPARK_HOME/jars
# Add the connector jar needed to access Google Cloud Storage using the Hadoop FileSystem API.
ADD https://storage.googleapis.com/hadoop-lib/gcs/gcs-connector-latest-hadoop3.jar $SPARK_HOME/jars

# Add the S3A connector
ADD https://repo1.maven.org/maven2/org/apache/hadoop/hadoop-aws/3.2.0/hadoop-aws-3.2.0.jar $SPARK_HOME/jars
ADD https://repo1.maven.org/maven2/com/amazonaws/aws-java-sdk-bundle/1.11.732/aws-java-sdk-bundle-1.11.732.jar $SPARK_HOME/jars

#tag::include[]
# Fix permissions
WORKDIR /opt/spark/work-dir
RUN chmod -R 777 /opt/spark/


# Switch the user back, using jovyan as a user is bad but the base image
# depends on it.
USER jovyan
# Install some common tools
pip install pandas numpy scipy pyarrow
#end::include[]

================================================
FILE: data-extraction/python-spark-notebook/SparkMailingListForKF.ipynb
================================================
{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Yes we need both these imports\n",
    "from pyspark.sql import SparkSession\n",
    "from pyspark.sql.functions import col, to_date\n",
    "from pyspark.sql.types import *\n",
    "from pyspark.sql.types import StructField, StructType\n",
    "from pyspark.sql.catalog import UserDefinedFunction\n",
    "import os"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "fs_prefix = \"s3a://kf-book-examples/mailing-lists\" # Create with mc as in ch1"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# See https://medium.com/@szinck/setting-up-pyspark-jupyter-and-minio-on-kubeflow-kubernetes-aab98874794f\n",
    "#tag::configurePythonVersion[]\n",
    "os.environ[\"PYSPARK_PYTHON\"] = \"python3.6\"\n",
    "#end::configurePythonVersion[]\n",
    "session = (\n",
    "    SparkSession.builder\n",
    "    .appName(\"fetchMailingListData\")\n",
    "    .config(\"spark.executor.instances\", \"8\")\n",
    "    .config(\"spark.driver.memoryOverhead\", \"0.25\")\n",
    "    .config(\"spark.executor.memory\", \"6g\")\n",
    "    .config(\"spark.dynamicAllocation.enabled\", \"false\")\n",
    "    .config(\"spark.ui.enabled\", \"true\")\n",
    "    .config(\"spark.kubernetes.container.image\",\n",
    "           \"gcr.io/boos-demo-projects-are-rad/kubeflow/spark-worker/spark-py-36:v3.0.0-preview2-23\")\n",
    "    #tag::notebookSession[]\n",
    "    .config(\"spark.driver.bindAddress\", \"0.0.0.0\")\n",
    "    .config(\"spark.kubernetes.namespace\", \"kubeflow-programmerboo\")\n",
    "    .config(\"spark.master\", \"k8s://https://kubernetes.default\")\n",
    "    .config(\"spark.driver.host\", \n",
    "            \"spark-driver.kubeflow-programmerboo.svc.cluster.local\")\n",
    "    .config(\"spark.kubernetes.executor.annotation.sidecar.istio.io/inject\",\n",
    "            \"false\")\n",
    "    .config(\"spark.driver.port\", \"39235\")\n",
    "    .config(\"spark.blockManager.port\", \"39236\")\n",
    "    #end::notebookSession[]\n",
    "    # If using minio - see https://github.com/minio/cookbook/blob/master/docs/apache-spark-with-minio.md\n",
    "    #tag::minio[]\n",
    "    .config(\"spark.hadoop.fs.s3a.endpoint\",\n",
    "            \"minio-service.kubeflow.svc.cluster.local:9000\")\n",
    "    .config(\"fs.s3a.connection.ssl.enabled\", \"false\")\n",
    "    .config(\"fs.s3a.path.style.access\", \"true\")\n",
    "    # You can also add an account using the minio command as described in chapter 1\n",
    "    .config(\"spark.hadoop.fs.s3a.access.key\", \"minio\")\n",
    "    .config(\"spark.hadoop.fs.s3a.secret.key\", \"minio123\")\n",
    "    #end::minio[]\n",
    "    ).getOrCreate()\n",
    "sc = session.sparkContext"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Data fetch pipeline: Download mailing list data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "list_name=\"spark-user\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "mailing_list_template=\"http://mail-archives.apache.org/mod_mbox/{list_name}/{date}.mbox\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Generate the possible dates"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "start_year=2019 # Change to 2002 once you've verified\n",
    "end_year=2021\n",
    "dates = [\"{:d}{:02d}\".format(year, month) for year in range(start_year, end_year) for month in range (1,12)]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def download_emails(date):\n",
    "    import subprocess\n",
    "    from mailbox import mbox\n",
    "    import os\n",
    "    mbox_filename = \"{date}.mbox\".format(date=date)\n",
    "    url=mailing_list_template.format(list_name=list_name,date=date)\n",
    "    subprocess.call([\"wget\", url])\n",
    "    # Skip years that don't exist\n",
    "    if not os.path.exists(mbox_filename):\n",
    "        return []\n",
    "    mail = mbox(mbox_filename.format(date=date), create=False)\n",
    "    # LC the keys since the casing is non-consistent\n",
    "    def get_body(message):\n",
    "        content_type = message.get_content_type()\n",
    "        # Multi-part messages\n",
    "        if message.is_multipart():\n",
    "            return \"\".join(map(get_body, message.get_payload()))\n",
    "        elif \"text\" in content_type or \"html\" in content_type:\n",
    "            return message.get_payload()\n",
    "        else:\n",
    "            return \"\"\n",
    "    def message_to_dict(message):\n",
    "        ret = dict((k.lower(), v) for k, v in message.items())\n",
    "        ret[\"multipart\"] = message.is_multipart()\n",
    "        ret[\"body\"] = get_body(message)\n",
    "        return ret\n",
    "    emails = list(map(message_to_dict, mail.itervalues()))\n",
    "    os.remove(mbox_filename)\n",
    "    return emails"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Optional: test that it works locally\n",
    "# download_emails(\"202001\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "emails_rdd = sc.parallelize(dates).flatMap(download_emails).cache()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "emails_rdd.count()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "mailing_list_posts_mbox_df = emails_rdd.toDF(sampleRatio=1.0)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "cached = mailing_list_posts_mbox_df.cache()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "mailing_list_posts_mbox_df.select(\"list-id\", \"In-Reply-To\").take(5)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "spark_mailing_list_data = mailing_list_posts_mbox_df.filter(\n",
    "    mailing_list_posts_mbox_df[\"list-id\"].contains(\"spark\")).repartition(60).cache()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "spark_mailing_list_data.show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "spark_mailing_list_data.printSchema()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def extract_date_from_email_datefield(datefield):\n",
    "    if datefield is None:\n",
    "        return None\n",
    "    from datetime import datetime\n",
    "    import time\n",
    "    import email.utils\n",
    "    parsed_date = email.utils.parsedate(datefield)\n",
    "    return datetime.fromtimestamp(time.mktime((parsed_date)))\n",
    "\n",
    "\n",
    "extract_date_from_email_datefield_udf = UserDefinedFunction(\n",
    "    extract_date_from_email_datefield, StringType(), \"extract_date_from_email_datefield\")\n",
    "\n",
    "session.catalog._jsparkSession.udf().registerPython(\n",
    "    \"extract_date_from_email_datefield\",\n",
    "    extract_date_from_email_datefield_udf._judf)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "spark_mailing_list_data_with_date = spark_mailing_list_data.select(\n",
    "    \"*\",\n",
    "    extract_date_from_email_datefield_udf(spark_mailing_list_data[\"Date\"]).alias(\"email_date\"))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Manually verify that our date parser is looking ok\n",
    "spark_mailing_list_data.select(spark_mailing_list_data[\"Date\"],\n",
    "                               extract_date_from_email_datefield_udf(spark_mailing_list_data[\"Date\"]).alias(\"email_date\")\n",
    "                              ).take(2)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "#tag::filter_junk[]\n",
    "def is_ok(post):\n",
    "    # Your special business logic goes here\n",
    "    return True\n",
    "spark_mailing_list_data_cleaned = spark_mailing_list_data_with_date.filter(is_ok)\n",
    "#end::filter_junk[]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "mailing_list_posts_in_reply_to = spark_mailing_list_data_cleaned.filter(\n",
    "    spark_mailing_list_data[\"In-Reply-To\"].isNotNull()).alias(\"mailing_list_posts_in_reply_to\")\n",
    "initial_posts = spark_mailing_list_data_cleaned.filter(\n",
    "    spark_mailing_list_data[\"In-Reply-To\"].isNull()).alias(\"initial_posts\").cache()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# See how many start-of-thread posts we have\n",
    "initial_posts.count()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "ids_in_reply = mailing_list_posts_in_reply_to.select(\"In-Reply-To\", \"message-id\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "ids_in_reply.schema"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Ok now it's time to save these\n",
    "#tag::write_big_data[]\n",
    "initial_posts.write.format(\"parquet\").mode('overwrite').save(fs_prefix + \"/initial_posts\")\n",
    "ids_in_reply.write.format(\"parquet\").mode('overwrite').save(fs_prefix + \"/ids_in_reply\")\n",
    "#end::write_big_data[]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "#tag::small_data[]\n",
    "initial_posts.toPandas()\n",
    "#end::small_data[]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "session.stop()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.9"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}


================================================
FILE: data-extraction/python-spark-notebook/SparkMailingListForKF.py
================================================
#!/usr/bin/env python
# coding: utf-8

# In[ ]:

# Yes we need both these imports
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, to_date
from pyspark.sql.types import *
from pyspark.sql.types import StructField, StructType
from pyspark.sql.catalog import UserDefinedFunction
import os

# In[ ]:

# In[ ]:

fs_prefix = "s3a://kf-book-examples/mailing-lists"  # Create with mc as in ch1

# In[ ]:

# See https://medium.com/@szinck/setting-up-pyspark-jupyter-and-minio-on-kubeflow-kubernetes-aab98874794f
#tag::configurePythonVersion[]
os.environ["PYSPARK_PYTHON"] = "python3.6"
#end::configurePythonVersion[]
session = (
    SparkSession.builder.appName("fetchMailingListData").config(
        "spark.executor.instances",
        "8").config("spark.driver.memoryOverhead",
                    "0.25").config("spark.executor.memory", "6g").config(
                        "spark.dynamicAllocation.enabled", "false").
    config("spark.ui.enabled", "true").config(
        "spark.kubernetes.container.image",
        "gcr.io/boos-demo-projects-are-rad/kubeflow/spark-worker/spark-py-36:v3.0.0-preview2-23"
    )
    #tag::notebookSession[]
    .config("spark.driver.bindAddress",
            "0.0.0.0").config("spark.kubernetes.namespace",
                              "kubeflow-programmerboo").
    config("spark.master", "k8s://https://kubernetes.default").config(
        "spark.driver.host",
        "spark-driver.kubeflow-programmerboo.svc.cluster.local").config(
            "spark.kubernetes.executor.annotation.sidecar.istio.io/inject",
            "false").config("spark.driver.port",
                            "39235").config("spark.blockManager.port", "39236")
    #end::notebookSession[]
    # If using minio - see https://github.com/minio/cookbook/blob/master/docs/apache-spark-with-minio.md
    #tag::minio[]
    .config("spark.hadoop.fs.s3a.endpoint",
            "minio-service.kubeflow.svc.cluster.local:9000").config(
                "fs.s3a.connection.ssl.enabled",
                "false").config("fs.s3a.path.style.access", "true")
    # You can also add an account using the minio command as described in chapter 1
    .config("spark.hadoop.fs.s3a.access.key",
            "minio").config("spark.hadoop.fs.s3a.secret.key", "minio123")
    #end::minio[]
).getOrCreate()
sc = session.sparkContext

# In[ ]:

# Data fetch pipeline: Download mailing list data

# In[ ]:

list_name = "spark-user"

# In[ ]:

mailing_list_template = "http://mail-archives.apache.org/mod_mbox/{list_name}/{date}.mbox"

# In[ ]:

# Generate the possible dates

# In[ ]:

start_year = 2019  # Change to 2002 once you've verified
end_year = 2021
dates = [
    "{:d}{:02d}".format(year, month) for year in range(start_year, end_year)
    for month in range(1, 12)
]

# In[ ]:


def download_emails(date):
    import subprocess
    from mailbox import mbox
    import os
    mbox_filename = "{date}.mbox".format(date=date)
    url = mailing_list_template.format(list_name=list_name, date=date)
    subprocess.call(["wget", url])
    # Skip years that don't exist
    if not os.path.exists(mbox_filename):
        return []
    mail = mbox(mbox_filename.format(date=date), create=False)

    # LC the keys since the casing is non-consistent

    def get_body(message):
        content_type = message.get_content_type()
        # Multi-part messages
        if message.is_multipart():
            return "".join(map(get_body, message.get_payload()))
        elif "text" in content_type or "html" in content_type:
            return message.get_payload()
        else:
            return ""

    def message_to_dict(message):
        ret = dict((k.lower(), v) for k, v in message.items())
        ret["multipart"] = message.is_multipart()
        ret["body"] = get_body(message)
        return ret

    emails = list(map(message_to_dict, mail.itervalues()))
    os.remove(mbox_filename)
    return emails


# In[ ]:

# Optional: test that it works locally
# download_emails("202001")

# In[ ]:

emails_rdd = sc.parallelize(dates).flatMap(download_emails).cache()

# In[ ]:

emails_rdd.count()

# In[ ]:

mailing_list_posts_mbox_df = emails_rdd.toDF(sampleRatio=1.0)

# In[ ]:

cached = mailing_list_posts_mbox_df.cache()

# In[ ]:

mailing_list_posts_mbox_df.select("list-id", "In-Reply-To").take(5)

# In[ ]:

spark_mailing_list_data = mailing_list_posts_mbox_df.filter(
    mailing_list_posts_mbox_df["list-id"].contains("spark")).repartition(
        60).cache()

# In[ ]:

spark_mailing_list_data.show()

# In[ ]:

spark_mailing_list_data.printSchema()

# In[ ]:


def extract_date_from_email_datefield(datefield):
    if datefield is None:
        return None
    from datetime import datetime
    import time
    import email.utils
    parsed_date = email.utils.parsedate(datefield)
    return datetime.fromtimestamp(time.mktime((parsed_date)))


extract_date_from_email_datefield_udf = UserDefinedFunction(
    extract_date_from_email_datefield, StringType(),
    "extract_date_from_email_datefield")

session.catalog._jsparkSession.udf().registerPython(
    "extract_date_from_email_datefield",
    extract_date_from_email_datefield_udf._judf)

# In[ ]:

spark_mailing_list_data_with_date = spark_mailing_list_data.select(
    "*",
    extract_date_from_email_datefield_udf(
        spark_mailing_list_data["Date"]).alias("email_date"))

# In[ ]:

# Manually verify that our date parser is looking ok
spark_mailing_list_data.select(
    spark_mailing_list_data["Date"],
    extract_date_from_email_datefield_udf(
        spark_mailing_list_data["Date"]).alias("email_date")).take(2)

# In[ ]:


#tag::filter_junk[]
def is_ok(post):
    # Your special business logic goes here
    return True


spark_mailing_list_data_cleaned = spark_mailing_list_data_with_date.filter(
    is_ok)
#end::filter_junk[]

# In[ ]:

mailing_list_posts_in_reply_to = spark_mailing_list_data_cleaned.filter(
    spark_mailing_list_data["In-Reply-To"].isNotNull()).alias(
        "mailing_list_posts_in_reply_to")
initial_posts = spark_mailing_list_data_cleaned.filter(
    spark_mailing_list_data["In-Reply-To"].isNull()).alias(
        "initial_posts").cache()

# In[ ]:

# See how many start-of-thread posts we have
initial_posts.count()

# In[ ]:

ids_in_reply = mailing_list_posts_in_reply_to.select("In-Reply-To",
                                                     "message-id")

# In[ ]:

ids_in_reply.schema

# In[ ]:

# Ok now it's time to save these
#tag::write_big_data[]
initial_posts.write.format("parquet").mode('overwrite').save(fs_prefix +
                                                             "/initial_posts")
ids_in_reply.write.format("parquet").mode('overwrite').save(fs_prefix +
                                                            "/ids_in_reply")
#end::write_big_data[]

# In[ ]:

#tag::small_data[]
initial_posts.toPandas()
#end::small_data[]

# In[ ]:

session.stop()

# In[ ]:


================================================
FILE: data-extraction/python-spark-notebook/build.sh
================================================
#!/bin/bash
# Build a notebook with Spark 3
# Note when Spark 3 is fully released we can use gcr.io/spark-operator/spark-py:v3.0.0
set -ex
V=${V:-"23"}
REPO=${REPO:-"gcr.io/$PROJECT"}
TARGET=${TARGET:-"$REPO/kubeflow/spark-notebook:v$V"}
KF_BASE=${KF_BASE:-"gcr.io/kubeflow-images-public"}
BASE=${BASE:-"$KF_BASE/tensorflow-1.15.2-notebook-cpu:1.0.0"}
SPARK_VERSION="3.0.0-preview2"
SPARK_RELEASE="spark-3.0.0-preview2-bin-hadoop3.2"
SPARK_ARTIFACT="${SPARK_RELEASE}.tgz"
docker build . -t "${TARGET}" --build-arg sparkversion="${SPARK_VERSION}" \
       --build-arg sparkrelease="${SPARK_RELEASE}" --build-arg base="${BASE}"
docker push "${TARGET}"
# Build Spark worker image
SPARK_TARGET=${SPARK_TARGET:-"$REPO/kubeflow/spark-worker"}
if [ ! -f /tmp/${SPARK_ARTIFACT} ]; then
  pushd /tmp/
  wget "https://www-us.apache.org/dists/spark/spark-${SPARK_VERSION}/${SPARK_ARTIFACT}"
  popd
fi

tmp_dir=$(mktemp -d -t spark-build-XXXXXXXXXX)
pushd "${tmp_dir}"
tar -xvf "/tmp/${SPARK_ARTIFACT}"

pushd "${SPARK_RELEASE}"
./bin/docker-image-tool.sh -r "${SPARK_TARGET}" -t "v${SPARK_VERSION}-${V}" build
./bin/docker-image-tool.sh -r "${SPARK_TARGET}" -t "v${SPARK_VERSION}-${V}" \
			   -p kubernetes/dockerfiles/spark/bindings/python/Dockerfile \
			   build
./bin/docker-image-tool.sh -r "${SPARK_TARGET}" -t "v${SPARK_VERSION}-${V}" push
popd

popd
# Add GCS to Spark images
docker build --build-arg base="${SPARK_TARGET}/spark:v${SPARK_VERSION}-${V}" \
       -t "${SPARK_TARGET}/spark-with-gcs:v${SPARK_VERSION}-$V" -f AddGCSDockerfile .
PYSPARK_WITH_GCS="${SPARK_TARGET}/spark-py-with-gcs:v${SPARK_VERSION}-$V"
docker build --build-arg base="${SPARK_TARGET}/spark-py:v${SPARK_VERSION}-${V}" \
       -t "${PYSPARK_WITH_GCS}" -f AddGCSDockerfile .
# Add Python 3.6 to PySpark images for notebook compat
SPARK_PY36_WORKER="${SPARK_TARGET}/spark-py-36:v${SPARK_VERSION}-$V"
docker build --build-arg base="${PYSPARK_WITH_GCS}" \
       -t "${SPARK_PY36_WORKER}" -f AddPython3.6Dockerfile .

docker push "${SPARK_TARGET}/spark-with-gcs:v${SPARK_VERSION}-$V"
docker push "${SPARK_TARGET}/spark-py-with-gcs:v${SPARK_VERSION}-$V"
docker push "${SPARK_PY36_WORKER}"
rm -rf "${tmp_dir}"

echo "Spark notebook pushed to ${TARGET}"
echo "Spark py worker pushed to ${SPARK_PY36_WORKER}"


================================================
FILE: data-extraction/python-spark-notebook/dr.yaml
================================================
apiVersion: networking.istio.io/v1alpha3
kind: DestinationRule
metadata:
  name: default
  namespace: kubeflow-programmerboo
spec:
  host: '*.svc.cluster.local'
  trafficPolicy:
    tls:
      mode: DISABLE

================================================
FILE: data-extraction/python-spark-notebook/no-saprk-tls.yaml
================================================
 apiVersion: "authentication.istio.io/v1alpha1"
 kind: "Policy"
 metadata:
   name: spark-no-tls
 spec:
   targets:
   - name: spark-notebook-0

================================================
FILE: data-extraction/python-spark-notebook/spark-driver-service.yaml
================================================
apiVersion: v1
kind: Service
metadata:
  name: spark-driver
  namespace: kubeflow-programmerboo
spec:
  selector:
    notebook-name: spark-test-2
  ports:
    - port: 39235
      targetPort: 39235
      name: spark-driver-port
    - port: 39236
      targetPort: 39236
      name: spark-block-port


================================================
FILE: data-extraction/python-spark-notebook/virt_service.yaml
================================================
apiVersion: networking.istio.io/v1alpha3
kind: VirtualService
metadata:
  creationTimestamp: "2019-10-14T20:09:50Z"
  generation: 1
  name: notebook-programmerboo-spark-notebook
  namespace: programmerboo
  ownerReferences:
  - apiVersion: kubeflow.org/v1beta1
    blockOwnerDeletion: true
    controller: true
    kind: Notebook
    name: spark-notebook
    uid: 93fb0c0e-eebe-11e9-a454-42010a8e0119
  resourceVersion: "3616573"
  selfLink: /apis/networking.istio.io/v1alpha3/namespaces/programmerboo/virtualservices/notebook-programmerboo-spark-notebook
  uid: 9404145c-eebe-11e9-a454-42010a8e0119
spec:
  gateways:
  - kubeflow/kubeflow-gateway
  hosts:
  - '*'
  http:
  - match:
    - uri:
        prefix: /notebook/programmerboo/spark-notebook
    rewrite:
      uri: /notebook/programmerboo/spark-notebook
    route:
    - destination:
        host: spark-notebook.programmerboo.svc.cluster.local
        port:
          number: 80
    timeout: 300s


================================================
FILE: data-extraction/spark-hello-world/Dockerfile
================================================


================================================
FILE: data-extraction/spark-hello-world/README.md
================================================
This directory will walk you through running a Spark Hello world example with kubeflow.
It (currently) uses the master branch of Kubeflow unlike the rest of the examples
since Spark support is not yet in a released version.


================================================
FILE: data-extraction/spark-hello-world/hello_world_pipeline.py
================================================
import kfp.dsl as dsl
import kfp.gcp as gcp
import kfp.onprem as onprem

from string import Template
import json


@dsl.pipeline(name='Simple spark pipeline demo',
              description='Shows how to use Spark operator inside KF')
def spark_hello_world_pipeline(jar_location="gcs://....", tf_job_image="..."):
    spark_json_template = Template("""
{
    "apiVersion": "sparkoperator.k8s.io/v1beta2",
    "kind": "SparkApplication",
    "metadata": {
      "name": "spark-frank",
      "namespace": "kubeflow"},
    "spec": {
      "type": "Scala",
      "mode": "cluster",
      "mainApplicationFile": "$jar_location"
    }""")
    spark_json = spark_json_template.substitute({'jar_location': jar_location})
    spark_job = json.loads(spark_json)
    spark_resource = dsl.ResourceOp(
        name='spark-job',
        k8s_resource=spark_job,
        success_condition='status.state == Succeeded')
    train = dsl.ContainerOp(
        name='train',
        image=tf_job_image,
    ).after(spark_resoure)


================================================
FILE: data-extraction/spark-hello-world/lr_demo/.gitignore
================================================
*.class
*.log
build.sbt_back

# sbt specific
dist/*
target/
lib_managed/
src_managed/
project/boot/
project/plugins/project/
sbt/*.jar
mini-complete-example/sbt/*.jar

# Scala-IDE specific
.scala_dependencies

#Emacs
*~

#ignore the metastore
metastore_db/*

# Byte-compiled / optimized / DLL files
__pycache__/
*.py[cod]

# C extensions
*.so

# Distribution / packaging
.env
.Python
env/
bin/
build/*.jar
develop-eggs/
dist/
eggs/
lib/
lib64/
parts/
sdist/
var/
*.egg-info/
.installed.cfg
*.egg

# Installer logs
pip-log.txt
pip-delete-this-directory.txt

# Unit test / coverage reports
htmlcov/
.tox/
.coverage
.cache
nosetests.xml
coverage.xml

# Translations
*.mo

# Mr Developer
.mr.developer.cfg
.project
.pydevproject

# Rope
.ropeproject

# Django stuff:
*.log
*.pot

# Sphinx documentation
docs/_build/

# PyCharm files
*.idea

# emacs stuff

# Autoenv
.env
*~
# Byte-compiled / optimized / DLL files
__pycache__/
*.py[cod]

# C extensions
*.so

# Distribution / packaging
.env
.Python
env/
bin/
build/
develop-eggs/
dist/
eggs/
lib/
lib64/
parts/
sdist/
var/
*.egg-info/
.installed.cfg
*.egg

# Installer logs
pip-log.txt
pip-delete-this-directory.txt

# Unit test / coverage reports
htmlcov/
.tox/
.coverage
.cache
nosetests.xml
coverage.xml

# Translations
*.mo

# Mr Developer
.mr.developer.cfg
.project
.pydevproject

# Rope
.ropeproject

# Django stuff:
*.log
*.pot

# Sphinx documentation
docs/_build/

# PyCharm files
*.idea

# emacs stuff
\#*\#
\.\#*

# Autoenv
.env
*~


================================================
FILE: data-extraction/spark-hello-world/lr_demo/.travis.yml
================================================
language: scala

# These directories are cached to S3 at the end of the build
cache:
  directories:
    - $HOME/.ivy2/cache
    - $HOME/.sbt/boot/
    - $HOME/.sbt/launchers
    - $HOME/build

jdk:
  - oraclejdk8
scala:
  - 2.11.8
after_success:
  - bash <(curl -s https://codecov.io/bash)
sudo: false

================================================
FILE: data-extraction/spark-hello-world/lr_demo/README.md
================================================
A simple, bad, LR example with Spark.


================================================
FILE: data-extraction/spark-hello-world/lr_demo/build.sbt
================================================
val sparkVersion = "2.3.1"

lazy val root = (project in file(".")).

  settings(
    inThisBuild(List(
      organization := "com.introtomlwithkubeflow.spark.demo",
      scalaVersion := "2.11.12"
    )),
    name := "basic.lr",
    version := "0.0.1",

    javacOptions ++= Seq("-source", "1.8", "-target", "1.8"),
    javaOptions ++= Seq("-Xms512M", "-Xmx2048M", "-XX:MaxPermSize=2048M", "-XX:+CMSClassUnloadingEnabled"),
    scalacOptions ++= Seq("-deprecation", "-unchecked"),
    parallelExecution in Test := false,
    fork := true,

    coverageHighlighting := true,

    libraryDependencies ++= Seq(
      "org.apache.spark" %% "spark-streaming" % sparkVersion % "provided",
      "org.apache.spark" %% "spark-sql" % sparkVersion % "provided",
      "org.apache.spark" %% "spark-mllib" % sparkVersion % "provided",
      "ml.combust.mleap" %% "mleap-spark" % "0.13.0",

      "org.scalatest" %% "scalatest" % "3.0.1" % "test",
      "org.scalacheck" %% "scalacheck" % "1.13.4" % "test",
      "com.holdenkarau" %% "spark-testing-base" % "2.3.1_0.11.0" % "test"
    ),

    // uses compile classpath for the run task, including "provided" jar (cf http://stackoverflow.com/a/21803413/3827)
    run in Compile := Defaults.runTask(fullClasspath in Compile, mainClass in (Compile, run), runner in (Compile, run)).evaluated,

    scalacOptions ++= Seq("-deprecation", "-unchecked"),
    pomIncludeRepository := { x => false },

   resolvers ++= Seq(
      "sonatype-releases" at "https://oss.sonatype.org/content/repositories/releases/",
      "Typesafe repository" at "http://repo.typesafe.com/typesafe/releases/",
      "Second Typesafe repo" at "http://repo.typesafe.com/typesafe/maven-releases/",
      Resolver.sonatypeRepo("public")
    ),

    pomIncludeRepository := { x => false },
        mergeStrategy in assembly := {
      case m if m.toLowerCase.endsWith("manifest.mf") => MergeStrategy.discard
      case m if m.toLowerCase.endsWith("io.netty.versions.properties") => MergeStrategy.concat
      case m if m.toLowerCase.endsWith("services") => MergeStrategy.filterDistinctLines
      case m if m.toLowerCase.endsWith("git.properties") => MergeStrategy.discard
      case m if m.toLowerCase.endsWith("reference.conf") => MergeStrategy.filterDistinctLines
        // Travis is giving a weird error on netty I don't see locally :(
      case PathList("META-INF", "io.netty.versions.properties") => MergeStrategy.first
      case PathList("META-INF", "native", xs @ _*) => MergeStrategy.deduplicate
      case PathList("META-INF", "services", xs @ _ *) => MergeStrategy.filterDistinctLines
      case PathList("META-INF", xs @ _ *) => MergeStrategy.discard
      case PathList("javax", "servlet", xs @ _*) => MergeStrategy.last
      case PathList("org", "apache", xs @ _*) => MergeStrategy.last
      case PathList("org", "jboss", xs @ _*) => MergeStrategy.last
        // Start http://queirozf.com/entries/creating-scala-fat-jars-for-spark-on-sbt-with-sbt-assembly-plugin
      case PathList("org","aopalliance", xs @ _*) => MergeStrategy.last
      case PathList("javax", "inject", xs @ _*) => MergeStrategy.last
      case PathList("javax", "servlet", xs @ _*) => MergeStrategy.last
      case PathList("javax", "activation", xs @ _*) => MergeStrategy.last
      case PathList("org", "apache", xs @ _*) => MergeStrategy.last
      case PathList("com", "google", xs @ _*) => MergeStrategy.last
      case PathList("com", "esotericsoftware", xs @ _*) => MergeStrategy.last
      case PathList("com", "codahale", xs @ _*) => MergeStrategy.last
      case PathList("com", "yammer", xs @ _*) => MergeStrategy.last
        // End http://queirozf.com/entries/creating-scala-fat-jars-for-spark-on-sbt-with-sbt-assembly-plugin
      case PathList("com", "sun", "activation", "registries", xs @ _*) => MergeStrategy.last
      case PathList("com", "sun", "activation", "viewers", xs @ _*) => MergeStrategy.last
      case "about.html"  => MergeStrategy.rename
      case "reference.conf" => MergeStrategy.concat
      case m =>
        val oldStrategy = (assemblyMergeStrategy in assembly).value
        oldStrategy(m)
    },
    assemblyShadeRules in assembly := Seq(
      ShadeRule.rename("com.google.protobuf.**" -> "shadeproto.@1").inAll
    ),


    // publish settings
    publishTo := {
      val nexus = "https://oss.sonatype.org/"
      if (isSnapshot.value)
        Some("snapshots" at nexus + "content/repositories/snapshots")
      else
        Some("releases"  at nexus + "service/local/staging/deploy/maven2")
    }
  )


================================================
FILE: data-extraction/spark-hello-world/lr_demo/project/build.properties
================================================
sbt.version=1.2.8


================================================
FILE: data-extraction/spark-hello-world/lr_demo/project/plugins.sbt
================================================
addSbtPlugin("org.scalastyle" %% "scalastyle-sbt-plugin" % "1.0.0")

resolvers += "sonatype-releases" at "https://oss.sonatype.org/content/repositories/releases/"

resolvers += "Spark Package Main Repo" at "https://dl.bintray.com/spark-packages/maven"

addSbtPlugin("org.scoverage" % "sbt-scoverage" % "1.5.1")

addSbtPlugin("com.eed3si9n" % "sbt-assembly" % "0.14.5")


================================================
FILE: data-extraction/spark-hello-world/lr_demo/sample.csv
================================================
e1,e2,label
1.0, 0.0, 1.0
2.0, 2.1, 2.0


================================================
FILE: data-extraction/spark-hello-world/lr_demo/sbt/sbt
================================================
#!/bin/bash

#
# Licensed to the Apache Software Foundation (ASF) under one or more
# contributor license agreements.  See the NOTICE file distributed with
# this work for additional information regarding copyright ownership.
# The ASF licenses this file to You under the Apache License, Version 2.0
# (the "License"); you may not use this file except in compliance with
# the License.  You may obtain a copy of the License at
#
#    http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#

# This script launches sbt for this project. If present it uses the system 
# version of sbt. If there is no system version of sbt it attempts to download
# sbt locally.
SBT_VERSION=0.13.15
URL1=http://typesafe.artifactoryonline.com/typesafe/ivy-releases/org.scala-sbt/sbt-launch/${SBT_VERSION}/sbt-launch.jar
URL2=http://repo.typesafe.com/typesafe/ivy-releases/org.scala-sbt/sbt-launch/${SBT_VERSION}/sbt-launch.jar
JAR=sbt/sbt-launch-${SBT_VERSION}.jar

# Download sbt launch jar if it hasn't been downloaded yet
if [ ! -f ${JAR} ]; then
  # Download
  printf "Attempting to fetch sbt\n"
  set -x
  JAR_DL=${JAR}.part
  if hash wget 2>/dev/null; then
    (wget --progress=bar ${URL1} -O ${JAR_DL} || wget --progress=bar ${URL2} -O ${JAR_DL}) && mv ${JAR_DL} ${JAR}
  elif hash axel 2>/dev/null; then
    (axel  ${URL1} -o ${JAR_DL} || axel  ${URL2} -o ${JAR_DL}) && mv ${JAR_DL} ${JAR}
  else
    printf "You do not have curl or wget installed, please install sbt manually from http://www.scala-sbt.org/\n"
    exit -1
  fi
fi
if [ ! -f ${JAR} ]; then
  # We failed to download
  printf "Our attempt to download sbt locally to ${JAR} failed. Please install sbt manually from http://www.scala-sbt.org/\n"
  exit -1
fi
printf "Launching sbt from ${JAR}\n"
java \
  -Xmx1200m -XX:MaxPermSize=350m -XX:ReservedCodeCacheSize=256m \
  -jar ${JAR} \
  "$@"


================================================
FILE: data-extraction/spark-hello-world/lr_demo/src/main/scala/com/introtomlwithkubeflow/spark/demo/lr/TrainingApp.scala
================================================
package com.introtomlwithkubeflow.spark.demo.lr

import org.apache.spark.{SparkConf, SparkContext}


/**
  * Use this when submitting the app to a cluster with spark-submit
  * */
object TrainingApp extends App{
  val (inputFile, outputFile) = (args(0), args(1))

  // spark-submit command should supply all necessary config elements
  Runner.run(new SparkConf(), inputFile, outputFile)
}

object Runner {
  def run(conf: SparkConf, inputFile: String, outputFile: String): Unit = {
    val sc = new SparkContext(conf)
    val trainer = new TrainingPipeline(sc)
    trainer.train(inputFile, outputFile)
  }
}


================================================
FILE: data-extraction/spark-hello-world/lr_demo/src/main/scala/com/introtomlwithkubeflow/spark/demo/lr/TrainingPipeline.scala
================================================
package com.introtomlwithkubeflow.spark.demo.lr

import java.nio.file.{Files, Paths}


import ml.combust.bundle.BundleFile
import ml.combust.mleap.spark.SparkSupport._
import org.apache.hadoop.fs.{FileSystem, Path}
import org.apache.spark.ml.bundle.SparkBundleContext // Actually an mleap import
import org.apache.spark.{SparkConf, SparkContext}
import org.apache.spark.sql._
import org.apache.spark.sql.functions._
import org.apache.spark.sql.catalyst.ScalaReflection
import org.apache.spark.ml.Transformer
import org.apache.spark.ml.{Pipeline, PipelineModel}
import org.apache.spark.ml.feature._
import org.apache.spark.ml.regression._
import resource._


class TrainingPipeline(sc: SparkContext) {
  val session = SparkSession.builder().getOrCreate()
  import session.implicits._

  def train(input: String, outputFile: String) = {
    val trainingData = session.read.format("csv")
      .option("inferSchema", "true").option("header", "true").load(input)
    val vectorizer = new VectorAssembler().setInputCols(Array("e1", "e2")).setOutputCol("features")
    val lr = new GeneralizedLinearRegression()
      .setFamily("gaussian")
      .setLink("identity")
      .setMaxIter(10)
      .setRegParam(0.3)
    val pipeline = new Pipeline().setStages(Array(
      vectorizer,
      lr))
    val fit = pipeline.fit(trainingData)
    // Serialize the fit pipeline
    val resultData = fit.transform(trainingData)
    val localFile = "/tmp/mleap.zip"
    val localOutput = s"jar:file:${localFile}"
    val sbc = SparkBundleContext().withDataset(resultData)
    for(bf <- managed(BundleFile(localOutput))) {
      fit.writeBundle.save(bf)(sbc).get
    }
    // We only have one file so its k
    val modelBinary = Files.readAllBytes(Paths.get(localFile))
    val fs = FileSystem.get(sc.hadoopConfiguration)
    val out = fs.create(new Path(outputFile))
    out.write(modelBinary);
    out.close();
  }
}


================================================
FILE: data-extraction/spark-hello-world/lr_demo/src/test/scala/com/introtomlwithkubeflow/spark/demo/lr/TrainingPipelineTest.scala
================================================
package com.introtomlwithkubeflow.spark.demo.lr

/**
 * A simple test for the training pipeline
 */

import com.holdenkarau.spark.testing.{SharedSparkContext, Utils}

import org.apache.spark.sql._

import org.scalatest.FunSuite

import java.io.File

case class MyData(e1: Double, e2: Double, label: Double)

class TrainingPipelineTest extends FunSuite with SharedSparkContext {
  test("smok test"){
    val session = SparkSession.builder().getOrCreate()
    import session.implicits._

    val tempDir = Utils.createTempDir()

    val sampleDataRDD = sc.parallelize(Seq(
      MyData(1.0, 0.0, 1.0),
      MyData(2.0, 2.1, 2.0)))
    val sampleDataDS = session.createDataset(sampleDataRDD)
    val inputDataLocation = tempDir + "/input"
    val outputFile = tempDir + "/output.zip"
    sampleDataDS.write.format("csv").option("header", "true").save(inputDataLocation)

    val trainingPipeline = new TrainingPipeline(sc)
    trainingPipeline.train(inputDataLocation, outputFile)
    assert(new File(outputFile).exists())
  }
}


================================================
FILE: data-extraction/spark-hello-world/setup.sh
================================================
#!/bin/bash
set -ex

SPARK_DEMO_DIR=${SPARK_DEMO_DIR:=~/spark_demo_3}
SPARK_DEMO_GCS=${SPARK_DEMO_GCS:=gs://boo-spark-kf-demo}

# Set up kubeflow
mkdir "$SPARK_DEMO_DIR"
pushd "$SPARK_DEMO_DIR"
pwd

wget https://raw.githubusercontent.com/kubeflow/kubeflow/master/scripts/download.sh
chmod a+x download.sh
KUBEFLOW_VERSION=0.5.0
export KUBEFLOW_VERSION
./download.sh

PATH="$(pwd)/scripts":$PATH
kfctl.sh init mydemoapp --platform none
pushd mydemoapp
source env.sh
#kfctl.sh generate platform
#kfctl.sh apply platform
kfctl.sh generate k8s
kfctl.sh apply k8s
pushd ks_app
# Set up the Spark operator
ks pkg install kubeflow/spark
ks generate spark-operator spark-operator --name=spark-operator
ks apply default -c spark-operator

# Create a Spark job with the operator (Pi)
local_jar_path="local:///opt/spark/examples/jars/spark-examples_2.11-2.3.1.jar"
ks generate spark-job spark-pi --name=spark-operator \
   --applicationResource="$local_jar_path" \
   --mainClass=org.apache.spark.examples.SparkPi
ks apply default -c spark-pi

# Create a Spark job with the operator to train an LR model

pushd "$SPARK_MNIST_DIR/lr_demo"
sbt assembly
gsutil cp target/scala-2.11/basic.lr-assembly-0.0.1.jar "$SPARK_DEMO_GCS/jars"
gsutil cp sample.csv "$SPARK_DEMO_GCS/input/part0.csv"
popd

ks generate spark-job spark-lr --name=spark-operator \
   --applicationResource="$SPARK_DEMO_GCS/jars/basic.lr-assembly-0.0.1.jar" \
   --mainClass=com.introtomlwithkubeflow.spark.demo.lr.TrainingApp
   "$SPARK_DEMO_GCS/input" "$SPARK_DEMO_GCS/output"
ks apply default -c spark-lr


# Create a Spark job with the operator for data prep on the GitHub data

popd


================================================
FILE: data-extraction/spark-hello-world/spark-pi-min.yaml
================================================
apiVersion: "sparkoperator.k8s.io/v1beta2"
kind: SparkApplication
metadata:
  name: spark-pi
  namespace: kubeflow
spec:
  type: Scala
  mode: cluster
  image: "gcr.io/spark-operator/spark:v2.4.4"
  imagePullPolicy: Always
  mainClass: org.apache.spark.examples.SparkPi
  mainApplicationFile: "local:///opt/spark/examples/jars/spark-examples_2.11-2.4.4.jar"
  sparkVersion: "2.4.4"
  restartPolicy:
    type: Never
  volumes:
    - name: "test-volume"
      hostPath:
        path: "/tmp"
        type: Directory
  driver:
    cores: 1
    coreLimit: "1200m"
    memory: "512m"
    labels:
      version: 2.4.4
    volumeMounts:
      - name: "test-volume"
        mountPath: "/tmp"
  executor:
    cores: 1
    instances: 1
    memory: "512m"
    labels:
      version: 2.4.4
    volumeMounts:
      - name: "test-volume"
        mountPath: "/tmp"


================================================
FILE: data-extraction/spark-hello-world/spark-pi.yaml
================================================
apiVersion: "sparkoperator.k8s.io/v1beta2"
kind: SparkApplication
metadata:
  name: spark-pi
  namespace: kubeflow
spec:
  type: Scala
  mode: cluster
  image: "gcr.io/spark-operator/spark:v2.4.4"
  imagePullPolicy: Always
  mainClass: org.apache.spark.examples.SparkPi
  mainApplicationFile: "local:///opt/spark/examples/jars/spark-examples_2.11-2.4.4.jar"
  sparkVersion: "2.4.4"
  restartPolicy:
    type: Never
  volumes:
    - name: "test-volume"
      hostPath:
        path: "/tmp"
        type: Directory
  driver:
    cores: 1
    coreLimit: "1200m"
    memory: "512m"
    labels:
      version: 2.4.4
    serviceAccount: spark-operatoroperator-sa
    volumeMounts:
      - name: "test-volume"
        mountPath: "/tmp"
  executor:
    cores: 1
    instances: 1
    memory: "512m"
    labels:
      version: 2.4.4
    volumeMounts:
      - name: "test-volume"
        mountPath: "/tmp"


================================================
FILE: data-extraction/stack_overflow_questions.bsql
================================================
SELECT 

================================================
FILE: data-extraction/tfx/TFDV.ipynb
================================================
{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "We start by downloading a specific release of the components because running from master is not a good way to buid \"repetable\" systems"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "!wget https://github.com/kubeflow/pipelines/archive/0.2.5.tar.gz"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "!tar -xvf 0.2.5.tar.gz"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import kfp"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "#tag::loadGCSDLComponent[]\n",
    "gcs_download_component = kfp.components.load_component_from_file(\n",
    "    \"pipelines-0.2.5/components/google-cloud/storage/download/component.yaml\")\n",
    "#end::loadGCSDLComponent[]\n",
    "#tag::loadTFDVAndFriendsComponents[]\n",
    "tfx_csv_gen = kfp.components.load_component_from_file(\n",
    "    \"pipelines-0.2.5/components/tfx/ExampleGen/CsvExampleGen/component.yaml\")\n",
    "tfx_statistic_gen = kfp.components.load_component_from_file(\n",
    "    \"pipelines-0.2.5/components/tfx/StatisticsGen/component.yaml\")\n",
    "tfx_schema_gen = kfp.components.load_component_from_file(\n",
    "    \"pipelines-0.2.5/components/tfx/SchemaGen/component.yaml\")\n",
    "tfx_example_validator = kfp.components.load_component_from_file(\n",
    "    \"pipelines-0.2.5/components/tfx/ExampleValidator/component.yaml\")\n",
    "#end::loadTFDVAndFriendsComponents[]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "@kfp.dsl.pipeline(\n",
    "  name='DL',\n",
    "  description='Sample DL pipeline'\n",
    ")\n",
    "def pipeline_with_dl():\n",
    "    #tag::dlOp[]\n",
    "    dl_op = gcs_download_component(\n",
    "        gcs_path=\"gs://ml-pipeline-playground/tensorflow-tfx-repo/tfx/components/testdata/external/csv\") # Your path goes here\n",
    "    #end::dlOp[]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "kfp.compiler.Compiler().compile(pipeline_with_dl, 'dl_pipeline.zip')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "client = kfp.Client()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "my_experiment = client.create_experiment(name='dl')\n",
    "my_run = client.run_pipeline(my_experiment.id, 'dl', \n",
    "  'dl_pipeline.zip')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "#tag::standaloneTFDVPipeline[]\n",
    "@kfp.dsl.pipeline(\n",
    "  name='TFDV',\n",
    "  description='TF DV Pipeline'\n",
    ")\n",
    "def tfdv_pipeline():\n",
    "    # DL with wget, can use gcs instead as well\n",
    "    data_url = \"https://raw.githubusercontent.com/moorissa/medium/master/items-recommender/data/trx_data.csv\"\n",
    "    #tag::wget[]\n",
    "    fetch = kfp.dsl.ContainerOp(\n",
    "      name='download',\n",
    "      image='busybox',\n",
    "      command=['sh', '-c'],\n",
    "      arguments=[\n",
    "          'sleep 1;'\n",
    "          'mkdir -p /tmp/data;'\n",
    "          'wget '+ data_url +' -O /tmp/data/results.csv'],\n",
    "      file_outputs={'downloaded': '/tmp/data'})\n",
    "    # This expects a directory of inputs not just a single file\n",
    "    #end::wget[]\n",
    "    #tag::csv[]\n",
    "    records_example = tfx_csv_gen(input_base=fetch.output)\n",
    "    #end::csv[]\n",
    "    #tag::stats[]\n",
    "    stats = tfx_statistic_gen(input_data=records_example.output)\n",
    "    #end::stats[]\n",
    "    #tag::schema[]\n",
    "    schema_op = tfx_schema_gen(stats.output)\n",
    "    #end::schema[]\n",
    "    #tag::validate[]\n",
    "    tfx_example_validator(stats=stats.outputs['output'], schema=schema_op.outputs['output'])\n",
    "    #end::validate[]\n",
    "#end::standaloneTFDVPipeline[]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "kfp.compiler.Compiler().compile(tfdv_pipeline, 'tfdv_pipeline.zip')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "my_experiment = client.create_experiment(name='tfdv_pipeline')\n",
    "my_run = client.run_pipeline(my_experiment.id, 'tfdv', \n",
    "  'tfdv_pipeline.zip')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "!pip3 install tfx tensorflow-data-validation"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "#tag::importTFDV[]\n",
    "import tensorflow_data_validation as tfdv\n",
    "#end::importTFDV[]"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "You can download your schema by looking at the inputs/outputs in your pipeline run for the schema gen stage"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "#tag::displaySchema{}\n",
    "schema = tfdv.load_schema_text(\"schema_info_2\")\n",
    "tfdv.display_schema(schema)\n",
    "#end::displaySchema[]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "#tag::loadTFT[]\n",
    "tfx_transform = kfp.components.load_component_from_file(\n",
    "    \"pipelines-0.2.5/components/tfx/Transform/component.yaml\")\n",
    "#end::loadTFT[]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "module_file=\"gcs://\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "@kfp.dsl.pipeline(\n",
    "  name='TFX',\n",
    "  description='TFX pipeline'\n",
    ")\n",
    "def tfx_pipeline():\n",
    "    # DL with wget, can use gcs instead as well\n",
    "    fetch = kfp.dsl.ContainerOp(\n",
    "      name='download',\n",
    "      image='busybox',\n",
    "      command=['sh', '-c'],\n",
    "      arguments=[\n",
    "          'sleep 1;'\n",
    "          'mkdir -p /tmp/data;'\n",
    "          'wget https://raw.githubusercontent.com/moorissa/medium/master/items-recommender/data/trx_data.csv -O /tmp/data/results.csv'],\n",
    "      file_outputs={'downloaded': '/tmp/data'})\n",
    "    records_example = tfx_csv_gen(input_base=fetch.output)\n",
    "    stats = tfx_statistic_gen(input_data=records_example.output)\n",
    "    schema_op = tfx_schema_gen(stats.output)\n",
    "    tfx_example_validator(stats=stats.outputs['output'], schema=schema_op.outputs['output'])\n",
    "    #tag::tft[]\n",
    "    transformed_output = tfx_transform(\n",
    "        input_data=records_example.output,\n",
    "        schema=schema_op.outputs['output'],\n",
    "        module_file=module_file) # Path to your TFT code on GCS/S3\n",
    "    #end::tft[]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "kfp.compiler.Compiler().compile(tfx_pipeline, 'tfx_pipeline.zip')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "my_experiment = client.create_experiment(name='tfx_pipeline')\n",
    "my_run = client.run_pipeline(my_experiment.id, 'tfx', \n",
    "  'tfx_pipeline.zip')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.9"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}


================================================
FILE: data-extraction/tfx/TFDV.py
================================================
#!/usr/bin/env python
# coding: utf-8

# We start by downloading a specific release of the components because running from master is not a good way to buid "repetable" systems

# In[ ]:

get_ipython().system(
    'wget https://github.com/kubeflow/pipelines/archive/0.2.5.tar.gz')

# In[ ]:

get_ipython().system('tar -xvf 0.2.5.tar.gz')

# In[ ]:

import kfp

# In[ ]:

# In[ ]:

#tag::loadGCSDLComponent[]
gcs_download_component = kfp.components.load_component_from_file(
    "pipelines-0.2.5/components/google-cloud/storage/download/component.yaml")
#end::loadGCSDLComponent[]
#tag::loadTFDVAndFriendsComponents[]
tfx_csv_gen = kfp.components.load_component_from_file(
    "pipelines-0.2.5/components/tfx/ExampleGen/CsvExampleGen/component.yaml")
tfx_statistic_gen = kfp.components.load_component_from_file(
    "pipelines-0.2.5/components/tfx/StatisticsGen/component.yaml")
tfx_schema_gen = kfp.components.load_component_from_file(
    "pipelines-0.2.5/components/tfx/SchemaGen/component.yaml")
tfx_example_validator = kfp.components.load_component_from_file(
    "pipelines-0.2.5/components/tfx/ExampleValidator/component.yaml")
#end::loadTFDVAndFriendsComponents[]

# In[ ]:


@kfp.dsl.pipeline(name='DL', description='Sample DL pipeline')
def pipeline_with_dl():
    #tag::dlOp[]
    dl_op = gcs_download_component(
        gcs_path=
        "gs://ml-pipeline-playground/tensorflow-tfx-repo/tfx/components/testdata/external/csv"
    )  # Your path goes here
    #end::dlOp[]


# In[ ]:

kfp.compiler.Compiler().compile(pipeline_with_dl, 'dl_pipeline.zip')

# In[ ]:

client = kfp.Client()

# In[ ]:

my_experiment = client.create_experiment(name='dl')
my_run = client.run_pipeline(my_experiment.id, 'dl', 'dl_pipeline.zip')

# In[ ]:


#tag::standaloneTFDVPipeline[]
@kfp.dsl.pipeline(name='TFDV', description='TF DV Pipeline')
def tfdv_pipeline():
    # DL with wget, can use gcs instead as well
    data_url = "https://raw.githubusercontent.com/moorissa/medium/master/items-recommender/data/trx_data.csv"
    #tag::wget[]
    fetch = kfp.dsl.ContainerOp(name='download',
                                image='busybox',
                                command=['sh', '-c'],
                                arguments=[
                                    'sleep 1;'
                                    'mkdir -p /tmp/data;'
                                    'wget ' + data_url +
                                    ' -O /tmp/data/results.csv'
                                ],
                                file_outputs={'downloaded': '/tmp/data'})
    # This expects a directory of inputs not just a single file
    #end::wget[]
    #tag::csv[]
    records_example = tfx_csv_gen(input_base=fetch.output)
    #end::csv[]
    #tag::stats[]
    stats = tfx_statistic_gen(input_data=records_example.output)
    #end::stats[]
    #tag::schema[]
    schema_op = tfx_schema_gen(stats.output)
    #end::schema[]
    #tag::validate[]
    tfx_example_validator(stats=stats.outputs['output'],
                          schema=schema_op.outputs['output'])
    #end::validate[]


#end::standaloneTFDVPipeline[]

# In[ ]:

kfp.compiler.Compiler().compile(tfdv_pipeline, 'tfdv_pipeline.zip')

# In[ ]:

my_experiment = client.create_experiment(name='tfdv_pipeline')
my_run = client.run_pipeline(my_experiment.id, 'tfdv', 'tfdv_pipeline.zip')

# In[ ]:

get_ipython().system('pip3 install tfx tensorflow-data-validation')

# In[ ]:

#tag::importTFDV[]
import tensorflow_data_validation as tfdv
#end::importTFDV[]

# You can download your schema by looking at the inputs/outputs in your pipeline run for the schema gen stage

# In[ ]:

#tag::displaySchema{}
schema = tfdv.load_schema_text("schema_info_2")
tfdv.display_schema(schema)
#end::displaySchema[]

# In[ ]:

#tag::loadTFT[]
tfx_transform = kfp.components.load_component_from_file(
    "pipelines-0.2.5/components/tfx/Transform/component.yaml")
#end::loadTFT[]

# In[ ]:

module_file = "gcs://"

# In[ ]:


@kfp.dsl.pipeline(name='TFX', description='TFX pipeline')
def tfx_pipeline():
    # DL with wget, can use gcs instead as well
    fetch = kfp.dsl.ContainerOp(
        name='download',
        image='busybox',
        command=['sh', '-c'],
        arguments=[
            'sleep 1;'
            'mkdir -p /tmp/data;'
            'wget https://raw.githubusercontent.com/moorissa/medium/master/items-recommender/data/trx_data.csv -O /tmp/data/results.csv'
        ],
        file_outputs={'downloaded': '/tmp/data'})
    records_example = tfx_csv_gen(input_base=fetch.output)
    stats = tfx_statistic_gen(input_data=records_example.output)
    schema_op = tfx_schema_gen(stats.output)
    tfx_example_validator(stats=stats.outputs['output'],
                          schema=schema_op.outputs['output'])
    #tag::tft[]
    transformed_output = tfx_transform(
        input_data=records_example.output,
        schema=schema_op.outputs['output'],
        module_file=module_file)  # Path to your TFT code on GCS/S3
    #end::tft[]


# In[ ]:

kfp.compiler.Compiler().compile(tfx_pipeline, 'tfx_pipeline.zip')

# In[ ]:

my_experiment = client.create_experiment(name='tfx_pipeline')
my_run = client.run_pipeline(my_experiment.id, 'tfx', 'tfx_pipeline.zip')

# In[ ]:


================================================
FILE: data-extraction/tfx/install_tfx.sh
================================================
#!/bin/bash
#tag::install[]
pip3 install tfx tensorflow-data-validation
#end::install[]


================================================
FILE: data-extraction/tfx/requirements.txt
================================================
tfx


================================================
FILE: data-extraction/tfx/run_on_dataflow_ex.py
================================================
#tag::example[]
generated_output_uri = root_output_uri + kfp.dsl.EXECUTION_ID_PLACEHOLDER
beam_pipeline_args = [
    '--runner=DataflowRunner',
    '--project=' + project_id,
    '--temp_location=' + root_output_uri + '/tmp'),
    '--region=' + gcp_region,
    '--disk_size_gb=50', # Adjust as needed
]

records_example = tfx_csv_gen(
    input_uri=fetch.output, # Must be on distributed storage
    beam_pipeline_args=beam_pipeline_args,
    output_examples_uri=generated_output_uri)
#end::example[]


================================================
FILE: dev-setup/install-argo.sh
================================================
#!/bin/bash
# Download the binary
curl -sLO https://github.com/argoproj/argo/releases/download/v2.8.1/argo-linux-amd64

# Make binary executable
chmod +x argo-linux-amd64

# Move binary to path
mv ./argo-linux-amd64 ~/bin/argo


================================================
FILE: dev-setup/install-kf-pipeline-sdk.sh
================================================
#!/bin/bash
# Put as inside a venv
pushd /tmp
#tag::venv[]
virtualenv kfvenv --python python3
source kfvenv/bin/activate
#end::venv[]
popd
#tag::install[]
URL=https://storage.googleapis.com/ml-pipeline/release/latest/kfp.tar.gz
pip install "${URL}" --upgrade
#end::install[]
mkdir -p ~/repos
pushd ~/repos
if [[ ! -d pipelines ]]; then
#tag::checkout_sdk[]
  git clone --single-branch --branch 0.3.0 https://github.com/kubeflow/pipelines.git
#end::checkout_sdk[]
fi
popd


================================================
FILE: dev-setup/install-kf.sh
================================================
#!/bin/bash
set -ex
#tag::install[]
PLATFORM=$(uname) # Either Linux or Darwin
export PLATFORM
mkdir -p ~/bin
#Configuration
export KUBEFLOW_TAG=1.0.1
# ^ You can also point this to a different version if you want to try
KUBEFLOW_BASE="https://api.github.com/repos/kubeflow/kfctl/releases"
# Or just go to https://github.com/kubeflow/kfctl/releases
KFCTL_URL=$(curl -s ${KUBEFLOW_BASE} |\
	      grep http |\
	      grep "${KUBEFLOW_TAG}" |\
	      grep -i "${PLATFORM}" |\
	      cut -d : -f 2,3 |\
	      tr -d '\" ' )
wget "${KFCTL_URL}"
KFCTL_FILE=${KFCTL_URL##*/}
tar -xvf "${KFCTL_FILE}"
mv ./kfctl ~/bin/
rm "${KFCTL_FILE}"
# Recommended add the scripts directory to your path
export PATH=$PATH:~/bin
#end::install[]


================================================
FILE: dev-setup/install-kubectl.sh
================================================
#!/bin/bash
#tag::ubuntu-kubectl[]
sudo snap install kubectl --classic
#end::ubuntu-kubectl[]
#tag::debian-kubectl[]
sudo apt-get update && sudo apt-get install -y apt-transport-https
curl -s https://packages.cloud.google.com/apt/doc/apt-key.gpg |\
  sudo apt-key add -
echo "deb https://apt.kubernetes.io/ kubernetes-xenial main" |\
  sudo tee -a /etc/apt/sources.list.d/kubernetes.list
sudo apt-get update
sudo apt-get install -y kubectl
#end::debian-kubectl[]
#tag::redhat-kubectl[]
cat <<EOF > /etc/yum.repos.d/kubernetes.repo
[kubernetes]
name=Kubernetes
baseurl=https://packages.cloud.google.com/yum/repos/kubernetes-el7-x86_64
enabled=1
gpgcheck=1
repo_gpgcheck=0
gpgkey=https://packages.cloud.google.com/yum/doc/rpm-package-key.gpg
EOF
yum install -y kubectl
#end::redhat-kubectl[]
#tag::osx-kubectl[]
brew install kubernetes-cli
#end::osx-kubectl[]
#tag::no-pkg-manager-kubectl[]
kubectl_release_base="https://storage.googleapis.com/kubernetes-release"
stable_url="$kubectl_release_base/release/stable.txt"
KUBECTL_VERSION=$(curl -s "$stable_url")
export KUBECTL_VERSION
curl -LO "$kubectl_release_base/$KUBECTL_VERSION/bin/$PLATFORM/amd64/kubectl"
# Now either move kubectl to /usr/bin or add it to your PATH
#end::no-pkg-manager-kubectl[]


================================================
FILE: dev-setup/install-kustomize.sh
================================================
#!/bin/bash
#tag::kustomize[]
PLATFORM=$(uname) # Either Linux or Darwin
export PLATFORM
mkdir -p ~/bin
KUSTOMIZE_URL=$(curl -s \
  https://api.github.com/repos/kubernetes-sigs/kustomize/releases/latest |\
  grep browser_download |\
  grep -i "${PLATFORM}" |\
  cut -d '"' -f 4)
wget "${KUSTOMIZE_URL}"
KUSTOMIZE_FILE=${KUSTOMIZE_URL##*/}
tar -xvf "${KUSTOMIZE_FILE}"
rm "${KUSTOMIZE_FILE}"
mv kustomize ~/bin/kustomize
chmod u+x ~/bin/kustomize
# Add this + platform/version exports to your bashrc or move the ks bin into /usr/bin
export PATH=$PATH:"~/bin"
#end::kustomize[]


================================================
FILE: dev-setup/install-microk8s.sh
================================================
#!/bin/bash
#tag::installmicrok8s[]
sudo snap install microk8s --classic
#end::installmicrok8s[]
#tag::setupmicrok8s[]
# Alias the microk8s versions of kubectl and docker so kubeflow uses them
# You will want to add this to your bashrc if you intend to use microk8s
# generally.
alias kubectl="microk8s.kubectl"
alias docker="microk8s.docker"
### Faking Docker registry, skip for production docker registry
microk8s.enable registry
export DOCKER_HOST="unix:///var/snap/microk8s/current/docker.sock"
sudo ln -s /var/snap/microk8s/current/docker.sock /var/run/docker.sock
sudo ln -s /var/snap/microk8s/common/var/lib/docker /var/lib/docker
#end::setupmicrok8s[]
#tag::bootstrapwithcanonicallabs[]
git clone https://github.com/canonical-labs/kubeflow-tools
pushd kubeflow-tools
KUBEFLOW_VERSION=0.4.1 ./install-kubeflow.sh
#end::bootstrapwithcanonicallabs[]
#tag::unaliasmicrok8s[]
unalias kubectl
unalias docker
#end::unaliasmicrok8s[]


================================================
FILE: dev-setup/jsonnet.sh
================================================
#!/bin/bash
set -e
set -x
#tag::snap[]
sudo snap install jsonnet
#end::snap[]
#tag::manual[]
export JSONNET_VERSION=0.12.1
wget https://github.com/google/jsonnet/archive/v$JSONNET_VERSION.tar.gz
# You will need to add this to your path if it is not already
tar -xvf v$JSONNET_VERSION.tar.gz
cd jsonnet-$JSONNET_VERSION
make
# Or otherwise add to your path
sudo cp jsonent /usr/bin/
#end::manual[]


================================================
FILE: feature-prep/README.md
================================================
Feature preparation is the task of converting the data into features
suitable for our machine algorithms. What makes a "feature" suitable
depends on the algorithm used.

In the `tft` directory we show feature prep using Tensorflow Transform. At the time of writing this only supports Python 2 and has limited support on non-GCP platforms, but it is rapidly improving in both areas.

================================================
FILE: feature-prep/spark/SparkMailingListFeaturePrep.ipynb
================================================
{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Yes we need both these imports\n",
    "from pyspark.sql import SparkSession\n",
    "from pyspark.sql.functions import col, to_date, lit, isnull\n",
    "from pyspark.sql.types import *\n",
    "from pyspark.sql.types import StructField, StructType\n",
    "from pyspark.sql.catalog import UserDefinedFunction\n",
    "from pyspark.ml.feature import *\n",
    "from pyspark.ml.pipeline import Pipeline\n",
    "import os"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "fs_prefix = \"s3a://kf-book-examples/mailing-lists\" # Create with mc as in ch1"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "os.environ[\"PYSPARK_PYTHON\"] = \"python3.6\"\n",
    "# See https://medium.com/@szinck/setting-up-pyspark-jupyter-and-minio-on-kubeflow-kubernetes-aab98874794f\n",
    "session = (SparkSession.builder\n",
    "           .appName(\"processMailingListData\")\n",
    "           .config(\"spark.executor.instances\", \"8\")\n",
    "           .config(\"spark.driver.memoryOverhead\", \"0.25\")\n",
    "           .config(\"spark.executor.memory\", \"10g\")\n",
    "           .config(\"spark.dynamicAllocation.enabled\", \"false\")\n",
    "           .config(\"spark.ui.enabled\", \"true\")\n",
    "           .config(\"spark.kubernetes.container.image\",\n",
    "                   \"gcr.io/boos-demo-projects-are-rad/kubeflow/spark-worker/spark-py-36:v3.0.0-preview2-23\")\n",
    "           .config(\"spark.driver.bindAddress\", \"0.0.0.0\")\n",
    "           .config(\"spark.kubernetes.namespace\", \"kubeflow-programmerboo\")\n",
    "           .config(\"spark.master\", \"k8s://https://kubernetes.default\")\n",
    "           .config(\"spark.driver.host\", \"spark-driver.kubeflow-programmerboo.svc.cluster.local\")\n",
    "           .config(\"spark.kubernetes.executor.annotation.sidecar.istio.io/inject\", \"false\")\n",
    "           .config(\"spark.driver.port\", \"39235\")\n",
    "           .config(\"spark.blockManager.port\", \"39236\")\n",
    "            # If using minio - see https://github.com/minio/cookbook/blob/master/docs/apache-spark-with-minio.md\n",
    "           .config(\"spark.hadoop.fs.s3a.endpoint\", \"minio-service.kubeflow.svc.cluster.local:9000\")\n",
    "           .config(\"fs.s3a.connection.ssl.enabled\", \"false\")\n",
    "           .config(\"fs.s3a.path.style.access\", \"true\")\n",
    "           # You can also add an account using the minio command as described in chapter 1\n",
    "           .config(\"spark.hadoop.fs.s3a.access.key\", \"minio\")\n",
    "           .config(\"spark.hadoop.fs.s3a.secret.key\", \"minio123\")\n",
    "          ).getOrCreate()\n",
    "sc = session.sparkContext"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "#Load data from the previous stage\n",
    "#tag::load_data[]\n",
    "initial_posts = session.read.format(\"parquet\").load(fs_prefix + \"/initial_posts\")\n",
    "ids_in_reply = session.read.format(\"parquet\").load(fs_prefix + \"/ids_in_reply\")\n",
    "#end::load_data[]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Load data from the previous stage while checking the schema\n",
    "#tag::load_with_schema[]\n",
    "ids_schema = StructType([\n",
    "    StructField(\"In-Reply-To\", StringType(), nullable=True),\n",
    "    StructField(\"message-id\", StringType(),nullable=True)])\n",
    "ids_in_reply = session.read.format(\"parquet\").schema(ids_schema).load(fs_prefix + \"/ids_in_reply\")\n",
    "#end::load_with_schema[]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Cache the data\n",
    "initial_posts = initial_posts.alias(\"initial_posts\").cache()\n",
    "ids_in_reply = ids_in_reply.alias(\"ids_in_reply\").cache()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# We can write random SQL -- although we need to wait for preview 3 cause it was taken out in preview1\n",
    "#tag::direct_sql[]\n",
    "#ids_in_reply.registerTempTable(\"cheese\")\n",
    "#no_text = session.sql(\"select * from cheese where body = '' AND subject = ''\")\n",
    "#end::direct_sql[]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Drop bad data\n",
    "#tag::drop_bad_fields[]\n",
    "initial_posts_count = initial_posts.count()\n",
    "initial_posts_cleaned = initial_posts.na.drop(how='any', subset=['body', 'from'])\n",
    "initial_posts_cleaned_count = initial_posts_cleaned.count()\n",
    "#end::drop_bad_fields[]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "initial_posts.show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Start with computing the labels\n",
    "# Find the initial posts where no one replied\n",
    "posts_with_replies = (initial_posts.join(\n",
    "        ids_in_reply,\n",
    "        col(\"ids_in_reply.In-Reply-To\") == col(\"initial_posts.Message-Id\"),\n",
    "        \"left_outer\")\n",
    "       .filter(col(\"ids_in_reply.In-Reply-To\").isNotNull())).cache()\n",
    "posts_with_replies.count()\n",
    "post_ids_with_replies = (posts_with_replies\n",
    "                            .select(col(\"initial_posts.Message-Id\").alias(\"id\"))\n",
    "                            .withColumn(\"has_reply\", lit(1.0))).alias(\"post_with_replies\")\n",
    "\n",
    "joined_posts = initial_posts.join(\n",
    "    post_ids_with_replies,\n",
    "    col(\"initial_posts.Message-Id\") == col(\"post_with_replies.id\"))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "joined_posts.show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "posts_with_labels = joined_posts.na.fill({\"has_reply\": 0.0}).cache()\n",
    "posts_with_labels.count()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def extract_links(body):\n",
    "    import re\n",
    "    link_regex_str = r'(http(|s)://(.*?))([\\s\\n]|$)'\n",
    "    itr = re.finditer(link_regex_str, body, re.MULTILINE)\n",
    "    return list(map(lambda elem: elem.group(1), itr))\n",
    "\n",
    "def extract_domains(links):\n",
    "    from urllib.parse import urlparse\n",
    "    def extract_domain(link):\n",
    "        try:\n",
    "            nloc = urlparse(link).netloc\n",
    "            # We want to drop www and any extra spaces wtf nloc on the spaces.\n",
    "            regex_str = r'^(www\\.|)(.*?)\\s*$'\n",
    "            match = re.search(regex_str, nloc)\n",
    "            return match.group(2)\n",
    "        except:\n",
    "            return None\n",
    "    return list(map(extract_domain, links))\n",
    "\n",
    "def contains_python_stack_trace(body):\n",
    "    return \"Traceback (most recent call last)\" in body\n",
    "\n",
    "\n",
    "\n",
    "def contains_probably_java_stack_trace(body):\n",
    "    # Look for something based on regex\n",
    "    # Tried https://stackoverflow.com/questions/20609134/regular-expression-optional-multiline-java-stacktrace - more msg looking\n",
    "    # Tried https://stackoverflow.com/questions/3814327/regular-expression-to-parse-a-log-file-and-find-stacktraces\n",
    "    # Yes the compile is per call, but it's cached so w/e\n",
    "    import re\n",
    "    stack_regex_str = r'^\\s*(.+Exception.*):\\n(.*\\n){0,3}?(\\s+at\\s+.*\\(.*\\))+'\n",
    "    match = re.search(stack_regex_str, body, re.MULTILINE)\n",
    "    return match is not None\n",
    "\n",
    "\n",
    "def contains_exception_in_task(body):\n",
    "    # Look for a line along the lines of ERROR Executor: Exception in task \n",
    "    return \"ERROR Executor: Exception in task\" in body\n",
    "    "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "extract_links_udf = UserDefinedFunction(\n",
    "    extract_links, ArrayType(StringType()), \"extract_links\")\n",
    "\n",
    "session.catalog._jsparkSession.udf().registerPython(\n",
    "    \"extract_links\",\n",
    "    extract_links_udf._judf)\n",
    "\n",
    "\n",
    "extract_domains_udf = UserDefinedFunction(\n",
    "    extract_domains, ArrayType(StringType()), \"extract_domains\")\n",
    "\n",
    "session.catalog._jsparkSession.udf().registerPython(\n",
    "    \"extract_domains\",\n",
    "    extract_domains_udf._judf)\n",
    "\n",
    "\n",
    "contains_python_stack_trace_udf = UserDefinedFunction(\n",
    "    contains_python_stack_trace, BooleanType(), \"contains_python_stack_trace\")\n",
    "\n",
    "session.catalog._jsparkSession.udf().registerPython(\n",
    "    \"contains_python_stack_trace\",\n",
    "    contains_python_stack_trace_udf._judf)\n",
    "\n",
    "\n",
    "contains_probably_java_stack_trace_udf = UserDefinedFunction(\n",
    "    contains_probably_java_stack_trace, BooleanType(), \"contains_probably_java_stack_trace\")\n",
    "\n",
    "session.catalog._jsparkSession.udf().registerPython(\n",
    "    \"contains_probably_java_stack_trace\",\n",
    "    contains_probably_java_stack_trace_udf._judf)\n",
    "\n",
    "\n",
    "contains_exception_in_task_udf = UserDefinedFunction(\n",
    "    contains_exception_in_task, BooleanType(), \"contains_exception_in_task\")\n",
    "\n",
    "session.catalog._jsparkSession.udf().registerPython(\n",
    "    \"contains_exception_in_task\",\n",
    "    contains_exception_in_task_udf._judf)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "We could make this a transformer stage, but I'm lazy so we'll just use a UDF directly."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "annotated_spark_mailing_list_data = posts_with_labels.select(\n",
    "    \"*\",\n",
    "    extract_links_udf(posts_with_labels[\"body\"]).alias(\"links_in_email\"),\n",
    "    contains_python_stack_trace_udf(posts_with_labels.body).alias(\"contains_python_stack_trace\").cast(\"double\"),\n",
    "    contains_probably_java_stack_trace_udf(posts_with_labels.body).alias(\"contains_java_stack_trace\").cast(\"double\"),\n",
    "    contains_exception_in_task_udf(posts_with_labels.body).alias(\"contains_exception_in_task\").cast(\"double\"))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "annotated_spark_mailing_list_data.cache()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "annotated_spark_mailing_list_data.show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "further_annotated = annotated_spark_mailing_list_data.withColumn(\n",
    "    \"domain_links\",\n",
    "    extract_domains_udf(annotated_spark_mailing_list_data.links_in_email))\n",
    "# Long story, allow mixed UDF types\n",
    "further_annotated.cache()\n",
    "further_annotated.count()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "#tag::make_features[]\n",
    "tokenizer = Tokenizer(inputCol=\"body\", outputCol=\"body_tokens\")\n",
    "body_hashing = HashingTF(\n",
    "    inputCol=\"body_tokens\", outputCol=\"raw_body_features\",\n",
    "    numFeatures=10000)\n",
    "body_idf = IDF(\n",
    "    inputCol=\"raw_body_features\", outputCol=\"body_features\")\n",
    "body_word2Vec = Word2Vec(\n",
    "    vectorSize=5, minCount=0, numPartitions=10,\n",
    "    inputCol=\"body_tokens\", outputCol=\"body_vecs\")\n",
    "assembler = VectorAssembler(\n",
    "    inputCols=[\n",
    "        \"body_features\", \"body_vecs\", \"contains_python_stack_trace\",\n",
    "        \"contains_java_stack_trace\", \"contains_exception_in_task\"],\n",
    "    outputCol=\"features\")\n",
    "#end::make_features[]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "featureprep_pipeline = Pipeline(\n",
    "    stages=[tokenizer, body_hashing, body_idf, body_word2Vec, assembler])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "featureprep_pipeline_transformer = featureprep_pipeline.fit(further_annotated)\n",
    "preped_data = featureprep_pipeline_transformer.transform(further_annotated)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "featureprep_pipeline_transformer.write().save(fs_prefix+\"/feature_prep-2\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "preped_data.write.format(\"parquet\").mode(\"overwrite\").save(fs_prefix+\"/prepared_data\")"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.9"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}


================================================
FILE: feature-prep/spark/SparkMailingListFeaturePrep.py
================================================
#!/usr/bin/env python
# coding: utf-8

# In[ ]:

# Yes we need both these imports
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, to_date, lit, isnull
from pyspark.sql.types import *
from pyspark.sql.types import StructField, StructType
from pyspark.sql.catalog import UserDefinedFunction
from pyspark.ml.feature import *
from pyspark.ml.pipeline import Pipeline
import os

# In[ ]:

# In[ ]:

fs_prefix = "s3a://kf-book-examples/mailing-lists"  # Create with mc as in ch1

# In[ ]:

os.environ["PYSPARK_PYTHON"] = "python3.6"
# See https://medium.com/@szinck/setting-up-pyspark-jupyter-and-minio-on-kubeflow-kubernetes-aab98874794f
session = (
    SparkSession.builder.appName("processMailingListData").config(
        "spark.executor.instances",
        "8").config("spark.driver.memoryOverhead",
                    "0.25").config("spark.executor.memory", "10g").config(
                        "spark.dynamicAllocation.enabled", "false").
    config("spark.ui.enabled", "true").config(
        "spark.kubernetes.container.image",
        "gcr.io/boos-demo-projects-are-rad/kubeflow/spark-worker/spark-py-36:v3.0.0-preview2-23"
    ).config("spark.driver.bindAddress",
             "0.0.0.0").config("spark.kubernetes.namespace",
                               "kubeflow-programmerboo").
    config("spark.master", "k8s://https://kubernetes.default").config(
        "spark.driver.host",
        "spark-driver.kubeflow-programmerboo.svc.cluster.local").config(
            "spark.kubernetes.executor.annotation.sidecar.istio.io/inject",
            "false").config("spark.driver.port",
                            "39235").config("spark.blockManager.port", "39236")
    # If using minio - see https://github.com/minio/cookbook/blob/master/docs/apache-spark-with-minio.md
    .config("spark.hadoop.fs.s3a.endpoint",
            "minio-service.kubeflow.svc.cluster.local:9000").config(
                "fs.s3a.connection.ssl.enabled",
                "false").config("fs.s3a.path.style.access", "true")
    # You can also add an account using the minio command as described in chapter 1
    .config("spark.hadoop.fs.s3a.access.key",
            "minio").config("spark.hadoop.fs.s3a.secret.key",
                            "minio123")).getOrCreate()
sc = session.sparkContext

# In[ ]:

#Load data from the previous stage
#tag::load_data[]
initial_posts = session.read.format("parquet").load(fs_prefix +
                                                    "/initial_posts")
ids_in_reply = session.read.format("parquet").load(fs_prefix + "/ids_in_reply")
#end::load_data[]

# In[ ]:

# Load data from the previous stage while checking the schema
#tag::load_with_schema[]
ids_schema = StructType([
    StructField("In-Reply-To", StringType(), nullable=True),
    StructField("message-id", StringType(), nullable=True)
])
ids_in_reply = session.read.format("parquet").schema(ids_schema).load(
    fs_prefix + "/ids_in_reply")
#end::load_with_schema[]

# In[ ]:

# Cache the data
initial_posts = initial_posts.alias("initial_posts").cache()
ids_in_reply = ids_in_reply.alias("ids_in_reply").cache()

# In[ ]:

# We can write random SQL -- although we need to wait for preview 3 cause it was taken out in preview1
#tag::direct_sql[]
#ids_in_reply.registerTempTable("cheese")
#no_text = session.sql("select * from cheese where body = '' AND subject = ''")
#end::direct_sql[]

# In[ ]:

# Drop bad data
#tag::drop_bad_fields[]
initial_posts_count = initial_posts.count()
initial_posts_cleaned = initial_posts.na.drop(how='any',
                                              subset=['body', 'from'])
initial_posts_cleaned_count = initial_posts_cleaned.count()
#end::drop_bad_fields[]

# In[ ]:

initial_posts.show()

# In[ ]:

# Start with computing the labels
# Find the initial posts where no one replied
posts_with_replies = (initial_posts.join(
    ids_in_reply,
    col("ids_in_reply.In-Reply-To") == col("initial_posts.Message-Id"),
    "left_outer").filter(col("ids_in_reply.In-Reply-To").isNotNull())).cache()
posts_with_replies.count()
post_ids_with_replies = (posts_with_replies.select(
    col("initial_posts.Message-Id").alias("id")).withColumn(
        "has_reply", lit(1.0))).alias("post_with_replies")

joined_posts = initial_posts.join(
    post_ids_with_replies,
    col("initial_posts.Message-Id") == col("post_with_replies.id"))

# In[ ]:

joined_posts.show()

# In[ ]:

posts_with_labels = joined_posts.na.fill({"has_reply": 0.0}).cache()
posts_with_labels.count()

# In[ ]:


def extract_links(body):
    import re
    link_regex_str = r'(http(|s)://(.*?))([\s\n]|$)'
    itr = re.finditer(link_regex_str, body, re.MULTILINE)
    return list(map(lambda elem: elem.group(1), itr))


def extract_domains(links):
    from urllib.parse import urlparse

    def extract_domain(link):
        try:
            nloc = urlparse(link).netloc
            # We want to drop www and any extra spaces wtf nloc on the spaces.
            regex_str = r'^(www\.|)(.*?)\s*$'
            match = re.search(regex_str, nloc)
            return match.group(2)
        except:
            return None

    return list(map(extract_domain, links))


def contains_python_stack_trace(body):
    return "Traceback (most recent call last)" in body


def contains_probably_java_stack_trace(body):
    # Look for something based on regex
    # Tried https://stackoverflow.com/questions/20609134/regular-expression-optional-multiline-java-stacktrace - more msg looking
    # Tried https://stackoverflow.com/questions/3814327/regular-expression-to-parse-a-log-file-and-find-stacktraces
    # Yes the compile is per call, but it's cached so w/e
    import re
    stack_regex_str = r'^\s*(.+Exception.*):\n(.*\n){0,3}?(\s+at\s+.*\(.*\))+'
    match = re.search(stack_regex_str, body, re.MULTILINE)
    return match is not None


def contains_exception_in_task(body):
    # Look for a line along the lines of ERROR Executor: Exception in task
    return "ERROR Executor: Exception in task" in body


# In[ ]:

extract_links_udf = UserDefinedFunction(extract_links, ArrayType(StringType()),
                                        "extract_links")

session.catalog._jsparkSession.udf().registerPython("extract_links",
                                                    extract_links_udf._judf)

extract_domains_udf = UserDefinedFunction(extract_domains,
                                          ArrayType(StringType()),
                                          "extract_domains")

session.catalog._jsparkSession.udf().registerPython("extract_domains",
                                                    extract_domains_udf._judf)

contains_python_stack_trace_udf = UserDefinedFunction(
    contains_python_stack_trace, BooleanType(), "contains_python_stack_trace")

session.catalog._jsparkSession.udf().registerPython(
    "contains_python_stack_trace", contains_python_stack_trace_udf._judf)

contains_probably_java_stack_trace_udf = UserDefinedFunction(
    contains_probably_java_stack_trace, BooleanType(),
    "contains_probably_java_stack_trace")

session.catalog._jsparkSession.udf().registerPython(
    "contains_probably_java_stack_trace",
    contains_probably_java_stack_trace_udf._judf)

contains_exception_in_task_udf = UserDefinedFunction(
    contains_exception_in_task, BooleanType(), "contains_exception_in_task")

session.catalog._jsparkSession.udf().registerPython(
    "contains_exception_in_task", contains_exception_in_task_udf._judf)

# We could make this a transformer stage, but I'm lazy so we'll just use a UDF directly.

# In[ ]:

annotated_spark_mailing_list_data = posts_with_labels.select(
    "*",
    extract_links_udf(posts_with_labels["body"]).alias("links_in_email"),
    contains_python_stack_trace_udf(posts_with_labels.body).alias(
        "contains_python_stack_trace").cast("double"),
    contains_probably_java_stack_trace_udf(posts_with_labels.body).alias(
        "contains_java_stack_trace").cast("double"),
    contains_exception_in_task_udf(posts_with_labels.body).alias(
        "contains_exception_in_task").cast("double"))

# In[ ]:

annotated_spark_mailing_list_data.cache()

# In[ ]:

annotated_spark_mailing_list_data.show()

# In[ ]:

further_annotated = annotated_spark_mailing_list_data.withColumn(
    "domain_links",
    extract_domains_udf(annotated_spark_mailing_list_data.links_in_email))
# Long story, allow mixed UDF types
further_annotated.cache()
further_annotated.count()

# In[ ]:

#tag::make_features[]
tokenizer = Tokenizer(inputCol="body", outputCol="body_tokens")
body_hashing = HashingTF(inputCol="body_tokens",
                         outputCol="raw_body_features",
                         numFeatures=10000)
body_idf = IDF(inputCol="raw_body_features", outputCol="body_features")
body_word2Vec = Word2Vec(vectorSize=5,
                         minCount=0,
                         numPartitions=10,
                         inputCol="body_tokens",
                         outputCol="body_vecs")
assembler = VectorAssembler(inputCols=[
    "body_features", "body_vecs", "contains_python_stack_trace",
    "contains_java_stack_trace", "contains_exception_in_task"
],
                            outputCol="features")
#end::make_features[]

# In[ ]:

featureprep_pipeline = Pipeline(
    stages=[tokenizer, body_hashing, body_idf, body_word2Vec, assembler])

# In[ ]:

featureprep_pipeline_transformer = featureprep_pipeline.fit(further_annotated)
preped_data = featureprep_pipeline_transformer.transform(further_annotated)

# In[ ]:

featureprep_pipeline_transformer.write().save(fs_prefix + "/feature_prep-2")

# In[ ]:

preped_data.write.format("parquet").mode("overwrite").save(fs_prefix +
                                                           "/prepared_data")


================================================
FILE: feature-prep/tft/requirements.txt
================================================
tfx
tensorflow
apache-beam


================================================
FILE: feature-prep/tft/transform.py
================================================
#tag::imports[]
import tensorflow as tf
import tensorflow_transform as tft
from tensorflow_transform.tf_metadata import schema_utils
#end::imports[]

#tag::entry_point[]


def preprocessing_fn(inputs):
    #end::entry_point[]
    #tag::logic[]
    outputs = {}
    # TFT business logic goes here
    outputs["body_stuff"] = tft.compute_and_apply_vocabulary(inputs["body"],
                                                             top_k=1000)
    return outputs


#end::logic[]


================================================
FILE: gcp-setup/cloudshell_scrip.sh
================================================
#!/bin/bash
# Note: this only works inside of cloudshell!
#tag::cloudshell_script[]
G_SOURCES="https://source.developers.google.com/p"
cloudshell_open \
  --repo_url "$G_SOURCES/$PROJECTID/r/$PROJECTID-$DEPLOYMENTNAME-config"\
  --dir"v$KUBEFLOWVERSION/kubeflow/kf_util" \
  --page "editor" \
  --tutorial "conn.md"
#end::cloudshell_script[]


================================================
FILE: gcp-setup/setup-gcp.sh
================================================
#!/bin/bash
#tag::ubuntu[]
apt-get install google-cloud-sdk
#end::ubuntu[]
apt-get remove google-cloud-sdk
#tag::general[]
curl https://sdk.cloud.google.com | bash
#end::general[]
#tag::enable_container_apis[]
gcloud services enable container.googleapis.com
#end::enable_container_apis[]
PROJECT_ID="boos-demo-projects-are-rad"
#tag::configure_cloud_sdk[]
gcloud auth login # Launches a web browser to login with
gcloud config set project "$PROJECT_ID" #Project ID is your Google project ID
#end::configure_cloud_sdk[]
ZONE="us-central1-a" # For TPU access
CLUSTER_NAME="ci-cluster"
#tag::launch_cluster[]
gcloud beta container clusters create $CLUSTER_NAME \
       --zone $ZONE \
       --machine-type "n1-standard-4" \
       --disk-type "pd-standard" \
       --disk-size "100" \
       --scopes "https://www.googleapis.com/auth/cloud-platform" \
       --addons HorizontalPodAutoscaling,HttpLoadBalancing \
       --enable-autoupgrade \
       --enable-autorepair \
       --enable-autoscaling --min-nodes 1 --max-nodes 10 --num-nodes 2
#end::launch_cluster[]
#tag::delete_cluster[]
gcloud beta container clusters delete $CLUSTER_NAME --zone $ZONE
#end::delete_cluster[]


================================================
FILE: kfctl_gcp_iap.v1.0.1.yaml
================================================
apiVersion: kfdef.apps.kubeflow.org/v1
kind: KfDef
metadata:
  namespace: kubeflow
spec:
  applications:
  - kustomizeConfig:
      parameters:
      - name: namespace
        value: istio-system
      repoRef:
        name: manifests
        path: istio/istio-crds
    name: istio-crds
  - kustomizeConfig:
      parameters:
      - name: namespace
        value: istio-system
      repoRef:
        name: manifests
        path: istio/istio-install
    name: istio-install
  - kustomizeConfig:
      parameters:
      - name: namespace
        value: istio-system
      repoRef:
        name: manifests
        path: istio/cluster-local-gateway
    name: cluster-local-gateway
  - kustomizeConfig:
      parameters:
      - name: namespace
        value: istio-system
      repoRef:
        name: manifests
        path: istio/kfserving-gateway
    name: kfserving-gateway
  - kustomizeConfig:
      parameters:
      - name: clusterRbacConfig
        value: 'ON'
      repoRef:
        name: manifests
        path: istio/istio
    name: istio
  - kustomizeConfig:
      repoRef:
        name: manifests
        path: application/application-crds
    name: application-crds
  - kustomizeConfig:
      overlays:
      - application
      repoRef:
        name: manifests
        path: application/application
    name: application
  - kustomizeConfig:
      parameters:
      - name: namespace
        value: cert-manager
      repoRef:
        name: manifests
        path: cert-manager/cert-manager-crds
    name: cert-manager-crds
  - kustomizeConfig:
      parameters:
      - name: namespace
        value: kube-system
      repoRef:
        name: manifests
        path: cert-manager/cert-manager-kube-system-resources
    name: cert-manager-kube-system-resources
  - kustomizeConfig:
      overlays:
      - self-signed
      - application
      parameters:
      - name: namespace
        value: cert-manager
      repoRef:
        name: manifests
        path: cert-manager/cert-manager
    name: cert-manager
  - kustomizeConfig:
      repoRef:
        name: manifests
        path: kubeflow-roles
    name: kubeflow-roles
  - kustomizeConfig:
      repoRef:
        name: manifests
        path: metacontroller
    name: metacontroller
  - kustomizeConfig:
      overlays:
      - istio
      - application
      repoRef:
        name: manifests
        path: argo
    name: argo
  - kustomizeConfig:
      overlays:
      - istio
      - application
      parameters:
      - name: userid-header
        value: X-Goog-Authenticated-User-Email
      - name: userid-prefix
        value: 'accounts.google.com:'
      repoRef:
        name: manifests
        path: common/centraldashboard
    name: centraldashboard
  - kustomizeConfig:
      overlays:
      - application
      repoRef:
        name: manifests
        path: admission-webhook/webhook
    name: webhook
  - kustomizeConfig:
      overlays:
      - application
      parameters:
      - name: webhookNamePrefix
        value: admission-webhook-
      repoRef:
        name: manifests
        path: admission-webhook/bootstrap
    name: bootstrap
  - kustomizeConfig:
      overlays:
      - istio
      - application
      parameters:
      - name: userid-header
        value: X-Goog-Authenticated-User-Email
      - name: userid-prefix
        value: 'accounts.google.com:'
      repoRef:
        name: manifests
        path: jupyter/jupyter-web-app
    name: jupyter-web-app
  - kustomizeConfig:
      overlays:
      - application
      repoRef:
        name: manifests
        path: spark/spark-operator
    name: spark-operator
  - kustomizeConfig:
      overlays:
      - istio
      - application
      - db
      repoRef:
        name: manifests
        path: metadata
    name: metadata
  - kustomizeConfig:
      overlays:
      - istio
      - application
      parameters:
      - name: injectGcpCredentials
        value: 'true'
      repoRef:
        name: manifests
        path: jupyter/notebook-controller
    name: notebook-controller
  - kustomizeConfig:
      overlays:
      - application
      repoRef:
        name: manifests
        path: pytorch-job/pytorch-job-crds
    name: pytorch-job-crds
  - kustomizeConfig:
      overlays:
      - application
      repoRef:
        name: manifests
        path: pytorch-job/pytorch-operator
    name: pytorch-operator
  - kustomizeConfig:
      overlays:
      - application
      parameters:
      - name: namespace
        value: knative-serving
      repoRef:
        name: manifests
        path: knative/knative-serving-crds
    name: knative-crds
  - kustomizeConfig:
      overlays:
      - application
      parameters:
      - name: namespace
        value: knative-serving
      repoRef:
        name: manifests
        path: knative/knative-serving-install
    name: knative-install
  - kustomizeConfig:
      overlays:
      - application
      repoRef:
        name: manifests
        path: kfserving/kfserving-crds
    name: kfserving-crds
  - kustomizeConfig:
      overlays:
      - application
      repoRef:
        name: manifests
        path: kfserving/kfserving-install
    name: kfserving-install
  - kustomizeConfig:
      overlays:
      - application
      parameters:
      - name: usageId
        value: '7439583937720421527'
      - name: reportUsage
        value: 'true'
      repoRef:
        name: manifests
        path: common/spartakus
    name: spartakus
  - kustomizeConfig:
      overlays:
      - istio
      repoRef:
        name: manifests
        path: tensorboard
    name: tensorboard
  - kustomizeConfig:
      overlays:
      - application
      repoRef:
        name: manifests
        path: tf-training/tf-job-crds
    name: tf-job-crds
  - kustomizeConfig:
      overlays:
      - application
      repoRef:
        name: manifests
        path: tf-training/tf-job-operator
    name: tf-job-operator
  - kustomizeConfig:
      overlays:
      - application
      repoRef:
        name: manifests
        path: katib/katib-crds
    name: katib-crds
  - kustomizeConfig:
      overlays:
      - application
      - istio
      repoRef:
        name: manifests
        path: katib/katib-controller
    name: katib-controller
  - kustomizeConfig:
      overlays:
      - application
      repoRef:
        name: manifests
        path: pipeline/api-service
    name: api-service
  - kustomizeConfig:
      overlays:
      - minioPd
      - application
      parameters:
      - name: minioPd
        value: test1-storage-artifact-store
      - name: minioPvName
        value: minio-pv
      - name: minioPvcName
        value: minio-pv-claim
      repoRef:
        name: manifests
        path: pipeline/minio
    name: minio
  - kustomizeConfig:
      overlays:
      - mysqlPd
      - application
      parameters:
      - name: mysqlPd
        value: test1-storage-metadata-store
      - name: mysqlPvName
        value: mysql-pv
      - name: mysqlPvcName
        value: mysql-pv-claim
      repoRef:
        name: manifests
        path: pipeline/mysql
    name: mysql
  - kustomizeConfig:
      overlays:
      - application
      repoRef:
        name: manifests
        path: pipeline/persistent-agent
    name: persistent-agent
  - kustomizeConfig:
      overlays:
      - application
      repoRef:
        name: manifests
        path: pipeline/pipelines-runner
    name: pipelines-runner
  - kustomizeConfig:
      overlays:
      - gcp
      - istio
      - application
      repoRef:
        name: manifests
        path: pipeline/pipelines-ui
    name: pipelines-ui
  - kustomizeConfig:
      overlays:
      - application
      repoRef:
        name: manifests
        path: pipeline/pipelines-viewer
    name: pipelines-viewer
  - kustomizeConfig:
      overlays:
      - application
      repoRef:
        name: manifests
        path: pipeline/scheduledworkflow
    name: scheduledworkflow
  - kustomizeConfig:
      overlays:
      - application
      repoRef:
        name: manifests
        path: pipeline/pipeline-visualization-service
    name: pipeline-visualization-service
  - kustomizeConfig:
      overlays:
      - application
      repoRef:
        name: manifests
        path: gcp/cloud-endpoints
    name: cloud-endpoints
  - kustomizeConfig:
      overlays:
      - application
      - istio
      parameters:
      - name: admin
      - name: userid-header
        value: X-Goog-Authenticated-User-Email
      - name: userid-prefix
        value: 'accounts.google.com:'
      repoRef:
        name: manifests
        path: profiles
    name: profiles
  - kustomizeConfig:
      overlays:
      - application
      repoRef:
        name: manifests
        path: gcp/gpu-driver
    name: gpu-driver
  - kustomizeConfig:
      overlays:
      - managed-cert
      - application
      parameters:
      - name: namespace
        value: istio-system
      - name: ipName
        value: test1-ip
      - name: hostname
      repoRef:
        name: manifests
        path: gcp/iap-ingress
    name: iap-ingress
  - kustomizeConfig:
      overlays:
      - application
      repoRef:
        name: manifests
        path: seldon/seldon-core-operator
    name: seldon-core-operator
  - kustomizeConfig:
      parameters:
      - name: user
      - name: profile-name
        value: anonymous
      repoRef:
        name: manifests
        path: default-install
    name: default-install
  plugins:
  - kind: KfGcpPlugin
    metadata:
      creationTimestamp: null
      name: gcp
    spec:
      createPipelinePersistentStorage: true
      deploymentManagerConfig:
        repoRef:
          name: manifests
          path: gcp/deployment_manager_configs
      enableWorkloadIdentity: true
      skipInitProject: true
      useBasicAuth: false
  repos:
  - name: manifests
    uri: https://github.com/holdenk/manifests/archive/fix-spark-crd.tar.gz
  version: v1.0.1


================================================
FILE: pipelines/ControlStructures.ipynb
================================================
{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Simple Control structure\n",
    "\n",
    "Shows how to use conditional execution"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Requirement already up-to-date: kfp in ./.local/lib/python3.6/site-packages (0.2.2.1)\n",
      "Requirement already satisfied, skipping upgrade: jsonschema>=3.0.1 in /usr/local/lib/python3.6/dist-packages (from kfp) (3.2.0)\n",
      "Requirement already satisfied, skipping upgrade: google-cloud-storage>=1.13.0 in /usr/local/lib/python3.6/dist-packages (from kfp) (1.25.0)\n",
      "Requirement already satisfied, skipping upgrade: python-dateutil in /usr/local/lib/python3.6/dist-packages (from kfp) (2.8.1)\n",
      "Requirement already satisfied, skipping upgrade: google-auth>=1.6.1 in /usr/local/lib/python3.6/dist-packages (from kfp) (1.11.0)\n",
      "Requirement already satisfied, skipping upgrade: argo-models==2.2.1a in ./.local/lib/python3.6/site-packages (from kfp) (2.2.1a0)\n",
      "Requirement already satisfied, skipping upgrade: urllib3<1.25,>=1.15 in ./.local/lib/python3.6/site-packages (from kfp) (1.24.3)\n",
      "Requirement already satisfied, skipping upgrade: cloudpickle==1.1.1 in ./.local/lib/python3.6/site-packages (from kfp) (1.1.1)\n",
      "Requirement already satisfied, skipping upgrade: kubernetes<=10.0.0,>=8.0.0 in ./.local/lib/python3.6/site-packages (from kfp) (10.0.0)\n",
      "Requirement already satisfied, skipping upgrade: click==7.0 in ./.local/lib/python3.6/site-packages (from kfp) (7.0)\n",
      "Requirement already satisfied, skipping upgrade: certifi in /usr/local/lib/python3.6/dist-packages (from kfp) (2019.11.28)\n",
      "Requirement already satisfied, skipping upgrade: PyYAML in /usr/local/lib/python3.6/dist-packages (from kfp) (5.3)\n",
      "Requirement already satisfied, skipping upgrade: kfp-server-api<=0.1.40,>=0.1.18 in ./.local/lib/python3.6/site-packages (from kfp) (0.1.40)\n",
      "Requirement already satisfied, skipping upgrade: six>=1.10 in /usr/lib/python3/dist-packages (from kfp) (1.11.0)\n",
      "Requirement already satisfied, skipping upgrade: PyJWT>=1.6.4 in ./.local/lib/python3.6/site-packages (from kfp) (1.7.1)\n",
      "Requirement already satisfied, skipping upgrade: Deprecated in ./.local/lib/python3.6/site-packages (from kfp) (1.2.7)\n",
      "Requirement already satisfied, skipping upgrade: requests-toolbelt>=0.8.0 in ./.local/lib/python3.6/site-packages (from kfp) (0.9.1)\n",
      "Requirement already satisfied, skipping upgrade: cryptography>=2.4.2 in ./.local/lib/python3.6/site-packages (from kfp) (2.8)\n",
      "Requirement already satisfied, skipping upgrade: tabulate==0.8.3 in ./.local/lib/python3.6/site-packages (from kfp) (0.8.3)\n",
      "Requirement already satisfied, skipping upgrade: importlib-metadata; python_version < \"3.8\" in /usr/local/lib/python3.6/dist-packages (from jsonschema>=3.0.1->kfp) (1.4.0)\n",
      "Requirement already satisfied, skipping upgrade: attrs>=17.4.0 in /usr/local/lib/python3.6/dist-packages (from jsonschema>=3.0.1->kfp) (19.3.0)\n",
      "Requirement already satisfied, skipping upgrade: pyrsistent>=0.14.0 in /usr/local/lib/python3.6/dist-packages (from jsonschema>=3.0.1->kfp) (0.15.7)\n",
      "Requirement already satisfied, skipping upgrade: setuptools in /usr/local/lib/python3.6/dist-packages (from jsonschema>=3.0.1->kfp) (45.1.0)\n",
      "Requirement already satisfied, skipping upgrade: google-resumable-media<0.6dev,>=0.5.0 in /usr/local/lib/python3.6/dist-packages (from google-cloud-storage>=1.13.0->kfp) (0.5.0)\n",
      "Requirement already satisfied, skipping upgrade: google-cloud-core<2.0dev,>=1.2.0 in /usr/local/lib/python3.6/dist-packages (from google-cloud-storage>=1.13.0->kfp) (1.3.0)\n",
      "Requirement already satisfied, skipping upgrade: cachetools<5.0,>=2.0.0 in /usr/local/lib/python3.6/dist-packages (from google-auth>=1.6.1->kfp) (4.0.0)\n",
      "Requirement already satisfied, skipping upgrade: pyasn1-modules>=0.2.1 in /usr/local/lib/python3.6/dist-packages (from google-auth>=1.6.1->kfp) (0.2.8)\n",
      "Requirement already satisfied, skipping upgrade: rsa<4.1,>=3.1.4 in /usr/local/lib/python3.6/dist-packages (from google-auth>=1.6.1->kfp) (4.0)\n",
      "Requirement already satisfied, skipping upgrade: requests-oauthlib in /usr/local/lib/python3.6/dist-packages (from kubernetes<=10.0.0,>=8.0.0->kfp) (1.3.0)\n",
      "Requirement already satisfied, skipping upgrade: requests in /usr/local/lib/python3.6/dist-packages (from kubernetes<=10.0.0,>=8.0.0->kfp) (2.22.0)\n",
      "Requirement already satisfied, skipping upgrade: websocket-client!=0.40.0,!=0.41.*,!=0.42.*,>=0.32.0 in /usr/local/lib/python3.6/dist-packages (from kubernetes<=10.0.0,>=8.0.0->kfp) (0.57.0)\n",
      "Requirement already satisfied, skipping upgrade: wrapt<2,>=1.10 in /usr/local/lib/python3.6/dist-packages (from Deprecated->kfp) (1.11.2)\n",
      "Requirement already satisfied, skipping upgrade: cffi!=1.11.3,>=1.8 in ./.local/lib/python3.6/site-packages (from cryptography>=2.4.2->kfp) (1.14.0)\n",
      "Requirement already satisfied, skipping upgrade: zipp>=0.5 in /usr/local/lib/python3.6/dist-packages (from importlib-metadata; python_version < \"3.8\"->jsonschema>=3.0.1->kfp) (2.1.0)\n",
      "Requirement already satisfied, skipping upgrade: google-api-core<2.0.0dev,>=1.16.0 in /usr/local/lib/python3.6/dist-packages (from google-cloud-core<2.0dev,>=1.2.0->google-cloud-storage>=1.13.0->kfp) (1.16.0)\n",
      "Requirement already satisfied, skipping upgrade: pyasn1<0.5.0,>=0.4.6 in /usr/local/lib/python3.6/dist-packages (from pyasn1-modules>=0.2.1->google-auth>=1.6.1->kfp) (0.4.8)\n",
      "Requirement already satisfied, skipping upgrade: oauthlib>=3.0.0 in /usr/local/lib/python3.6/dist-packages (from requests-oauthlib->kubernetes<=10.0.0,>=8.0.0->kfp) (3.1.0)\n",
      "Requirement already satisfied, skipping upgrade: chardet<3.1.0,>=3.0.2 in /usr/local/lib/python3.6/dist-packages (from requests->kubernetes<=10.0.0,>=8.0.0->kfp) (3.0.4)\n",
      "Requirement already satisfied, skipping upgrade: idna<2.9,>=2.5 in /usr/lib/python3/dist-packages (from requests->kubernetes<=10.0.0,>=8.0.0->kfp) (2.6)\n",
      "Requirement already satisfied, skipping upgrade: pycparser in ./.local/lib/python3.6/site-packages (from cffi!=1.11.3,>=1.8->cryptography>=2.4.2->kfp) (2.19)\n",
      "Requirement already satisfied, skipping upgrade: protobuf>=3.4.0 in /usr/local/lib/python3.6/dist-packages (from google-api-core<2.0.0dev,>=1.16.0->google-cloud-core<2.0dev,>=1.2.0->google-cloud-storage>=1.13.0->kfp) (3.11.2)\n",
      "Requirement already satisfied, skipping upgrade: pytz in /usr/local/lib/python3.6/dist-packages (from google-api-core<2.0.0dev,>=1.16.0->google-cloud-core<2.0dev,>=1.2.0->google-cloud-storage>=1.13.0->kfp) (2019.3)\n",
      "Requirement already satisfied, skipping upgrade: googleapis-common-protos<2.0dev,>=1.6.0 in /usr/local/lib/python3.6/dist-packages (from google-api-core<2.0.0dev,>=1.16.0->google-cloud-core<2.0dev,>=1.2.0->google-cloud-storage>=1.13.0->kfp) (1.51.0)\n"
     ]
    }
   ],
   "source": [
    "!pip install kfp --upgrade --user\n",
    "\n",
    "import kfp\n",
    "from kfp import dsl\n",
    "from kfp.components import func_to_container_op, InputPath, OutputPath"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Functions"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "@func_to_container_op\n",
    "def get_random_int_op(minimum: int, maximum: int) -> int:\n",
    "    \"\"\"Generate a random number between minimum and maximum (inclusive).\"\"\"\n",
    "    import random\n",
    "    result = random.randint(minimum, maximum)\n",
    "    print(result)\n",
    "    return result\n",
    "\n",
    "@func_to_container_op\n",
    "def process_small_op(data: int):\n",
    "    \"\"\"Process small numbers.\"\"\"\n",
    "    print(\"Processing small result\", data)\n",
    "    return\n",
    "\n",
    "@func_to_container_op\n",
    "def process_medium_op(data: int):\n",
    "    \"\"\"Process medium numbers.\"\"\"\n",
    "    print(\"Processing medium result\", data)\n",
    "    return\n",
    "\n",
    "@func_to_container_op\n",
    "def process_large_op(data: int):\n",
    "    \"\"\"Process large numbers.\"\"\"\n",
    "    print(\"Processing large result\", data)\n",
    "    return"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Conditional pipeline"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "@dsl.pipeline(\n",
    "    name='Conditional execution pipeline',\n",
    "    description='Shows how to use dsl.Condition().'\n",
    ")\n",
    "def conditional_pipeline():\n",
    "    number = get_random_int_op(0, 100).output\n",
    "    with dsl.Condition(number < 10):\n",
    "        process_small_op(number)\n",
    "    with dsl.Condition(number > 10 and number < 50):\n",
    "        process_medium_op(number)\n",
    "    with dsl.Condition(number > 50):\n",
    "        process_large_op(number)\n",
    "        "
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Submit the pipeline for execution:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "Experiment link <a href=\"/pipeline/#/experiments/details/2abe16d1-fa2e-4f49-a3a5-acad8d36790d\" target=\"_blank\" >here</a>"
      ],
      "text/plain": [
       "<IPython.core.display.HTML object>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "text/html": [
       "Run link <a href=\"/pipeline/#/runs/details/293a92c5-50b2-4a96-bbd4-ebc85106f337\" target=\"_blank\" >here</a>"
      ],
      "text/plain": [
       "<IPython.core.display.HTML object>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "text/plain": [
       "RunPipelineResult(run_id=293a92c5-50b2-4a96-bbd4-ebc85106f337)"
      ]
     },
     "execution_count": 4,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "kfp.Client().create_run_from_pipeline_func(conditional_pipeline, arguments={})"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.9"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}


================================================
FILE: pipelines/Lightweight Pipeline.ipynb
================================================
{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Setup"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Requirement already up-to-date: kfp in ./.local/lib/python3.6/site-packages (0.2.2.1)\n",
      "Requirement already satisfied, skipping upgrade: PyYAML in /usr/local/lib/python3.6/dist-packages (from kfp) (5.3)\n",
      "Requirement already satisfied, skipping upgrade: requests-toolbelt>=0.8.0 in ./.local/lib/python3.6/site-packages (from kfp) (0.9.1)\n",
      "Requirement already satisfied, skipping upgrade: click==7.0 in ./.local/lib/python3.6/site-packages (from kfp) (7.0)\n",
      "Requirement already satisfied, skipping upgrade: Deprecated in ./.local/lib/python3.6/site-packages (from kfp) (1.2.7)\n",
      "Requirement already satisfied, skipping upgrade: kubernetes<=10.0.0,>=8.0.0 in ./.local/lib/python3.6/site-packages (from kfp) (10.0.0)\n",
      "Requirement already satisfied, skipping upgrade: six>=1.10 in /usr/lib/python3/dist-packages (from kfp) (1.11.0)\n",
      "Requirement already satisfied, skipping upgrade: certifi in /usr/local/lib/python3.6/dist-packages (from kfp) (2019.11.28)\n",
      "Requirement already satisfied, skipping upgrade: google-cloud-storage>=1.13.0 in /usr/local/lib/python3.6/dist-packages (from kfp) (1.25.0)\n",
      "Requirement already satisfied, skipping upgrade: PyJWT>=1.6.4 in ./.local/lib/python3.6/site-packages (from kfp) (1.7.1)\n",
      "Requirement already satisfied, skipping upgrade: cryptography>=2.4.2 in ./.local/lib/python3.6/site-packages (from kfp) (2.8)\n",
      "Requirement already satisfied, skipping upgrade: google-auth>=1.6.1 in /usr/local/lib/python3.6/dist-packages (from kfp) (1.11.0)\n",
      "Requirement already satisfied, skipping upgrade: cloudpickle==1.1.1 in ./.local/lib/python3.6/site-packages (from kfp) (1.1.1)\n",
      "Requirement already satisfied, skipping upgrade: kfp-server-api<=0.1.40,>=0.1.18 in ./.local/lib/python3.6/site-packages (from kfp) (0.1.40)\n",
      "Requirement already satisfied, skipping upgrade: argo-models==2.2.1a in ./.local/lib/python3.6/site-packages (from kfp) (2.2.1a0)\n",
      "Requirement already satisfied, skipping upgrade: tabulate==0.8.3 in ./.local/lib/python3.6/site-packages (from kfp) (0.8.3)\n",
      "Requirement already satisfied, skipping upgrade: python-dateutil in /usr/local/lib/python3.6/dist-packages (from kfp) (2.8.1)\n",
      "Requirement already satisfied, skipping upgrade: urllib3<1.25,>=1.15 in ./.local/lib/python3.6/site-packages (from kfp) (1.24.3)\n",
      "Requirement already satisfied, skipping upgrade: jsonschema>=3.0.1 in /usr/local/lib/python3.6/dist-packages (from kfp) (3.2.0)\n",
      "Requirement already satisfied, skipping upgrade: requests<3.0.0,>=2.0.1 in /usr/local/lib/python3.6/dist-packages (from requests-toolbelt>=0.8.0->kfp) (2.22.0)\n",
      "Requirement already satisfied, skipping upgrade: wrapt<2,>=1.10 in /usr/local/lib/python3.6/dist-packages (from Deprecated->kfp) (1.11.2)\n",
      "Requirement already satisfied, skipping upgrade: setuptools>=21.0.0 in /usr/local/lib/python3.6/dist-packages (from kubernetes<=10.0.0,>=8.0.0->kfp) (45.1.0)\n",
      "Requirement already satisfied, skipping upgrade: websocket-client!=0.40.0,!=0.41.*,!=0.42.*,>=0.32.0 in /usr/local/lib/python3.6/dist-packages (from kubernetes<=10.0.0,>=8.0.0->kfp) (0.57.0)\n",
      "Requirement already satisfied, skipping upgrade: requests-oauthlib in /usr/local/lib/python3.6/dist-packages (from kubernetes<=10.0.0,>=8.0.0->kfp) (1.3.0)\n",
      "Requirement already satisfied, skipping upgrade: google-cloud-core<2.0dev,>=1.2.0 in /usr/local/lib/python3.6/dist-packages (from google-cloud-storage>=1.13.0->kfp) (1.3.0)\n",
      "Requirement already satisfied, skipping upgrade: google-resumable-media<0.6dev,>=0.5.0 in /usr/local/lib/python3.6/dist-packages (from google-cloud-storage>=1.13.0->kfp) (0.5.0)\n",
      "Requirement already satisfied, skipping upgrade: cffi!=1.11.3,>=1.8 in ./.local/lib/python3.6/site-packages (from cryptography>=2.4.2->kfp) (1.14.0)\n",
      "Requirement already satisfied, skipping upgrade: cachetools<5.0,>=2.0.0 in /usr/local/lib/python3.6/dist-packages (from google-auth>=1.6.1->kfp) (4.0.0)\n",
      "Requirement already satisfied, skipping upgrade: rsa<4.1,>=3.1.4 in /usr/local/lib/python3.6/dist-packages (from google-auth>=1.6.1->kfp) (4.0)\n",
      "Requirement already satisfied, skipping upgrade: pyasn1-modules>=0.2.1 in /usr/local/lib/python3.6/dist-packages (from google-auth>=1.6.1->kfp) (0.2.8)\n",
      "Requirement already satisfied, skipping upgrade: attrs>=17.4.0 in /usr/local/lib/python3.6/dist-packages (from jsonschema>=3.0.1->kfp) (19.3.0)\n",
      "Requirement already satisfied, skipping upgrade: pyrsistent>=0.14.0 in /usr/local/lib/python3.6/dist-packages (from jsonschema>=3.0.1->kfp) (0.15.7)\n",
      "Requirement already satisfied, skipping upgrade: importlib-metadata; python_version < \"3.8\" in /usr/local/lib/python3.6/dist-packages (from jsonschema>=3.0.1->kfp) (1.4.0)\n",
      "Requirement already satisfied, skipping upgrade: chardet<3.1.0,>=3.0.2 in /usr/local/lib/python3.6/dist-packages (from requests<3.0.0,>=2.0.1->requests-toolbelt>=0.8.0->kfp) (3.0.4)\n",
      "Requirement already satisfied, skipping upgrade: idna<2.9,>=2.5 in /usr/lib/python3/dist-packages (from requests<3.0.0,>=2.0.1->requests-toolbelt>=0.8.0->kfp) (2.6)\n",
      "Requirement already satisfied, skipping upgrade: oauthlib>=3.0.0 in /usr/local/lib/python3.6/dist-packages (from requests-oauthlib->kubernetes<=10.0.0,>=8.0.0->kfp) (3.1.0)\n",
      "Requirement already satisfied, skipping upgrade: google-api-core<2.0.0dev,>=1.16.0 in /usr/local/lib/python3.6/dist-packages (from google-cloud-core<2.0dev,>=1.2.0->google-cloud-storage>=1.13.0->kfp) (1.16.0)\n",
      "Requirement already satisfied, skipping upgrade: pycparser in ./.local/lib/python3.6/site-packages (from cffi!=1.11.3,>=1.8->cryptography>=2.4.2->kfp) (2.19)\n",
      "Requirement already satisfied, skipping upgrade: pyasn1>=0.1.3 in /usr/local/lib/python3.6/dist-packages (from rsa<4.1,>=3.1.4->google-auth>=1.6.1->kfp) (0.4.8)\n",
      "Requirement already satisfied, skipping upgrade: zipp>=0.5 in /usr/local/lib/python3.6/dist-packages (from importlib-metadata; python_version < \"3.8\"->jsonschema>=3.0.1->kfp) (2.1.0)\n",
      "Requirement already satisfied, skipping upgrade: googleapis-common-protos<2.0dev,>=1.6.0 in /usr/local/lib/python3.6/dist-packages (from google-api-core<2.0.0dev,>=1.16.0->google-cloud-core<2.0dev,>=1.2.0->google-cloud-storage>=1.13.0->kfp) (1.51.0)\n",
      "Requirement already satisfied, skipping upgrade: pytz in /usr/local/lib/python3.6/dist-packages (from google-api-core<2.0.0dev,>=1.16.0->google-cloud-core<2.0dev,>=1.2.0->google-cloud-storage>=1.13.0->kfp) (2019.3)\n",
      "Requirement already satisfied, skipping upgrade: protobuf>=3.4.0 in /usr/local/lib/python3.6/dist-packages (from google-api-core<2.0.0dev,>=1.16.0->google-cloud-core<2.0dev,>=1.2.0->google-cloud-storage>=1.13.0->kfp) (3.11.2)\n"
     ]
    }
   ],
   "source": [
    "!pip install kfp --upgrade --user\n",
    "\n",
    "import kfp \n",
    "from kfp import compiler\n",
    "import kfp.dsl as dsl\n",
    "import kfp.notebook\n",
    "import kfp.components as comp\n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Simple function that just add two numbers:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "#Define a Python function\n",
    "def add(a: float, b: float) -> float:\n",
    "   '''Calculates sum of two arguments'''\n",
    "   return a + b"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Convert the function to a pipeline operation"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "add_op = comp.func_to_container_op(add)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "A bit more advanced function which demonstrates how to use imports, helper functions and produce multiple outputs."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "from typing import NamedTuple\n",
    "def my_divmod(dividend: float, divisor:float) -> NamedTuple('MyDivmodOutput', [('quotient', float), ('remainder', float)]):\n",
    "    '''Divides two numbers and calculate  the quotient and remainder'''\n",
    "    #Imports inside a component function:\n",
    "    import numpy as np\n",
    "\n",
    "    #This function demonstrates how to use nested functions inside a component function:\n",
    "    def divmod_helper(dividend, divisor):\n",
    "        return np.divmod(dividend, divisor)\n",
    "\n",
    "    (quotient, remainder) = divmod_helper(dividend, divisor)\n",
    "\n",
    "    from collections import namedtuple\n",
    "    divmod_output = namedtuple('MyDivmodOutput', ['quotient', 'remainder'])\n",
    "    return divmod_output(quotient, remainder)\n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Test running the python function directly"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "MyDivmodOutput(quotient=14, remainder=2)"
      ]
     },
     "execution_count": 5,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "my_divmod(100, 7)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Convert the function to a pipeline operation"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [],
   "source": [
    "divmod_op = comp.func_to_container_op(my_divmod, base_image='tensorflow/tensorflow:1.14.0-py3')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Define the pipeline\n",
    "Pipeline function has to be decorated with the @dsl.pipeline decorator"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [],
   "source": [
    "@dsl.pipeline(\n",
    "   name='Calculation pipeline',\n",
    "   description='A toy pipeline that performs arithmetic calculations.'\n",
    ")\n",
    "def calc_pipeline(\n",
    "   a='a',\n",
    "   b='7',\n",
    "   c='17',\n",
    "):\n",
    "    #Passing pipeline parameter and a constant value as operation arguments\n",
    "    add_task = add_op(a, 4) #Returns a dsl.ContainerOp class instance. \n",
    "    \n",
    "    #Passing a task output reference as operation arguments\n",
    "    #For an operation with a single return value, the output reference can be accessed using `task.output` or `task.outputs['output_name']` syntax\n",
    "    divmod_task = divmod_op(add_task.output, b)\n",
    "\n",
    "    #For an operation with a multiple return values, the output references can be accessed using `task.outputs['output_name']` syntax\n",
    "    result_task = add_op(divmod_task.outputs['quotient'], c)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Submit the pipeline for execution"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "Experiment link <a href=\"/pipeline/#/experiments/details/2abe16d1-fa2e-4f49-a3a5-acad8d36790d\" target=\"_blank\" >here</a>"
      ],
      "text/plain": [
       "<IPython.core.display.HTML object>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "text/html": [
       "Run link <a href=\"/pipeline/#/runs/details/87276776-0c3a-4d4e-99d0-4563b7f42fa5\" target=\"_blank\" >here</a>"
      ],
      "text/plain": [
       "<IPython.core.display.HTML object>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "text/plain": [
       "RunPipelineResult(run_id=87276776-0c3a-4d4e-99d0-4563b7f42fa5)"
      ]
     },
     "execution_count": 8,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "client = kfp.Client()\n",
    "\n",
    "#Specify pipeline argument values\n",
    "arguments = {'a': '7', 'b': '8'}\n",
    "\n",
    "#Submit a pipeline run\n",
    "client.create_run_from_pipeline_func(calc_pipeline, arguments=arguments)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.9"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}


================================================
FILE: pipelines/RecommenderPipeline.ipynb
================================================
{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Kubeflow pipeline\n",
    "This is a fairly simple pipeline, containing sequential steps:\n",
    "\n",
    "1. Update data - implemented by lightbend/recommender-data-update-publisher:0.2 image\n",
    "2. Run model training. Ideally we would run TFJob, but due to the current limitations for pipelines, we will directly use an image implementing training lightbend/ml-tf-recommender:0.1\n",
    "3. Update serving model - implemented by lightbend/recommender-model-publisher:0.2"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Setup"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Requirement already up-to-date: kubernetes in ./.local/lib/python3.6/site-packages (10.0.1)\n",
      "Requirement already satisfied, skipping upgrade: pyyaml>=3.12 in /usr/local/lib/python3.6/dist-packages (from kubernetes) (5.3)\n",
      "Requirement already satisfied, skipping upgrade: six>=1.9.0 in /usr/lib/python3/dist-packages (from kubernetes) (1.11.0)\n",
      "Requirement already satisfied, skipping upgrade: urllib3>=1.24.2 in ./.local/lib/python3.6/site-packages (from kubernetes) (1.24.3)\n",
      "Requirement already satisfied, skipping upgrade: requests-oauthlib in /usr/local/lib/python3.6/dist-packages (from kubernetes) (1.3.0)\n",
      "Requirement already satisfied, skipping upgrade: certifi>=14.05.14 in /usr/local/lib/python3.6/dist-packages (from kubernetes) (2019.11.28)\n",
      "Requirement already satisfied, skipping upgrade: python-dateutil>=2.5.3 in /usr/local/lib/python3.6/dist-packages (from kubernetes) (2.8.1)\n",
      "Requirement already satisfied, skipping upgrade: setuptools>=21.0.0 in /usr/local/lib/python3.6/dist-packages (from kubernetes) (45.1.0)\n",
      "Requirement already satisfied, skipping upgrade: requests in /usr/local/lib/python3.6/dist-packages (from kubernetes) (2.22.0)\n",
      "Requirement already satisfied, skipping upgrade: websocket-client!=0.40.0,!=0.41.*,!=0.42.*,>=0.32.0 in /usr/local/lib/python3.6/dist-packages (from kubernetes) (0.57.0)\n",
      "Requirement already satisfied, skipping upgrade: google-auth>=1.0.1 in /usr/local/lib/python3.6/dist-packages (from kubernetes) (1.11.0)\n",
      "Requirement already satisfied, skipping upgrade: oauthlib>=3.0.0 in /usr/local/lib/python3.6/dist-packages (from requests-oauthlib->kubernetes) (3.1.0)\n",
      "Requirement already satisfied, skipping upgrade: idna<2.9,>=2.5 in /usr/lib/python3/dist-packages (from requests->kubernetes) (2.6)\n",
      "Requirement already satisfied, skipping upgrade: chardet<3.1.0,>=3.0.2 in /usr/local/lib/python3.6/dist-packages (from requests->kubernetes) (3.0.4)\n",
      "Requirement already satisfied, skipping upgrade: pyasn1-modules>=0.2.1 in /usr/local/lib/python3.6/dist-packages (from google-auth>=1.0.1->kubernetes) (0.2.8)\n",
      "Requirement already satisfied, skipping upgrade: cachetools<5.0,>=2.0.0 in /usr/local/lib/python3.6/dist-packages (from google-auth>=1.0.1->kubernetes) (4.0.0)\n",
      "Requirement already satisfied, skipping upgrade: rsa<4.1,>=3.1.4 in /usr/local/lib/python3.6/dist-packages (from google-auth>=1.0.1->kubernetes) (4.0)\n",
      "Requirement already satisfied, skipping upgrade: pyasn1<0.5.0,>=0.4.6 in /usr/local/lib/python3.6/dist-packages (from pyasn1-modules>=0.2.1->google-auth>=1.0.1->kubernetes) (0.4.8)\n",
      "Requirement already up-to-date: kfp in ./.local/lib/python3.6/site-packages (0.2.2.1)\n",
      "Requirement already satisfied, skipping upgrade: PyJWT>=1.6.4 in ./.local/lib/python3.6/site-packages (from kfp) (1.7.1)\n",
      "Requirement already satisfied, skipping upgrade: requests-toolbelt>=0.8.0 in ./.local/lib/python3.6/site-packages (from kfp) (0.9.1)\n",
      "Requirement already satisfied, skipping upgrade: python-dateutil in /usr/local/lib/python3.6/dist-packages (from kfp) (2.8.1)\n",
      "Requirement already satisfied, skipping upgrade: PyYAML in /usr/local/lib/python3.6/dist-packages (from kfp) (5.3)\n",
      "Requirement already satisfied, skipping upgrade: kfp-server-api<=0.1.40,>=0.1.18 in ./.local/lib/python3.6/site-packages (from kfp) (0.1.40)\n",
      "Requirement already satisfied, skipping upgrade: google-cloud-storage>=1.13.0 in /usr/local/lib/python3.6/dist-packages (from kfp) (1.25.0)\n",
      "Requirement already satisfied, skipping upgrade: jsonschema>=3.0.1 in /usr/local/lib/python3.6/dist-packages (from kfp) (3.2.0)\n",
      "Requirement already satisfied, skipping upgrade: Deprecated in ./.local/lib/python3.6/site-packages (from kfp) (1.2.7)\n",
      "Requirement already satisfied, skipping upgrade: google-auth>=1.6.1 in /usr/local/lib/python3.6/dist-packages (from kfp) (1.11.0)\n",
      "Collecting kubernetes<=10.0.0,>=8.0.0\n",
      "  Using cached kubernetes-10.0.0-py2.py3-none-any.whl (1.5 MB)\n",
      "Requirement already satisfied, skipping upgrade: argo-models==2.2.1a in ./.local/lib/python3.6/site-packages (from kfp) (2.2.1a0)\n",
      "Requirement already satisfied, skipping upgrade: urllib3<1.25,>=1.15 in ./.local/lib/python3.6/site-packages (from kfp) (1.24.3)\n",
      "Requirement already satisfied, skipping upgrade: certifi in /usr/local/lib/python3.6/dist-packages (from kfp) (2019.11.28)\n",
      "Requirement already satisfied, skipping upgrade: tabulate==0.8.3 in ./.local/lib/python3.6/site-packages (from kfp) (0.8.3)\n",
      "Requirement already satisfied, skipping upgrade: click==7.0 in ./.local/lib/python3.6/site-packages (from kfp) (7.0)\n",
      "Requirement already satisfied, skipping upgrade: cloudpickle==1.1.1 in ./.local/lib/python3.6/site-packages (from kfp) (1.1.1)\n",
      "Requirement already satisfied, skipping upgrade: six>=1.10 in /usr/lib/python3/dist-packages (from kfp) (1.11.0)\n",
      "Requirement already satisfied, skipping upgrade: cryptography>=2.4.2 in ./.local/lib/python3.6/site-packages (from kfp) (2.8)\n",
      "Requirement already satisfied, skipping upgrade: requests<3.0.0,>=2.0.1 in /usr/local/lib/python3.6/dist-packages (from requests-toolbelt>=0.8.0->kfp) (2.22.0)\n",
      "Requirement already satisfied, skipping upgrade: google-resumable-media<0.6dev,>=0.5.0 in /usr/local/lib/python3.6/dist-packages (from google-cloud-storage>=1.13.0->kfp) (0.5.0)\n",
      "Requirement already satisfied, skipping upgrade: google-cloud-core<2.0dev,>=1.2.0 in /usr/local/lib/python3.6/dist-packages (from google-cloud-storage>=1.13.0->kfp) (1.3.0)\n",
      "Requirement already satisfied, skipping upgrade: setuptools in /usr/local/lib/python3.6/dist-packages (from jsonschema>=3.0.1->kfp) (45.1.0)\n",
      "Requirement already satisfied, skipping upgrade: attrs>=17.4.0 in /usr/local/lib/python3.6/dist-packages (from jsonschema>=3.0.1->kfp) (19.3.0)\n",
      "Requirement already satisfied, skipping upgrade: importlib-metadata; python_version < \"3.8\" in /usr/local/lib/python3.6/dist-packages (from jsonschema>=3.0.1->kfp) (1.4.0)\n",
      "Requirement already satisfied, skipping upgrade: pyrsistent>=0.14.0 in /usr/local/lib/python3.6/dist-packages (from jsonschema>=3.0.1->kfp) (0.15.7)\n",
      "Requirement already satisfied, skipping upgrade: wrapt<2,>=1.10 in /usr/local/lib/python3.6/dist-packages (from Deprecated->kfp) (1.11.2)\n",
      "Requirement already satisfied, skipping upgrade: rsa<4.1,>=3.1.4 in /usr/local/lib/python3.6/dist-packages (from google-auth>=1.6.1->kfp) (4.0)\n",
      "Requirement already satisfied, skipping upgrade: cachetools<5.0,>=2.0.0 in /usr/local/lib/python3.6/dist-packages (from google-auth>=1.6.1->kfp) (4.0.0)\n",
      "Requirement already satisfied, skipping upgrade: pyasn1-modules>=0.2.1 in /usr/local/lib/python3.6/dist-packages (from google-auth>=1.6.1->kfp) (0.2.8)\n",
      "Requirement already satisfied, skipping upgrade: websocket-client!=0.40.0,!=0.41.*,!=0.42.*,>=0.32.0 in /usr/local/lib/python3.6/dist-packages (from kubernetes<=10.0.0,>=8.0.0->kfp) (0.57.0)\n",
      "Requirement already satisfied, skipping upgrade: requests-oauthlib in /usr/local/lib/python3.6/dist-packages (from kubernetes<=10.0.0,>=8.0.0->kfp) (1.3.0)\n",
      "Requirement already satisfied, skipping upgrade: cffi!=1.11.3,>=1.8 in ./.local/lib/python3.6/site-packages (from cryptography>=2.4.2->kfp) (1.14.0)\n",
      "Requirement already satisfied, skipping upgrade: chardet<3.1.0,>=3.0.2 in /usr/local/lib/python3.6/dist-packages (from requests<3.0.0,>=2.0.1->requests-toolbelt>=0.8.0->kfp) (3.0.4)\n",
      "Requirement already satisfied, skipping upgrade: idna<2.9,>=2.5 in /usr/lib/python3/dist-packages (from requests<3.0.0,>=2.0.1->requests-toolbelt>=0.8.0->kfp) (2.6)\n",
      "Requirement already satisfied, skipping upgrade: google-api-core<2.0.0dev,>=1.16.0 in /usr/local/lib/python3.6/dist-packages (from google-cloud-core<2.0dev,>=1.2.0->google-cloud-storage>=1.13.0->kfp) (1.16.0)\n",
      "Requirement already satisfied, skipping upgrade: zipp>=0.5 in /usr/local/lib/python3.6/dist-packages (from importlib-metadata; python_version < \"3.8\"->jsonschema>=3.0.1->kfp) (2.1.0)\n",
      "Requirement already satisfied, skipping upgrade: pyasn1>=0.1.3 in /usr/local/lib/python3.6/dist-packages (from rsa<4.1,>=3.1.4->google-auth>=1.6.1->kfp) (0.4.8)\n",
      "Requirement already satisfied, skipping upgrade: oauthlib>=3.0.0 in /usr/local/lib/python3.6/dist-packages (from requests-oauthlib->kubernetes<=10.0.0,>=8.0.0->kfp) (3.1.0)\n",
      "Requirement already satisfied, skipping upgrade: pycparser in ./.local/lib/python3.6/site-packages (from cffi!=1.11.3,>=1.8->cryptography>=2.4.2->kfp) (2.19)\n",
      "Requirement already satisfied, skipping upgrade: googleapis-common-protos<2.0dev,>=1.6.0 in /usr/local/lib/python3.6/dist-packages (from google-api-core<2.0.0dev,>=1.16.0->google-cloud-core<2.0dev,>=1.2.0->google-cloud-storage>=1.13.0->kfp) (1.51.0)\n",
      "Requirement already satisfied, skipping upgrade: protobuf>=3.4.0 in /usr/local/lib/python3.6/dist-packages (from google-api-core<2.0.0dev,>=1.16.0->google-cloud-core<2.0dev,>=1.2.0->google-cloud-storage>=1.13.0->kfp) (3.11.2)\n",
      "Requirement already satisfied, skipping upgrade: pytz in /usr/local/lib/python3.6/dist-packages (from google-api-core<2.0.0dev,>=1.16.0->google-cloud-core<2.0dev,>=1.2.0->google-cloud-storage>=1.13.0->kfp) (2019.3)\n",
      "Installing collected packages: kubernetes\n",
      "  Attempting uninstall: kubernetes\n",
      "    Found existing installation: kubernetes 10.0.1\n",
      "    Uninstalling kubernetes-10.0.1:\n",
      "      Successfully uninstalled kubernetes-10.0.1\n",
      "Successfully installed kubernetes-10.0.0\n"
     ]
    }
   ],
   "source": [
    "!pip install kubernetes --upgrade --user\n",
    "!pip install kfp --upgrade --user\n",
    "\n",
    "\n",
    "import kfp  # the Pipelines SDK.  This library is included with the notebook image.\n",
    "from kfp import compiler\n",
    "import kfp.dsl as dsl\n",
    "import kfp.notebook\n",
    "from kubernetes import client as k8s_client"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Create/Get an Experiment in the Kubeflow Pipeline System\n",
    "The Kubeflow Pipeline system requires an \"Experiment\" to group pipeline runs. You can create a new experiment, or call client.list_experiments() to get existing ones."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "client = kfp.Client()\n",
    "client.list_experiments()\n",
    "#exp = client.create_experiment(name='mdupdate')\n",
    "exp = client.get_experiment(experiment_name ='mdupdate')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Define a Pipeline\n",
    "Authoring a pipeline is like authoring a normal Python function. The pipeline function describes the topology of the pipeline.\n",
    "\n",
    "Each step in the pipeline is typically a ContainerOp --- a simple class or function describing how to interact with a docker container image. In the pipeline, all the container images referenced in the pipeline are already built."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "@dsl.pipeline(\n",
    "  name='Recommender model update',\n",
    "  description='Demonstrate usage of pipelines for multi-step model update'\n",
    ")\n",
    "def recommender_pipeline():\n",
    "    # Load new data\n",
    "  data = dsl.ContainerOp(\n",
    "      name='updatedata',\n",
    "      image='lightbend/recommender-data-update-publisher:0.2') \\\n",
    "    .add_env_variable(k8s_client.V1EnvVar(name='MINIO_URL',value='http://minio-service.kubeflow.svc.cluster.local:9000')) \\\n",
    "    .add_env_variable(k8s_client.V1EnvVar(name='MINIO_KEY', value='minio')) \\\n",
    "    .add_env_variable(k8s_client.V1EnvVar(name='MINIO_SECRET', value='minio123'))\n",
    "    # Train the model\n",
    "  train = dsl.ContainerOp(\n",
    "      name='trainmodel',\n",
    "      image='lightbend/ml-tf-recommender:0.1') \\\n",
    "    .add_env_variable(k8s_client.V1EnvVar(name='MINIO_URL',value='minio-service.kubeflow.svc.cluster.local:9000')) \\\n",
    "    .add_env_variable(k8s_client.V1EnvVar(name='MINIO_KEY', value='minio')) \\\n",
    "    .add_env_variable(k8s_client.V1EnvVar(name='MINIO_SECRET', value='minio123'))\n",
    "  train.after(data)\n",
    "    # Publish new model model\n",
    "  publish = dsl.ContainerOp(\n",
    "      name='publishmodel',\n",
    "      image='lightbend/recommender-model-publisher:0.2') \\\n",
    "    .add_env_variable(k8s_client.V1EnvVar(name='MINIO_URL',value='http://minio-service.kubeflow.svc.cluster.local:9000')) \\\n",
    "    .add_env_variable(k8s_client.V1EnvVar(name='MINIO_KEY', value='minio')) \\\n",
    "    .add_env_variable(k8s_client.V1EnvVar(name='MINIO_SECRET', value='minio123')) \\\n",
    "    .add_env_variable(k8s_client.V1EnvVar(name='KAFKA_BROKERS', value='cloudflow-kafka-brokers.cloudflow.svc.cluster.local:9092')) \\\n",
    "    .add_env_variable(k8s_client.V1EnvVar(name='DEFAULT_RECOMMENDER_URL', value='http://recommendermodelserver.kubeflow.svc.cluster.local:8501')) \\\n",
    "    .add_env_variable(k8s_client.V1EnvVar(name='ALTERNATIVE_RECOMMENDER_URL', value='http://recommendermodelserver1.kubeflow.svc.cluster.local:8501'))\n",
    "  publish.after(train)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Compile pipeline"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
    "compiler.Compiler().compile(recommender_pipeline, 'pipeline.tar.gz')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Submit an experiment run"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "Run link <a href=\"/pipeline/#/runs/details/df24284c-c7a1-480e-91b6-398bd352f164\" target=\"_blank\" >here</a>"
      ],
      "text/plain": [
       "<IPython.core.display.HTML object>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "run = client.run_pipeline(exp.id, 'pipeline1', 'pipeline.tar.gz')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.9"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}


================================================
FILE: pipelines/download_components.sh
================================================
#!/bin/bash
#tag::dlPipelineRelease[]
wget https://github.com/kubeflow/pipelines/archive/0.2.5.tar.gz
tar -xvf 0.2.5.tar.gz
#end::dlPipelineRelease[]


================================================
FILE: recommender/Dockerfile
================================================
FROM  tensorflow/tensorflow:1.12.0-devel-py3
RUN pip3 install --upgrade pip
RUN pip3 install pandas --upgrade
RUN pip3 install keras --upgrade
RUN pip3 install minio --upgrade
RUN mkdir -p /opt/kubeflow
COPY Recommender_Kubeflow.py /opt/kubeflow/
ENTRYPOINT ["python3", "/opt/kubeflow/Recommender_Kubeflow.py"]


================================================
FILE: recommender/Recommender_Kubeflow.ipynb
================================================
{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# This is implementation of the Recommender training\n",
    "\n",
    "This implementation takes a list of users and their purchasing history to calculate prediction\n",
    "on the probability that they would by a certain product.\n",
    "The implementation is structured in 2 parts:\n",
    "1. Build rating matrix based on the purchasing history. The implementation is based on this blog post\n",
    "https://medium.com/datadriveninvestor/how-to-build-a-recommendation-system-for-purchase-data-step-by-step-d6d7a78800b6\n",
    "2. Build collabarative filtering model based on the rating matrix. The implementation is based on this project https://github.com/Piyushdharkar/Collaborative-Filtering-Using-Keras \n",
    "\n",
    "Implementation is leveraging Minio for storing both source data and result models\n",
    "\n",
    "It also uses Python kubernetes client for re starting model server pod\n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# 1. Install libraries"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Collecting pandas\n",
      "  Downloading pandas-1.0.1-cp36-cp36m-manylinux1_x86_64.whl (10.1 MB)\n",
      "\u001b[K     |████████████████████████████████| 10.1 MB 3.2 MB/s eta 0:00:01\n",
      "\u001b[?25hRequirement already satisfied, skipping upgrade: numpy>=1.13.3 in /usr/local/lib/python3.6/dist-packages (from pandas) (1.18.1)\n",
      "Requirement already satisfied, skipping upgrade: python-dateutil>=2.6.1 in /usr/local/lib/python3.6/dist-packages (from pandas) (2.8.1)\n",
      "Requirement already satisfied, skipping upgrade: pytz>=2017.2 in /usr/local/lib/python3.6/dist-packages (from pandas) (2019.3)\n",
      "Requirement already satisfied, skipping upgrade: six>=1.5 in /usr/lib/python3/dist-packages (from python-dateutil>=2.6.1->pandas) (1.11.0)\n",
      "Installing collected packages: pandas\n",
      "Successfully installed pandas-1.0.1\n",
      "Collecting keras\n",
      "  Downloading Keras-2.3.1-py2.py3-none-any.whl (377 kB)\n",
      "\u001b[K     |████████████████████████████████| 377 kB 3.2 MB/s eta 0:00:01\n",
      "\u001b[?25hRequirement already satisfied, skipping upgrade: h5py in /usr/local/lib/python3.6/dist-packages (from keras) (2.10.0)\n",
      "Requirement already satisfied, skipping upgrade: numpy>=1.9.1 in /usr/local/lib/python3.6/dist-packages (from keras) (1.18.1)\n",
      "Requirement already satisfied, skipping upgrade: keras-applications>=1.0.6 in /usr/local/lib/python3.6/dist-packages (from keras) (1.0.8)\n",
      "Requirement already satisfied, skipping upgrade: scipy>=0.14 in /usr/local/lib/python3.6/dist-packages (from keras) (1.4.1)\n",
      "Requirement already satisfied, skipping upgrade: keras-preprocessing>=1.0.5 in /usr/local/lib/python3.6/dist-packages (from keras) (1.1.0)\n",
      "Requirement already satisfied, skipping upgrade: six>=1.9.0 in /usr/lib/python3/dist-packages (from keras) (1.11.0)\n",
      "Requirement already satisfied, skipping upgrade: pyyaml in /usr/local/lib/python3.6/dist-packages (from keras) (5.3)\n",
      "Installing collected packages: keras\n",
      "Successfully installed keras-2.3.1\n",
      "Collecting minio\n",
      "  Downloading minio-5.0.7-py2.py3-none-any.whl (71 kB)\n",
      "\u001b[K     |████████████████████████████████| 71 kB 1.9 MB/s eta 0:00:011\n",
      "\u001b[?25hRequirement already satisfied, skipping upgrade: python-dateutil in /usr/local/lib/python3.6/dist-packages (from minio) (2.8.1)\n",
      "Requirement already satisfied, skipping upgrade: urllib3 in ./.local/lib/python3.6/site-packages (from minio) (1.24.3)\n",
      "Requirement already satisfied, skipping upgrade: pytz in /usr/local/lib/python3.6/dist-packages (from minio) (2019.3)\n",
      "Requirement already satisfied, skipping upgrade: certifi in /usr/local/lib/python3.6/dist-packages (from minio) (2019.11.28)\n",
      "Collecting configparser\n",
      "  Downloading configparser-4.0.2-py2.py3-none-any.whl (22 kB)\n",
      "Requirement already satisfied, skipping upgrade: six>=1.5 in /usr/lib/python3/dist-packages (from python-dateutil->minio) (1.11.0)\n",
      "Installing collected packages: configparser, minio\n",
      "Successfully installed configparser-4.0.2 minio-5.0.7\n",
      "Collecting kubernetes\n",
      "  Downloading kubernetes-10.0.1-py2.py3-none-any.whl (1.5 MB)\n",
      "\u001b[K     |████████████████████████████████| 1.5 MB 3.4 MB/s eta 0:00:01\n",
      "\u001b[?25hRequirement already satisfied, skipping upgrade: certifi>=14.05.14 in /usr/local/lib/python3.6/dist-packages (from kubernetes) (2019.11.28)\n",
      "Requirement already satisfied, skipping upgrade: python-dateutil>=2.5.3 in /usr/local/lib/python3.6/dist-packages (from kubernetes) (2.8.1)\n",
      "Requirement already satisfied, skipping upgrade: pyyaml>=3.12 in /usr/local/lib/python3.6/dist-packages (from kubernetes) (5.3)\n",
      "Requirement already satisfied, skipping upgrade: six>=1.9.0 in /usr/lib/python3/dist-packages (from kubernetes) (1.11.0)\n",
      "Requirement already satisfied, skipping upgrade: websocket-client!=0.40.0,!=0.41.*,!=0.42.*,>=0.32.0 in /usr/local/lib/python3.6/dist-packages (from kubernetes) (0.57.0)\n",
      "Requirement already satisfied, skipping upgrade: urllib3>=1.24.2 in ./.local/lib/python3.6/site-packages (from kubernetes) (1.24.3)\n",
      "Requirement already satisfied, skipping upgrade: setuptools>=21.0.0 in /usr/local/lib/python3.6/dist-packages (from kubernetes) (45.1.0)\n",
      "Requirement already satisfied, skipping upgrade: requests in /usr/local/lib/python3.6/dist-packages (from kubernetes) (2.22.0)\n",
      "Requirement already satisfied, skipping upgrade: google-auth>=1.0.1 in /usr/local/lib/python3.6/dist-packages (from kubernetes) (1.11.0)\n",
      "Requirement already satisfied, skipping upgrade: requests-oauthlib in /usr/local/lib/python3.6/dist-packages (from kubernetes) (1.3.0)\n",
      "Requirement already satisfied, skipping upgrade: idna<2.9,>=2.5 in /usr/lib/python3/dist-packages (from requests->kubernetes) (2.6)\n",
      "Requirement already satisfied, skipping upgrade: chardet<3.1.0,>=3.0.2 in /usr/local/lib/python3.6/dist-packages (from requests->kubernetes) (3.0.4)\n",
      "Requirement already satisfied, skipping upgrade: pyasn1-modules>=0.2.1 in /usr/local/lib/python3.6/dist-packages (from google-auth>=1.0.1->kubernetes) (0.2.8)\n",
      "Requirement already satisfied, skipping upgrade: rsa<4.1,>=3.1.4 in /usr/local/lib/python3.6/dist-packages (from google-auth>=1.0.1->kubernetes) (4.0)\n",
      "Requirement already satisfied, skipping upgrade: cachetools<5.0,>=2.0.0 in /usr/local/lib/python3.6/dist-packages (from google-auth>=1.0.1->kubernetes) (4.0.0)\n",
      "Requirement already satisfied, skipping upgrade: oauthlib>=3.0.0 in /usr/local/lib/python3.6/dist-packages (from requests-oauthlib->kubernetes) (3.1.0)\n",
      "Requirement already satisfied, skipping upgrade: pyasn1<0.5.0,>=0.4.6 in /usr/local/lib/python3.6/dist-packages (from pyasn1-modules>=0.2.1->google-auth>=1.0.1->kubernetes) (0.4.8)\n",
      "\u001b[31mERROR: kfp 0.2.2.1 has requirement kubernetes<=10.0.0,>=8.0.0, but you'll have kubernetes 10.0.1 which is incompatible.\u001b[0m\n",
      "Installing collected packages: kubernetes\n",
      "  Attempting uninstall: kubernetes\n",
      "    Found existing installation: kubernetes 10.0.0\n",
      "    Uninstalling kubernetes-10.0.0:\n",
      "      Successfully uninstalled kubernetes-10.0.0\n",
      "Successfully installed kubernetes-10.0.1\n",
      "Collecting kfmd\n",
      "  Downloading kfmd-0.1.8.tar.gz (29 kB)\n",
      "Building wheels for collected packages: kfmd\n",
      "  Building wheel for kfmd (setup.py) ... \u001b[?25ldone\n",
      "\u001b[?25h  Created wheel for kfmd: filename=kfmd-0.1.8-py3-none-any.whl size=65919 sha256=c65ab8ff649134dbe6c8391743d5361546e5b29e6df9c0ff13915c99b67be1e7\n",
      "  Stored in directory: /home/jovyan/.cache/pip/wheels/54/6b/5c/f063f501d5c632c93566ed967f2f0c36bad3b384d68c83aa65\n",
      "Successfully built kfmd\n",
      "Installing collected packages: kfmd\n",
      "Successfully installed kfmd-0.1.8\n"
     ]
    }
   ],
   "source": [
    "!pip install pandas --upgrade --user\n",
    "!pip install keras --upgrade --user\n",
    "!pip install minio --upgrade --user\n",
    "!pip install kubernetes --upgrade --user\n",
    "!pip install kfmd --upgrade --user"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## imports"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Using TensorFlow backend.\n"
     ]
    }
   ],
   "source": [
    "import pandas as pd\n",
    "import numpy as np\n",
    "import time\n",
    "from minio import Minio\n",
    "from keras.models import Model\n",
    "from keras.layers import *\n",
    "from keras.losses import *\n",
    "import tensorflow as tf\n",
    "import os\n",
    "from kfmd import metadata\n",
    "from datetime import datetime\n",
    "from keras import backend as K\n",
    "from kubernetes import client as k8s_client, config as k8s_config\n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Create a workspace, run and execution"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "execTime = datetime.utcnow().isoformat(\"T\")\n",
    "ws = metadata.Workspace(\n",
    "    # Connect to metadata-service in namesapce kubeflow in k8s cluster.\n",
    "    backend_url_prefix=\"metadata-service.kubeflow.svc.cluster.local:8080\",\n",
    "    name=\"recommender\",\n",
    "    description=\"a workspace for saving recommender experiments\")\n",
    "r = metadata.Run(\n",
    "    workspace=ws,\n",
    "    name=\"run-\" + execTime ,\n",
    "    description=\"recommender run\",\n",
    ")\n",
    "exec = metadata.Execution(\n",
    "    name = \"execution\" + execTime ,\n",
    "    workspace=ws,\n",
    "    run=r,\n",
    "    description=\"recommender ML execution\",\n",
    ")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# 2. Read data\n",
    "\n",
    "For reading data we are using two diffierent approaches:\n",
    "1. We use Tensorflow build in support to write resulting model to Minio\n",
    "2. We use Minio APIs to read source data using Pandas. We could of use Boto APIs here instead."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Minio parameters : URL  minio-service.kubeflow.svc.cluster.local:9000  key  minio  secret  minio123\n"
     ]
    }
   ],
   "source": [
    "minio_endpoint = os.environ.get('MINIO_URL', 'minio-service.kubeflow.svc.cluster.local:9000')\n",
    "minio_key = os.environ.get('MINIO_KEY', 'minio')\n",
    "minio_secret = os.environ.get('MINIO_SECRET', 'minio123')\n",
    "\n",
    "print('Minio parameters : URL ', minio_endpoint, ' key ', minio_key, ' secret ', minio_secret)\n",
    "\n",
    "os.environ['AWS_ACCESS_KEY_ID'] = minio_key\n",
    "os.environ['AWS_SECRET_ACCESS_KEY'] = minio_secret\n",
    "os.environ['AWS_REGION'] = 'us-west-1'\n",
    "os.environ['S3_REGION'] = 'us-west-1'\n",
    "os.environ['S3_ENDPOINT'] = minio_endpoint\n",
    "os.environ['S3_USE_HTTPS'] = '0'\n",
    "os.environ['S3_VERIFY_SSL'] = '0'\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
    "minioClient = Minio(minio_endpoint,\n",
    "                    access_key=minio_key,\n",
    "                    secret_key=minio_secret,\n",
    "                    secure=False)\n",
    "\n",
    "minioClient.fget_object('data', 'recommender/users.csv', '/tmp/users.csv')\n",
    "customers = pd.read_csv('/tmp/users.csv')\n",
    "minioClient.fget_object('data', 'recommender/transactions.csv', '/tmp/transactions.csv')\n",
    "transactions = pd.read_csv('/tmp/transactions.csv')\n",
    "\n",
    "#Log experiment data set\n",
    "data_set = exec.log_input(\n",
    "        metadata.DataSet(\n",
    "            description=\"recommender current transactions and customers\",\n",
    "            name=\"Current transactions and customers\",\n",
    "            version=execTime,\n",
    "            uri=\"minio:/tmp/transactions.csv; minio:/tmp/users.csv\"))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "(1000, 1)\n"
     ]
    },
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>customerId</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>1553</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>20400</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>19750</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>6334</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>27773</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   customerId\n",
       "0        1553\n",
       "1       20400\n",
       "2       19750\n",
       "3        6334\n",
       "4       27773"
      ]
     },
     "execution_count": 6,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "print(customers.shape)\n",
    "customers.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "(62483, 2)\n"
     ]
    },
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>customerId</th>\n",
       "      <th>products</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>0</td>\n",
       "      <td>20</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>1</td>\n",
       "      <td>2|2|23|68|68|111|29|86|107|152</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>2</td>\n",
       "      <td>111|107|29|11|11|11|33|23</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>3</td>\n",
       "      <td>164|227</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>5</td>\n",
       "      <td>2|2</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   customerId                        products\n",
       "0           0                              20\n",
       "1           1  2|2|23|68|68|111|29|86|107|152\n",
       "2           2       111|107|29|11|11|11|33|23\n",
       "3           3                         164|227\n",
       "4           5                             2|2"
      ]
     },
     "execution_count": 7,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "print(transactions.shape)\n",
    "transactions.head()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# 3 Data preparation\n",
    "\n",
    "Our goal here is to break down each list of items in the products column into rows \n",
    "and count the number of products bought by a user"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>customerId</th>\n",
       "      <th>0</th>\n",
       "      <th>1</th>\n",
       "      <th>2</th>\n",
       "      <th>3</th>\n",
       "      <th>4</th>\n",
       "      <th>5</th>\n",
       "      <th>6</th>\n",
       "      <th>7</th>\n",
       "      <th>8</th>\n",
       "      <th>9</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>0</td>\n",
       "      <td>20.0</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>1</td>\n",
       "      <td>2.0</td>\n",
       "      <td>2.0</td>\n",
       "      <td>23.0</td>\n",
       "      <td>68.0</td>\n",
       "      <td>68.0</td>\n",
       "      <td>111.0</td>\n",
       "      <td>29.0</td>\n",
       "      <td>86.0</td>\n",
       "      <td>107.0</td>\n",
       "      <td>152.0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   customerId     0    1     2     3     4      5     6     7      8      9\n",
       "0           0  20.0  NaN   NaN   NaN   NaN    NaN   NaN   NaN    NaN    NaN\n",
       "1           1   2.0  2.0  23.0  68.0  68.0  111.0  29.0  86.0  107.0  152.0"
      ]
     },
     "execution_count": 8,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# 1: split product items\n",
    "transactions['products'] = transactions['products'].apply(lambda x: [int(i) for i in x.split('|')])\n",
    "transactions.head(2).set_index('customerId')['products'].apply(pd.Series).reset_index()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>customerId</th>\n",
       "      <th>productId</th>\n",
       "      <th>purchase_count</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>0</td>\n",
       "      <td>20.0</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>1</td>\n",
       "      <td>2.0</td>\n",
       "      <td>2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>1</td>\n",
       "      <td>23.0</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>1</td>\n",
       "      <td>29.0</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>1</td>\n",
       "      <td>68.0</td>\n",
       "      <td>2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>1</td>\n",
       "      <td>86.0</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6</th>\n",
       "      <td>1</td>\n",
       "      <td>107.0</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>7</th>\n",
       "      <td>1</td>\n",
       "      <td>111.0</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>8</th>\n",
       "      <td>1</td>\n",
       "      <td>152.0</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   customerId  productId  purchase_count\n",
       "0           0       20.0               1\n",
       "1           1        2.0               2\n",
       "2           1       23.0               1\n",
       "3           1       29.0               1\n",
       "4           1       68.0               2\n",
       "5           1       86.0               1\n",
       "6           1      107.0               1\n",
       "7           1      111.0               1\n",
       "8           1      152.0               1"
      ]
     },
     "execution_count": 9,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# 2: organize a given table into a dataframe with customerId, single productId, and purchase count\n",
    "pd.melt(transactions.head(2).set_index('customerId')['products'].apply(pd.Series).reset_index(), \n",
    "             id_vars=['customerId'],\n",
    "             value_name='products') \\\n",
    "    .dropna().drop(['variable'], axis=1) \\\n",
    "    .groupby(['customerId', 'products']) \\\n",
    "    .agg({'products': 'count'}) \\\n",
    "    .rename(columns={'products': 'purchase_count'}) \\\n",
    "    .reset_index() \\\n",
    "    .rename(columns={'products': 'productId'})"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 3.1 Create data with user, item, and target field"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "(133585, 3)\n"
     ]
    },
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>customerId</th>\n",
       "      <th>productId</th>\n",
       "      <th>purchase_count</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>0</td>\n",
       "      <td>13</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>0</td>\n",
       "      <td>19</td>\n",
       "      <td>3</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>0</td>\n",
       "      <td>20</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>0</td>\n",
       "      <td>31</td>\n",
       "      <td>2</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   customerId  productId  purchase_count\n",
       "0           0          1               2\n",
       "1           0         13               1\n",
       "2           0         19               3\n",
       "3           0         20               1\n",
       "4           0         31               2"
      ]
     },
     "execution_count": 10,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "data = pd.melt(transactions.set_index('customerId')['products'].apply(pd.Series).reset_index(), \n",
    "             id_vars=['customerId'],\n",
    "             value_name='products') \\\n",
    "    .dropna().drop(['variable'], axis=1) \\\n",
    "    .groupby(['customerId', 'products']) \\\n",
    "    .agg({'products': 'count'}) \\\n",
    "    .rename(columns={'products': 'purchase_count'}) \\\n",
    "    .reset_index() \\\n",
    "    .rename(columns={'products': 'productId'})\n",
    "data['productId'] = data['productId'].astype(np.int64)\n",
    "\n",
    "print(data.shape)\n",
    "data.head()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 3.2 Normalize item values across users"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th>productId</th>\n",
       "      <th>0</th>\n",
       "      <th>1</th>\n",
       "      <th>2</th>\n",
       "      <th>3</th>\n",
       "      <th>4</th>\n",
       "      <th>5</th>\n",
       "      <th>6</th>\n",
       "      <th>7</th>\n",
       "      <th>8</th>\n",
       "      <th>9</th>\n",
       "      <th>...</th>\n",
       "      <th>290</th>\n",
       "      <th>291</th>\n",
       "      <th>292</th>\n",
       "      <th>293</th>\n",
       "      <th>294</th>\n",
       "      <th>295</th>\n",
       "      <th>296</th>\n",
       "      <th>297</th>\n",
       "      <th>298</th>\n",
       "      <th>299</th>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>customerId</th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>NaN</td>\n",
       "      <td>2.0</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>...</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>6.0</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>...</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>1.0</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>1.0</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>...</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>...</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>...</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>5 rows × 300 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "productId   0    1    2    3    4    5    6    7    8    9    ...  290  291  \\\n",
       "customerId                                                    ...             \n",
       "0           NaN  2.0  NaN  NaN  NaN  NaN  NaN  NaN  NaN  NaN  ...  NaN  NaN   \n",
       "1           NaN  NaN  6.0  NaN  NaN  NaN  NaN  NaN  NaN  NaN  ...  NaN  NaN   \n",
       "2           NaN  NaN  NaN  NaN  NaN  NaN  NaN  NaN  NaN  NaN  ...  NaN  NaN   \n",
       "3           NaN  NaN  NaN  NaN  NaN  NaN  NaN  NaN  NaN  NaN  ...  NaN  NaN   \n",
       "4           NaN  NaN  NaN  NaN  NaN  NaN  NaN  NaN  NaN  NaN  ...  NaN  NaN   \n",
       "\n",
       "productId   292  293  294  295  296  297  298  299  \n",
       "customerId                                          \n",
       "0           NaN  NaN  NaN  NaN  NaN  NaN  NaN  NaN  \n",
       "1           NaN  1.0  NaN  NaN  1.0  NaN  NaN  NaN  \n",
       "2           NaN  NaN  NaN  NaN  NaN  NaN  NaN  NaN  \n",
       "3           NaN  NaN  NaN  NaN  NaN  NaN  NaN  NaN  \n",
       "4           NaN  NaN  NaN  NaN  NaN  NaN  NaN  NaN  \n",
       "\n",
       "[5 rows x 300 columns]"
      ]
     },
     "execution_count": 11,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df_matrix = pd.pivot_table(data, values='purchase_count', index='customerId', columns='productId')\n",
    "df_matrix.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "(24429, 300)\n"
     ]
    },
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th>productId</th>\n",
       "      <th>0</th>\n",
       "      <th>1</th>\n",
       "      <th>2</th>\n",
       "      <th>3</th>\n",
       "      <th>4</th>\n",
       "      <th>5</th>\n",
       "      <th>6</th>\n",
       "      <th>7</th>\n",
       "      <th>8</th>\n",
       "      <th>9</th>\n",
       "      <th>...</th>\n",
       "      <th>290</th>\n",
       "      <th>291</th>\n",
       "      <th>292</th>\n",
       "      <th>293</th>\n",
       "      <th>294</th>\n",
       "      <th>295</th>\n",
       "      <th>296</th>\n",
       "      <th>297</th>\n",
       "      <th>298</th>\n",
       "      <th>299</th>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>customerId</th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>NaN</td>\n",
       "      <td>0.1</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>...</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0.166667</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>...</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0.0</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0.0</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>...</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>...</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>...</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>5 rows × 300 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "productId   0    1         2    3    4    5    6    7    8    9    ...  290  \\\n",
       "customerId                                                         ...        \n",
       "0           NaN  0.1       NaN  NaN  NaN  NaN  NaN  NaN  NaN  NaN  ...  NaN   \n",
       "1           NaN  NaN  0.166667  NaN  NaN  NaN  NaN  NaN  NaN  NaN  ...  NaN   \n",
       "2           NaN  NaN       NaN  NaN  NaN  NaN  NaN  NaN  NaN  NaN  ...  NaN   \n",
       "3           NaN  NaN       NaN  NaN  NaN  NaN  NaN  NaN  NaN  NaN  ...  NaN   \n",
       "4           NaN  NaN       NaN  NaN  NaN  NaN  NaN  NaN  NaN  NaN  ...  NaN   \n",
       "\n",
       "productId   291  292  293  294  295  296  297  298  299  \n",
       "customerId                                               \n",
       "0           NaN  NaN  NaN  NaN  NaN  NaN  NaN  NaN  NaN  \n",
       "1           NaN  NaN  0.0  NaN  NaN  0.0  NaN  NaN  NaN  \n",
       "2           NaN  NaN  NaN  NaN  NaN  NaN  NaN  NaN  NaN  \n",
       "3           NaN  NaN  NaN  NaN  NaN  NaN  NaN  NaN  NaN  \n",
       "4           NaN  NaN  NaN  NaN  NaN  NaN  NaN  NaN  NaN  \n",
       "\n",
       "[5 rows x 300 columns]"
      ]
     },
     "execution_count": 12,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df_matrix_norm = (df_matrix-df_matrix.min())/(df_matrix.max()-df_matrix.min())\n",
    "print(df_matrix_norm.shape)\n",
    "df_matrix_norm.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "(133585, 3)\n"
     ]
    },
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>customerId</th>\n",
       "      <th>productId</th>\n",
       "      <th>scaled_purchase_freq</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>9</th>\n",
       "      <td>9</td>\n",
       "      <td>0</td>\n",
       "      <td>0.133333</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>25</th>\n",
       "      <td>25</td>\n",
       "      <td>0</td>\n",
       "      <td>0.133333</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>32</th>\n",
       "      <td>33</td>\n",
       "      <td>0</td>\n",
       "      <td>0.133333</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>35</th>\n",
       "      <td>36</td>\n",
       "      <td>0</td>\n",
       "      <td>0.133333</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>43</th>\n",
       "      <td>44</td>\n",
       "      <td>0</td>\n",
       "      <td>0.133333</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "    customerId productId  scaled_purchase_freq\n",
       "9            9         0              0.133333\n",
       "25          25         0              0.133333\n",
       "32          33         0              0.133333\n",
       "35          36         0              0.133333\n",
       "43          44         0              0.133333"
      ]
     },
     "execution_count": 13,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# create a table for input to the modeling\n",
    "\n",
    "d = df_matrix_norm.reset_index()\n",
    "d.index.names = ['scaled_purchase_freq']\n",
    "data_norm = pd.melt(d, id_vars=['customerId'], value_name='scaled_purchase_freq').dropna()\n",
    "print(data_norm.shape)\n",
    "data_norm.head()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# 4 Preparing data for learning"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "28606\n",
      "300\n",
      "[    9    25    33 ... 26873 26998 28066]\n",
      "[  0   0   0 ... 299 299 299]\n",
      "[0.13333333 0.13333333 0.13333333 ... 0.         0.         0.        ]\n"
     ]
    }
   ],
   "source": [
    "customer_idxs = np.array(data_norm.customerId, dtype = np.int)\n",
    "product_idxs = np.array(data_norm.productId, dtype = np.int)\n",
    "\n",
    "ratings = np.array(data_norm.scaled_purchase_freq)\n",
    "\n",
    "n_customers = int(data_norm['customerId'].drop_duplicates().max()) + 1\n",
    "n_products = int(data_norm['productId'].drop_duplicates().max()) + 1\n",
    "n_factors = 50\n",
    "\n",
    "input_shape = (1,)\n",
    "\n",
    "print(n_customers)\n",
    "print(n_products)\n",
    "print(customer_idxs)\n",
    "print(product_idxs)\n",
    "print(ratings)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 4.1 Tensorflow Session"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [],
   "source": [
    "# create TF session and set it in Keras\n",
    "sess = tf.Session()\n",
    "K.set_session(sess)\n",
    "K.set_learning_phase(1)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 4.2 Model Class"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {},
   "outputs": [],
   "source": [
    "class DeepCollaborativeFiltering(Model):\n",
    "    def __init__(self, n_customers, n_products, n_factors, p_dropout = 0.2):\n",
    "        x1 = Input(shape = (1,), name=\"user\")\n",
    "\n",
    "        P = Embedding(n_customers, n_factors, input_length = 1)(x1)\n",
    "        P = Reshape((n_factors,))(P)\n",
    "\n",
    "        x2 = Input(shape = (1,), name=\"product\")\n",
    "\n",
    "        Q = Embedding(n_products, n_factors, input_length = 1)(x2)\n",
    "        Q = Reshape((n_factors,))(Q)\n",
    "\n",
    "        x = concatenate([P, Q], axis=1)\n",
    "        x = Dropout(p_dropout)(x)\n",
    "\n",
    "        x = Dense(n_factors)(x)\n",
    "        x = Activation('relu')(x)\n",
    "        x = Dropout(p_dropout)(x)\n",
    "\n",
    "        output = Dense(1)(x)       \n",
    "        \n",
    "        super(DeepCollaborativeFiltering, self).__init__([x1, x2], output)\n",
    "    \n",
    "    def rate(self, customer_idxs, product_idxs):\n",
    "        if (type(customer_idxs) == int and type(product_idxs) == int):\n",
    "            return self.predict([np.array(customer_idxs).reshape((1,)), np.array(product_idxs).reshape((1,))])\n",
    "        \n",
    "        if (type(customer_idxs) == str and type(product_idxs) == str):\n",
    "            return self.predict([np.array(customerMapping[customer_idxs]).reshape((1,)), np.array(productMapping[product_idxs]).reshape((1,))])\n",
    "        \n",
    "        return self.predict([\n",
    "            np.array([customerMapping[customer_idx] for customer_idx in customer_idxs]), \n",
    "            np.array([productMapping[product_idx] for product_idx in product_idxs])\n",
    "        ])"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 4.3 Hyperparameters"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {},
   "outputs": [],
   "source": [
    "bs = 64\n",
    "val_per = 0.25\n",
    "epochs = 3"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 4.4 Model Definition"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "WARNING:tensorflow:From /usr/local/lib/python3.6/dist-packages/tensorflow_core/python/ops/resource_variable_ops.py:1630: calling BaseResourceVariable.__init__ (from tensorflow.python.ops.resource_variable_ops) with constraint is deprecated and will be removed in a future version.\n",
      "Instructions for updating:\n",
      "If using Keras pass *_constraint arguments to layers.\n",
      "Model: \"deepcollaborativefiltering_1\"\n",
      "__________________________________________________________________________________________________\n",
      "Layer (type)                    Output Shape         Param #     Connected to                     \n",
      "==================================================================================================\n",
      "user (InputLayer)               (None, 1)            0                                            \n",
      "__________________________________________________________________________________________________\n",
      "product (InputLayer)            (None, 1)            0                                            \n",
      "__________________________________________________________________________________________________\n",
      "embedding_1 (Embedding)         (None, 1, 50)        1430300     user[0][0]                       \n",
      "__________________________________________________________________________________________________\n",
      "embedding_2 (Embedding)         (None, 1, 50)        15000       product[0][0]                    \n",
      "__________________________________________________________________________________________________\n",
      "reshape_1 (Reshape)             (None, 50)           0           embedding_1[0][0]                \n",
      "__________________________________________________________________________________________________\n",
      "reshape_2 (Reshape)             (None, 50)           0           embedding_2[0][0]                \n",
      "__________________________________________________________________________________________________\n",
      "concatenate_1 (Concatenate)     (None, 100)          0           reshape_1[0][0]                  \n",
      "                                                                 reshape_2[0][0]                  \n",
      "__________________________________________________________________________________________________\n",
      "dropout_1 (Dropout)             (None, 100)          0           concatenate_1[0][0]              \n",
      "__________________________________________________________________________________________________\n",
      "dense_1 (Dense)                 (None, 50)           5050        dropout_1[0][0]                  \n",
      "__________________________________________________________________________________________________\n",
      "activation_1 (Activation)       (None, 50)           0           dense_1[0][0]                    \n",
      "__________________________________________________________________________________________________\n",
      "dropout_2 (Dropout)             (None, 50)           0           activation_1[0][0]               \n",
      "__________________________________________________________________________________________________\n",
      "dense_2 (Dense)                 (None, 1)            51          dropout_2[0][0]                  \n",
      "==================================================================================================\n",
      "Total params: 1,450,401\n",
      "Trainable params: 1,450,401\n",
      "Non-trainable params: 0\n",
      "__________________________________________________________________________________________________\n"
     ]
    }
   ],
   "source": [
    "model = DeepCollaborativeFiltering(n_customers, n_products, n_factors)\n",
    "model.summary()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# 5 Training"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "WARNING:tensorflow:From /usr/local/lib/python3.6/dist-packages/tensorflow_core/python/ops/math_grad.py:1424: where (from tensorflow.python.ops.array_ops) is deprecated and will be removed in a future version.\n",
      "Instructions for updating:\n",
      "Use tf.where in 2.0, which has the same broadcast rule as np.where\n",
      "WARNING:tensorflow:From /home/jovyan/.local/lib/python3.6/site-packages/keras/backend/tensorflow_backend.py:422: The name tf.global_variables is deprecated. Please use tf.compat.v1.global_variables instead.\n",
      "\n",
      "Train on 100188 samples, validate on 33397 samples\n",
      "Epoch 1/3\n",
      "100188/100188 [==============================] - 14s 142us/step - loss: 0.0105 - val_loss: 0.0184\n",
      "Epoch 2/3\n",
      "100188/100188 [==============================] - 14s 137us/step - loss: 0.0091 - val_loss: 0.0187\n",
      "Epoch 3/3\n",
      "100188/100188 [==============================] - 14s 139us/step - loss: 0.0078 - val_loss: 0.0193\n",
      "Done training!\n"
     ]
    }
   ],
   "source": [
    "model.compile(optimizer = 'adam', loss = mean_squared_logarithmic_error)\n",
    "model.fit(x = [customer_idxs, product_idxs], y = ratings, batch_size = bs, epochs = epochs, validation_split = val_per)\n",
    "print('Done training!')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 5.1 Log model and metrics"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "metadata": {},
   "outputs": [],
   "source": [
    "logmodel = exec.log_output(\n",
    "    metadata.Model(\n",
    "            name=\"DeepCollaborativeFiltering\",\n",
    "            description=\"Model for product recommender\",\n",
    "            uri=\"\",\n",
    "            model_type=\"neural network\",\n",
    "            version=execTime,\n",
    "            training_framework={\n",
    "                \"name\": \"tensorflow\",\n",
    "                \"version\": \"v1.14\"\n",
    "            },\n",
    "            hyperparameters={\n",
    "                \"batch_size\" : 64,\n",
    "                \"validation_split\" : 0.25,\n",
    "                \"layers\": [n_customers, n_products, n_factors],\n",
    "                \"epochs\" : 3\n",
    "            }))\n",
    "metrics = exec.log_output(\n",
    "    metadata.Metrics(\n",
    "            name=\"Model for product recommender evaluation\",\n",
    "            description=\"Validating of the recommender model\",\n",
    "            uri=\"\",\n",
    "            version=execTime,\n",
    "            data_set_id=data_set.id,\n",
    "            model_id=logmodel.id))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# 6 Get current output directory for model"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Exporting trained model to s3://models/recommender/1/\n"
     ]
    }
   ],
   "source": [
    "directorystream = minioClient.get_object('data', 'recommender/directory.txt')\n",
    "directory = \"\"\n",
    "for d in directorystream.stream(32*1024):\n",
    "    directory += d.decode('utf-8')\n",
    "arg_version = \"1\"    \n",
    "export_path = 's3://models/' + directory + '/' + arg_version + '/'\n",
    "print ('Exporting trained model to', export_path)\n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 6.1 Export models"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "WARNING:tensorflow:From <ipython-input-22-58b1f5cc64c6>:2: build_tensor_info (from tensorflow.python.saved_model.utils_impl) is deprecated and will be removed in a future version.\n",
      "Instructions for updating:\n",
      "This function will only be available through the v1 compatibility library as tf.compat.v1.saved_model.utils.build_tensor_info or tf.compat.v1.saved_model.build_tensor_info.\n",
      "tensor_info_users user:0\n",
      "tensor_info_products product:0\n",
      "tensor_info_pred dense_2/BiasAdd:0\n"
     ]
    }
   ],
   "source": [
    "# inputs/outputs\n",
    "tensor_info_users = tf.saved_model.utils.build_tensor_info(model.input[0])\n",
    "tensor_info_products = tf.saved_model.utils.build_tensor_info(model.input[1])\n",
    "tensor_info_pred = tf.saved_model.utils.build_tensor_info(model.output)\n",
    "\n",
    "print (\"tensor_info_users\", tensor_info_users.name)\n",
    "print (\"tensor_info_products\", tensor_info_products.name)\n",
    "print (\"tensor_info_pred\", tensor_info_pred.name)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "WARNING:tensorflow:From <ipython-input-23-da3077ef7d92>:14: calling SavedModelBuilder.add_meta_graph_and_variables (from tensorflow.python.saved_model.builder_impl) with legacy_init_op is deprecated and will be removed in a future version.\n",
      "Instructions for updating:\n",
      "Pass your op to the equivalent parameter main_op instead.\n",
      "INFO:tensorflow:No assets to save.\n",
      "INFO:tensorflow:No assets to write.\n",
      "INFO:tensorflow:SavedModel written to: s3://models/recommender/1/saved_model.pb\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "b's3://models/recommender/1/saved_model.pb'"
      ]
     },
     "execution_count": 23,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# signature\n",
    "prediction_signature = (tf.saved_model.signature_def_utils.build_signature_def(\n",
    "        inputs={\"users\": tensor_info_users, \"products\": tensor_info_products},\n",
    "        outputs={\"predictions\": tensor_info_pred},\n",
    "        method_name=tf.saved_model.signature_constants.PREDICT_METHOD_NAME))\n",
    "# export\n",
    "legacy_init_op = tf.group(tf.tables_initializer(), name='legacy_init_op')\n",
    "builder = tf.saved_model.builder.SavedModelBuilder(export_path)\n",
    "builder.add_meta_graph_and_variables(\n",
    "      sess, [tf.saved_model.tag_constants.SERVING],\n",
    "      signature_def_map={\n",
    "           tf.saved_model.signature_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY: prediction_signature,\n",
    "      },\n",
    "      legacy_init_op=legacy_init_op)\n",
    "builder.save()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# 7 Restarting of the model serving server\n",
    "\n",
    "In order for a new model to take effect it is also necessary to restart a model server.\n",
    "The issue here is that we are not changing the model version version and as a result, \n",
    "the model will not be updated. To ensure model update, we are here restarting a server -\n",
    "simply killing the running instance, and as a server is installed using deployment, the instance\n",
    "will be recreated. Additionally for pods operations to work correctly from the notebook,\n",
    "it is necessary to create permissions allowing for access to pods in another namespace. \n",
    "Look at the podaccessroles.yaml for details."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "pod prefix  recommendermodelserver-\n",
      "pod namespace  kubeflow\n"
     ]
    }
   ],
   "source": [
    "recommender = \"recommendermodelserver-\"\n",
    "if directory == \"recommender1\":\n",
    "    recommender = \"recommendermodelserver1-\"\n",
    "print(\"pod prefix \", recommender) \n",
    "\n",
    "namespace = \"kubeflow\"\n",
    "print(\"pod namespace \", namespace) "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 26,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Current pod name  recommendermodelserver-6d5d5c654-snl99\n"
     ]
    }
   ],
   "source": [
    "# Get full pod name for the current model\n",
    "\n",
    "k8s_config.load_incluster_config()\n",
    "\n",
    "v1 = k8s_client.CoreV1Api()\n",
    "\n",
    "pod_list = v1.list_namespaced_pod(namespace)\n",
    "pod = [item.metadata.name for item in pod_list.items if recommender in item.metadata.name][0]\n",
    "print(\"Current pod name \", pod)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 27,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Done deleting\n"
     ]
    }
   ],
   "source": [
    "# Delete pod, so that it gets recreated\n",
    "v1.delete_namespaced_pod(pod, namespace, grace_period_seconds=0)\n",
    "\n",
    "print(\"Done deleting\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 28,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "New pod name  recommendermodelserver-6d5d5c654-xvxf7\n"
     ]
    }
   ],
   "source": [
    "# Verify that the new instance was created\n",
    "time.sleep(20)\n",
    "pod_list = v1.list_namespaced_pod(namespace)\n",
    "pod = [item.metadata.name for item in pod_list.items if recommender in item.metadata.name][0]\n",
    "print(\"New pod name \", pod)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.9"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}


================================================
FILE: recommender/Recommender_Kubeflow.py
================================================
#!/usr/bin/env python
# coding: utf-8

# # This is implementation of the Recommender training
#
# This implementation takes a list of users and their purchasing history to calculate prediction
# on the probability that they would by a certain product.
# The implementation is structured in 2 parts:
# 1. Build rating matrix based on the purchasing history. The implementation is based on this blog post
# https://medium.com/datadriveninvestor/how-to-build-a-recommendation-system-for-purchase-data-step-by-step-d6d7a78800b6
# 2. Build collabarative filtering model based on the rating matrix. The implementation is based on this project https://github.com/Piyushdharkar/Collaborative-Filtering-Using-Keras
#
# Implementation is leveraging Minio for storing both source data and result models
#
# It also uses Python kubernetes client for re starting model server pod
#

# # 1. Install libraries

# In[1]:

get_ipython().system('pip install pandas --upgrade --user')
get_ipython().system('pip install keras --upgrade --user')
get_ipython().system('pip install minio --upgrade --user')
get_ipython().system('pip install kubernetes --upgrade --user')
get_ipython().system('pip install kfmd --upgrade --user')

# ## imports

# In[2]:

import pandas as pd
import numpy as np
import time
from minio import Minio
from keras.models import Model
from keras.layers import *
from keras.losses import *
import tensorflow as tf
import os
from kfmd import metadata
from datetime import datetime
from keras import backend as K
from kubernetes import client as k8s_client, config as k8s_config

# Create a workspace, run and execution

# In[3]:

execTime = datetime.utcnow().isoformat("T")
ws = metadata.Workspace(
    # Connect to metadata-service in namesapce kubeflow in k8s cluster.
    backend_url_prefix="metadata-service.kubeflow.svc.cluster.local:8080",
    name="recommender",
    description="a workspace for saving recommender experiments")
r = metadata.Run(
    workspace=ws,
    name="run-" + execTime,
    description="recommender run",
)
exec = metadata.Execution(
    name="execution" + execTime,
    workspace=ws,
    run=r,
    description="recommender ML execution",
)

# # 2. Read data
#
# For reading data we are using two diffierent approaches:
# 1. We use Tensorflow build in support to write resulting model to Minio
# 2. We use Minio APIs to read source data using Pandas. We could of use Boto APIs here instead.

# In[4]:

minio_endpoint = os.environ.get(
    'MINIO_URL', 'minio-service.kubeflow.svc.cluster.local:9000')
minio_key = os.environ.get('MINIO_KEY', 'minio')
minio_secret = os.environ.get('MINIO_SECRET', 'minio123')

print('Minio parameters : URL ', minio_endpoint, ' key ', minio_key,
      ' secret ', minio_secret)

os.environ['AWS_ACCESS_KEY_ID'] = minio_key
os.environ['AWS_SECRET_ACCESS_KEY'] = minio_secret
os.environ['AWS_REGION'] = 'us-west-1'
os.environ['S3_REGION'] = 'us-west-1'
os.environ['S3_ENDPOINT'] = minio_endpoint
os.environ['S3_USE_HTTPS'] = '0'
os.environ['S3_VERIFY_SSL'] = '0'

# In[5]:

minioClient = Minio(minio_endpoint,
                    access_key=minio_key,
                    secret_key=minio_secret,
                    secure=False)

minioClient.fget_object('data', 'recommender/users.csv', '/tmp/users.csv')
customers = pd.read_csv('/tmp/users.csv')
minioClient.fget_object('data', 'recommender/transactions.csv',
                        '/tmp/transactions.csv')
transactions = pd.read_csv('/tmp/transactions.csv')

#Log experiment data set
data_set = exec.log_input(
    metadata.DataSet(
        description="recommender current transactions and customers",
        name="Current transactions and customers",
        version=execTime,
        uri="minio:/tmp/transactions.csv; minio:/tmp/users.csv"))

# In[6]:

print(customers.shape)
customers.head()

# In[7]:

print(transactions.shape)
transactions.head()

# # 3 Data preparation
#
# Our goal here is to break down each list of items in the products column into rows
# and count the number of products bought by a user

# In[8]:

# 1: split product items
transactions['products'] = transactions['products'].apply(
    lambda x: [int(i) for i in x.split('|')])
transactions.head(2).set_index('customerId')['products'].apply(
    pd.Series).reset_index()

# In[9]:

# 2: organize a given table into a dataframe with customerId, single productId, and purchase count
pd.melt(transactions.head(2).set_index('customerId')['products'].apply(pd.Series).reset_index(),
             id_vars=['customerId'],
             value_name='products') \
    .dropna().drop(['variable'], axis=1) \
    .groupby(['customerId', 'products']) \
    .agg({'products': 'count'}) \
    .rename(columns={'products': 'purchase_count'}) \
    .reset_index() \
    .rename(columns={'products': 'productId'})

# ## 3.1 Create data with user, item, and target field

# In[10]:


data = pd.melt(transactions.set_index('customerId')['products'].apply(pd.Series).reset_index(),
             id_vars=['customerId'],
             value_name='products') \
    .dropna().drop(['variable'], axis=1) \
    .groupby(['customerId', 'products']) \
    .agg({'products': 'count'}) \
    .rename(columns={'products': 'purchase_count'}) \
    .reset_index() \
    .rename(columns={'products': 'productId'})
data['productId'] = data['productId'].astype(np.int64)

print(data.shape)
data.head()

# ## 3.2 Normalize item values across users

# In[11]:

df_matrix = pd.pivot_table(data,
                           values='purchase_count',
                           index='customerId',
                           columns='productId')
df_matrix.head()

# In[12]:


df_matrix_norm = (df_matrix - df_matrix.min()) / \
                  (df_matrix.max() - df_matrix.min())
print(df_matrix_norm.shape)
df_matrix_norm.head()

# In[13]:

# create a table for input to the modeling

d = df_matrix_norm.reset_index()
d.index.names = ['scaled_purchase_freq']
data_norm = pd.melt(d,
                    id_vars=['customerId'],
                    value_name='scaled_purchase_freq').dropna()
print(data_norm.shape)
data_norm.head()

# # 4 Preparing data for learning

# In[14]:

customer_idxs = np.array(data_norm.customerId, dtype=np.int)
product_idxs = np.array(data_norm.productId, dtype=np.int)

ratings = np.array(data_norm.scaled_purchase_freq)

n_customers = int(data_norm['customerId'].drop_duplicates().max()) + 1
n_products = int(data_norm['productId'].drop_duplicates().max()) + 1
n_factors = 50

input_shape = (1, )

print(n_customers)
print(n_products)
print(customer_idxs)
print(product_idxs)
print(ratings)

# ## 4.1 Tensorflow Session

# In[15]:

# create TF session and set it in Keras
sess = tf.Session()
K.set_session(sess)
K.set_learning_phase(1)

# ## 4.2 Model Class

# In[16]:


class DeepCollaborativeFiltering(Model):
    def __init__(self, n_customers, n_products, n_factors, p_dropout=0.2):
        x1 = Input(shape=(1, ), name="user")

        P = Embedding(n_customers, n_factors, input_length=1)(x1)
        P = Reshape((n_factors, ))(P)

        x2 = Input(shape=(1, ), name="product")

        Q = Embedding(n_products, n_factors, input_length=1)(x2)
        Q = Reshape((n_factors, ))(Q)

        x = concatenate([P, Q], axis=1)
        x = Dropout(p_dropout)(x)

        x = Dense(n_factors)(x)
        x = Activation('relu')(x)
        x = Dropout(p_dropout)(x)

        output = Dense(1)(x)

        super(DeepCollaborativeFiltering, self).__init__([x1, x2], output)

    def rate(self, customer_idxs, product_idxs):
        if (type(customer_idxs) == int and type(product_idxs) == int):
            return self.predict([
                np.array(customer_idxs).reshape((1, )),
                np.array(product_idxs).reshape((1, ))
            ])

        if (type(customer_idxs) == str and type(product_idxs) == str):
            return self.predict([
                np.array(customerMapping[customer_idxs]).reshape((1, )),
                np.array(productMapping[product_idxs]).reshape((1, ))
            ])

        return self.predict([
            np.array([
                customerMapping[customer_idx] for customer_idx in customer_idxs
            ]),
            np.array(
                [productMapping[product_idx] for product_idx in product_idxs])
        ])


# ## 4.3 Hyperparameters

# In[17]:

bs = 64
val_per = 0.25
epochs = 3

# ## 4.4 Model Definition

# In[18]:

model = DeepCollaborativeFiltering(n_customers, n_products, n_factors)
model.summary()

# # 5 Training

# In[19]:

model.compile(optimizer='adam', loss=mean_squared_logarithmic_error)
model.fit(x=[customer_idxs, product_idxs],
          y=ratings,
          batch_size=bs,
          epochs=epochs,
          validation_split=val_per)
print('Done training!')

# ## 5.1 Log model and metrics

# In[20]:

logmodel = exec.log_output(
    metadata.Model(name="DeepCollaborativeFiltering",
                   description="Model for product recommender",
                   uri="",
                   model_type="neural network",
                   version=execTime,
                   training_framework={
                       "name": "tensorflow",
                       "version": "v1.14"
                   },
                   hyperparameters={
                       "batch_size": 64,
                       "validation_split": 0.25,
                       "layers": [n_customers, n_products, n_factors],
                       "epochs": 3
                   }))
metrics = exec.log_output(
    metadata.Metrics(name="Model for product recommender evaluation",
                     description="Validating of the recommender model",
                     uri="",
                     version=execTime,
                     data_set_id=data_set.id,
                     model_id=logmodel.id))

# # 6 Get current output directory for model

# In[21]:

directorystream = minioClient.get_object('data', 'recommender/directory.txt')
directory = ""
for d in directorystream.stream(32 * 1024):
    directory += d.decode('utf-8')
arg_version = "1"
export_path = 's3://models/' + directory + '/' + arg_version + '/'
print('Exporting trained model to', export_path)

# ## 6.1 Export models

# In[22]:

# inputs/outputs
tensor_info_users = tf.saved_model.utils.build_tensor_info(model.input[0])
tensor_info_products = tf.saved_model.utils.build_tensor_info(model.input[1])
tensor_info_pred = tf.saved_model.utils.build_tensor_info(model.output)

print("tensor_info_users", tensor_info_users.name)
print("tensor_info_products", tensor_info_products.name)
print("tensor_info_pred", tensor_info_pred.name)

# In[23]:

# signature
prediction_signature = (tf.saved_model.signature_def_utils.build_signature_def(
    inputs={
        "users": tensor_info_users,
        "products": tensor_info_products
    },
    outputs={"predictions": tensor_info_pred},
    method_name=tf.saved_model.signature_constants.PREDICT_METHOD_NAME))
# export
legacy_init_op = tf.group(tf.tables_initializer(), name='legacy_init_op')
builder = tf.saved_model.builder.SavedModelBuilder(export_path)
builder.add_meta_graph_and_variables(
    sess, [tf.saved_model.tag_constants.SERVING],
    signature_def_map={
        tf.saved_model.signature_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY:
        prediction_signature,
    },
    legacy_init_op=legacy_init_op)
builder.save()

# # 7 Restarting of the model serving server
#
# In order for a new model to take effect it is also necessary to restart a model server.
# The issue here is that we are not changing the model version version and as a result,
# the model will not be updated. To ensure model update, we are here restarting a server -
# simply killing the running instance, and as a server is installed using deployment, the instance
# will be recreated. Additionally for pods operations to work correctly from the notebook,
# it is necessary to create permissions allowing for access to pods in another namespace.
# Look at the podaccessroles.yaml for details.

# In[24]:

recommender = "recommendermodelserver-"
if directory == "recommender1":
    recommender = "recommendermodelserver1-"
print("pod prefix ", recommender)

namespace = "kubeflow"
print("pod namespace ", namespace)

# In[26]:

# Get full pod name for the current model

k8s_config.load_incluster_config()

v1 = k8s_client.CoreV1Api()

pod_list = v1.list_namespaced_pod(namespace)
pod = [
    item.metadata.name for item in pod_list.items
    if recommender in item.metadata.name
][0]
print("Current pod name ", pod)

# In[27]:

# Delete pod, so that it gets recreated
v1.delete_namespaced_pod(pod, namespace, grace_period_seconds=0)

print("Done deleting")

# In[28]:

# Verify that the new instance was created
time.sleep(20)
pod_list = v1.list_namespaced_pod(namespace)
pod = [
    item.metadata.name for item in pod_list.items
    if recommender in item.metadata.name
][0]
print("New pod name ", pod)

# In[ ]:


================================================
FILE: recommender/docker/Dockerfile
================================================
FROM  tensorflow/tensorflow:1.15.0-py3
RUN pip3 install --upgrade pip
RUN pip3 install pandas --upgrade
RUN pip3 install keras --upgrade
RUN pip3 install minio --upgrade
RUN pip3 install kubernetes --upgrade
RUN pip3 install kfmd --upgrade

RUN mkdir -p /opt/kubeflow
COPY Recommender_Kubeflow.py /opt/kubeflow/
ENTRYPOINT ["python3", "/opt/kubeflow/Recommender_Kubeflow.py"]

================================================
FILE: recommender/docker/build.sh
================================================
#!/bin/bash

img='lightbend/ml-tf-recommender'
tag='0.1'
docker build -t $img:$tag .


================================================
FILE: recommender/tfservingchart/.helmignore
================================================
# Patterns to ignore when building packages.
# This supports shell glob matching, relative path matching, and
# negation (prefixed with !). Only one pattern per line.
.DS_Store
# Common VCS dirs
.git/
.gitignore
.bzr/
.bzrignore
.hg/
.hgignore
.svn/
# Common backup files
*.swp
*.bak
*.tmp
*~
# Various IDEs
.project
.idea/
*.tmproj


================================================
FILE: recommender/tfservingchart/Chart.yaml
================================================
apiVersion: v1
appVersion: 1.14.0
description: TF Serving
maintainers:
- name: Boris Lublinsky
name: TF Serving Recommender model server
version: 1.0.0

================================================
FILE: recommender/tfservingchart/templates/NOTES.txt
================================================
Kubeflow Model serving components : tfserving is installed


================================================
FILE: recommender/tfservingchart/templates/_helpers.tpl
================================================
{{/* vim: set filetype=mustache: */}}
{{/*
Expand the name of the chart.
*/}}
{{- define "modelserverchart.name" -}}
{{- default .Chart.Name .Values.nameOverride | trunc 63 | trimSuffix "-" -}}
{{- end -}}

{{/*
Create a default fully qualified app name.
We truncate at 63 chars because some Kubernetes name fields are limited to this (by the DNS naming spec).
*/}}
{{- define "modelserverchart.fullname" -}}
{{- $name := default .Chart.Name .Values.nameOverride -}}
{{- printf "%s-%s" .Release.Name $name | trunc 63 | trimSuffix "-" -}}
{{- end -}}


================================================
FILE: recommender/tfservingchart/templates/minioaccess.yaml
================================================
apiVersion: v1
kind: Secret
metadata:
  name: minioaccess
  namespace: kubeflow
data:
  AWS_ACCESS_KEY_ID: bWluaW8=
  AWS_SECRET_ACCESS_KEY: bWluaW8xMjM=

================================================
FILE: recommender/tfservingchart/templates/tfserving.yaml
================================================
apiVersion: apps/v1
kind: Deployment
metadata:
  namespace: kubeflow
  name: recommendermodelserver
  labels:
    app: recommendermodelserver
spec:
  replicas: 1
  selector:
    matchLabels:
      app: recommendermodelserver
  strategy:
    type: RollingUpdate
  template:
    metadata:
      labels:
        app: recommendermodelserver
    spec:
      containers:
        - name: serving
          image: "{{ .Values.image.server }}:{{ .Values.image.version }}"
          imagePullPolicy: "{{ .Values.image.pullPolicy }}"
          ports:
            - containerPort: 8500
              name: grpc
              protocol: TCP
            - containerPort: 8501
              name: http
              protocol: TCP
          readinessProbe:
            tcpSocket:
              port: http
            initialDelaySeconds: 15
            timeoutSeconds: 1
          livenessProbe:
            initialDelaySeconds: 30
            periodSeconds: 30
            tcpSocket:
              port: htttp
          resources:
            limits:
              cpu: "2"
              memory: 2Gi
            requests:
              cpu: "1"
              memory: 1Gi
          env:
            - name: "AWS_REGION"
              value: "us-west-1"
            - name: "S3_REGION"
              value: "us-west-1"
            - name: "S3_ENDPOINT"
              value: "minio-service.kubeflow.svc.cluster.local:9000"
            - name: "S3_USE_HTTPS"
              value: "0"
            - name: "S3_VERIFY_SSL"
              value: "0"
            - name: "AWS_ACCESS_KEY_ID"
              valueFrom: { secretKeyRef: { name: "minioaccess", key: "AWS_ACCESS_KEY_ID" } }
            - name: "AWS_SECRET_ACCESS_KEY"
              valueFrom: { secretKeyRef: { name: "minioaccess", key: "AWS_SECRET_ACCESS_KEY" } }
            - name: "MODEL_BASE_PATH"
              value: "s3://models"
            - name: "MODEL_NAME"
              value: "recommender"
          volumes:
            - name: secret-volume
              secret:
                secretName: minioaccess
---
apiVersion: v1
kind: Service
metadata:
  namespace: kubeflow
  name: recommendermodelserver
spec:
  selector:
    app: recommendermodelserver
  ports:
    - name: grpc
      protocol: TCP
      port: 8500
      targetPort: 8500
    - name: http
      protocol: TCP
      port: 8501
      targetPort: 8501


================================================
FILE: recommender/tfservingchart/templates/tfserving1.yaml
================================================
apiVersion: apps/v1
kind: Deployment
metadata:
  namespace: kubeflow
  name: recommendermodelserver1
  labels:
    app: recommendermodelserver1
spec:
  replicas: 1
  selector:
    matchLabels:
      app: recommendermodelserver1
  strategy:
    type: RollingUpdate
  template:
    metadata:
      labels:
        app: recommendermodelserver1
    spec:
      containers:
        - name: serving
          image: "{{ .Values.image.server }}:{{ .Values.image.version }}"
          imagePullPolicy: "{{ .Values.image.pullPolicy }}"
          ports:
            - containerPort: 8500
              name: grpc
              protocol: TCP
            - containerPort: 8501
              name: http
              protocol: TCP
          readinessProbe:
            tcpSocket:
              port: http
            initialDelaySeconds: 15
            timeoutSeconds: 1
          livenessProbe:
            initialDelaySeconds: 30
            periodSeconds: 30
            tcpSocket:
              port: htttp
          resources:
            limits:
              cpu: "2"
              memory: 2Gi
            requests:
              cpu: "1"
              memory: 1Gi
          env:
            - name: "AWS_REGION"
              value: "us-west-1"
            - name: "S3_REGION"
              value: "us-west-1"
            - name: "S3_ENDPOINT"
              value: "minio-service.kubeflow.svc.cluster.local:9000"
            - name: "S3_USE_HTTPS"
              value: "0"
            - name: "S3_VERIFY_SSL"
              value: "0"
            - name: "AWS_ACCESS_KEY_ID"
              valueFrom: { secretKeyRef: { name: "minioaccess", key: "AWS_ACCESS_KEY_ID" } }
            - name: "AWS_SECRET_ACCESS_KEY"
              valueFrom: { secretKeyRef: { name: "minioaccess", key: "AWS_SECRET_ACCESS_KEY" } }
            - name: "MODEL_BASE_PATH"
              value: "s3://models"
            - name: "MODEL_NAME"
              value: "recommender1"
          volumes:
            - name: secret-volume
              secret:
                secretName: minioaccess
---
apiVersion: v1
kind: Service
metadata:
  namespace: kubeflow
  name: recommendermodelserver1
spec:
  selector:
    app: recommendermodelserver1
  ports:
    - name: grpc
      protocol: TCP
      port: 8500
      targetPort: 8500
    - name: http
      protocol: TCP
      port: 8501
      targetPort: 8501


================================================
FILE: recommender/tfservingchart/values.yaml
================================================
# application name is a namespace
# docker images
image:
  server: tensorflow/serving
  pullPolicy: Always
  version: 1.15.0


================================================
FILE: runthrough.sh
================================================
#!/bin/bash
set -ex
example_repo_home="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )"
KF_PLATFORM=${KF_PLATFORM:-minikube}
export KF_PLATFORM

if [ "$PLATFORM" == "gcp" ]; then
  # In GCP we also need a default zone
  gcloud config set compute/zone us-west1-b
fi

pushd dev-setup
command -v kfctl >/dev/null 2>&1 || source install-kf.sh
command -v kustomize >/dev/null 2>&1 || source install-kustomize.sh
command -v argo >/dev/null 2>&1 || source install-argo.sh
source install-kf-pipeline-sdk.sh
popd
mkdir -p /tmp/abc
pushd /tmp/abc
source "${example_repo_home}/ch2_seldon_examples/setup_example.sh"
popd
# rm -rf /tmp/abc


================================================
FILE: scikitLearn/python/IncomePrediction.ipynb
================================================
{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Income prediction\n",
    "based on Seldon's implementation\n",
    "https://github.com/SeldonIO/alibi/blob/master/examples/anchor_tabular_adult.ipynb and\n",
    "https://github.com/SeldonIO/alibi/blob/5aec3ab4ce651ca2249bf849ecb434371c9278e4/alibi/datasets.py#L183"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Requirement already up-to-date: pandas in ./.local/lib/python3.6/site-packages (1.0.3)\n",
      "Requirement already satisfied, skipping upgrade: numpy>=1.13.3 in /usr/local/lib/python3.6/dist-packages (from pandas) (1.18.1)\n",
      "Requirement already satisfied, skipping upgrade: pytz>=2017.2 in /usr/local/lib/python3.6/dist-packages (from pandas) (2019.3)\n",
      "Requirement already satisfied, skipping upgrade: python-dateutil>=2.6.1 in /usr/local/lib/python3.6/dist-packages (from pandas) (2.8.1)\n",
      "Requirement already satisfied, skipping upgrade: six>=1.5 in /usr/lib/python3/dist-packages (from python-dateutil>=2.6.1->pandas) (1.11.0)\n",
      "Requirement already up-to-date: scikit-learn in ./.local/lib/python3.6/site-packages (0.22.2.post1)\n",
      "Requirement already satisfied, skipping upgrade: joblib>=0.11 in ./.local/lib/python3.6/site-packages (from scikit-learn) (0.14.1)\n",
      "Requirement already satisfied, skipping upgrade: numpy>=1.11.0 in /usr/local/lib/python3.6/dist-packages (from scikit-learn) (1.18.1)\n",
      "Requirement already satisfied, skipping upgrade: scipy>=0.17.0 in /usr/local/lib/python3.6/dist-packages (from scikit-learn) (1.4.1)\n",
      "Requirement already up-to-date: alibi in ./.local/lib/python3.6/site-packages (0.4.0)\n",
      "Requirement already satisfied, skipping upgrade: scikit-learn in ./.local/lib/python3.6/site-packages (from alibi) (0.22.2.post1)\n",
      "Requirement already satisfied, skipping upgrade: attrs in /usr/local/lib/python3.6/dist-packages (from alibi) (19.3.0)\n",
      "Requirement already satisfied, skipping upgrade: beautifulsoup4 in ./.local/lib/python3.6/site-packages (from alibi) (4.8.2)\n",
      "Requirement already satisfied, skipping upgrade: spacy in ./.local/lib/python3.6/site-packages (from alibi) (2.2.4)\n",
      "Requirement already satisfied, skipping upgrade: shap in ./.local/lib/python3.6/site-packages (from alibi) (0.35.0)\n",
      "Requirement already satisfied, skipping upgrade: scipy in /usr/local/lib/python3.6/dist-packages (from alibi) (1.4.1)\n",
      "Requirement already satisfied, skipping upgrade: requests in /usr/local/lib/python3.6/dist-packages (from alibi) (2.22.0)\n",
      "Requirement already satisfied, skipping upgrade: numpy in /usr/local/lib/python3.6/dist-packages (from alibi) (1.18.1)\n",
      "Requirement already satisfied, skipping upgrade: Pillow in ./.local/lib/python3.6/site-packages (from alibi) (7.0.0)\n",
      "Requirement already satisfied, skipping upgrade: tensorflow<2.0 in /usr/local/lib/python3.6/dist-packages (from alibi) (1.15.2)\n",
      "Requirement already satisfied, skipping upgrade: pandas in ./.local/lib/python3.6/site-packages (from alibi) (1.0.3)\n",
      "Requirement already satisfied, skipping upgrade: prettyprinter in ./.local/lib/python3.6/site-packages (from alibi) (0.18.0)\n",
      "Requirement already satisfied, skipping upgrade: scikit-image in ./.local/lib/python3.6/site-packages (from alibi) (0.16.2)\n",
      "Requirement already satisfied, skipping upgrade: joblib>=0.11 in ./.local/lib/python3.6/site-packages (from scikit-learn->alibi) (0.14.1)\n",
      "Requirement already satisfied, skipping upgrade: soupsieve>=1.2 in ./.local/lib/python3.6/site-packages (from beautifulsoup4->alibi) (2.0)\n",
      "Requirement already satisfied, skipping upgrade: srsly<1.1.0,>=1.0.2 in ./.local/lib/python3.6/site-packages (from spacy->alibi) (1.0.2)\n",
      "Requirement already satisfied, skipping upgrade: preshed<3.1.0,>=3.0.2 in ./.local/lib/python3.6/site-packages (from spacy->alibi) (3.0.2)\n",
      "Requirement already satisfied, skipping upgrade: plac<1.2.0,>=0.9.6 in ./.local/lib/python3.6/site-packages (from spacy->alibi) (1.1.3)\n",
      "Requirement already satisfied, skipping upgrade: blis<0.5.0,>=0.4.0 in ./.local/lib/python3.6/site-packages (from spacy->alibi) (0.4.1)\n",
      "Requirement already satisfied, skipping upgrade: cymem<2.1.0,>=2.0.2 in ./.local/lib/python3.6/site-packages (from spacy->alibi) (2.0.3)\n",
      "Requirement already satisfied, skipping upgrade: tqdm<5.0.0,>=4.38.0 in ./.local/lib/python3.6/site-packages (from spacy->alibi) (4.43.0)\n",
      "Requirement already satisfied, skipping upgrade: catalogue<1.1.0,>=0.0.7 in ./.local/lib/python3.6/site-packages (from spacy->alibi) (1.0.0)\n",
      "Requirement already satisfied, skipping upgrade: thinc==7.4.0 in ./.local/lib/python3.6/site-packages (from spacy->alibi) (7.4.0)\n",
      "Requirement already satisfied, skipping upgrade: murmurhash<1.1.0,>=0.28.0 in ./.local/lib/python3.6/site-packages (from spacy->alibi) (1.0.2)\n",
      "Requirement already satisfied, skipping upgrade: setuptools in /usr/local/lib/python3.6/dist-packages (from spacy->alibi) (45.1.0)\n",
      "Requirement already satisfied, skipping upgrade: wasabi<1.1.0,>=0.4.0 in ./.local/lib/python3.6/site-packages (from spacy->alibi) (0.6.0)\n",
      "Requirement already satisfied, skipping upgrade: certifi>=2017.4.17 in /usr/local/lib/python3.6/dist-packages (from requests->alibi) (2019.11.28)\n",
      "Requirement already satisfied, skipping upgrade: idna<2.9,>=2.5 in /usr/lib/python3/dist-packages (from requests->alibi) (2.6)\n",
      "Requirement already satisfied, skipping upgrade: urllib3!=1.25.0,!=1.25.1,<1.26,>=1.21.1 in ./.local/lib/python3.6/site-packages (from requests->alibi) (1.24.3)\n",
      "Requirement already satisfied, skipping upgrade: chardet<3.1.0,>=3.0.2 in /usr/local/lib/python3.6/dist-packages (from requests->alibi) (3.0.4)\n",
      "Requirement already satisfied, skipping upgrade: keras-preprocessing>=1.0.5 in /usr/local/lib/python3.6/dist-packages (from tensorflow<2.0->alibi) (1.1.0)\n",
      "Requirement already satisfied, skipping upgrade: gast==0.2.2 in /usr/local/lib/python3.6/dist-packages (from tensorflow<2.0->alibi) (0.2.2)\n",
      "Requirement already satisfied, skipping upgrade: wheel>=0.26; python_version >= \"3\" in /usr/lib/python3/dist-packages (from tensorflow<2.0->alibi) (0.30.0)\n",
      "Requirement already satisfied, skipping upgrade: six>=1.10.0 in /usr/lib/python3/dist-packages (from tensorflow<2.0->alibi) (1.11.0)\n",
      "Requirement already satisfied, skipping upgrade: termcolor>=1.1.0 in /usr/local/lib/python3.6/dist-packages (from tensorflow<2.0->alibi) (1.1.0)\n",
      "Requirement already satisfied, skipping upgrade: astor>=0.6.0 in /usr/local/lib/python3.6/dist-packages (from tensorflow<2.0->alibi) (0.8.1)\n",
      "Requirement already satisfied, skipping upgrade: keras-applications>=1.0.8 in /usr/local/lib/python3.6/dist-packages (from tensorflow<2.0->alibi) (1.0.8)\n",
      "Requirement already satisfied, skipping upgrade: opt-einsum>=2.3.2 in /usr/local/lib/python3.6/dist-packages (from tensorflow<2.0->alibi) (3.1.0)\n",
      "Requirement already satisfied, skipping upgrade: protobuf>=3.6.1 in /usr/local/lib/python3.6/dist-packages (from tensorflow<2.0->alibi) (3.11.2)\n",
      "Requirement already satisfied, skipping upgrade: tensorflow-estimator==1.15.1 in /usr/local/lib/python3.6/dist-packages (from tensorflow<2.0->alibi) (1.15.1)\n",
      "Requirement already satisfied, skipping upgrade: google-pasta>=0.1.6 in /usr/local/lib/python3.6/dist-packages (from tensorflow<2.0->alibi) (0.1.8)\n",
      "Requirement already satisfied, skipping upgrade: wrapt>=1.11.1 in /usr/local/lib/python3.6/dist-packages (from tensorflow<2.0->alibi) (1.11.2)\n",
      "Requirement already satisfied, skipping upgrade: grpcio>=1.8.6 in /usr/local/lib/python3.6/dist-packages (from tensorflow<2.0->alibi) (1.26.0)\n",
      "Requirement already satisfied, skipping upgrade: tensorboard<1.16.0,>=1.15.0 in /usr/local/lib/python3.6/dist-packages (from tensorflow<2.0->alibi) (1.15.0)\n",
      "Requirement already satisfied, skipping upgrade: absl-py>=0.7.0 in /usr/local/lib/python3.6/dist-packages (from tensorflow<2.0->alibi) (0.9.0)\n",
      "Requirement already satisfied, skipping upgrade: pytz>=2017.2 in /usr/local/lib/python3.6/dist-packages (from pandas->alibi) (2019.3)\n",
      "Requirement already satisfied, skipping upgrade: python-dateutil>=2.6.1 in /usr/local/lib/python3.6/dist-packages (from pandas->alibi) (2.8.1)\n",
      "Requirement already satisfied, skipping upgrade: Pygments>=2.2.0 in /usr/local/lib/python3.6/dist-packages (from prettyprinter->alibi) (2.5.2)\n",
      "Requirement already satisfied, skipping upgrade: colorful>=0.4.0 in ./.local/lib/python3.6/site-packages (from prettyprinter->alibi) (0.5.4)\n",
      "Requirement already satisfied, skipping upgrade: networkx>=2.0 in ./.local/lib/python3.6/site-packages (from scikit-image->alibi) (2.4)\n",
      "Requirement already satisfied, skipping upgrade: imageio>=2.3.0 in ./.local/lib/python3.6/site-packages (from scikit-image->alibi) (2.8.0)\n",
      "Requirement already satisfied, skipping upgrade: matplotlib!=3.0.0,>=2.0.0 in /usr/local/lib/python3.6/dist-packages (from scikit-image->alibi) (3.1.2)\n",
      "Requirement already satisfied, skipping upgrade: PyWavelets>=0.4.0 in ./.local/lib/python3.6/site-packages (from scikit-image->alibi) (1.1.1)\n",
      "Requirement already satisfied, skipping upgrade: importlib-metadata>=0.20; python_version < \"3.8\" in /usr/local/lib/python3.6/dist-packages (from catalogue<1.1.0,>=0.0.7->spacy->alibi) (1.4.0)\n",
      "Requirement already satisfied, skipping upgrade: h5py in /usr/local/lib/python3.6/dist-packages (from keras-applications>=1.0.8->tensorflow<2.0->alibi) (2.10.0)\n",
      "Requirement already satisfied, skipping upgrade: werkzeug>=0.11.15 in /usr/local/lib/python3.6/dist-packages (from tensorboard<1.16.0,>=1.15.0->tensorflow<2.0->alibi) (0.16.1)\n",
      "Requirement already satisfied, skipping upgrade: markdown>=2.6.8 in /usr/local/lib/python3.6/dist-packages (from tensorboard<1.16.0,>=1.15.0->tensorflow<2.0->alibi) (3.1.1)\n",
      "Requirement already satisfied, skipping upgrade: decorator>=4.3.0 in /usr/local/lib/python3.6/dist-packages (from networkx>=2.0->scikit-image->alibi) (4.4.1)\n",
      "Requirement already satisfied, skipping upgrade: kiwisolver>=1.0.1 in /usr/local/lib/python3.6/dist-packages (from matplotlib!=3.0.0,>=2.0.0->scikit-image->alibi) (1.1.0)\n",
      "Requirement already satisfied, skipping upgrade: cycler>=0.10 in /usr/local/lib/python3.6/dist-packages (from matplotlib!=3.0.0,>=2.0.0->scikit-image->alibi) (0.10.0)\n",
      "Requirement already satisfied, skipping upgrade: pyparsing!=2.0.4,!=2.1.2,!=2.1.6,>=2.0.1 in /usr/local/lib/python3.6/dist-packages (from matplotlib!=3.0.0,>=2.0.0->scikit-image->alibi) (2.4.6)\n",
      "Requirement already satisfied, skipping upgrade: zipp>=0.5 in /usr/local/lib/python3.6/dist-packages (from importlib-metadata>=0.20; python_version < \"3.8\"->catalogue<1.1.0,>=0.0.7->spacy->alibi) (2.1.0)\r\n"
     ]
    }
   ],
   "source": [
    "!pip install pandas --upgrade --user\n",
    "!pip install scikit-learn --upgrade --user\n",
    "!pip install alibi --upgrade --user"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "import numpy as np\n",
    "import pandas as pd\n",
    "from sklearn.ensemble import RandomForestClassifier\n",
    "from sklearn.compose import ColumnTransformer\n",
    "from sklearn.pipeline import Pipeline\n",
    "from sklearn.impute import SimpleImputer\n",
    "from sklearn.metrics import accuracy_score\n",
    "from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder\n",
    "from alibi.explainers import AnchorTabular\n",
    "from alibi.datasets import fetch_adult\n",
    "from alibi.utils.data import Bunch, gen_category_map\n",
    "from typing import Tuple, Union\n",
    "import requests\n",
    "from requests import RequestException\n",
    "from io import BytesIO, StringIO"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Fetching and preprocessing data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "def fetch_adult(features_drop: list = None, return_X_y: bool = False, url_id: int = 0) -> Union[Bunch, Tuple[np.ndarray, np.ndarray]]:\n",
    "    \"\"\"\n",
    "    Downloads and pre-processes 'adult' dataset.\n",
    "    More info: http://mlr.cs.umass.edu/ml/machine-learning-databases/adult/\n",
    "    Parameters\n",
    "    ----------\n",
    "    features_drop\n",
    "        List of features to be dropped from dataset, by default drops [\"fnlwgt\", \"Education-Num\"]\n",
    "    return_X_y\n",
    "        If true, return features X and labels y as numpy arrays, if False return a Bunch object\n",
    "    url_id\n",
    "        Index specifying which URL to use for downloading\n",
    "    Returns\n",
    "    -------\n",
    "    Bunch\n",
    "        Dataset, labels, a list of features and a dictionary containing a list with the potential categories\n",
    "        for each categorical feature where the key refers to the feature column.\n",
    "    (data, target)\n",
    "        Tuple if ``return_X_y`` is true\n",
    "    \"\"\"\n",
    "    ADULT_URLS = ['https://storage.googleapis.com/seldon-datasets/adult/adult.data',\n",
    "              'https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data',\n",
    "              'http://mlr.cs.umass.edu/ml/machine-learning-databases/adult/adult.data']\n",
    "    if features_drop is None:\n",
    "        features_drop = [\"fnlwgt\", \"Education-Num\"]\n",
    "\n",
    "    # download data\n",
    "    dataset_url = ADULT_URLS[url_id]\n",
    "    raw_features = ['Age', 'Workclass', 'fnlwgt', 'Education', 'Education-Num', 'Marital Status',\n",
    "                    'Occupation', 'Relationship', 'Race', 'Sex', 'Capital Gain', 'Capital Loss',\n",
    "                    'Hours per week', 'Country', 'Target']\n",
    "    try:\n",
    "        resp = requests.get(dataset_url)\n",
    "        resp.raise_for_status()\n",
    "    except RequestException:\n",
    "        logger.exception(\"Could not connect, URL may be out of service\")\n",
    "        raise\n",
    "\n",
    "    raw_data = pd.read_csv(StringIO(resp.text), names=raw_features, delimiter=', ', engine='python').fillna('?')\n",
    "\n",
    "    # get labels, features and drop unnecessary features\n",
    "    labels = (raw_data['Target'] == '>50K').astype(int).values\n",
    "    features_drop += ['Target']\n",
    "    data = raw_data.drop(features_drop, axis=1)\n",
    "    features = list(data.columns)\n",
    "\n",
    "    # map categorical features\n",
    "    education_map = {\n",
    "        '10th': 'Dropout', '11th': 'Dropout', '12th': 'Dropout', '1st-4th':\n",
    "            'Dropout', '5th-6th': 'Dropout', '7th-8th': 'Dropout', '9th':\n",
    "            'Dropout', 'Preschool': 'Dropout', 'HS-grad': 'High School grad',\n",
    "        'Some-college': 'High School grad', 'Masters': 'Masters',\n",
    "        'Prof-school': 'Prof-School', 'Assoc-acdm': 'Associates',\n",
    "        'Assoc-voc': 'Associates'\n",
    "    }\n",
    "    occupation_map = {\n",
    "        \"Adm-clerical\": \"Admin\", \"Armed-Forces\": \"Military\",\n",
    "        \"Craft-repair\": \"Blue-Collar\", \"Exec-managerial\": \"White-Collar\",\n",
    "        \"Farming-fishing\": \"Blue-Collar\", \"Handlers-cleaners\":\n",
    "            \"Blue-Collar\", \"Machine-op-inspct\": \"Blue-Collar\", \"Other-service\":\n",
    "            \"Service\", \"Priv-house-serv\": \"Service\", \"Prof-specialty\":\n",
    "            \"Professional\", \"Protective-serv\": \"Other\", \"Sales\":\n",
    "            \"Sales\", \"Tech-support\": \"Other\", \"Transport-moving\":\n",
    "            \"Blue-Collar\"\n",
    "    }\n",
    "    country_map = {\n",
    "        'Cambodia': 'SE-Asia', 'Canada': 'British-Commonwealth', 'China':\n",
    "            'China', 'Columbia': 'South-America', 'Cuba': 'Other',\n",
    "        'Dominican-Republic': 'Latin-America', 'Ecuador': 'South-America',\n",
    "        'El-Salvador': 'South-America', 'England': 'British-Commonwealth',\n",
    "        'France': 'Euro_1', 'Germany': 'Euro_1', 'Greece': 'Euro_2',\n",
    "        'Guatemala': 'Latin-America', 'Haiti': 'Latin-America',\n",
    "        'Holand-Netherlands': 'Euro_1', 'Honduras': 'Latin-America',\n",
    "        'Hong': 'China', 'Hungary': 'Euro_2', 'India':\n",
    "            'British-Commonwealth', 'Iran': 'Other', 'Ireland':\n",
    "            'British-Commonwealth', 'Italy': 'Euro_1', 'Jamaica':\n",
    "            'Latin-America', 'Japan': 'Other', 'Laos': 'SE-Asia', 'Mexico':\n",
    "            'Latin-America', 'Nicaragua': 'Latin-America',\n",
    "        'Outlying-US(Guam-USVI-etc)': 'Latin-America', 'Peru':\n",
    "            'South-America', 'Philippines': 'SE-Asia', 'Poland': 'Euro_2',\n",
    "        'Portugal': 'Euro_2', 'Puerto-Rico': 'Latin-America', 'Scotland':\n",
    "            'British-Commonwealth', 'South': 'Euro_2', 'Taiwan': 'China',\n",
    "        'Thailand': 'SE-Asia', 'Trinadad&Tobago': 'Latin-America',\n",
    "        'United-States': 'United-States', 'Vietnam': 'SE-Asia'\n",
    "    }\n",
    "    married_map = {\n",
    "        'Never-married': 'Never-Married', 'Married-AF-spouse': 'Married',\n",
    "        'Married-civ-spouse': 'Married', 'Married-spouse-absent':\n",
    "            'Separated', 'Separated': 'Separated', 'Divorced':\n",
    "            'Separated', 'Widowed': 'Widowed'\n",
    "    }\n",
    "    mapping = {'Education': education_map, 'Occupation': occupation_map, 'Country': country_map,\n",
    "               'Marital Status': married_map}\n",
    "\n",
    "    data_copy = data.copy()\n",
    "    for f, f_map in mapping.items():\n",
    "        data_tmp = data_copy[f].values\n",
    "        for key, value in f_map.items():\n",
    "            data_tmp[data_tmp == key] = value\n",
    "        data[f] = data_tmp\n",
    "\n",
    "    # get categorical features and apply labelencoding\n",
    "    categorical_features = [f for f in features if data[f].dtype == 'O']\n",
    "    category_map = {}\n",
    "    for f in categorical_features:\n",
    "        le = LabelEncoder()\n",
    "        data_tmp = le.fit_transform(data[f].values)\n",
    "        data[f] = data_tmp\n",
    "        category_map[features.index(f)] = list(le.classes_)\n",
    "\n",
    "    # only return data values\n",
    "    data = data.values\n",
    "    target_names = ['<=50K', '>50K']\n",
    "\n",
    "    if return_X_y:\n",
    "        return data, labels\n",
    "\n",
    "    return Bunch(data=data, target=labels, feature_names=features, target_names=target_names, category_map=category_map)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Load adult dataset\n",
    "The fetch_adult function returns a Bunch object containing the features, the targets, the feature names and a mapping of categorical variables to numbers which are required for formatting the output of the Anchor explainer."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "dict_keys(['data', 'target', 'feature_names', 'target_names', 'category_map'])"
      ]
     },
     "execution_count": 4,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "adult = fetch_adult()\n",
    "adult.keys()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
    "data = adult.data\n",
    "target = adult.target\n",
    "feature_names = adult.feature_names\n",
    "category_map = adult.category_map"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Define shuffled training and test set"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [],
   "source": [
    "np.random.seed(0)\n",
    "data_perm = np.random.permutation(np.c_[data, target])\n",
    "data = data_perm[:,:-1]\n",
    "target = data_perm[:,-1]\n",
    "idx = 30000\n",
    "X_train,Y_train = data[:idx,:], target[:idx]\n",
    "X_test, Y_test = data[idx+1:,:], target[idx+1:]"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Create feature transformation pipeline\n",
    "Create feature pre-processor. Needs to have 'fit' and 'transform' methods. Different types of pre-processing can be applied to all or part of the features. In the example below we will standardize ordinal features and apply one-hot-encoding to categorical features.\n",
    "\n",
    "Ordinal features:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [],
   "source": [
    "ordinal_features = [x for x in range(len(feature_names)) if x not in list(category_map.keys())]\n",
    "ordinal_transformer = Pipeline(steps=[('imputer', SimpleImputer(strategy='median')),\n",
    "                                      ('scaler', StandardScaler())])"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Categorical features:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [],
   "source": [
    "categorical_features = list(category_map.keys())\n",
    "categorical_transformer = Pipeline(steps=[('imputer', SimpleImputer(strategy='median')),\n",
    "                                          ('onehot', OneHotEncoder(handle_unknown='ignore'))])"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Combine and fit:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "ColumnTransformer(n_jobs=None, remainder='drop', sparse_threshold=0.3,\n",
       "                  transformer_weights=None,\n",
       "                  transformers=[('num',\n",
       "                                 Pipeline(memory=None,\n",
       "                                          steps=[('imputer',\n",
       "                                                  SimpleImputer(add_indicator=False,\n",
       "                                                                copy=True,\n",
       "                                                                fill_value=None,\n",
       "                                                                missing_values=nan,\n",
       "                                                                strategy='median',\n",
       "                                                                verbose=0)),\n",
       "                                                 ('scaler',\n",
       "                                                  StandardScaler(copy=True,\n",
       "                                                                 with_mean=True,\n",
       "                                                                 with_std=True))],\n",
       "                                          verbose=False),\n",
       "                                 [0, 8, 9, 10]),\n",
       "                                ('cat',\n",
       "                                 Pipeline(memory=None,\n",
       "                                          steps=[('imputer',\n",
       "                                                  SimpleImputer(add_indicator=False,\n",
       "                                                                copy=True,\n",
       "                                                                fill_value=None,\n",
       "                                                                missing_values=nan,\n",
       "                                                                strategy='median',\n",
       "                                                                verbose=0)),\n",
       "                                                 ('onehot',\n",
       "                                                  OneHotEncoder(categories='auto',\n",
       "                                                                drop=None,\n",
       "                                                                dtype=<class 'numpy.float64'>,\n",
       "                                                                handle_unknown='ignore',\n",
       "                                                                sparse=True))],\n",
       "                                          verbose=False),\n",
       "                                 [1, 2, 3, 4, 5, 6, 7, 11])],\n",
       "                  verbose=False)"
      ]
     },
     "execution_count": 9,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "preprocessor = ColumnTransformer(transformers=[('num', ordinal_transformer, ordinal_features),\n",
    "                                               ('cat', categorical_transformer, categorical_features)])\n",
    "preprocessor.fit(X_train)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Train Random Forest model\n",
    "Fit on pre-processed (imputing, OHE, standardizing) data."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,\n",
       "                       criterion='gini', max_depth=None, max_features='auto',\n",
       "                       max_leaf_nodes=None, max_samples=None,\n",
       "                       min_impurity_decrease=0.0, min_impurity_split=None,\n",
       "                       min_samples_leaf=1, min_samples_split=2,\n",
       "                       min_weight_fraction_leaf=0.0, n_estimators=50,\n",
       "                       n_jobs=None, oob_score=False, random_state=None,\n",
       "                       verbose=0, warm_start=False)"
      ]
     },
     "execution_count": 10,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "np.random.seed(0)\n",
    "clf = RandomForestClassifier(n_estimators=50)\n",
    "clf.fit(preprocessor.transform(X_train), Y_train)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Define predict function"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Train accuracy:  0.9655333333333334\n",
      "Test accuracy:  0.855859375\n"
     ]
    }
   ],
   "source": [
    "predict_fn = lambda x: clf.predict(preprocessor.transform(x))\n",
    "print('Train accuracy: ', accuracy_score(Y_train, predict_fn(X_train)))\n",
    "print('Test accuracy: ', accuracy_score(Y_test, predict_fn(X_test)))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Initialize and fit anchor explainer for tabular data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [],
   "source": [
    "explainer = AnchorTabular(predict_fn, feature_names, categorical_names=category_map, seed=1)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Discretize the ordinal features into quartiles"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "AnchorTabular(meta={\n",
       "    'name': 'AnchorTabular',\n",
       "    'type': ['blackbox'],\n",
       "    'explanations': ['local'],\n",
       "    'params': {'seed': 1, 'disc_perc': [25, 50, 75]}\n",
       "})"
      ]
     },
     "execution_count": 13,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "explainer.fit(X_train, disc_perc=[25, 50, 75])"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Getting an anchor\n",
    "Below, we get an anchor for the prediction of the first observation in the test set. An anchor is a sufficient condition - that is, when the anchor holds, the prediction should be the same as the prediction for this instance."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Prediction:  <=50K\n"
     ]
    }
   ],
   "source": [
    "idx = 0\n",
    "class_names = adult.target_names\n",
    "print('Prediction: ', class_names[explainer.predictor(X_test[idx].reshape(1, -1))[0]])"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "We set the precision threshold to 0.95. This means that predictions on observations where the anchor holds will be the same as the prediction on the explained instance at least 95% of the time."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Anchor: Marital Status = Separated AND Sex = Female\n",
      "Precision: 0.95\n",
      "Coverage: 0.18\n"
     ]
    }
   ],
   "source": [
    "explanation = explainer.explain(X_test[idx], threshold=0.95)\n",
    "print('Anchor: %s' % (' AND '.join(explanation.anchor)))\n",
    "print('Precision: %.2f' % explanation.precision)\n",
    "print('Coverage: %.2f' % explanation.coverage)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# ...or not?\n",
    "Let's try getting an anchor for a different observation in the test set - one for the which the prediction is >50K."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Prediction:  >50K\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Could not find an result satisfying the 0.95 precision constraint. Now returning the best non-eligible result.\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Anchor: Capital Loss > 0.00 AND Relationship = Husband AND Marital Status = Married AND Age > 37.00 AND Race = White AND Country = United-States AND Sex = Male\n",
      "Precision: 0.71\n",
      "Coverage: 0.05\n"
     ]
    }
   ],
   "source": [
    "idx = 6\n",
    "class_names = adult.target_names\n",
    "print('Prediction: ', class_names[explainer.predictor(X_test[idx].reshape(1, -1))[0]])\n",
    "\n",
    "explanation = explainer.explain(X_test[idx], threshold=0.95)\n",
    "print('Anchor: %s' % (' AND '.join(explanation.anchor)))\n",
    "print('Precision: %.2f' % explanation.precision)\n",
    "print('Coverage: %.2f' % explanation.coverage)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Notice how no anchor is found!\n",
    "\n",
    "This is due to the imbalanced dataset (roughly 25:75 high:low earner proportion), so during the sampling stage feature ranges corresponding to low-earners will be oversampled. This is a feature because it can point out an imbalanced dataset, but it can also be fixed by producing balanced datasets to enable anchors to be found for either class."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.9"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}