[
  {
    "path": ".circleci/config.yml",
    "content": "version: 2\n\napt-run:  &apt-install\n  name: Install apt packages\n  command: |\n    sudo apt-get -qq update\n    sudo apt-get install -y \\\n      shellcheck\n\njobs:\n  build:\n    working_directory: ~/mermaid-starter\n    docker:\n      - image: circleci/python:3.6-jessie-node-browsers-legacy\n    steps:\n      - checkout\n      - run: *apt-install\n      - run:\n        name: Run our basic shell CI\n        command: ./ci.sh"
  },
  {
    "path": ".gitignore",
    "content": "# Byte-compiled / optimized / DLL files\n__pycache__/\n*.py[cod]\n*$py.class\n\n# C extensions\n*.so\n\n# Distribution / packaging\n.idea\n.Python\nbuild/\ndevelop-eggs/\ndist/\ndownloads/\neggs/\n.eggs/\nlib/\nlib64/\nparts/\nsdist/\nvar/\nwheels/\n*.egg-info/\n.installed.cfg\n*.egg\nMANIFEST\n\n# PyInstaller\n#  Usually these files are written by a python script from a template\n#  before PyInstaller builds the exe, so as to inject date/other infos into it.\n*.manifest\n*.spec\n\n\n# Installer logs\npip-log.txt\npip-delete-this-directory.txt\n\n# Unit test / coverage reports\nhtmlcov/\n.tox/\n.coverage\n.coverage.*\n.cache\nnosetests.xml\ncoverage.xml\n*.cover\n.hypothesis/\n.pytest_cache/\n\n# Translations\n*.mo\n*.pot\n\n# Django stuff:\n*.log\nlocal_settings.py\ndb.sqlite3\n\n# Flask stuff:\ninstance/\n.webassets-cache\n\n# Scrapy stuff:\n.scrapy\n\n# Sphinx documentation\ndocs/_build/\n\n# PyBuilder\ntarget/\n\n# Jupyter Notebook\n.ipynb_checkpoints\n\n# pyenv\n.python-version\n\n# celery beat schedule file\ncelerybeat-schedule\n\n# SageMath parsed files\n*.sage.py\n\n# Environments\n.env\n.venv\nenv/\nvenv/\nENV/\nenv.bak/\nvenv.bak/\n\n# Spyder project settings\n.spyderproject\n.spyproject\n\n# Rope project settings\n.ropeproject\n\n# mkdocs documentation\n/site\n\n# mypy\n.mypy_cache/\n\n# Emacs\n*~\n\n# Ignore kfctl's downloaded\nkfctl*.t*z"
  },
  {
    "path": ".travis.yaml",
    "content": "language: generic\nsudo: true\naddons:\n  apt:\n    packages:\n     - shellcheck\nscript:\n  - ./ci.sh"
  },
  {
    "path": "LICENSE",
    "content": "                                 Apache License\n                           Version 2.0, January 2004\n                        http://www.apache.org/licenses/\n\n   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION\n\n   1. Definitions.\n\n      \"License\" shall mean the terms and conditions for use, reproduction,\n      and distribution as defined by Sections 1 through 9 of this document.\n\n      \"Licensor\" shall mean the copyright owner or entity authorized by\n      the copyright owner that is granting the License.\n\n      \"Legal Entity\" shall mean the union of the acting entity and all\n      other entities that control, are controlled by, or are under common\n      control with that entity. For the purposes of this definition,\n      \"control\" means (i) the power, direct or indirect, to cause the\n      direction or management of such entity, whether by contract or\n      otherwise, or (ii) ownership of fifty percent (50%) or more of the\n      outstanding shares, or (iii) beneficial ownership of such entity.\n\n      \"You\" (or \"Your\") shall mean an individual or Legal Entity\n      exercising permissions granted by this License.\n\n      \"Source\" form shall mean the preferred form for making modifications,\n      including but not limited to software source code, documentation\n      source, and configuration files.\n\n      \"Object\" form shall mean any form resulting from mechanical\n      transformation or translation of a Source form, including but\n      not limited to compiled object code, generated documentation,\n      and conversions to other media types.\n\n      \"Work\" shall mean the work of authorship, whether in Source or\n      Object form, made available under the License, as indicated by a\n      copyright notice that is included in or attached to the work\n      (an example is provided in the Appendix below).\n\n      \"Derivative Works\" shall mean any work, whether in Source or Object\n      form, that is based on (or derived from) the Work and for which the\n      editorial revisions, annotations, elaborations, or other modifications\n      represent, as a whole, an original work of authorship. For the purposes\n      of this License, Derivative Works shall not include works that remain\n      separable from, or merely link (or bind by name) to the interfaces of,\n      the Work and Derivative Works thereof.\n\n      \"Contribution\" shall mean any work of authorship, including\n      the original version of the Work and any modifications or additions\n      to that Work or Derivative Works thereof, that is intentionally\n      submitted to Licensor for inclusion in the Work by the copyright owner\n      or by an individual or Legal Entity authorized to submit on behalf of\n      the copyright owner. For the purposes of this definition, \"submitted\"\n      means any form of electronic, verbal, or written communication sent\n      to the Licensor or its representatives, including but not limited to\n      communication on electronic mailing lists, source code control systems,\n      and issue tracking systems that are managed by, or on behalf of, the\n      Licensor for the purpose of discussing and improving the Work, but\n      excluding communication that is conspicuously marked or otherwise\n      designated in writing by the copyright owner as \"Not a Contribution.\"\n\n      \"Contributor\" shall mean Licensor and any individual or Legal Entity\n      on behalf of whom a Contribution has been received by Licensor and\n      subsequently incorporated within the Work.\n\n   2. Grant of Copyright License. Subject to the terms and conditions of\n      this License, each Contributor hereby grants to You a perpetual,\n      worldwide, non-exclusive, no-charge, royalty-free, irrevocable\n      copyright license to reproduce, prepare Derivative Works of,\n      publicly display, publicly perform, sublicense, and distribute the\n      Work and such Derivative Works in Source or Object form.\n\n   3. Grant of Patent License. Subject to the terms and conditions of\n      this License, each Contributor hereby grants to You a perpetual,\n      worldwide, non-exclusive, no-charge, royalty-free, irrevocable\n      (except as stated in this section) patent license to make, have made,\n      use, offer to sell, sell, import, and otherwise transfer the Work,\n      where such license applies only to those patent claims licensable\n      by such Contributor that are necessarily infringed by their\n      Contribution(s) alone or by combination of their Contribution(s)\n      with the Work to which such Contribution(s) was submitted. If You\n      institute patent litigation against any entity (including a\n      cross-claim or counterclaim in a lawsuit) alleging that the Work\n      or a Contribution incorporated within the Work constitutes direct\n      or contributory patent infringement, then any patent licenses\n      granted to You under this License for that Work shall terminate\n      as of the date such litigation is filed.\n\n   4. Redistribution. You may reproduce and distribute copies of the\n      Work or Derivative Works thereof in any medium, with or without\n      modifications, and in Source or Object form, provided that You\n      meet the following conditions:\n\n      (a) You must give any other recipients of the Work or\n          Derivative Works a copy of this License; and\n\n      (b) You must cause any modified files to carry prominent notices\n          stating that You changed the files; and\n\n      (c) You must retain, in the Source form of any Derivative Works\n          that You distribute, all copyright, patent, trademark, and\n          attribution notices from the Source form of the Work,\n          excluding those notices that do not pertain to any part of\n          the Derivative Works; and\n\n      (d) If the Work includes a \"NOTICE\" text file as part of its\n          distribution, then any Derivative Works that You distribute must\n          include a readable copy of the attribution notices contained\n          within such NOTICE file, excluding those notices that do not\n          pertain to any part of the Derivative Works, in at least one\n          of the following places: within a NOTICE text file distributed\n          as part of the Derivative Works; within the Source form or\n          documentation, if provided along with the Derivative Works; or,\n          within a display generated by the Derivative Works, if and\n          wherever such third-party notices normally appear. The contents\n          of the NOTICE file are for informational purposes only and\n          do not modify the License. You may add Your own attribution\n          notices within Derivative Works that You distribute, alongside\n          or as an addendum to the NOTICE text from the Work, provided\n          that such additional attribution notices cannot be construed\n          as modifying the License.\n\n      You may add Your own copyright statement to Your modifications and\n      may provide additional or different license terms and conditions\n      for use, reproduction, or distribution of Your modifications, or\n      for any such Derivative Works as a whole, provided Your use,\n      reproduction, and distribution of the Work otherwise complies with\n      the conditions stated in this License.\n\n   5. Submission of Contributions. Unless You explicitly state otherwise,\n      any Contribution intentionally submitted for inclusion in the Work\n      by You to the Licensor shall be under the terms and conditions of\n      this License, without any additional terms or conditions.\n      Notwithstanding the above, nothing herein shall supersede or modify\n      the terms of any separate license agreement you may have executed\n      with Licensor regarding such Contributions.\n\n   6. Trademarks. This License does not grant permission to use the trade\n      names, trademarks, service marks, or product names of the Licensor,\n      except as required for reasonable and customary use in describing the\n      origin of the Work and reproducing the content of the NOTICE file.\n\n   7. Disclaimer of Warranty. Unless required by applicable law or\n      agreed to in writing, Licensor provides the Work (and each\n      Contributor provides its Contributions) on an \"AS IS\" BASIS,\n      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or\n      implied, including, without limitation, any warranties or conditions\n      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A\n      PARTICULAR PURPOSE. You are solely responsible for determining the\n      appropriateness of using or redistributing the Work and assume any\n      risks associated with Your exercise of permissions under this License.\n\n   8. Limitation of Liability. In no event and under no legal theory,\n      whether in tort (including negligence), contract, or otherwise,\n      unless required by applicable law (such as deliberate and grossly\n      negligent acts) or agreed to in writing, shall any Contributor be\n      liable to You for damages, including any direct, indirect, special,\n      incidental, or consequential damages of any character arising as a\n      result of this License or out of the use or inability to use the\n      Work (including but not limited to damages for loss of goodwill,\n      work stoppage, computer failure or malfunction, or any and all\n      other commercial damages or losses), even if such Contributor\n      has been advised of the possibility of such damages.\n\n   9. Accepting Warranty or Additional Liability. While redistributing\n      the Work or Derivative Works thereof, You may choose to offer,\n      and charge a fee for, acceptance of support, warranty, indemnity,\n      or other liability obligations and/or rights consistent with this\n      License. However, in accepting such obligations, You may act only\n      on Your own behalf and on Your sole responsibility, not on behalf\n      of any other Contributor, and only if You agree to indemnify,\n      defend, and hold each Contributor harmless for any liability\n      incurred by, or claims asserted against, such Contributor by reason\n      of your accepting any such warranty or additional liability.\n\n   END OF TERMS AND CONDITIONS\n\n   APPENDIX: How to apply the Apache License to your work.\n\n      To apply the Apache License to your work, attach the following\n      boilerplate notice, with the fields enclosed by brackets \"[]\"\n      replaced with your own identifying information. (Don't include\n      the brackets!)  The text should be enclosed in the appropriate\n      comment syntax for the file format. We also recommend that a\n      file or class name and description of purpose be included on the\n      same \"printed page\" as the copyright notice for easier\n      identification within third-party archives.\n\n   Copyright [yyyy] [name of copyright owner]\n\n   Licensed under the Apache License, Version 2.0 (the \"License\");\n   you may not use this file except in compliance with the License.\n   You may obtain a copy of the License at\n\n       http://www.apache.org/licenses/LICENSE-2.0\n\n   Unless required by applicable law or agreed to in writing, software\n   distributed under the License is distributed on an \"AS IS\" BASIS,\n   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n   See the License for the specific language governing permissions and\n   limitations under the License.\n"
  },
  {
    "path": "README.md",
    "content": "# intro-to-ml-with-kubeflow-examples\nExamples for the Intro to ML with Kubeflow book\n"
  },
  {
    "path": "autopep_stuff.sh",
    "content": "#!/bin/bash\n# autopep8 a bunch of things that we can\nautopep8 -i -r ./ \\\n\t --select E101,E202,E201,E203,E211,E221,E222,E223,E224,E225,E226,E227,\\\n\t E228,E231,E241,E242,E251,E252,E262,E271,E272,E273,E274,E301,E302,E303,\\\n\t E304,E305,E306,E501,E502,E711,E712,E713,E714,E721,E722,E731,W291,W293,\\\n\t W391,W601,W602,W603,W604,W690\\\n\t -j 0 --exclude \"*venv*\"\n# Then we use YAPF because it does a better job on long-lines\nyapf -i -r ./ --exclude \"*venv*\"\n"
  },
  {
    "path": "ch03/example_secret.yaml",
    "content": "apiVersion: v1\nkind: Secret\nmetadata:\n  name: minioaccess\n  namespace: mynamespace\ndata:\n  AWS_ACCESS_KEY_ID: xxxxxxxxxx\n  AWS_SECRET_ACCESS_KEY: xxxxxxxxxxxxxxxxxxxxx\n"
  },
  {
    "path": "ch03/linux_install.sh",
    "content": "#!/bin/bash\n#tag::installMCLinux[]\npushd ~/bin\nwget https://dl.min.io/client/mc/release/linux-amd64/mc\nchmod a+x mc\n#end::installMCLinux[]\n"
  },
  {
    "path": "ch03/mac_install.sh",
    "content": "#!/bin/bash\n#tag::installMCMac[]\nbrew install minio/stable/minio\n#end::installMCMac[]\n"
  },
  {
    "path": "ch03/minio.sh",
    "content": "#!/bin/bash\nset -ex\n\n# Minio runs on port 9000 (both UI and service) so expose locally to use cli or UI\n#tag::fwdMinio[]\nkubectl port-forward -n kubeflow svc/minio-service 9000:9000 &\n#end::fwdMinio[]\n\n# Give it a spell to settle\nsleep 10\n\n# Kubeflow creates a minio user with password minio123 at install\n#tag::configMC[]\nmc config host add minio http://localhost:9000 minio minio123\n#end::configMC[]\n\n#tag::listMC[]\nmc ls minio\n#end::listMC[]\n# Output [2018-12-13 18:23:41 CST]     0B mlpipeline/\n\n# Make a new bucket for our work\n#tag::makeBucket[]\nmc mb minio/kf-book-examples\n#end::makeBucket[]\n"
  },
  {
    "path": "ch04/code/ControlStructures.ipynb",
    "content": "{\n \"cells\": [\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"# Simple Control structure\\n\",\n    \"\\n\",\n    \"Shows how to use conditional execution\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": 1,\n   \"metadata\": {},\n   \"outputs\": [\n    {\n     \"name\": \"stdout\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"Requirement already up-to-date: kfp in ./.local/lib/python3.6/site-packages (0.2.2.1)\\n\",\n      \"Requirement already satisfied, skipping upgrade: jsonschema>=3.0.1 in /usr/local/lib/python3.6/dist-packages (from kfp) (3.2.0)\\n\",\n      \"Requirement already satisfied, skipping upgrade: google-cloud-storage>=1.13.0 in /usr/local/lib/python3.6/dist-packages (from kfp) (1.25.0)\\n\",\n      \"Requirement already satisfied, skipping upgrade: python-dateutil in /usr/local/lib/python3.6/dist-packages (from kfp) (2.8.1)\\n\",\n      \"Requirement already satisfied, skipping upgrade: google-auth>=1.6.1 in /usr/local/lib/python3.6/dist-packages (from kfp) (1.11.0)\\n\",\n      \"Requirement already satisfied, skipping upgrade: argo-models==2.2.1a in ./.local/lib/python3.6/site-packages (from kfp) (2.2.1a0)\\n\",\n      \"Requirement already satisfied, skipping upgrade: urllib3<1.25,>=1.15 in ./.local/lib/python3.6/site-packages (from kfp) (1.24.3)\\n\",\n      \"Requirement already satisfied, skipping upgrade: cloudpickle==1.1.1 in ./.local/lib/python3.6/site-packages (from kfp) (1.1.1)\\n\",\n      \"Requirement already satisfied, skipping upgrade: kubernetes<=10.0.0,>=8.0.0 in ./.local/lib/python3.6/site-packages (from kfp) (10.0.0)\\n\",\n      \"Requirement already satisfied, skipping upgrade: click==7.0 in ./.local/lib/python3.6/site-packages (from kfp) (7.0)\\n\",\n      \"Requirement already satisfied, skipping upgrade: certifi in /usr/local/lib/python3.6/dist-packages (from kfp) (2019.11.28)\\n\",\n      \"Requirement already satisfied, skipping upgrade: PyYAML in /usr/local/lib/python3.6/dist-packages (from kfp) (5.3)\\n\",\n      \"Requirement already satisfied, skipping upgrade: kfp-server-api<=0.1.40,>=0.1.18 in ./.local/lib/python3.6/site-packages (from kfp) (0.1.40)\\n\",\n      \"Requirement already satisfied, skipping upgrade: six>=1.10 in /usr/lib/python3/dist-packages (from kfp) (1.11.0)\\n\",\n      \"Requirement already satisfied, skipping upgrade: PyJWT>=1.6.4 in ./.local/lib/python3.6/site-packages (from kfp) (1.7.1)\\n\",\n      \"Requirement already satisfied, skipping upgrade: Deprecated in ./.local/lib/python3.6/site-packages (from kfp) (1.2.7)\\n\",\n      \"Requirement already satisfied, skipping upgrade: requests-toolbelt>=0.8.0 in ./.local/lib/python3.6/site-packages (from kfp) (0.9.1)\\n\",\n      \"Requirement already satisfied, skipping upgrade: cryptography>=2.4.2 in ./.local/lib/python3.6/site-packages (from kfp) (2.8)\\n\",\n      \"Requirement already satisfied, skipping upgrade: tabulate==0.8.3 in ./.local/lib/python3.6/site-packages (from kfp) (0.8.3)\\n\",\n      \"Requirement already satisfied, skipping upgrade: importlib-metadata; python_version < \\\"3.8\\\" in /usr/local/lib/python3.6/dist-packages (from jsonschema>=3.0.1->kfp) (1.4.0)\\n\",\n      \"Requirement already satisfied, skipping upgrade: attrs>=17.4.0 in /usr/local/lib/python3.6/dist-packages (from jsonschema>=3.0.1->kfp) (19.3.0)\\n\",\n      \"Requirement already satisfied, skipping upgrade: pyrsistent>=0.14.0 in /usr/local/lib/python3.6/dist-packages (from jsonschema>=3.0.1->kfp) (0.15.7)\\n\",\n      \"Requirement already satisfied, skipping upgrade: setuptools in /usr/local/lib/python3.6/dist-packages (from jsonschema>=3.0.1->kfp) (45.1.0)\\n\",\n      \"Requirement already satisfied, skipping upgrade: google-resumable-media<0.6dev,>=0.5.0 in /usr/local/lib/python3.6/dist-packages (from google-cloud-storage>=1.13.0->kfp) (0.5.0)\\n\",\n      \"Requirement already satisfied, skipping upgrade: google-cloud-core<2.0dev,>=1.2.0 in /usr/local/lib/python3.6/dist-packages (from google-cloud-storage>=1.13.0->kfp) (1.3.0)\\n\",\n      \"Requirement already satisfied, skipping upgrade: cachetools<5.0,>=2.0.0 in /usr/local/lib/python3.6/dist-packages (from google-auth>=1.6.1->kfp) (4.0.0)\\n\",\n      \"Requirement already satisfied, skipping upgrade: pyasn1-modules>=0.2.1 in /usr/local/lib/python3.6/dist-packages (from google-auth>=1.6.1->kfp) (0.2.8)\\n\",\n      \"Requirement already satisfied, skipping upgrade: rsa<4.1,>=3.1.4 in /usr/local/lib/python3.6/dist-packages (from google-auth>=1.6.1->kfp) (4.0)\\n\",\n      \"Requirement already satisfied, skipping upgrade: requests-oauthlib in /usr/local/lib/python3.6/dist-packages (from kubernetes<=10.0.0,>=8.0.0->kfp) (1.3.0)\\n\",\n      \"Requirement already satisfied, skipping upgrade: requests in /usr/local/lib/python3.6/dist-packages (from kubernetes<=10.0.0,>=8.0.0->kfp) (2.22.0)\\n\",\n      \"Requirement already satisfied, skipping upgrade: websocket-client!=0.40.0,!=0.41.*,!=0.42.*,>=0.32.0 in /usr/local/lib/python3.6/dist-packages (from kubernetes<=10.0.0,>=8.0.0->kfp) (0.57.0)\\n\",\n      \"Requirement already satisfied, skipping upgrade: wrapt<2,>=1.10 in /usr/local/lib/python3.6/dist-packages (from Deprecated->kfp) (1.11.2)\\n\",\n      \"Requirement already satisfied, skipping upgrade: cffi!=1.11.3,>=1.8 in ./.local/lib/python3.6/site-packages (from cryptography>=2.4.2->kfp) (1.14.0)\\n\",\n      \"Requirement already satisfied, skipping upgrade: zipp>=0.5 in /usr/local/lib/python3.6/dist-packages (from importlib-metadata; python_version < \\\"3.8\\\"->jsonschema>=3.0.1->kfp) (2.1.0)\\n\",\n      \"Requirement already satisfied, skipping upgrade: google-api-core<2.0.0dev,>=1.16.0 in /usr/local/lib/python3.6/dist-packages (from google-cloud-core<2.0dev,>=1.2.0->google-cloud-storage>=1.13.0->kfp) (1.16.0)\\n\",\n      \"Requirement already satisfied, skipping upgrade: pyasn1<0.5.0,>=0.4.6 in /usr/local/lib/python3.6/dist-packages (from pyasn1-modules>=0.2.1->google-auth>=1.6.1->kfp) (0.4.8)\\n\",\n      \"Requirement already satisfied, skipping upgrade: oauthlib>=3.0.0 in /usr/local/lib/python3.6/dist-packages (from requests-oauthlib->kubernetes<=10.0.0,>=8.0.0->kfp) (3.1.0)\\n\",\n      \"Requirement already satisfied, skipping upgrade: chardet<3.1.0,>=3.0.2 in /usr/local/lib/python3.6/dist-packages (from requests->kubernetes<=10.0.0,>=8.0.0->kfp) (3.0.4)\\n\",\n      \"Requirement already satisfied, skipping upgrade: idna<2.9,>=2.5 in /usr/lib/python3/dist-packages (from requests->kubernetes<=10.0.0,>=8.0.0->kfp) (2.6)\\n\",\n      \"Requirement already satisfied, skipping upgrade: pycparser in ./.local/lib/python3.6/site-packages (from cffi!=1.11.3,>=1.8->cryptography>=2.4.2->kfp) (2.19)\\n\",\n      \"Requirement already satisfied, skipping upgrade: protobuf>=3.4.0 in /usr/local/lib/python3.6/dist-packages (from google-api-core<2.0.0dev,>=1.16.0->google-cloud-core<2.0dev,>=1.2.0->google-cloud-storage>=1.13.0->kfp) (3.11.2)\\n\",\n      \"Requirement already satisfied, skipping upgrade: pytz in /usr/local/lib/python3.6/dist-packages (from google-api-core<2.0.0dev,>=1.16.0->google-cloud-core<2.0dev,>=1.2.0->google-cloud-storage>=1.13.0->kfp) (2019.3)\\n\",\n      \"Requirement already satisfied, skipping upgrade: googleapis-common-protos<2.0dev,>=1.6.0 in /usr/local/lib/python3.6/dist-packages (from google-api-core<2.0.0dev,>=1.16.0->google-cloud-core<2.0dev,>=1.2.0->google-cloud-storage>=1.13.0->kfp) (1.51.0)\\n\"\n     ]\n    }\n   ],\n   \"source\": [\n    \"!pip install kfp --upgrade --user\\n\",\n    \"\\n\",\n    \"import kfp\\n\",\n    \"from kfp import dsl\\n\",\n    \"from kfp.components import func_to_container_op, InputPath, OutputPath\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"# Functions\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": 2,\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"@func_to_container_op\\n\",\n    \"def get_random_int_op(minimum: int, maximum: int) -> int:\\n\",\n    \"    \\\"\\\"\\\"Generate a random number between minimum and maximum (inclusive).\\\"\\\"\\\"\\n\",\n    \"    import random\\n\",\n    \"    result = random.randint(minimum, maximum)\\n\",\n    \"    print(result)\\n\",\n    \"    return result\\n\",\n    \"\\n\",\n    \"@func_to_container_op\\n\",\n    \"def process_small_op(data: int):\\n\",\n    \"    \\\"\\\"\\\"Process small numbers.\\\"\\\"\\\"\\n\",\n    \"    print(\\\"Processing small result\\\", data)\\n\",\n    \"    return\\n\",\n    \"\\n\",\n    \"@func_to_container_op\\n\",\n    \"def process_medium_op(data: int):\\n\",\n    \"    \\\"\\\"\\\"Process medium numbers.\\\"\\\"\\\"\\n\",\n    \"    print(\\\"Processing medium result\\\", data)\\n\",\n    \"    return\\n\",\n    \"\\n\",\n    \"@func_to_container_op\\n\",\n    \"def process_large_op(data: int):\\n\",\n    \"    \\\"\\\"\\\"Process large numbers.\\\"\\\"\\\"\\n\",\n    \"    print(\\\"Processing large result\\\", data)\\n\",\n    \"    return\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"# Conditional pipeline\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": 3,\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"@dsl.pipeline(\\n\",\n    \"    name='Conditional execution pipeline',\\n\",\n    \"    description='Shows how to use dsl.Condition().'\\n\",\n    \")\\n\",\n    \"def conditional_pipeline():\\n\",\n    \"    number = get_random_int_op(0, 100).output\\n\",\n    \"    with dsl.Condition(number < 10):\\n\",\n    \"        process_small_op(number)\\n\",\n    \"    with dsl.Condition(number > 10 and number < 50):\\n\",\n    \"        process_medium_op(number)\\n\",\n    \"    with dsl.Condition(number > 50):\\n\",\n    \"        process_large_op(number)\\n\",\n    \"        \"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"# Submit the pipeline for execution:\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": 4,\n   \"metadata\": {},\n   \"outputs\": [\n    {\n     \"data\": {\n      \"text/html\": [\n       \"Experiment link <a href=\\\"/pipeline/#/experiments/details/2abe16d1-fa2e-4f49-a3a5-acad8d36790d\\\" target=\\\"_blank\\\" >here</a>\"\n      ],\n      \"text/plain\": [\n       \"<IPython.core.display.HTML object>\"\n      ]\n     },\n     \"metadata\": {},\n     \"output_type\": \"display_data\"\n    },\n    {\n     \"data\": {\n      \"text/html\": [\n       \"Run link <a href=\\\"/pipeline/#/runs/details/293a92c5-50b2-4a96-bbd4-ebc85106f337\\\" target=\\\"_blank\\\" >here</a>\"\n      ],\n      \"text/plain\": [\n       \"<IPython.core.display.HTML object>\"\n      ]\n     },\n     \"metadata\": {},\n     \"output_type\": \"display_data\"\n    },\n    {\n     \"data\": {\n      \"text/plain\": [\n       \"RunPipelineResult(run_id=293a92c5-50b2-4a96-bbd4-ebc85106f337)\"\n      ]\n     },\n     \"execution_count\": 4,\n     \"metadata\": {},\n     \"output_type\": \"execute_result\"\n    }\n   ],\n   \"source\": [\n    \"kfp.Client().create_run_from_pipeline_func(conditional_pipeline, arguments={})\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": null,\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": []\n  }\n ],\n \"metadata\": {\n  \"kernelspec\": {\n   \"display_name\": \"Python 3\",\n   \"language\": \"python\",\n   \"name\": \"python3\"\n  },\n  \"language_info\": {\n   \"codemirror_mode\": {\n    \"name\": \"ipython\",\n    \"version\": 3\n   },\n   \"file_extension\": \".py\",\n   \"mimetype\": \"text/x-python\",\n   \"name\": \"python\",\n   \"nbconvert_exporter\": \"python\",\n   \"pygments_lexer\": \"ipython3\",\n   \"version\": \"3.6.9\"\n  }\n },\n \"nbformat\": 4,\n \"nbformat_minor\": 2\n}\n"
  },
  {
    "path": "ch04/code/ControlStructures.py",
    "content": "#!/usr/bin/env python\n# coding: utf-8\n\n# # Simple Control structure\n#\n# Shows how to use conditional execution\n\n# In[1]:\n\nget_ipython().system('pip install kfp --upgrade --user')\n\nimport kfp\nfrom kfp import dsl\nfrom kfp.components import func_to_container_op, InputPath, OutputPath\n\n# # Functions\n\n# In[2]:\n\n\n@func_to_container_op\ndef get_random_int_op(minimum: int, maximum: int) -> int:\n    \"\"\"Generate a random number between minimum and maximum (inclusive).\"\"\"\n    import random\n    result = random.randint(minimum, maximum)\n    print(result)\n    return result\n\n\n@func_to_container_op\ndef process_small_op(data: int):\n    \"\"\"Process small numbers.\"\"\"\n    print(\"Processing small result\", data)\n    return\n\n\n@func_to_container_op\ndef process_medium_op(data: int):\n    \"\"\"Process medium numbers.\"\"\"\n    print(\"Processing medium result\", data)\n    return\n\n\n@func_to_container_op\ndef process_large_op(data: int):\n    \"\"\"Process large numbers.\"\"\"\n    print(\"Processing large result\", data)\n    return\n\n\n# # Conditional pipeline\n\n# In[3]:\n\n\n@dsl.pipeline(name='Conditional execution pipeline',\n              description='Shows how to use dsl.Condition().')\ndef conditional_pipeline():\n    number = get_random_int_op(0, 100).output\n    with dsl.Condition(number < 10):\n        process_small_op(number)\n    with dsl.Condition(number > 10 and number < 50):\n        process_medium_op(number)\n    with dsl.Condition(number > 50):\n        process_large_op(number)\n\n\n# # Submit the pipeline for execution:\n\n# In[4]:\n\nkfp.Client().create_run_from_pipeline_func(conditional_pipeline, arguments={})\n\n# In[ ]:\n"
  },
  {
    "path": "ch04/code/Lightweight Pipeline.ipynb",
    "content": "{\n \"cells\": [\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"# Setup\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": 1,\n   \"metadata\": {},\n   \"outputs\": [\n    {\n     \"name\": \"stdout\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"Requirement already up-to-date: kfp in ./.local/lib/python3.6/site-packages (0.2.2.1)\\n\",\n      \"Requirement already satisfied, skipping upgrade: PyYAML in /usr/local/lib/python3.6/dist-packages (from kfp) (5.3)\\n\",\n      \"Requirement already satisfied, skipping upgrade: requests-toolbelt>=0.8.0 in ./.local/lib/python3.6/site-packages (from kfp) (0.9.1)\\n\",\n      \"Requirement already satisfied, skipping upgrade: click==7.0 in ./.local/lib/python3.6/site-packages (from kfp) (7.0)\\n\",\n      \"Requirement already satisfied, skipping upgrade: Deprecated in ./.local/lib/python3.6/site-packages (from kfp) (1.2.7)\\n\",\n      \"Requirement already satisfied, skipping upgrade: kubernetes<=10.0.0,>=8.0.0 in ./.local/lib/python3.6/site-packages (from kfp) (10.0.0)\\n\",\n      \"Requirement already satisfied, skipping upgrade: six>=1.10 in /usr/lib/python3/dist-packages (from kfp) (1.11.0)\\n\",\n      \"Requirement already satisfied, skipping upgrade: certifi in /usr/local/lib/python3.6/dist-packages (from kfp) (2019.11.28)\\n\",\n      \"Requirement already satisfied, skipping upgrade: google-cloud-storage>=1.13.0 in /usr/local/lib/python3.6/dist-packages (from kfp) (1.25.0)\\n\",\n      \"Requirement already satisfied, skipping upgrade: PyJWT>=1.6.4 in ./.local/lib/python3.6/site-packages (from kfp) (1.7.1)\\n\",\n      \"Requirement already satisfied, skipping upgrade: cryptography>=2.4.2 in ./.local/lib/python3.6/site-packages (from kfp) (2.8)\\n\",\n      \"Requirement already satisfied, skipping upgrade: google-auth>=1.6.1 in /usr/local/lib/python3.6/dist-packages (from kfp) (1.11.0)\\n\",\n      \"Requirement already satisfied, skipping upgrade: cloudpickle==1.1.1 in ./.local/lib/python3.6/site-packages (from kfp) (1.1.1)\\n\",\n      \"Requirement already satisfied, skipping upgrade: kfp-server-api<=0.1.40,>=0.1.18 in ./.local/lib/python3.6/site-packages (from kfp) (0.1.40)\\n\",\n      \"Requirement already satisfied, skipping upgrade: argo-models==2.2.1a in ./.local/lib/python3.6/site-packages (from kfp) (2.2.1a0)\\n\",\n      \"Requirement already satisfied, skipping upgrade: tabulate==0.8.3 in ./.local/lib/python3.6/site-packages (from kfp) (0.8.3)\\n\",\n      \"Requirement already satisfied, skipping upgrade: python-dateutil in /usr/local/lib/python3.6/dist-packages (from kfp) (2.8.1)\\n\",\n      \"Requirement already satisfied, skipping upgrade: urllib3<1.25,>=1.15 in ./.local/lib/python3.6/site-packages (from kfp) (1.24.3)\\n\",\n      \"Requirement already satisfied, skipping upgrade: jsonschema>=3.0.1 in /usr/local/lib/python3.6/dist-packages (from kfp) (3.2.0)\\n\",\n      \"Requirement already satisfied, skipping upgrade: requests<3.0.0,>=2.0.1 in /usr/local/lib/python3.6/dist-packages (from requests-toolbelt>=0.8.0->kfp) (2.22.0)\\n\",\n      \"Requirement already satisfied, skipping upgrade: wrapt<2,>=1.10 in /usr/local/lib/python3.6/dist-packages (from Deprecated->kfp) (1.11.2)\\n\",\n      \"Requirement already satisfied, skipping upgrade: setuptools>=21.0.0 in /usr/local/lib/python3.6/dist-packages (from kubernetes<=10.0.0,>=8.0.0->kfp) (45.1.0)\\n\",\n      \"Requirement already satisfied, skipping upgrade: websocket-client!=0.40.0,!=0.41.*,!=0.42.*,>=0.32.0 in /usr/local/lib/python3.6/dist-packages (from kubernetes<=10.0.0,>=8.0.0->kfp) (0.57.0)\\n\",\n      \"Requirement already satisfied, skipping upgrade: requests-oauthlib in /usr/local/lib/python3.6/dist-packages (from kubernetes<=10.0.0,>=8.0.0->kfp) (1.3.0)\\n\",\n      \"Requirement already satisfied, skipping upgrade: google-cloud-core<2.0dev,>=1.2.0 in /usr/local/lib/python3.6/dist-packages (from google-cloud-storage>=1.13.0->kfp) (1.3.0)\\n\",\n      \"Requirement already satisfied, skipping upgrade: google-resumable-media<0.6dev,>=0.5.0 in /usr/local/lib/python3.6/dist-packages (from google-cloud-storage>=1.13.0->kfp) (0.5.0)\\n\",\n      \"Requirement already satisfied, skipping upgrade: cffi!=1.11.3,>=1.8 in ./.local/lib/python3.6/site-packages (from cryptography>=2.4.2->kfp) (1.14.0)\\n\",\n      \"Requirement already satisfied, skipping upgrade: cachetools<5.0,>=2.0.0 in /usr/local/lib/python3.6/dist-packages (from google-auth>=1.6.1->kfp) (4.0.0)\\n\",\n      \"Requirement already satisfied, skipping upgrade: rsa<4.1,>=3.1.4 in /usr/local/lib/python3.6/dist-packages (from google-auth>=1.6.1->kfp) (4.0)\\n\",\n      \"Requirement already satisfied, skipping upgrade: pyasn1-modules>=0.2.1 in /usr/local/lib/python3.6/dist-packages (from google-auth>=1.6.1->kfp) (0.2.8)\\n\",\n      \"Requirement already satisfied, skipping upgrade: attrs>=17.4.0 in /usr/local/lib/python3.6/dist-packages (from jsonschema>=3.0.1->kfp) (19.3.0)\\n\",\n      \"Requirement already satisfied, skipping upgrade: pyrsistent>=0.14.0 in /usr/local/lib/python3.6/dist-packages (from jsonschema>=3.0.1->kfp) (0.15.7)\\n\",\n      \"Requirement already satisfied, skipping upgrade: importlib-metadata; python_version < \\\"3.8\\\" in /usr/local/lib/python3.6/dist-packages (from jsonschema>=3.0.1->kfp) (1.4.0)\\n\",\n      \"Requirement already satisfied, skipping upgrade: chardet<3.1.0,>=3.0.2 in /usr/local/lib/python3.6/dist-packages (from requests<3.0.0,>=2.0.1->requests-toolbelt>=0.8.0->kfp) (3.0.4)\\n\",\n      \"Requirement already satisfied, skipping upgrade: idna<2.9,>=2.5 in /usr/lib/python3/dist-packages (from requests<3.0.0,>=2.0.1->requests-toolbelt>=0.8.0->kfp) (2.6)\\n\",\n      \"Requirement already satisfied, skipping upgrade: oauthlib>=3.0.0 in /usr/local/lib/python3.6/dist-packages (from requests-oauthlib->kubernetes<=10.0.0,>=8.0.0->kfp) (3.1.0)\\n\",\n      \"Requirement already satisfied, skipping upgrade: google-api-core<2.0.0dev,>=1.16.0 in /usr/local/lib/python3.6/dist-packages (from google-cloud-core<2.0dev,>=1.2.0->google-cloud-storage>=1.13.0->kfp) (1.16.0)\\n\",\n      \"Requirement already satisfied, skipping upgrade: pycparser in ./.local/lib/python3.6/site-packages (from cffi!=1.11.3,>=1.8->cryptography>=2.4.2->kfp) (2.19)\\n\",\n      \"Requirement already satisfied, skipping upgrade: pyasn1>=0.1.3 in /usr/local/lib/python3.6/dist-packages (from rsa<4.1,>=3.1.4->google-auth>=1.6.1->kfp) (0.4.8)\\n\",\n      \"Requirement already satisfied, skipping upgrade: zipp>=0.5 in /usr/local/lib/python3.6/dist-packages (from importlib-metadata; python_version < \\\"3.8\\\"->jsonschema>=3.0.1->kfp) (2.1.0)\\n\",\n      \"Requirement already satisfied, skipping upgrade: googleapis-common-protos<2.0dev,>=1.6.0 in /usr/local/lib/python3.6/dist-packages (from google-api-core<2.0.0dev,>=1.16.0->google-cloud-core<2.0dev,>=1.2.0->google-cloud-storage>=1.13.0->kfp) (1.51.0)\\n\",\n      \"Requirement already satisfied, skipping upgrade: pytz in /usr/local/lib/python3.6/dist-packages (from google-api-core<2.0.0dev,>=1.16.0->google-cloud-core<2.0dev,>=1.2.0->google-cloud-storage>=1.13.0->kfp) (2019.3)\\n\",\n      \"Requirement already satisfied, skipping upgrade: protobuf>=3.4.0 in /usr/local/lib/python3.6/dist-packages (from google-api-core<2.0.0dev,>=1.16.0->google-cloud-core<2.0dev,>=1.2.0->google-cloud-storage>=1.13.0->kfp) (3.11.2)\\n\"\n     ]\n    }\n   ],\n   \"source\": [\n    \"!pip install kfp --upgrade --user\\n\",\n    \"\\n\",\n    \"import kfp \\n\",\n    \"from kfp import compiler\\n\",\n    \"import kfp.dsl as dsl\\n\",\n    \"import kfp.notebook\\n\",\n    \"import kfp.components as comp\\n\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"Simple function that just add two numbers:\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": 2,\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"#Define a Python function\\n\",\n    \"def add(a: float, b: float) -> float:\\n\",\n    \"   '''Calculates sum of two arguments'''\\n\",\n    \"   return a + b\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"Convert the function to a pipeline operation\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": 3,\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"add_op = comp.func_to_container_op(add)\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"A bit more advanced function which demonstrates how to use imports, helper functions and produce multiple outputs.\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": 4,\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"from typing import NamedTuple\\n\",\n    \"def my_divmod(dividend: float, divisor:float) -> NamedTuple('MyDivmodOutput', [('quotient', float), ('remainder', float)]):\\n\",\n    \"    '''Divides two numbers and calculate  the quotient and remainder'''\\n\",\n    \"    #Imports inside a component function:\\n\",\n    \"    import numpy as np\\n\",\n    \"\\n\",\n    \"    #This function demonstrates how to use nested functions inside a component function:\\n\",\n    \"    def divmod_helper(dividend, divisor):\\n\",\n    \"        return np.divmod(dividend, divisor)\\n\",\n    \"\\n\",\n    \"    (quotient, remainder) = divmod_helper(dividend, divisor)\\n\",\n    \"\\n\",\n    \"    from collections import namedtuple\\n\",\n    \"    divmod_output = namedtuple('MyDivmodOutput', ['quotient', 'remainder'])\\n\",\n    \"    return divmod_output(quotient, remainder)\\n\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"Test running the python function directly\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": 5,\n   \"metadata\": {},\n   \"outputs\": [\n    {\n     \"data\": {\n      \"text/plain\": [\n       \"MyDivmodOutput(quotient=14, remainder=2)\"\n      ]\n     },\n     \"execution_count\": 5,\n     \"metadata\": {},\n     \"output_type\": \"execute_result\"\n    }\n   ],\n   \"source\": [\n    \"my_divmod(100, 7)\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"Convert the function to a pipeline operation\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": 6,\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"divmod_op = comp.func_to_container_op(my_divmod, base_image='tensorflow/tensorflow:1.14.0-py3')\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"Define the pipeline\\n\",\n    \"Pipeline function has to be decorated with the @dsl.pipeline decorator\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": 7,\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"@dsl.pipeline(\\n\",\n    \"   name='Calculation pipeline',\\n\",\n    \"   description='A toy pipeline that performs arithmetic calculations.'\\n\",\n    \")\\n\",\n    \"def calc_pipeline(\\n\",\n    \"   a='a',\\n\",\n    \"   b='7',\\n\",\n    \"   c='17',\\n\",\n    \"):\\n\",\n    \"    #Passing pipeline parameter and a constant value as operation arguments\\n\",\n    \"    add_task = add_op(a, 4) #Returns a dsl.ContainerOp class instance. \\n\",\n    \"    \\n\",\n    \"    #Passing a task output reference as operation arguments\\n\",\n    \"    #For an operation with a single return value, the output reference can be accessed using `task.output` or `task.outputs['output_name']` syntax\\n\",\n    \"    divmod_task = divmod_op(add_task.output, b)\\n\",\n    \"\\n\",\n    \"    #For an operation with a multiple return values, the output references can be accessed using `task.outputs['output_name']` syntax\\n\",\n    \"    result_task = add_op(divmod_task.outputs['quotient'], c)\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"Submit the pipeline for execution\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": 8,\n   \"metadata\": {},\n   \"outputs\": [\n    {\n     \"data\": {\n      \"text/html\": [\n       \"Experiment link <a href=\\\"/pipeline/#/experiments/details/2abe16d1-fa2e-4f49-a3a5-acad8d36790d\\\" target=\\\"_blank\\\" >here</a>\"\n      ],\n      \"text/plain\": [\n       \"<IPython.core.display.HTML object>\"\n      ]\n     },\n     \"metadata\": {},\n     \"output_type\": \"display_data\"\n    },\n    {\n     \"data\": {\n      \"text/html\": [\n       \"Run link <a href=\\\"/pipeline/#/runs/details/87276776-0c3a-4d4e-99d0-4563b7f42fa5\\\" target=\\\"_blank\\\" >here</a>\"\n      ],\n      \"text/plain\": [\n       \"<IPython.core.display.HTML object>\"\n      ]\n     },\n     \"metadata\": {},\n     \"output_type\": \"display_data\"\n    },\n    {\n     \"data\": {\n      \"text/plain\": [\n       \"RunPipelineResult(run_id=87276776-0c3a-4d4e-99d0-4563b7f42fa5)\"\n      ]\n     },\n     \"execution_count\": 8,\n     \"metadata\": {},\n     \"output_type\": \"execute_result\"\n    }\n   ],\n   \"source\": [\n    \"client = kfp.Client()\\n\",\n    \"\\n\",\n    \"#Specify pipeline argument values\\n\",\n    \"arguments = {'a': '7', 'b': '8'}\\n\",\n    \"\\n\",\n    \"#Submit a pipeline run\\n\",\n    \"client.create_run_from_pipeline_func(calc_pipeline, arguments=arguments)\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": null,\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": []\n  }\n ],\n \"metadata\": {\n  \"kernelspec\": {\n   \"display_name\": \"Python 3\",\n   \"language\": \"python\",\n   \"name\": \"python3\"\n  },\n  \"language_info\": {\n   \"codemirror_mode\": {\n    \"name\": \"ipython\",\n    \"version\": 3\n   },\n   \"file_extension\": \".py\",\n   \"mimetype\": \"text/x-python\",\n   \"name\": \"python\",\n   \"nbconvert_exporter\": \"python\",\n   \"pygments_lexer\": \"ipython3\",\n   \"version\": \"3.6.9\"\n  }\n },\n \"nbformat\": 4,\n \"nbformat_minor\": 2\n}\n"
  },
  {
    "path": "ch04/code/Lightweight Pipeline.py",
    "content": "#!/usr/bin/env python\n# coding: utf-8\n\n# # Setup\n\n# In[1]:\n\nget_ipython().system('pip install kfp --upgrade --user')\n\nimport kfp\nfrom kfp import compiler\nimport kfp.dsl as dsl\nimport kfp.notebook\nimport kfp.components as comp\n\n# Simple function that just add two numbers:\n\n# In[2]:\n\n\n#Define a Python function\ndef add(a: float, b: float) -> float:\n    '''Calculates sum of two arguments'''\n    return a + b\n\n\n# Convert the function to a pipeline operation\n\n# In[3]:\n\nadd_op = comp.func_to_container_op(add)\n\n# A bit more advanced function which demonstrates how to use imports, helper functions and produce multiple outputs.\n\n# In[4]:\n\nfrom typing import NamedTuple\n\n\ndef my_divmod(\n    dividend: float, divisor: float\n) -> NamedTuple('MyDivmodOutput', [('quotient', float), ('remainder', float)]):\n    '''Divides two numbers and calculate  the quotient and remainder'''\n    #Imports inside a component function:\n    import numpy as np\n\n    #This function demonstrates how to use nested functions inside a component function:\n    def divmod_helper(dividend, divisor):\n        return np.divmod(dividend, divisor)\n\n    (quotient, remainder) = divmod_helper(dividend, divisor)\n\n    from collections import namedtuple\n    divmod_output = namedtuple('MyDivmodOutput', ['quotient', 'remainder'])\n    return divmod_output(quotient, remainder)\n\n\n# Test running the python function directly\n\n# In[5]:\n\nmy_divmod(100, 7)\n\n# Convert the function to a pipeline operation\n\n# In[6]:\n\ndivmod_op = comp.func_to_container_op(\n    my_divmod, base_image='tensorflow/tensorflow:1.14.0-py3')\n\n# Define the pipeline\n# Pipeline function has to be decorated with the @dsl.pipeline decorator\n\n# In[7]:\n\n\n@dsl.pipeline(\n    name='Calculation pipeline',\n    description='A toy pipeline that performs arithmetic calculations.')\ndef calc_pipeline(\n    a='a',\n    b='7',\n    c='17',\n):\n    #Passing pipeline parameter and a constant value as operation arguments\n    add_task = add_op(a, 4)  # Returns a dsl.ContainerOp class instance.\n\n    #Passing a task output reference as operation arguments\n    #For an operation with a single return value, the output reference can be accessed using `task.output` or `task.outputs['output_name']` syntax\n    divmod_task = divmod_op(add_task.output, b)\n\n    #For an operation with a multiple return values, the output references can be accessed using `task.outputs['output_name']` syntax\n    result_task = add_op(divmod_task.outputs['quotient'], c)\n\n\n# Submit the pipeline for execution\n\n# In[8]:\n\nclient = kfp.Client()\n\n#Specify pipeline argument values\narguments = {'a': '7', 'b': '8'}\n\n#Submit a pipeline run\nclient.create_run_from_pipeline_func(calc_pipeline, arguments=arguments)\n\n# In[ ]:\n"
  },
  {
    "path": "ch04/code/RecommenderPipeline.ipynb",
    "content": "{\n \"cells\": [\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"# Kubeflow pipeline\\n\",\n    \"This is a fairly simple pipeline, containing sequential steps:\\n\",\n    \"\\n\",\n    \"1. Update data - implemented by lightbend/recommender-data-update-publisher:0.2 image\\n\",\n    \"2. Run model training. Ideally we would run TFJob, but due to the current limitations for pipelines, we will directly use an image implementing training lightbend/ml-tf-recommender:0.1\\n\",\n    \"3. Update serving model - implemented by lightbend/recommender-model-publisher:0.2\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"# Setup\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": 1,\n   \"metadata\": {},\n   \"outputs\": [\n    {\n     \"name\": \"stdout\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"Requirement already up-to-date: kubernetes in ./.local/lib/python3.6/site-packages (10.0.1)\\n\",\n      \"Requirement already satisfied, skipping upgrade: pyyaml>=3.12 in /usr/local/lib/python3.6/dist-packages (from kubernetes) (5.3)\\n\",\n      \"Requirement already satisfied, skipping upgrade: six>=1.9.0 in /usr/lib/python3/dist-packages (from kubernetes) (1.11.0)\\n\",\n      \"Requirement already satisfied, skipping upgrade: urllib3>=1.24.2 in ./.local/lib/python3.6/site-packages (from kubernetes) (1.24.3)\\n\",\n      \"Requirement already satisfied, skipping upgrade: requests-oauthlib in /usr/local/lib/python3.6/dist-packages (from kubernetes) (1.3.0)\\n\",\n      \"Requirement already satisfied, skipping upgrade: certifi>=14.05.14 in /usr/local/lib/python3.6/dist-packages (from kubernetes) (2019.11.28)\\n\",\n      \"Requirement already satisfied, skipping upgrade: python-dateutil>=2.5.3 in /usr/local/lib/python3.6/dist-packages (from kubernetes) (2.8.1)\\n\",\n      \"Requirement already satisfied, skipping upgrade: setuptools>=21.0.0 in /usr/local/lib/python3.6/dist-packages (from kubernetes) (45.1.0)\\n\",\n      \"Requirement already satisfied, skipping upgrade: requests in /usr/local/lib/python3.6/dist-packages (from kubernetes) (2.22.0)\\n\",\n      \"Requirement already satisfied, skipping upgrade: websocket-client!=0.40.0,!=0.41.*,!=0.42.*,>=0.32.0 in /usr/local/lib/python3.6/dist-packages (from kubernetes) (0.57.0)\\n\",\n      \"Requirement already satisfied, skipping upgrade: google-auth>=1.0.1 in /usr/local/lib/python3.6/dist-packages (from kubernetes) (1.11.0)\\n\",\n      \"Requirement already satisfied, skipping upgrade: oauthlib>=3.0.0 in /usr/local/lib/python3.6/dist-packages (from requests-oauthlib->kubernetes) (3.1.0)\\n\",\n      \"Requirement already satisfied, skipping upgrade: idna<2.9,>=2.5 in /usr/lib/python3/dist-packages (from requests->kubernetes) (2.6)\\n\",\n      \"Requirement already satisfied, skipping upgrade: chardet<3.1.0,>=3.0.2 in /usr/local/lib/python3.6/dist-packages (from requests->kubernetes) (3.0.4)\\n\",\n      \"Requirement already satisfied, skipping upgrade: pyasn1-modules>=0.2.1 in /usr/local/lib/python3.6/dist-packages (from google-auth>=1.0.1->kubernetes) (0.2.8)\\n\",\n      \"Requirement already satisfied, skipping upgrade: cachetools<5.0,>=2.0.0 in /usr/local/lib/python3.6/dist-packages (from google-auth>=1.0.1->kubernetes) (4.0.0)\\n\",\n      \"Requirement already satisfied, skipping upgrade: rsa<4.1,>=3.1.4 in /usr/local/lib/python3.6/dist-packages (from google-auth>=1.0.1->kubernetes) (4.0)\\n\",\n      \"Requirement already satisfied, skipping upgrade: pyasn1<0.5.0,>=0.4.6 in /usr/local/lib/python3.6/dist-packages (from pyasn1-modules>=0.2.1->google-auth>=1.0.1->kubernetes) (0.4.8)\\n\",\n      \"Requirement already up-to-date: kfp in ./.local/lib/python3.6/site-packages (0.2.2.1)\\n\",\n      \"Requirement already satisfied, skipping upgrade: PyJWT>=1.6.4 in ./.local/lib/python3.6/site-packages (from kfp) (1.7.1)\\n\",\n      \"Requirement already satisfied, skipping upgrade: requests-toolbelt>=0.8.0 in ./.local/lib/python3.6/site-packages (from kfp) (0.9.1)\\n\",\n      \"Requirement already satisfied, skipping upgrade: python-dateutil in /usr/local/lib/python3.6/dist-packages (from kfp) (2.8.1)\\n\",\n      \"Requirement already satisfied, skipping upgrade: PyYAML in /usr/local/lib/python3.6/dist-packages (from kfp) (5.3)\\n\",\n      \"Requirement already satisfied, skipping upgrade: kfp-server-api<=0.1.40,>=0.1.18 in ./.local/lib/python3.6/site-packages (from kfp) (0.1.40)\\n\",\n      \"Requirement already satisfied, skipping upgrade: google-cloud-storage>=1.13.0 in /usr/local/lib/python3.6/dist-packages (from kfp) (1.25.0)\\n\",\n      \"Requirement already satisfied, skipping upgrade: jsonschema>=3.0.1 in /usr/local/lib/python3.6/dist-packages (from kfp) (3.2.0)\\n\",\n      \"Requirement already satisfied, skipping upgrade: Deprecated in ./.local/lib/python3.6/site-packages (from kfp) (1.2.7)\\n\",\n      \"Requirement already satisfied, skipping upgrade: google-auth>=1.6.1 in /usr/local/lib/python3.6/dist-packages (from kfp) (1.11.0)\\n\",\n      \"Collecting kubernetes<=10.0.0,>=8.0.0\\n\",\n      \"  Using cached kubernetes-10.0.0-py2.py3-none-any.whl (1.5 MB)\\n\",\n      \"Requirement already satisfied, skipping upgrade: argo-models==2.2.1a in ./.local/lib/python3.6/site-packages (from kfp) (2.2.1a0)\\n\",\n      \"Requirement already satisfied, skipping upgrade: urllib3<1.25,>=1.15 in ./.local/lib/python3.6/site-packages (from kfp) (1.24.3)\\n\",\n      \"Requirement already satisfied, skipping upgrade: certifi in /usr/local/lib/python3.6/dist-packages (from kfp) (2019.11.28)\\n\",\n      \"Requirement already satisfied, skipping upgrade: tabulate==0.8.3 in ./.local/lib/python3.6/site-packages (from kfp) (0.8.3)\\n\",\n      \"Requirement already satisfied, skipping upgrade: click==7.0 in ./.local/lib/python3.6/site-packages (from kfp) (7.0)\\n\",\n      \"Requirement already satisfied, skipping upgrade: cloudpickle==1.1.1 in ./.local/lib/python3.6/site-packages (from kfp) (1.1.1)\\n\",\n      \"Requirement already satisfied, skipping upgrade: six>=1.10 in /usr/lib/python3/dist-packages (from kfp) (1.11.0)\\n\",\n      \"Requirement already satisfied, skipping upgrade: cryptography>=2.4.2 in ./.local/lib/python3.6/site-packages (from kfp) (2.8)\\n\",\n      \"Requirement already satisfied, skipping upgrade: requests<3.0.0,>=2.0.1 in /usr/local/lib/python3.6/dist-packages (from requests-toolbelt>=0.8.0->kfp) (2.22.0)\\n\",\n      \"Requirement already satisfied, skipping upgrade: google-resumable-media<0.6dev,>=0.5.0 in /usr/local/lib/python3.6/dist-packages (from google-cloud-storage>=1.13.0->kfp) (0.5.0)\\n\",\n      \"Requirement already satisfied, skipping upgrade: google-cloud-core<2.0dev,>=1.2.0 in /usr/local/lib/python3.6/dist-packages (from google-cloud-storage>=1.13.0->kfp) (1.3.0)\\n\",\n      \"Requirement already satisfied, skipping upgrade: setuptools in /usr/local/lib/python3.6/dist-packages (from jsonschema>=3.0.1->kfp) (45.1.0)\\n\",\n      \"Requirement already satisfied, skipping upgrade: attrs>=17.4.0 in /usr/local/lib/python3.6/dist-packages (from jsonschema>=3.0.1->kfp) (19.3.0)\\n\",\n      \"Requirement already satisfied, skipping upgrade: importlib-metadata; python_version < \\\"3.8\\\" in /usr/local/lib/python3.6/dist-packages (from jsonschema>=3.0.1->kfp) (1.4.0)\\n\",\n      \"Requirement already satisfied, skipping upgrade: pyrsistent>=0.14.0 in /usr/local/lib/python3.6/dist-packages (from jsonschema>=3.0.1->kfp) (0.15.7)\\n\",\n      \"Requirement already satisfied, skipping upgrade: wrapt<2,>=1.10 in /usr/local/lib/python3.6/dist-packages (from Deprecated->kfp) (1.11.2)\\n\",\n      \"Requirement already satisfied, skipping upgrade: rsa<4.1,>=3.1.4 in /usr/local/lib/python3.6/dist-packages (from google-auth>=1.6.1->kfp) (4.0)\\n\",\n      \"Requirement already satisfied, skipping upgrade: cachetools<5.0,>=2.0.0 in /usr/local/lib/python3.6/dist-packages (from google-auth>=1.6.1->kfp) (4.0.0)\\n\",\n      \"Requirement already satisfied, skipping upgrade: pyasn1-modules>=0.2.1 in /usr/local/lib/python3.6/dist-packages (from google-auth>=1.6.1->kfp) (0.2.8)\\n\",\n      \"Requirement already satisfied, skipping upgrade: websocket-client!=0.40.0,!=0.41.*,!=0.42.*,>=0.32.0 in /usr/local/lib/python3.6/dist-packages (from kubernetes<=10.0.0,>=8.0.0->kfp) (0.57.0)\\n\",\n      \"Requirement already satisfied, skipping upgrade: requests-oauthlib in /usr/local/lib/python3.6/dist-packages (from kubernetes<=10.0.0,>=8.0.0->kfp) (1.3.0)\\n\",\n      \"Requirement already satisfied, skipping upgrade: cffi!=1.11.3,>=1.8 in ./.local/lib/python3.6/site-packages (from cryptography>=2.4.2->kfp) (1.14.0)\\n\",\n      \"Requirement already satisfied, skipping upgrade: chardet<3.1.0,>=3.0.2 in /usr/local/lib/python3.6/dist-packages (from requests<3.0.0,>=2.0.1->requests-toolbelt>=0.8.0->kfp) (3.0.4)\\n\",\n      \"Requirement already satisfied, skipping upgrade: idna<2.9,>=2.5 in /usr/lib/python3/dist-packages (from requests<3.0.0,>=2.0.1->requests-toolbelt>=0.8.0->kfp) (2.6)\\n\",\n      \"Requirement already satisfied, skipping upgrade: google-api-core<2.0.0dev,>=1.16.0 in /usr/local/lib/python3.6/dist-packages (from google-cloud-core<2.0dev,>=1.2.0->google-cloud-storage>=1.13.0->kfp) (1.16.0)\\n\",\n      \"Requirement already satisfied, skipping upgrade: zipp>=0.5 in /usr/local/lib/python3.6/dist-packages (from importlib-metadata; python_version < \\\"3.8\\\"->jsonschema>=3.0.1->kfp) (2.1.0)\\n\",\n      \"Requirement already satisfied, skipping upgrade: pyasn1>=0.1.3 in /usr/local/lib/python3.6/dist-packages (from rsa<4.1,>=3.1.4->google-auth>=1.6.1->kfp) (0.4.8)\\n\",\n      \"Requirement already satisfied, skipping upgrade: oauthlib>=3.0.0 in /usr/local/lib/python3.6/dist-packages (from requests-oauthlib->kubernetes<=10.0.0,>=8.0.0->kfp) (3.1.0)\\n\",\n      \"Requirement already satisfied, skipping upgrade: pycparser in ./.local/lib/python3.6/site-packages (from cffi!=1.11.3,>=1.8->cryptography>=2.4.2->kfp) (2.19)\\n\",\n      \"Requirement already satisfied, skipping upgrade: googleapis-common-protos<2.0dev,>=1.6.0 in /usr/local/lib/python3.6/dist-packages (from google-api-core<2.0.0dev,>=1.16.0->google-cloud-core<2.0dev,>=1.2.0->google-cloud-storage>=1.13.0->kfp) (1.51.0)\\n\",\n      \"Requirement already satisfied, skipping upgrade: protobuf>=3.4.0 in /usr/local/lib/python3.6/dist-packages (from google-api-core<2.0.0dev,>=1.16.0->google-cloud-core<2.0dev,>=1.2.0->google-cloud-storage>=1.13.0->kfp) (3.11.2)\\n\",\n      \"Requirement already satisfied, skipping upgrade: pytz in /usr/local/lib/python3.6/dist-packages (from google-api-core<2.0.0dev,>=1.16.0->google-cloud-core<2.0dev,>=1.2.0->google-cloud-storage>=1.13.0->kfp) (2019.3)\\n\",\n      \"Installing collected packages: kubernetes\\n\",\n      \"  Attempting uninstall: kubernetes\\n\",\n      \"    Found existing installation: kubernetes 10.0.1\\n\",\n      \"    Uninstalling kubernetes-10.0.1:\\n\",\n      \"      Successfully uninstalled kubernetes-10.0.1\\n\",\n      \"Successfully installed kubernetes-10.0.0\\n\"\n     ]\n    }\n   ],\n   \"source\": [\n    \"!pip install kubernetes --upgrade --user\\n\",\n    \"!pip install kfp --upgrade --user\\n\",\n    \"\\n\",\n    \"\\n\",\n    \"import kfp  # the Pipelines SDK.  This library is included with the notebook image.\\n\",\n    \"from kfp import compiler\\n\",\n    \"import kfp.dsl as dsl\\n\",\n    \"import kfp.notebook\\n\",\n    \"from kubernetes import client as k8s_client\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"# Create/Get an Experiment in the Kubeflow Pipeline System\\n\",\n    \"The Kubeflow Pipeline system requires an \\\"Experiment\\\" to group pipeline runs. You can create a new experiment, or call client.list_experiments() to get existing ones.\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": 3,\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"client = kfp.Client()\\n\",\n    \"client.list_experiments()\\n\",\n    \"#exp = client.create_experiment(name='mdupdate')\\n\",\n    \"exp = client.get_experiment(experiment_name ='mdupdate')\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"# Define a Pipeline\\n\",\n    \"Authoring a pipeline is like authoring a normal Python function. The pipeline function describes the topology of the pipeline.\\n\",\n    \"\\n\",\n    \"Each step in the pipeline is typically a ContainerOp --- a simple class or function describing how to interact with a docker container image. In the pipeline, all the container images referenced in the pipeline are already built.\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": 4,\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"@dsl.pipeline(\\n\",\n    \"  name='Recommender model update',\\n\",\n    \"  description='Demonstrate usage of pipelines for multi-step model update'\\n\",\n    \")\\n\",\n    \"def recommender_pipeline():\\n\",\n    \"    # Load new data\\n\",\n    \"  data = dsl.ContainerOp(\\n\",\n    \"      name='updatedata',\\n\",\n    \"      image='lightbend/recommender-data-update-publisher:0.2') \\\\\\n\",\n    \"    .add_env_variable(k8s_client.V1EnvVar(name='MINIO_URL',value='http://minio-service.kubeflow.svc.cluster.local:9000')) \\\\\\n\",\n    \"    .add_env_variable(k8s_client.V1EnvVar(name='MINIO_KEY', value='minio')) \\\\\\n\",\n    \"    .add_env_variable(k8s_client.V1EnvVar(name='MINIO_SECRET', value='minio123'))\\n\",\n    \"    # Train the model\\n\",\n    \"  train = dsl.ContainerOp(\\n\",\n    \"      name='trainmodel',\\n\",\n    \"      image='lightbend/ml-tf-recommender:0.1') \\\\\\n\",\n    \"    .add_env_variable(k8s_client.V1EnvVar(name='MINIO_URL',value='minio-service.kubeflow.svc.cluster.local:9000')) \\\\\\n\",\n    \"    .add_env_variable(k8s_client.V1EnvVar(name='MINIO_KEY', value='minio')) \\\\\\n\",\n    \"    .add_env_variable(k8s_client.V1EnvVar(name='MINIO_SECRET', value='minio123'))\\n\",\n    \"  train.after(data)\\n\",\n    \"    # Publish new model model\\n\",\n    \"  publish = dsl.ContainerOp(\\n\",\n    \"      name='publishmodel',\\n\",\n    \"      image='lightbend/recommender-model-publisher:0.2') \\\\\\n\",\n    \"    .add_env_variable(k8s_client.V1EnvVar(name='MINIO_URL',value='http://minio-service.kubeflow.svc.cluster.local:9000')) \\\\\\n\",\n    \"    .add_env_variable(k8s_client.V1EnvVar(name='MINIO_KEY', value='minio')) \\\\\\n\",\n    \"    .add_env_variable(k8s_client.V1EnvVar(name='MINIO_SECRET', value='minio123')) \\\\\\n\",\n    \"    .add_env_variable(k8s_client.V1EnvVar(name='KAFKA_BROKERS', value='cloudflow-kafka-brokers.cloudflow.svc.cluster.local:9092')) \\\\\\n\",\n    \"    .add_env_variable(k8s_client.V1EnvVar(name='DEFAULT_RECOMMENDER_URL', value='http://recommendermodelserver.kubeflow.svc.cluster.local:8501')) \\\\\\n\",\n    \"    .add_env_variable(k8s_client.V1EnvVar(name='ALTERNATIVE_RECOMMENDER_URL', value='http://recommendermodelserver1.kubeflow.svc.cluster.local:8501'))\\n\",\n    \"  publish.after(train)\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"# Compile pipeline\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": 5,\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"compiler.Compiler().compile(recommender_pipeline, 'pipeline.tar.gz')\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"# Submit an experiment run\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": 6,\n   \"metadata\": {},\n   \"outputs\": [\n    {\n     \"data\": {\n      \"text/html\": [\n       \"Run link <a href=\\\"/pipeline/#/runs/details/df24284c-c7a1-480e-91b6-398bd352f164\\\" target=\\\"_blank\\\" >here</a>\"\n      ],\n      \"text/plain\": [\n       \"<IPython.core.display.HTML object>\"\n      ]\n     },\n     \"metadata\": {},\n     \"output_type\": \"display_data\"\n    }\n   ],\n   \"source\": [\n    \"run = client.run_pipeline(exp.id, 'pipeline1', 'pipeline.tar.gz')\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": null,\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": []\n  }\n ],\n \"metadata\": {\n  \"kernelspec\": {\n   \"display_name\": \"Python 3\",\n   \"language\": \"python\",\n   \"name\": \"python3\"\n  },\n  \"language_info\": {\n   \"codemirror_mode\": {\n    \"name\": \"ipython\",\n    \"version\": 3\n   },\n   \"file_extension\": \".py\",\n   \"mimetype\": \"text/x-python\",\n   \"name\": \"python\",\n   \"nbconvert_exporter\": \"python\",\n   \"pygments_lexer\": \"ipython3\",\n   \"version\": \"3.6.9\"\n  }\n },\n \"nbformat\": 4,\n \"nbformat_minor\": 2\n}\n"
  },
  {
    "path": "ch04/code/RecommenderPipeline.py",
    "content": "#!/usr/bin/env python\n# coding: utf-8\n\n# # Kubeflow pipeline\n# This is a fairly simple pipeline, containing sequential steps:\n#\n# 1. Update data - implemented by lightbend/recommender-data-update-publisher:0.2 image\n# 2. Run model training. Ideally we would run TFJob, but due to the current limitations for pipelines, we will directly use an image implementing training lightbend/ml-tf-recommender:0.1\n# 3. Update serving model - implemented by lightbend/recommender-model-publisher:0.2\n\n# # Setup\n\n# In[1]:\n\nget_ipython().system('pip install kubernetes --upgrade --user')\nget_ipython().system('pip install kfp --upgrade --user')\n\n# the Pipelines SDK.  This library is included with the notebook image.\nimport kfp\nfrom kfp import compiler\nimport kfp.dsl as dsl\nimport kfp.notebook\nfrom kubernetes import client as k8s_client\n\n# # Create/Get an Experiment in the Kubeflow Pipeline System\n# The Kubeflow Pipeline system requires an \"Experiment\" to group pipeline runs. You can create a new experiment, or call client.list_experiments() to get existing ones.\n\n# In[3]:\n\nclient = kfp.Client()\nclient.list_experiments()\n#exp = client.create_experiment(name='mdupdate')\nexp = client.get_experiment(experiment_name='mdupdate')\n\n# # Define a Pipeline\n# Authoring a pipeline is like authoring a normal Python function. The pipeline function describes the topology of the pipeline.\n#\n# Each step in the pipeline is typically a ContainerOp --- a simple class or function describing how to interact with a docker container image. In the pipeline, all the container images referenced in the pipeline are already built.\n\n# In[4]:\n\n\n@dsl.pipeline(\n    name='Recommender model update',\n    description='Demonstrate usage of pipelines for multi-step model update')\ndef recommender_pipeline():\n    # Load new data\n    data = dsl.ContainerOp(\n        name='updatedata',\n        image='lightbend/recommender-data-update-publisher:0.2') \\\n      .add_env_variable(k8s_client.V1EnvVar(name='MINIO_URL', value='http://minio-service.kubeflow.svc.cluster.local:9000')) \\\n      .add_env_variable(k8s_client.V1EnvVar(name='MINIO_KEY', value='minio')) \\\n      .add_env_variable(k8s_client.V1EnvVar(name='MINIO_SECRET', value='minio123'))\n    # Train the model\n    train = dsl.ContainerOp(\n        name='trainmodel',\n        image='lightbend/ml-tf-recommender:0.1') \\\n      .add_env_variable(k8s_client.V1EnvVar(name='MINIO_URL', value='minio-service.kubeflow.svc.cluster.local:9000')) \\\n      .add_env_variable(k8s_client.V1EnvVar(name='MINIO_KEY', value='minio')) \\\n      .add_env_variable(k8s_client.V1EnvVar(name='MINIO_SECRET', value='minio123'))\n    train.after(data)\n    # Publish new model model\n    publish = dsl.ContainerOp(\n        name='publishmodel',\n        image='lightbend/recommender-model-publisher:0.2') \\\n      .add_env_variable(k8s_client.V1EnvVar(name='MINIO_URL', value='http://minio-service.kubeflow.svc.cluster.local:9000')) \\\n      .add_env_variable(k8s_client.V1EnvVar(name='MINIO_KEY', value='minio')) \\\n      .add_env_variable(k8s_client.V1EnvVar(name='MINIO_SECRET', value='minio123')) \\\n      .add_env_variable(k8s_client.V1EnvVar(name='KAFKA_BROKERS', value='cloudflow-kafka-brokers.cloudflow.svc.cluster.local:9092')) \\\n      .add_env_variable(k8s_client.V1EnvVar(name='DEFAULT_RECOMMENDER_URL', value='http://recommendermodelserver.kubeflow.svc.cluster.local:8501')) \\\n      .add_env_variable(k8s_client.V1EnvVar(name='ALTERNATIVE_RECOMMENDER_URL', value='http://recommendermodelserver1.kubeflow.svc.cluster.local:8501'))\n    publish.after(train)\n\n\n# # Compile pipeline\n\n# In[5]:\n\ncompiler.Compiler().compile(recommender_pipeline, 'pipeline.tar.gz')\n\n# # Submit an experiment run\n\n# In[6]:\n\nrun = client.run_pipeline(exp.id, 'pipeline1', 'pipeline.tar.gz')\n\n# In[ ]:\n"
  },
  {
    "path": "ch04/code/download_components.sh",
    "content": "#!/bin/bash\n#tag::dlPipelineRelease[]\nwget https://github.com/kubeflow/pipelines/archive/0.2.5.tar.gz\ntar -xvf 0.2.5.tar.gz\n#end::dlPipelineRelease[]\n"
  },
  {
    "path": "ch04/install/deployment.yaml",
    "content": "apiVersion: extensions/v1beta1\nkind: Deployment\nmetadata:\n  labels:\n    app: argo-ui\n    app.kubernetes.io/component: argo\n    app.kubernetes.io/instance: argo-v2.3.0\n    app.kubernetes.io/managed-by: kfctl\n    app.kubernetes.io/name: argo\n    app.kubernetes.io/part-of: kubeflow\n    app.kubernetes.io/version: v2.3.0\n    kustomize.component: argo\n  name: argo-ui\n  namespace: kubeflow\nspec:\n  progressDeadlineSeconds: 600\n  replicas: 1\n  revisionHistoryLimit: 10\n  selector:\n    matchLabels:\n      app: argo-ui\n      app.kubernetes.io/component: argo\n      app.kubernetes.io/instance: argo-v2.3.0\n      app.kubernetes.io/managed-by: kfctl\n      app.kubernetes.io/name: argo\n      app.kubernetes.io/part-of: kubeflow\n      app.kubernetes.io/version: v2.3.0\n      kustomize.component: argo\n  strategy:\n    rollingUpdate:\n      maxSurge: 25%\n      maxUnavailable: 25%\n    type: RollingUpdate\n  template:\n    metadata:\n      annotations:\n        sidecar.istio.io/inject: \"false\"\n      creationTimestamp: null\n      labels:\n        app: argo-ui\n        app.kubernetes.io/component: argo\n        app.kubernetes.io/instance: argo-v2.3.0\n        app.kubernetes.io/managed-by: kfctl\n        app.kubernetes.io/name: argo\n        app.kubernetes.io/part-of: kubeflow\n        app.kubernetes.io/version: v2.3.0\n        kustomize.component: argo\n    spec:\n      containers:\n        - env:\n            - name: ARGO_NAMESPACE\n              valueFrom:\n                fieldRef:\n                  apiVersion: v1\n                  fieldPath: metadata.namespace\n            - name: IN_CLUSTER\n              value: \"true\"\n            - name: ENABLE_WEB_CONSOLE\n              value: \"true\"\n            - name: BASE_HREF\n              value: /\n          image: argoproj/argoui:v2.3.0\n          imagePullPolicy: IfNotPresent\n          name: argo-ui\n          ports:\n            - containerPort: 8001\n              name: ui\n              protocol: TCP\n          readinessProbe:\n            failureThreshold: 3\n            httpGet:\n              path: /\n              port: 8001\n              scheme: HTTP\n            periodSeconds: 10\n            successThreshold: 1\n            timeoutSeconds: 1\n          resources: {}\n          terminationMessagePath: /dev/termination-log\n          terminationMessagePolicy: File\n      dnsPolicy: ClusterFirst\n      restartPolicy: Always\n      schedulerName: default-scheduler\n      securityContext: {}\n      serviceAccount: argo-ui\n      serviceAccountName: argo-ui\n      terminationGracePeriodSeconds: 30"
  },
  {
    "path": "ch04/install/virtualservice.yaml",
    "content": "apiVersion: networking.istio.io/v1alpha3\nkind: VirtualService\nmetadata:\n  name: argo-ui\n  namespace: kubeflow\nspec:\n  gateways:\n    - kubeflow-gateway\n  hosts:\n    - '*'\n  http:\n    - match:\n        - uri:\n            prefix: /argo/\n      rewrite:\n        uri: /\n      route:\n        - destination:\n            host: argo-ui.kubeflow.svc.cluster.local\n            port:\n              number: 80"
  },
  {
    "path": "ch06/MLflow.ipynb",
    "content": "{\n \"cells\": [\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"# mlflow-energyforecast\\n\",\n    \"\\n\",\n    \"This is a showcase for ML Flow capabilities, based on the article\\n\",\n    \"http://the-odd-dataguy.com/be-more-efficient-to-produce-ml-models-with-mlflow\\n\",\n    \"and a github https://github.com/jeanmidevacc/mlflow-energyforecast\\n\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": 2,\n   \"metadata\": {},\n   \"outputs\": [\n    {\n     \"name\": \"stdout\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"Collecting pandas\\n\",\n      \"\\u001b[?25l  Downloading https://files.pythonhosted.org/packages/12/d1/a6502c2f5c15b50f5dd579fc1c52b47edf6f2e9f682aed917dd7565b3e60/pandas-1.0.0-cp36-cp36m-manylinux1_x86_64.whl (10.1MB)\\n\",\n      \"\\u001b[K     |████████████████████████████████| 10.1MB 3.2MB/s eta 0:00:01\\n\",\n      \"\\u001b[?25hRequirement already satisfied, skipping upgrade: numpy>=1.13.3 in ./.local/lib/python3.6/site-packages (from pandas) (1.18.1)\\n\",\n      \"Requirement already satisfied, skipping upgrade: python-dateutil>=2.6.1 in /usr/local/lib/python3.6/dist-packages (from pandas) (2.8.0)\\n\",\n      \"Requirement already satisfied, skipping upgrade: pytz>=2017.2 in /usr/local/lib/python3.6/dist-packages (from pandas) (2019.2)\\n\",\n      \"Requirement already satisfied, skipping upgrade: six>=1.5 in /usr/lib/python3/dist-packages (from python-dateutil>=2.6.1->pandas) (1.11.0)\\n\",\n      \"Installing collected packages: pandas\\n\",\n      \"  Found existing installation: pandas 0.25.3\\n\",\n      \"    Uninstalling pandas-0.25.3:\\n\",\n      \"      Successfully uninstalled pandas-0.25.3\\n\",\n      \"Successfully installed pandas-1.0.0\\n\",\n      \"\\u001b[33mWARNING: You are using pip version 19.1.1, however version 20.0.2 is available.\\n\",\n      \"You should consider upgrading via the 'pip install --upgrade pip' command.\\u001b[0m\\n\",\n      \"Collecting mlflow\\n\",\n      \"\\u001b[?25l  Downloading https://files.pythonhosted.org/packages/65/33/5fe1559f7eb95e1fa2077df747ada7fd225045bad4e76bcdb53605e4b937/mlflow-1.6.0.tar.gz (15.9MB)\\n\",\n      \"\\u001b[K     |████████████████████████████████| 15.9MB 3.0MB/s eta 0:00:01\\n\",\n      \"\\u001b[?25hRequirement already satisfied, skipping upgrade: alembic in ./.local/lib/python3.6/site-packages (from mlflow) (1.3.2)\\n\",\n      \"Requirement already satisfied, skipping upgrade: click>=7.0 in /usr/local/lib/python3.6/dist-packages (from mlflow) (7.0)\\n\",\n      \"Requirement already satisfied, skipping upgrade: cloudpickle in ./.local/lib/python3.6/site-packages (from mlflow) (1.1.1)\\n\",\n      \"Requirement already satisfied, skipping upgrade: databricks-cli>=0.8.7 in ./.local/lib/python3.6/site-packages (from mlflow) (0.9.1)\\n\",\n      \"Requirement already satisfied, skipping upgrade: requests>=2.17.3 in /usr/local/lib/python3.6/dist-packages (from mlflow) (2.22.0)\\n\",\n      \"Requirement already satisfied, skipping upgrade: six>=1.10.0 in /usr/lib/python3/dist-packages (from mlflow) (1.11.0)\\n\",\n      \"Requirement already satisfied, skipping upgrade: Flask in ./.local/lib/python3.6/site-packages (from mlflow) (1.1.1)\\n\",\n      \"Requirement already satisfied, skipping upgrade: numpy in ./.local/lib/python3.6/site-packages (from mlflow) (1.18.1)\\n\",\n      \"Requirement already satisfied, skipping upgrade: pandas in ./.local/lib/python3.6/site-packages (from mlflow) (1.0.0)\\n\",\n      \"Requirement already satisfied, skipping upgrade: python-dateutil in /usr/local/lib/python3.6/dist-packages (from mlflow) (2.8.0)\\n\",\n      \"Requirement already satisfied, skipping upgrade: protobuf>=3.6.0 in /usr/local/lib/python3.6/dist-packages (from mlflow) (3.8.0)\\n\",\n      \"Requirement already satisfied, skipping upgrade: gitpython>=2.1.0 in ./.local/lib/python3.6/site-packages (from mlflow) (3.0.5)\\n\",\n      \"Requirement already satisfied, skipping upgrade: pyyaml in /usr/local/lib/python3.6/dist-packages (from mlflow) (5.1.2)\\n\",\n      \"Requirement already satisfied, skipping upgrade: querystring_parser in ./.local/lib/python3.6/site-packages (from mlflow) (1.2.4)\\n\",\n      \"Requirement already satisfied, skipping upgrade: simplejson in ./.local/lib/python3.6/site-packages (from mlflow) (3.17.0)\\n\",\n      \"Requirement already satisfied, skipping upgrade: docker>=4.0.0 in /usr/local/lib/python3.6/dist-packages (from mlflow) (4.0.2)\\n\",\n      \"Requirement already satisfied, skipping upgrade: entrypoints in /usr/local/lib/python3.6/dist-packages (from mlflow) (0.3)\\n\",\n      \"Requirement already satisfied, skipping upgrade: sqlparse in ./.local/lib/python3.6/site-packages (from mlflow) (0.3.0)\\n\",\n      \"Requirement already satisfied, skipping upgrade: sqlalchemy in ./.local/lib/python3.6/site-packages (from mlflow) (1.3.12)\\n\",\n      \"Requirement already satisfied, skipping upgrade: gorilla in ./.local/lib/python3.6/site-packages (from mlflow) (0.3.0)\\n\",\n      \"Requirement already satisfied, skipping upgrade: prometheus-flask-exporter in ./.local/lib/python3.6/site-packages (from mlflow) (0.12.1)\\n\",\n      \"Requirement already satisfied, skipping upgrade: gunicorn in ./.local/lib/python3.6/site-packages (from mlflow) (20.0.4)\\n\",\n      \"Requirement already satisfied, skipping upgrade: Mako in ./.local/lib/python3.6/site-packages (from alembic->mlflow) (1.1.0)\\n\",\n      \"Requirement already satisfied, skipping upgrade: python-editor>=0.3 in ./.local/lib/python3.6/site-packages (from alembic->mlflow) (1.0.4)\\n\",\n      \"Requirement already satisfied, skipping upgrade: configparser>=0.3.5 in ./.local/lib/python3.6/site-packages (from databricks-cli>=0.8.7->mlflow) (4.0.2)\\n\",\n      \"Requirement already satisfied, skipping upgrade: tabulate>=0.7.7 in /usr/local/lib/python3.6/dist-packages (from databricks-cli>=0.8.7->mlflow) (0.8.3)\\n\",\n      \"Requirement already satisfied, skipping upgrade: chardet<3.1.0,>=3.0.2 in /usr/local/lib/python3.6/dist-packages (from requests>=2.17.3->mlflow) (3.0.4)\\n\",\n      \"Requirement already satisfied, skipping upgrade: certifi>=2017.4.17 in /usr/local/lib/python3.6/dist-packages (from requests>=2.17.3->mlflow) (2019.9.11)\\n\",\n      \"Requirement already satisfied, skipping upgrade: urllib3!=1.25.0,!=1.25.1,<1.26,>=1.21.1 in /usr/local/lib/python3.6/dist-packages (from requests>=2.17.3->mlflow) (1.24.3)\\n\",\n      \"Requirement already satisfied, skipping upgrade: idna<2.9,>=2.5 in /usr/lib/python3/dist-packages (from requests>=2.17.3->mlflow) (2.6)\\n\",\n      \"Requirement already satisfied, skipping upgrade: Jinja2>=2.10.1 in /usr/local/lib/python3.6/dist-packages (from Flask->mlflow) (2.10.1)\\n\",\n      \"Requirement already satisfied, skipping upgrade: itsdangerous>=0.24 in ./.local/lib/python3.6/site-packages (from Flask->mlflow) (1.1.0)\\n\",\n      \"Requirement already satisfied, skipping upgrade: Werkzeug>=0.15 in /usr/local/lib/python3.6/dist-packages (from Flask->mlflow) (0.15.4)\\n\",\n      \"Requirement already satisfied, skipping upgrade: pytz>=2017.2 in /usr/local/lib/python3.6/dist-packages (from pandas->mlflow) (2019.2)\\n\",\n      \"Requirement already satisfied, skipping upgrade: setuptools in /usr/local/lib/python3.6/dist-packages (from protobuf>=3.6.0->mlflow) (41.0.1)\\n\",\n      \"Requirement already satisfied, skipping upgrade: gitdb2>=2.0.0 in ./.local/lib/python3.6/site-packages (from gitpython>=2.1.0->mlflow) (2.0.6)\\n\",\n      \"Requirement already satisfied, skipping upgrade: websocket-client>=0.32.0 in /usr/local/lib/python3.6/dist-packages (from docker>=4.0.0->mlflow) (0.56.0)\\n\",\n      \"Requirement already satisfied, skipping upgrade: prometheus-client in /usr/local/lib/python3.6/dist-packages (from prometheus-flask-exporter->mlflow) (0.7.1)\\n\",\n      \"Requirement already satisfied, skipping upgrade: MarkupSafe>=0.9.2 in /usr/local/lib/python3.6/dist-packages (from Mako->alembic->mlflow) (1.1.1)\\n\",\n      \"Requirement already satisfied, skipping upgrade: smmap2>=2.0.0 in ./.local/lib/python3.6/site-packages (from gitdb2>=2.0.0->gitpython>=2.1.0->mlflow) (2.0.5)\\n\",\n      \"Building wheels for collected packages: mlflow\\n\",\n      \"  Building wheel for mlflow (setup.py) ... \\u001b[?25ldone\\n\",\n      \"\\u001b[?25h  Stored in directory: /home/jovyan/.cache/pip/wheels/46/4e/83/e58b14b6d2d494783e31690de9572c5777882f675f480374b6\\n\",\n      \"Successfully built mlflow\\n\",\n      \"Installing collected packages: mlflow\\n\",\n      \"  Found existing installation: mlflow 1.5.0\\n\",\n      \"    Uninstalling mlflow-1.5.0:\\n\",\n      \"      Successfully uninstalled mlflow-1.5.0\\n\",\n      \"\\u001b[33m  WARNING: The script mlflow is installed in '/home/jovyan/.local/bin' which is not on PATH.\\n\",\n      \"  Consider adding this directory to PATH or, if you prefer to suppress this warning, use --no-warn-script-location.\\u001b[0m\\n\",\n      \"Successfully installed mlflow-1.6.0\\n\",\n      \"\\u001b[33mWARNING: You are using pip version 19.1.1, however version 20.0.2 is available.\\n\",\n      \"You should consider upgrading via the 'pip install --upgrade pip' command.\\u001b[0m\\n\",\n      \"Requirement already up-to-date: joblib in ./.local/lib/python3.6/site-packages (0.14.1)\\n\",\n      \"\\u001b[33mWARNING: You are using pip version 19.1.1, however version 20.0.2 is available.\\n\",\n      \"You should consider upgrading via the 'pip install --upgrade pip' command.\\u001b[0m\\n\",\n      \"Requirement already up-to-date: numpy in ./.local/lib/python3.6/site-packages (1.18.1)\\n\",\n      \"\\u001b[33mWARNING: You are using pip version 19.1.1, however version 20.0.2 is available.\\n\",\n      \"You should consider upgrading via the 'pip install --upgrade pip' command.\\u001b[0m\\n\",\n      \"Requirement already up-to-date: scipy in ./.local/lib/python3.6/site-packages (1.4.1)\\n\",\n      \"Requirement already satisfied, skipping upgrade: numpy>=1.13.3 in ./.local/lib/python3.6/site-packages (from scipy) (1.18.1)\\n\",\n      \"\\u001b[33mWARNING: You are using pip version 19.1.1, however version 20.0.2 is available.\\n\",\n      \"You should consider upgrading via the 'pip install --upgrade pip' command.\\u001b[0m\\n\",\n      \"Requirement already up-to-date: scikit-learn in ./.local/lib/python3.6/site-packages (0.22.1)\\n\",\n      \"Requirement already satisfied, skipping upgrade: numpy>=1.11.0 in ./.local/lib/python3.6/site-packages (from scikit-learn) (1.18.1)\\n\",\n      \"Requirement already satisfied, skipping upgrade: scipy>=0.17.0 in ./.local/lib/python3.6/site-packages (from scikit-learn) (1.4.1)\\n\",\n      \"Requirement already satisfied, skipping upgrade: joblib>=0.11 in ./.local/lib/python3.6/site-packages (from scikit-learn) (0.14.1)\\n\",\n      \"\\u001b[33mWARNING: You are using pip version 19.1.1, however version 20.0.2 is available.\\n\",\n      \"You should consider upgrading via the 'pip install --upgrade pip' command.\\u001b[0m\\n\",\n      \"Collecting boto3\\n\",\n      \"\\u001b[?25l  Downloading https://files.pythonhosted.org/packages/d5/57/e9675a5a8d0ee586594ff19cb9a601334fbf24fa2fb29052d2a900ee5d23/boto3-1.11.9-py2.py3-none-any.whl (128kB)\\n\",\n      \"\\u001b[K     |████████████████████████████████| 133kB 3.5MB/s eta 0:00:01\\n\",\n      \"\\u001b[?25hCollecting botocore<1.15.0,>=1.14.9 (from boto3)\\n\",\n      \"\\u001b[?25l  Downloading https://files.pythonhosted.org/packages/64/4c/b0b0d3b6f84a05f9135051b56d3eb8708012a289c4b82ee21c8c766f47b5/botocore-1.14.9-py2.py3-none-any.whl (5.9MB)\\n\",\n      \"\\u001b[K     |████████████████████████████████| 5.9MB 11.6MB/s eta 0:00:01\\n\",\n      \"\\u001b[?25hRequirement already satisfied, skipping upgrade: jmespath<1.0.0,>=0.7.1 in ./.local/lib/python3.6/site-packages (from boto3) (0.9.4)\\n\",\n      \"Requirement already satisfied, skipping upgrade: s3transfer<0.4.0,>=0.3.0 in ./.local/lib/python3.6/site-packages (from boto3) (0.3.0)\\n\",\n      \"Requirement already satisfied, skipping upgrade: python-dateutil<3.0.0,>=2.1 in /usr/local/lib/python3.6/dist-packages (from botocore<1.15.0,>=1.14.9->boto3) (2.8.0)\\n\",\n      \"Requirement already satisfied, skipping upgrade: docutils<0.16,>=0.10 in ./.local/lib/python3.6/site-packages (from botocore<1.15.0,>=1.14.9->boto3) (0.15.2)\\n\",\n      \"Requirement already satisfied, skipping upgrade: urllib3<1.26,>=1.20 in /usr/local/lib/python3.6/dist-packages (from botocore<1.15.0,>=1.14.9->boto3) (1.24.3)\\n\",\n      \"Requirement already satisfied, skipping upgrade: six>=1.5 in /usr/lib/python3/dist-packages (from python-dateutil<3.0.0,>=2.1->botocore<1.15.0,>=1.14.9->boto3) (1.11.0)\\n\",\n      \"Installing collected packages: botocore, boto3\\n\",\n      \"  Found existing installation: botocore 1.14.4\\n\",\n      \"    Uninstalling botocore-1.14.4:\\n\",\n      \"      Successfully uninstalled botocore-1.14.4\\n\",\n      \"  Found existing installation: boto3 1.11.4\\n\",\n      \"    Uninstalling boto3-1.11.4:\\n\",\n      \"      Successfully uninstalled boto3-1.11.4\\n\",\n      \"Successfully installed boto3-1.11.9 botocore-1.14.9\\n\",\n      \"\\u001b[33mWARNING: You are using pip version 19.1.1, however version 20.0.2 is available.\\n\",\n      \"You should consider upgrading via the 'pip install --upgrade pip' command.\\u001b[0m\\n\"\n     ]\n    }\n   ],\n   \"source\": [\n    \"!pip install pandas --upgrade --user\\n\",\n    \"!pip install mlflow --upgrade --user\\n\",\n    \"!pip install joblib --upgrade --user\\n\",\n    \"!pip install numpy --upgrade --user \\n\",\n    \"!pip install scipy --upgrade --user \\n\",\n    \"!pip install scikit-learn --upgrade --user\\n\",\n    \"!pip install boto3 --upgrade --user\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": 3,\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"import time\\n\",\n    \"import json\\n\",\n    \"import os\\n\",\n    \"from joblib import Parallel, delayed\\n\",\n    \"\\n\",\n    \"import pandas as pd\\n\",\n    \"import numpy as np\\n\",\n    \"import scipy\\n\",\n    \"\\n\",\n    \"from sklearn.model_selection import train_test_split, KFold\\n\",\n    \"from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score, explained_variance_score\\n\",\n    \"from sklearn.exceptions import ConvergenceWarning\\n\",\n    \"\\n\",\n    \"import mlflow\\n\",\n    \"import mlflow.sklearn\\n\",\n    \"from  mlflow.tracking import MlflowClient\\n\",\n    \"\\n\",\n    \"from warnings import simplefilter\\n\",\n    \"simplefilter(action='ignore', category = FutureWarning)\\n\",\n    \"simplefilter(action='ignore', category = ConvergenceWarning)\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": 4,\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"# Ensure Minio access\\n\",\n    \"os.environ['MLFLOW_S3_ENDPOINT_URL'] = 'http://minio-service.kubeflow.svc.cluster.local:9000'\\n\",\n    \"os.environ['AWS_ACCESS_KEY_ID'] = 'minio'\\n\",\n    \"os.environ['AWS_SECRET_ACCESS_KEY'] = 'minio123'\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"# Data preparation\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": 5,\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"# Collect the data \\n\",\n    \"df_nationalconsumption_electricity_daily = pd.read_csv(\\\"https://raw.githubusercontent.com/jeanmidevacc/mlflow-energyforecast/master/data/rtu_data.csv\\\")\\n\",\n    \"df_nationalconsumption_electricity_daily.set_index([\\\"day\\\"], inplace = True)\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": 6,\n   \"metadata\": {},\n   \"outputs\": [\n    {\n     \"name\": \"stdout\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"Size of the training set :  1081\\n\",\n      \"Size of the testing set :  233\\n\"\n     ]\n    }\n   ],\n   \"source\": [\n    \"# Prepare the training set and the testing set\\n\",\n    \"df_trainvalidate_energyconsumption = df_nationalconsumption_electricity_daily[df_nationalconsumption_electricity_daily[\\\"datastatus\\\"] == \\\"Définitif\\\"]\\n\",\n    \"del df_trainvalidate_energyconsumption[\\\"datastatus\\\"]\\n\",\n    \"\\n\",\n    \"df_test_energyconsumption = df_nationalconsumption_electricity_daily[df_nationalconsumption_electricity_daily[\\\"datastatus\\\"] == \\\"Consolidé\\\"]\\n\",\n    \"del df_test_energyconsumption[\\\"datastatus\\\"]\\n\",\n    \"\\n\",\n    \"print(\\\"Size of the training set : \\\",len(df_trainvalidate_energyconsumption))\\n\",\n    \"print(\\\"Size of the testing set : \\\",len(df_test_energyconsumption))\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": 7,\n   \"metadata\": {},\n   \"outputs\": [\n    {\n     \"name\": \"stdout\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"Output to predict :  dailyconsumption\\n\",\n      \"Inputs for the prediction :  ['weekday', 'week', 'month', 'year', 'avg_min_temperature', 'avg_max_temperature', 'avg_mean_temperature', 'wavg_min_temperature', 'wavg_max_temperature', 'wavg_mean_temperature', 'is_holiday']\\n\"\n     ]\n    }\n   ],\n   \"source\": [\n    \"# Define the inputs and the output\\n\",\n    \"output = \\\"dailyconsumption\\\"\\n\",\n    \"allinputs = list(df_trainvalidate_energyconsumption.columns)\\n\",\n    \"allinputs.remove(output)\\n\",\n    \"\\n\",\n    \"print(\\\"Output to predict : \\\", output)\\n\",\n    \"print(\\\"Inputs for the prediction : \\\", allinputs)\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": 8,\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"# Build different set of featurws for the model\\n\",\n    \"possible_inputs = {\\n\",\n    \"    \\\"all\\\" : allinputs,\\n\",\n    \"    \\\"only_allday_inputs\\\" : [\\\"weekday\\\", \\\"month\\\", \\\"is_holiday\\\", \\\"week\\\"],\\n\",\n    \"    \\\"only_allweatheravg_inputs\\\" : [\\\"avg_min_temperature\\\", \\\"avg_max_temperature\\\", \\\"avg_mean_temperature\\\",\\\"wavg_min_temperature\\\", \\\"wavg_max_temperature\\\", \\\"wavg_mean_temperature\\\"],\\n\",\n    \"    \\\"only_meanweather_inputs_avg\\\" : [\\\"avg_mean_temperature\\\"],\\n\",\n    \"    \\\"only_meanweather_inputs_wavg\\\" : [\\\"wavg_mean_temperature\\\"],\\n\",\n    \"}\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": 9,\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"# Prepare the output of the model\\n\",\n    \"array_output_train = np.array(df_trainvalidate_energyconsumption[output])\\n\",\n    \"array_output_test = np.array(df_test_energyconsumption[output])\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": 10,\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"# connect to remote server\\n\",\n    \"remote_server_uri = \\\"http://mlflowserver.kubeflow.svc.cluster.local:5000\\\"\\n\",\n    \"mlflow.set_tracking_uri(remote_server_uri)\\n\",\n    \"# Launch the experiment on mlflow\\n\",\n    \"experiment_name = \\\"electricityconsumption-forecast\\\"\\n\",\n    \"mlflow.set_experiment(experiment_name)\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": 11,\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"# Define the evaluation function that will do the computation of the different metrics of accuracy (RMSE,MAE,R2)\\n\",\n    \"def evaluation_model(y_test, y_pred):\\n\",\n    \"\\n\",\n    \"    rmse = np.sqrt(mean_squared_error(y_test, y_pred))\\n\",\n    \"    mae = mean_absolute_error(y_test, y_pred)\\n\",\n    \"    r2 = r2_score(y_test, y_pred)\\n\",\n    \"\\n\",\n    \"    metrics = {\\n\",\n    \"        \\\"rmse\\\" : rmse,\\n\",\n    \"        \\\"r2\\\" : r2,\\n\",\n    \"        \\\"mae\\\" : mae,\\n\",\n    \"    }\\n\",\n    \"    \\n\",\n    \"    return metrics\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"# KNN regressor\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": 12,\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"from sklearn.neighbors import KNeighborsRegressor\\n\",\n    \"\\n\",\n    \"def train_knnmodel(parameters, inputs, tags, log = False):\\n\",\n    \"    with mlflow.start_run(nested = True):\\n\",\n    \"        \\n\",\n    \"        # Prepare the data\\n\",\n    \"        array_inputs_train = np.array(df_trainvalidate_energyconsumption[inputs])\\n\",\n    \"        array_inputs_test = np.array(df_test_energyconsumption[inputs])\\n\",\n    \"        \\n\",\n    \"        \\n\",\n    \"        # Build the model\\n\",\n    \"        tic = time.time()\\n\",\n    \"        model = KNeighborsRegressor(parameters[\\\"nbr_neighbors\\\"], weights = parameters[\\\"weight_method\\\"])\\n\",\n    \"        model.fit(array_inputs_train, array_output_train)\\n\",\n    \"        duration_training = time.time() - tic\\n\",\n    \"\\n\",\n    \"        # Make the prediction\\n\",\n    \"        tic1 = time.time()\\n\",\n    \"        prediction = model.predict(array_inputs_test)\\n\",\n    \"        duration_prediction = time.time() - tic1\\n\",\n    \"\\n\",\n    \"        # Evaluate the model prediction\\n\",\n    \"        metrics = evaluation_model(array_output_test, prediction)\\n\",\n    \"\\n\",\n    \"        # Log in the console\\n\",\n    \"        if log:\\n\",\n    \"            print(f\\\"KNN regressor:\\\")\\n\",\n    \"            print(parameters)\\n\",\n    \"            print(metrics)\\n\",\n    \"\\n\",\n    \"        # Log in mlflow (parameter)\\n\",\n    \"        mlflow.log_params(parameters)\\n\",\n    \"\\n\",\n    \"        # Log in mlflow (metrics)\\n\",\n    \"        metrics[\\\"duration_training\\\"] = duration_training\\n\",\n    \"        metrics[\\\"duration_prediction\\\"] = duration_prediction\\n\",\n    \"        mlflow.log_metrics(metrics)\\n\",\n    \"\\n\",\n    \"        # log in mlflow (model)\\n\",\n    \"        mlflow.sklearn.log_model(model, f\\\"model\\\")\\n\",\n    \"                \\n\",\n    \"        # Tag the model\\n\",\n    \"        mlflow.set_tags(tags)\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": 13,\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"# Test the different combinations\\n\",\n    \"configurations = []\\n\",\n    \"for nbr_neighbors in [1,2,5,10]:\\n\",\n    \"    for weight_method in ['uniform','distance']:\\n\",\n    \"        for field in possible_inputs:\\n\",\n    \"            parameters = {\\n\",\n    \"                \\\"nbr_neighbors\\\" : nbr_neighbors,\\n\",\n    \"                \\\"weight_method\\\" : weight_method\\n\",\n    \"            }\\n\",\n    \"\\n\",\n    \"            tags = {\\n\",\n    \"                \\\"model\\\" : \\\"knn\\\",\\n\",\n    \"                \\\"inputs\\\" : field\\n\",\n    \"            }\\n\",\n    \"            \\n\",\n    \"            configurations.append([parameters, tags])\\n\",\n    \"\\n\",\n    \"            train_knnmodel(parameters, possible_inputs[field], tags)\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"# MLP regressor\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": 14,\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"from sklearn.neural_network import MLPRegressor\\n\",\n    \"\\n\",\n    \"def train_mlpmodel(parameters, inputs, tags, log = False):\\n\",\n    \"    with mlflow.start_run(nested = True):\\n\",\n    \"        \\n\",\n    \"        # Prepare the data\\n\",\n    \"        array_inputs_train = np.array(df_trainvalidate_energyconsumption[inputs])\\n\",\n    \"        array_inputs_test = np.array(df_test_energyconsumption[inputs])\\n\",\n    \"        \\n\",\n    \"        # Build the model\\n\",\n    \"        tic = time.time()\\n\",\n    \"\\n\",\n    \"        model = MLPRegressor(\\n\",\n    \"            hidden_layer_sizes = parameters[\\\"hidden_layers\\\"],\\n\",\n    \"            activation = parameters[\\\"activation\\\"],\\n\",\n    \"            solver = parameters[\\\"solver\\\"],\\n\",\n    \"            max_iter = parameters[\\\"nbr_iteration\\\"],\\n\",\n    \"            random_state = 0)\\n\",\n    \"        \\n\",\n    \"        model.fit(array_inputs_train, array_output_train)\\n\",\n    \"        duration_training = time.time() - tic\\n\",\n    \"\\n\",\n    \"        # Make the prediction\\n\",\n    \"        tic1 = time.time()\\n\",\n    \"        prediction = model.predict(array_inputs_test)\\n\",\n    \"        duration_prediction = time.time() - tic1\\n\",\n    \"\\n\",\n    \"        # Evaluate the model prediction\\n\",\n    \"        metrics = evaluation_model(array_output_test, prediction)\\n\",\n    \"\\n\",\n    \"        # Log in the console\\n\",\n    \"        if log:\\n\",\n    \"            print(f\\\"Random forest regressor:\\\")\\n\",\n    \"            print(parameters)\\n\",\n    \"            print(metrics)\\n\",\n    \"    \\n\",\n    \"        # Log in mlflow (parameter)\\n\",\n    \"        mlflow.log_params(parameters)\\n\",\n    \"\\n\",\n    \"        # Log in mlflow (metrics)\\n\",\n    \"        metrics[\\\"duration_training\\\"] = duration_training\\n\",\n    \"        metrics[\\\"duration_prediction\\\"] = duration_prediction\\n\",\n    \"        mlflow.log_metrics(metrics)\\n\",\n    \"\\n\",\n    \"        # log in mlflow (model)\\n\",\n    \"        mlflow.sklearn.log_model(model, f\\\"model\\\")\\n\",\n    \"        \\n\",\n    \"        # Tag the model\\n\",\n    \"        mlflow.set_tags(tags)        \"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": 15,\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"for hiddenlayers in [4,8,16]:\\n\",\n    \"    for activation in [\\\"identity\\\",\\\"logistic\\\",]:\\n\",\n    \"        for solver in [\\\"lbfgs\\\"]:\\n\",\n    \"            for nbriteration in [10,100,1000]:\\n\",\n    \"                for field in possible_inputs:\\n\",\n    \"                    parameters = {\\n\",\n    \"                        \\\"hidden_layers\\\" : hiddenlayers,\\n\",\n    \"                        \\\"activation\\\" : activation,\\n\",\n    \"                        \\\"solver\\\" : solver,\\n\",\n    \"                        \\\"nbr_iteration\\\" : nbriteration\\n\",\n    \"                    }\\n\",\n    \"\\n\",\n    \"                    tags = {\\n\",\n    \"                        \\\"model\\\" : \\\"mlp\\\",\\n\",\n    \"                        \\\"inputs\\\" : field\\n\",\n    \"                    }\\n\",\n    \"\\n\",\n    \"                    train_mlpmodel(parameters, possible_inputs[field], tags)\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"# Use a handmade model (scipy approach)\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": 16,\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"class PTG:\\n\",\n    \"    def __init__(self, thresholds_x0, thresholds_a, thresholds_b):\\n\",\n    \"        self.thresholds_x0 = thresholds_x0\\n\",\n    \"        self.thresholds_a = thresholds_a\\n\",\n    \"        self.thresholds_b = thresholds_b\\n\",\n    \"        \\n\",\n    \"    def get_ptgmodel(self, x, a, b, x0):\\n\",\n    \"        return np.piecewise(x, [x < x0, x >= x0], [lambda x: a*x + b , lambda x : a*x0 + b])\\n\",\n    \"        \\n\",\n    \"    def fit(self, dfx, y):\\n\",\n    \"        x = np.array(dfx)\\n\",\n    \"        \\n\",\n    \"        # Define the bounds\\n\",\n    \"        bounds_min = [thresholds_a[0], thresholds_b[0], thresholds_x0[0]]\\n\",\n    \"        bounds_max = [thresholds_a[1], thresholds_b[1], thresholds_x0[1]]\\n\",\n    \"        bounds = (bounds_min, bounds_max)\\n\",\n    \"\\n\",\n    \"        # Fit a model\\n\",\n    \"        popt, pcov = scipy.optimize.curve_fit(self.get_ptgmodel, x, y, bounds = bounds)\\n\",\n    \"\\n\",\n    \"        # Get the parameter of the model\\n\",\n    \"        a = popt[0]\\n\",\n    \"        b = popt[1]\\n\",\n    \"        x0 = popt[2]\\n\",\n    \"        \\n\",\n    \"        self.coefficients = [a, b, x0]\\n\",\n    \"        \\n\",\n    \"    def predict(self,dfx):\\n\",\n    \"        x = np.array(dfx)\\n\",\n    \"        predictions = []\\n\",\n    \"        for elt in x:\\n\",\n    \"            forecast = self.get_ptgmodel(elt, self.coefficients[0], self.coefficients[1], self.coefficients[2])\\n\",\n    \"            predictions.append(forecast)\\n\",\n    \"        return np.array(predictions)\\n\",\n    \"        \\n\",\n    \"def train_ptgmodel(parameters, inputs, tags, log = False):\\n\",\n    \"    with mlflow.start_run(nested = True):\\n\",\n    \"        \\n\",\n    \"        # Prepare the data\\n\",\n    \"        df_inputs_train = df_trainvalidate_energyconsumption[inputs[0]]\\n\",\n    \"        df_inputs_test = df_test_energyconsumption[inputs[0]]\\n\",\n    \"        \\n\",\n    \"        \\n\",\n    \"        # Build the model\\n\",\n    \"        tic = time.time()\\n\",\n    \"        \\n\",\n    \"        model = PTG(parameters[\\\"thresholds_x0\\\"], parameters[\\\"thresholds_a\\\"], parameters[\\\"thresholds_b\\\"])\\n\",\n    \"        \\n\",\n    \"        model.fit(df_inputs_train, array_output_train)\\n\",\n    \"        duration_training = time.time() - tic\\n\",\n    \"\\n\",\n    \"        # Make the prediction\\n\",\n    \"        tic1 = time.time()\\n\",\n    \"        prediction = model.predict(df_inputs_test)\\n\",\n    \"        duration_prediction = time.time() - tic1\\n\",\n    \"\\n\",\n    \"        # Evaluate the model prediction\\n\",\n    \"        metrics = evaluation_model(array_output_test, prediction)\\n\",\n    \"\\n\",\n    \"        # Log in the console\\n\",\n    \"        if log:\\n\",\n    \"            print(f\\\"PTG:\\\")\\n\",\n    \"            print(parameters)\\n\",\n    \"            print(metrics)\\n\",\n    \"    \\n\",\n    \"        # Log in mlflow (parameter)\\n\",\n    \"        mlflow.log_params(parameters)  \\n\",\n    \"\\n\",\n    \"        # Log in mlflow (metrics)\\n\",\n    \"        metrics[\\\"duration_training\\\"] = duration_training\\n\",\n    \"        metrics[\\\"duration_prediction\\\"] = duration_prediction\\n\",\n    \"        mlflow.log_metrics(metrics)\\n\",\n    \"\\n\",\n    \"        # log in mlflow (model)\\n\",\n    \"        mlflow.sklearn.log_model(model, f\\\"model\\\")\\n\",\n    \"        \\n\",\n    \"        # Tag the model\\n\",\n    \"        mlflow.set_tags(tags)           \"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": 17,\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"# Define the parameters of the model\\n\",\n    \"thresholds_x0 = [0, 20]\\n\",\n    \"thresholds_a = [-200000, -50000]\\n\",\n    \"thresholds_b = [1000000, 3000000]\\n\",\n    \"\\n\",\n    \"parameters = {\\n\",\n    \"    \\\"thresholds_x0\\\" : thresholds_x0,\\n\",\n    \"    \\\"thresholds_a\\\" : thresholds_a,\\n\",\n    \"    \\\"thresholds_b\\\" : thresholds_b\\n\",\n    \"}\\n\",\n    \"\\n\",\n    \"for field in [\\\"only_meanweather_inputs_avg\\\", \\\"only_meanweather_inputs_wavg\\\"]:\\n\",\n    \"    \\n\",\n    \"    tags = {\\n\",\n    \"        \\\"model\\\" : \\\"ptg\\\",\\n\",\n    \"        \\\"inputs\\\" : field\\n\",\n    \"    }\\n\",\n    \"    \\n\",\n    \"    train_ptgmodel(parameters, possible_inputs[field], tags, log = False)\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"# Evaluate mlflow results\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": 18,\n   \"metadata\": {},\n   \"outputs\": [\n    {\n     \"name\": \"stdout\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"Number of runs done :  272\\n\"\n     ]\n    }\n   ],\n   \"source\": [\n    \"# Select the run of the experiment\\n\",\n    \"df_runs = mlflow.search_runs(experiment_ids=\\\"0\\\")\\n\",\n    \"print(\\\"Number of runs done : \\\", len(df_runs))\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": 19,\n   \"metadata\": {},\n   \"outputs\": [\n    {\n     \"data\": {\n      \"text/html\": [\n       \"<div>\\n\",\n       \"<style scoped>\\n\",\n       \"    .dataframe tbody tr th:only-of-type {\\n\",\n       \"        vertical-align: middle;\\n\",\n       \"    }\\n\",\n       \"\\n\",\n       \"    .dataframe tbody tr th {\\n\",\n       \"        vertical-align: top;\\n\",\n       \"    }\\n\",\n       \"\\n\",\n       \"    .dataframe thead th {\\n\",\n       \"        text-align: right;\\n\",\n       \"    }\\n\",\n       \"</style>\\n\",\n       \"<table border=\\\"1\\\" class=\\\"dataframe\\\">\\n\",\n       \"  <thead>\\n\",\n       \"    <tr style=\\\"text-align: right;\\\">\\n\",\n       \"      <th></th>\\n\",\n       \"      <th>run_id</th>\\n\",\n       \"      <th>experiment_id</th>\\n\",\n       \"      <th>status</th>\\n\",\n       \"      <th>artifact_uri</th>\\n\",\n       \"      <th>start_time</th>\\n\",\n       \"      <th>end_time</th>\\n\",\n       \"      <th>metrics.r2</th>\\n\",\n       \"      <th>metrics.mae</th>\\n\",\n       \"      <th>metrics.duration_prediction</th>\\n\",\n       \"      <th>metrics.rmse</th>\\n\",\n       \"      <th>...</th>\\n\",\n       \"      <th>params.activation</th>\\n\",\n       \"      <th>params.nbr_iteration</th>\\n\",\n       \"      <th>params.hidden_layers</th>\\n\",\n       \"      <th>params.nbr_neighbors</th>\\n\",\n       \"      <th>params.weight_method</th>\\n\",\n       \"      <th>tags.model</th>\\n\",\n       \"      <th>tags.mlflow.source.type</th>\\n\",\n       \"      <th>tags.inputs</th>\\n\",\n       \"      <th>tags.mlflow.user</th>\\n\",\n       \"      <th>tags.mlflow.source.name</th>\\n\",\n       \"    </tr>\\n\",\n       \"  </thead>\\n\",\n       \"  <tbody>\\n\",\n       \"    <tr>\\n\",\n       \"      <th>238</th>\\n\",\n       \"      <td>50ee6409ad3a4778bb9d8cb59034df5d</td>\\n\",\n       \"      <td>0</td>\\n\",\n       \"      <td>FINISHED</td>\\n\",\n       \"      <td>s3://mlflow/mlflow/artifacts/0/50ee6409ad3a477...</td>\\n\",\n       \"      <td>2020-01-17 18:17:47.448000+00:00</td>\\n\",\n       \"      <td>2020-01-17 18:17:47.929000+00:00</td>\\n\",\n       \"      <td>0.935956</td>\\n\",\n       \"      <td>104040.339809</td>\\n\",\n       \"      <td>0.003205</td>\\n\",\n       \"      <td>134649.399348</td>\\n\",\n       \"      <td>...</td>\\n\",\n       \"      <td>None</td>\\n\",\n       \"      <td>None</td>\\n\",\n       \"      <td>None</td>\\n\",\n       \"      <td>5</td>\\n\",\n       \"      <td>distance</td>\\n\",\n       \"      <td>knn</td>\\n\",\n       \"      <td>LOCAL</td>\\n\",\n       \"      <td>all</td>\\n\",\n       \"      <td>jovyan</td>\\n\",\n       \"      <td>/usr/local/lib/python3.6/dist-packages/ipykern...</td>\\n\",\n       \"    </tr>\\n\",\n       \"    <tr>\\n\",\n       \"      <th>106</th>\\n\",\n       \"      <td>614bcf7042ca465c8d86296f12ac9c09</td>\\n\",\n       \"      <td>0</td>\\n\",\n       \"      <td>FINISHED</td>\\n\",\n       \"      <td>s3://mlflow/mlflow/artifacts/0/614bcf7042ca465...</td>\\n\",\n       \"      <td>2020-01-31 15:21:29.978000+00:00</td>\\n\",\n       \"      <td>2020-01-31 15:21:30.503000+00:00</td>\\n\",\n       \"      <td>0.935956</td>\\n\",\n       \"      <td>104040.339809</td>\\n\",\n       \"      <td>0.003404</td>\\n\",\n       \"      <td>134649.399348</td>\\n\",\n       \"      <td>...</td>\\n\",\n       \"      <td>None</td>\\n\",\n       \"      <td>None</td>\\n\",\n       \"      <td>None</td>\\n\",\n       \"      <td>5</td>\\n\",\n       \"      <td>distance</td>\\n\",\n       \"      <td>knn</td>\\n\",\n       \"      <td>LOCAL</td>\\n\",\n       \"      <td>all</td>\\n\",\n       \"      <td>jovyan</td>\\n\",\n       \"      <td>/usr/local/lib/python3.6/dist-packages/ipykern...</td>\\n\",\n       \"    </tr>\\n\",\n       \"    <tr>\\n\",\n       \"      <th>96</th>\\n\",\n       \"      <td>b05667486f7d45779d23519eb0dbe24f</td>\\n\",\n       \"      <td>0</td>\\n\",\n       \"      <td>FINISHED</td>\\n\",\n       \"      <td>s3://mlflow/mlflow/artifacts/0/b05667486f7d457...</td>\\n\",\n       \"      <td>2020-01-31 15:21:35.424000+00:00</td>\\n\",\n       \"      <td>2020-01-31 15:21:35.922000+00:00</td>\\n\",\n       \"      <td>0.935111</td>\\n\",\n       \"      <td>105833.358681</td>\\n\",\n       \"      <td>0.002732</td>\\n\",\n       \"      <td>135534.759873</td>\\n\",\n       \"      <td>...</td>\\n\",\n       \"      <td>None</td>\\n\",\n       \"      <td>None</td>\\n\",\n       \"      <td>None</td>\\n\",\n       \"      <td>10</td>\\n\",\n       \"      <td>distance</td>\\n\",\n       \"      <td>knn</td>\\n\",\n       \"      <td>LOCAL</td>\\n\",\n       \"      <td>all</td>\\n\",\n       \"      <td>jovyan</td>\\n\",\n       \"      <td>/usr/local/lib/python3.6/dist-packages/ipykern...</td>\\n\",\n       \"    </tr>\\n\",\n       \"    <tr>\\n\",\n       \"      <th>228</th>\\n\",\n       \"      <td>d279d728946e4b74811203a842d79df3</td>\\n\",\n       \"      <td>0</td>\\n\",\n       \"      <td>FINISHED</td>\\n\",\n       \"      <td>s3://mlflow/mlflow/artifacts/0/d279d728946e4b7...</td>\\n\",\n       \"      <td>2020-01-17 18:17:52.555000+00:00</td>\\n\",\n       \"      <td>2020-01-17 18:17:53.029000+00:00</td>\\n\",\n       \"      <td>0.935111</td>\\n\",\n       \"      <td>105833.358681</td>\\n\",\n       \"      <td>0.002863</td>\\n\",\n       \"      <td>135534.759873</td>\\n\",\n       \"      <td>...</td>\\n\",\n       \"      <td>None</td>\\n\",\n       \"      <td>None</td>\\n\",\n       \"      <td>None</td>\\n\",\n       \"      <td>10</td>\\n\",\n       \"      <td>distance</td>\\n\",\n       \"      <td>knn</td>\\n\",\n       \"      <td>LOCAL</td>\\n\",\n       \"      <td>all</td>\\n\",\n       \"      <td>jovyan</td>\\n\",\n       \"      <td>/usr/local/lib/python3.6/dist-packages/ipykern...</td>\\n\",\n       \"    </tr>\\n\",\n       \"    <tr>\\n\",\n       \"      <th>111</th>\\n\",\n       \"      <td>88af21719e0a408b91448f7ddd27e84c</td>\\n\",\n       \"      <td>0</td>\\n\",\n       \"      <td>FINISHED</td>\\n\",\n       \"      <td>s3://mlflow/mlflow/artifacts/0/88af21719e0a408...</td>\\n\",\n       \"      <td>2020-01-31 15:21:27.338000+00:00</td>\\n\",\n       \"      <td>2020-01-31 15:21:27.947000+00:00</td>\\n\",\n       \"      <td>0.934465</td>\\n\",\n       \"      <td>105793.727897</td>\\n\",\n       \"      <td>0.002668</td>\\n\",\n       \"      <td>136207.422483</td>\\n\",\n       \"      <td>...</td>\\n\",\n       \"      <td>None</td>\\n\",\n       \"      <td>None</td>\\n\",\n       \"      <td>None</td>\\n\",\n       \"      <td>5</td>\\n\",\n       \"      <td>uniform</td>\\n\",\n       \"      <td>knn</td>\\n\",\n       \"      <td>LOCAL</td>\\n\",\n       \"      <td>all</td>\\n\",\n       \"      <td>jovyan</td>\\n\",\n       \"      <td>/usr/local/lib/python3.6/dist-packages/ipykern...</td>\\n\",\n       \"    </tr>\\n\",\n       \"  </tbody>\\n\",\n       \"</table>\\n\",\n       \"<p>5 rows × 25 columns</p>\\n\",\n       \"</div>\"\n      ],\n      \"text/plain\": [\n       \"                               run_id experiment_id    status  \\\\\\n\",\n       \"238  50ee6409ad3a4778bb9d8cb59034df5d             0  FINISHED   \\n\",\n       \"106  614bcf7042ca465c8d86296f12ac9c09             0  FINISHED   \\n\",\n       \"96   b05667486f7d45779d23519eb0dbe24f             0  FINISHED   \\n\",\n       \"228  d279d728946e4b74811203a842d79df3             0  FINISHED   \\n\",\n       \"111  88af21719e0a408b91448f7ddd27e84c             0  FINISHED   \\n\",\n       \"\\n\",\n       \"                                          artifact_uri  \\\\\\n\",\n       \"238  s3://mlflow/mlflow/artifacts/0/50ee6409ad3a477...   \\n\",\n       \"106  s3://mlflow/mlflow/artifacts/0/614bcf7042ca465...   \\n\",\n       \"96   s3://mlflow/mlflow/artifacts/0/b05667486f7d457...   \\n\",\n       \"228  s3://mlflow/mlflow/artifacts/0/d279d728946e4b7...   \\n\",\n       \"111  s3://mlflow/mlflow/artifacts/0/88af21719e0a408...   \\n\",\n       \"\\n\",\n       \"                          start_time                         end_time  \\\\\\n\",\n       \"238 2020-01-17 18:17:47.448000+00:00 2020-01-17 18:17:47.929000+00:00   \\n\",\n       \"106 2020-01-31 15:21:29.978000+00:00 2020-01-31 15:21:30.503000+00:00   \\n\",\n       \"96  2020-01-31 15:21:35.424000+00:00 2020-01-31 15:21:35.922000+00:00   \\n\",\n       \"228 2020-01-17 18:17:52.555000+00:00 2020-01-17 18:17:53.029000+00:00   \\n\",\n       \"111 2020-01-31 15:21:27.338000+00:00 2020-01-31 15:21:27.947000+00:00   \\n\",\n       \"\\n\",\n       \"     metrics.r2    metrics.mae  metrics.duration_prediction   metrics.rmse  \\\\\\n\",\n       \"238    0.935956  104040.339809                     0.003205  134649.399348   \\n\",\n       \"106    0.935956  104040.339809                     0.003404  134649.399348   \\n\",\n       \"96     0.935111  105833.358681                     0.002732  135534.759873   \\n\",\n       \"228    0.935111  105833.358681                     0.002863  135534.759873   \\n\",\n       \"111    0.934465  105793.727897                     0.002668  136207.422483   \\n\",\n       \"\\n\",\n       \"     ...  params.activation params.nbr_iteration params.hidden_layers  \\\\\\n\",\n       \"238  ...               None                 None                 None   \\n\",\n       \"106  ...               None                 None                 None   \\n\",\n       \"96   ...               None                 None                 None   \\n\",\n       \"228  ...               None                 None                 None   \\n\",\n       \"111  ...               None                 None                 None   \\n\",\n       \"\\n\",\n       \"    params.nbr_neighbors params.weight_method tags.model  \\\\\\n\",\n       \"238                    5             distance        knn   \\n\",\n       \"106                    5             distance        knn   \\n\",\n       \"96                    10             distance        knn   \\n\",\n       \"228                   10             distance        knn   \\n\",\n       \"111                    5              uniform        knn   \\n\",\n       \"\\n\",\n       \"    tags.mlflow.source.type tags.inputs tags.mlflow.user  \\\\\\n\",\n       \"238                   LOCAL         all           jovyan   \\n\",\n       \"106                   LOCAL         all           jovyan   \\n\",\n       \"96                    LOCAL         all           jovyan   \\n\",\n       \"228                   LOCAL         all           jovyan   \\n\",\n       \"111                   LOCAL         all           jovyan   \\n\",\n       \"\\n\",\n       \"                               tags.mlflow.source.name  \\n\",\n       \"238  /usr/local/lib/python3.6/dist-packages/ipykern...  \\n\",\n       \"106  /usr/local/lib/python3.6/dist-packages/ipykern...  \\n\",\n       \"96   /usr/local/lib/python3.6/dist-packages/ipykern...  \\n\",\n       \"228  /usr/local/lib/python3.6/dist-packages/ipykern...  \\n\",\n       \"111  /usr/local/lib/python3.6/dist-packages/ipykern...  \\n\",\n       \"\\n\",\n       \"[5 rows x 25 columns]\"\n      ]\n     },\n     \"execution_count\": 19,\n     \"metadata\": {},\n     \"output_type\": \"execute_result\"\n    }\n   ],\n   \"source\": [\n    \"# Quick sorting to get the best models based on the RMSE metric\\n\",\n    \"df_runs.sort_values([\\\"metrics.rmse\\\"], ascending = True, inplace = True)\\n\",\n    \"df_runs.head()\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": 20,\n   \"metadata\": {},\n   \"outputs\": [\n    {\n     \"data\": {\n      \"text/plain\": [\n       \"'50ee6409ad3a4778bb9d8cb59034df5d'\"\n      ]\n     },\n     \"execution_count\": 20,\n     \"metadata\": {},\n     \"output_type\": \"execute_result\"\n    }\n   ],\n   \"source\": [\n    \"# Get the best one\\n\",\n    \"runid_selected = df_runs.head(1)[\\\"run_id\\\"].values[0]\\n\",\n    \"runid_selected\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": null,\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": []\n  }\n ],\n \"metadata\": {\n  \"celltoolbar\": \"Raw Cell Format\",\n  \"kernelspec\": {\n   \"display_name\": \"Python 3\",\n   \"language\": \"python\",\n   \"name\": \"python3\"\n  },\n  \"language_info\": {\n   \"codemirror_mode\": {\n    \"name\": \"ipython\",\n    \"version\": 3\n   },\n   \"file_extension\": \".py\",\n   \"mimetype\": \"text/x-python\",\n   \"name\": \"python\",\n   \"nbconvert_exporter\": \"python\",\n   \"pygments_lexer\": \"ipython3\",\n   \"version\": \"3.6.8\"\n  }\n },\n \"nbformat\": 4,\n \"nbformat_minor\": 2\n}\n"
  },
  {
    "path": "ch06/MLflow.py",
    "content": "#!/usr/bin/env python\n# coding: utf-8\n\n# # mlflow-energyforecast\n#\n# This is a showcase for ML Flow capabilities, based on the article\n# http://the-odd-dataguy.com/be-more-efficient-to-produce-ml-models-with-mlflow\n# and a github https://github.com/jeanmidevacc/mlflow-energyforecast\n#\n\n# In[2]:\n\nget_ipython().system('pip install pandas --upgrade --user')\nget_ipython().system('pip install mlflow --upgrade --user')\nget_ipython().system('pip install joblib --upgrade --user')\nget_ipython().system('pip install numpy --upgrade --user ')\nget_ipython().system('pip install scipy --upgrade --user ')\nget_ipython().system('pip install scikit-learn --upgrade --user')\nget_ipython().system('pip install boto3 --upgrade --user')\n\n# In[3]:\n\nimport time\nimport json\nimport os\nfrom joblib import Parallel, delayed\n\nimport pandas as pd\nimport numpy as np\nimport scipy\n\nfrom sklearn.model_selection import train_test_split, KFold\nfrom sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score, explained_variance_score\nfrom sklearn.exceptions import ConvergenceWarning\n\nimport mlflow\nimport mlflow.sklearn\nfrom mlflow.tracking import MlflowClient\n\nfrom warnings import simplefilter\nsimplefilter(action='ignore', category=FutureWarning)\nsimplefilter(action='ignore', category=ConvergenceWarning)\n\n# In[4]:\n\n# Ensure Minio access\nos.environ[\n    'MLFLOW_S3_ENDPOINT_URL'] = 'http://minio-service.kubeflow.svc.cluster.local:9000'\nos.environ['AWS_ACCESS_KEY_ID'] = 'minio'\nos.environ['AWS_SECRET_ACCESS_KEY'] = 'minio123'\n\n# # Data preparation\n\n# In[5]:\n\n# Collect the data\ndf_nationalconsumption_electricity_daily = pd.read_csv(\n    \"https://raw.githubusercontent.com/jeanmidevacc/mlflow-energyforecast/master/data/rtu_data.csv\"\n)\ndf_nationalconsumption_electricity_daily.set_index([\"day\"], inplace=True)\n\n# In[6]:\n\n# Prepare the training set and the testing set\ndf_trainvalidate_energyconsumption = df_nationalconsumption_electricity_daily[\n    df_nationalconsumption_electricity_daily[\"datastatus\"] == \"Définitif\"]\ndel df_trainvalidate_energyconsumption[\"datastatus\"]\n\ndf_test_energyconsumption = df_nationalconsumption_electricity_daily[\n    df_nationalconsumption_electricity_daily[\"datastatus\"] == \"Consolidé\"]\ndel df_test_energyconsumption[\"datastatus\"]\n\nprint(\"Size of the training set : \", len(df_trainvalidate_energyconsumption))\nprint(\"Size of the testing set : \", len(df_test_energyconsumption))\n\n# In[7]:\n\n# Define the inputs and the output\noutput = \"dailyconsumption\"\nallinputs = list(df_trainvalidate_energyconsumption.columns)\nallinputs.remove(output)\n\nprint(\"Output to predict : \", output)\nprint(\"Inputs for the prediction : \", allinputs)\n\n# In[8]:\n\n# Build different set of featurws for the model\npossible_inputs = {\n    \"all\":\n    allinputs,\n    \"only_allday_inputs\": [\"weekday\", \"month\", \"is_holiday\", \"week\"],\n    \"only_allweatheravg_inputs\": [\n        \"avg_min_temperature\", \"avg_max_temperature\", \"avg_mean_temperature\",\n        \"wavg_min_temperature\", \"wavg_max_temperature\", \"wavg_mean_temperature\"\n    ],\n    \"only_meanweather_inputs_avg\": [\"avg_mean_temperature\"],\n    \"only_meanweather_inputs_wavg\": [\"wavg_mean_temperature\"],\n}\n\n# In[9]:\n\n# Prepare the output of the model\narray_output_train = np.array(df_trainvalidate_energyconsumption[output])\narray_output_test = np.array(df_test_energyconsumption[output])\n\n# In[10]:\n\n# connect to remote server\nremote_server_uri = \"http://mlflowserver.kubeflow.svc.cluster.local:5000\"\nmlflow.set_tracking_uri(remote_server_uri)\n# Launch the experiment on mlflow\nexperiment_name = \"electricityconsumption-forecast\"\nmlflow.set_experiment(experiment_name)\n\n# In[11]:\n\n\n# Define the evaluation function that will do the computation of the different metrics of accuracy (RMSE,MAE,R2)\ndef evaluation_model(y_test, y_pred):\n\n    rmse = np.sqrt(mean_squared_error(y_test, y_pred))\n    mae = mean_absolute_error(y_test, y_pred)\n    r2 = r2_score(y_test, y_pred)\n\n    metrics = {\n        \"rmse\": rmse,\n        \"r2\": r2,\n        \"mae\": mae,\n    }\n\n    return metrics\n\n\n# # KNN regressor\n\n# In[12]:\n\nfrom sklearn.neighbors import KNeighborsRegressor\n\n\ndef train_knnmodel(parameters, inputs, tags, log=False):\n    with mlflow.start_run(nested=True):\n\n        # Prepare the data\n        array_inputs_train = np.array(\n            df_trainvalidate_energyconsumption[inputs])\n        array_inputs_test = np.array(df_test_energyconsumption[inputs])\n\n        # Build the model\n        tic = time.time()\n        model = KNeighborsRegressor(parameters[\"nbr_neighbors\"],\n                                    weights=parameters[\"weight_method\"])\n        model.fit(array_inputs_train, array_output_train)\n        duration_training = time.time() - tic\n\n        # Make the prediction\n        tic1 = time.time()\n        prediction = model.predict(array_inputs_test)\n        duration_prediction = time.time() - tic1\n\n        # Evaluate the model prediction\n        metrics = evaluation_model(array_output_test, prediction)\n\n        # Log in the console\n        if log:\n            print(f\"KNN regressor:\")\n            print(parameters)\n            print(metrics)\n\n        # Log in mlflow (parameter)\n        mlflow.log_params(parameters)\n\n        # Log in mlflow (metrics)\n        metrics[\"duration_training\"] = duration_training\n        metrics[\"duration_prediction\"] = duration_prediction\n        mlflow.log_metrics(metrics)\n\n        # log in mlflow (model)\n        mlflow.sklearn.log_model(model, f\"model\")\n\n        # Tag the model\n        mlflow.set_tags(tags)\n\n\n# In[13]:\n\n# Test the different combinations\nconfigurations = []\nfor nbr_neighbors in [1, 2, 5, 10]:\n    for weight_method in ['uniform', 'distance']:\n        for field in possible_inputs:\n            parameters = {\n                \"nbr_neighbors\": nbr_neighbors,\n                \"weight_method\": weight_method\n            }\n\n            tags = {\"model\": \"knn\", \"inputs\": field}\n\n            configurations.append([parameters, tags])\n\n            train_knnmodel(parameters, possible_inputs[field], tags)\n\n# # MLP regressor\n\n# In[14]:\n\nfrom sklearn.neural_network import MLPRegressor\n\n\ndef train_mlpmodel(parameters, inputs, tags, log=False):\n    with mlflow.start_run(nested=True):\n\n        # Prepare the data\n        array_inputs_train = np.array(\n            df_trainvalidate_energyconsumption[inputs])\n        array_inputs_test = np.array(df_test_energyconsumption[inputs])\n\n        # Build the model\n        tic = time.time()\n\n        model = MLPRegressor(hidden_layer_sizes=parameters[\"hidden_layers\"],\n                             activation=parameters[\"activation\"],\n                             solver=parameters[\"solver\"],\n                             max_iter=parameters[\"nbr_iteration\"],\n                             random_state=0)\n\n        model.fit(array_inputs_train, array_output_train)\n        duration_training = time.time() - tic\n\n        # Make the prediction\n        tic1 = time.time()\n        prediction = model.predict(array_inputs_test)\n        duration_prediction = time.time() - tic1\n\n        # Evaluate the model prediction\n        metrics = evaluation_model(array_output_test, prediction)\n\n        # Log in the console\n        if log:\n            print(f\"Random forest regressor:\")\n            print(parameters)\n            print(metrics)\n\n        # Log in mlflow (parameter)\n        mlflow.log_params(parameters)\n\n        # Log in mlflow (metrics)\n        metrics[\"duration_training\"] = duration_training\n        metrics[\"duration_prediction\"] = duration_prediction\n        mlflow.log_metrics(metrics)\n\n        # log in mlflow (model)\n        mlflow.sklearn.log_model(model, f\"model\")\n\n        # Tag the model\n        mlflow.set_tags(tags)\n\n\n# In[15]:\n\nfor hiddenlayers in [4, 8, 16]:\n    for activation in [\n            \"identity\",\n            \"logistic\",\n    ]:\n        for solver in [\"lbfgs\"]:\n            for nbriteration in [10, 100, 1000]:\n                for field in possible_inputs:\n                    parameters = {\n                        \"hidden_layers\": hiddenlayers,\n                        \"activation\": activation,\n                        \"solver\": solver,\n                        \"nbr_iteration\": nbriteration\n                    }\n\n                    tags = {\"model\": \"mlp\", \"inputs\": field}\n\n                    train_mlpmodel(parameters, possible_inputs[field], tags)\n\n# # Use a handmade model (scipy approach)\n\n# In[16]:\n\n\nclass PTG:\n    def __init__(self, thresholds_x0, thresholds_a, thresholds_b):\n        self.thresholds_x0 = thresholds_x0\n        self.thresholds_a = thresholds_a\n        self.thresholds_b = thresholds_b\n\n    def get_ptgmodel(self, x, a, b, x0):\n        return np.piecewise(x, [x < x0, x >= x0],\n                            [lambda x: a * x + b, lambda x: a * x0 + b])\n\n    def fit(self, dfx, y):\n        x = np.array(dfx)\n\n        # Define the bounds\n        bounds_min = [thresholds_a[0], thresholds_b[0], thresholds_x0[0]]\n        bounds_max = [thresholds_a[1], thresholds_b[1], thresholds_x0[1]]\n        bounds = (bounds_min, bounds_max)\n\n        # Fit a model\n        popt, pcov = scipy.optimize.curve_fit(self.get_ptgmodel,\n                                              x,\n                                              y,\n                                              bounds=bounds)\n\n        # Get the parameter of the model\n        a = popt[0]\n        b = popt[1]\n        x0 = popt[2]\n\n        self.coefficients = [a, b, x0]\n\n    def predict(self, dfx):\n        x = np.array(dfx)\n        predictions = []\n        for elt in x:\n            forecast = self.get_ptgmodel(elt, self.coefficients[0],\n                                         self.coefficients[1],\n                                         self.coefficients[2])\n            predictions.append(forecast)\n        return np.array(predictions)\n\n\ndef train_ptgmodel(parameters, inputs, tags, log=False):\n    with mlflow.start_run(nested=True):\n\n        # Prepare the data\n        df_inputs_train = df_trainvalidate_energyconsumption[inputs[0]]\n        df_inputs_test = df_test_energyconsumption[inputs[0]]\n\n        # Build the model\n        tic = time.time()\n\n        model = PTG(parameters[\"thresholds_x0\"], parameters[\"thresholds_a\"],\n                    parameters[\"thresholds_b\"])\n\n        model.fit(df_inputs_train, array_output_train)\n        duration_training = time.time() - tic\n\n        # Make the prediction\n        tic1 = time.time()\n        prediction = model.predict(df_inputs_test)\n        duration_prediction = time.time() - tic1\n\n        # Evaluate the model prediction\n        metrics = evaluation_model(array_output_test, prediction)\n\n        # Log in the console\n        if log:\n            print(f\"PTG:\")\n            print(parameters)\n            print(metrics)\n\n        # Log in mlflow (parameter)\n        mlflow.log_params(parameters)\n\n        # Log in mlflow (metrics)\n        metrics[\"duration_training\"] = duration_training\n        metrics[\"duration_prediction\"] = duration_prediction\n        mlflow.log_metrics(metrics)\n\n        # log in mlflow (model)\n        mlflow.sklearn.log_model(model, f\"model\")\n\n        # Tag the model\n        mlflow.set_tags(tags)\n\n\n# In[17]:\n\n# Define the parameters of the model\nthresholds_x0 = [0, 20]\nthresholds_a = [-200000, -50000]\nthresholds_b = [1000000, 3000000]\n\nparameters = {\n    \"thresholds_x0\": thresholds_x0,\n    \"thresholds_a\": thresholds_a,\n    \"thresholds_b\": thresholds_b\n}\n\nfor field in [\"only_meanweather_inputs_avg\", \"only_meanweather_inputs_wavg\"]:\n\n    tags = {\"model\": \"ptg\", \"inputs\": field}\n\n    train_ptgmodel(parameters, possible_inputs[field], tags, log=False)\n\n# # Evaluate mlflow results\n\n# In[18]:\n\n# Select the run of the experiment\ndf_runs = mlflow.search_runs(experiment_ids=\"0\")\nprint(\"Number of runs done : \", len(df_runs))\n\n# In[19]:\n\n# Quick sorting to get the best models based on the RMSE metric\ndf_runs.sort_values([\"metrics.rmse\"], ascending=True, inplace=True)\ndf_runs.head()\n\n# In[20]:\n\n# Get the best one\nrunid_selected = df_runs.head(1)[\"run_id\"].values[0]\nrunid_selected\n\n# In[ ]:\n"
  },
  {
    "path": "ch06/Metadata.ipynb",
    "content": "{\n \"cells\": [\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"# Installation and imports\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": 1,\n   \"metadata\": {},\n   \"outputs\": [\n    {\n     \"name\": \"stdout\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"Requirement already up-to-date: kfmd in ./.local/lib/python3.6/site-packages (0.1.8)\\n\",\n      \"Requirement already up-to-date: pandas in ./.local/lib/python3.6/site-packages (1.0.1)\\n\",\n      \"Requirement already satisfied, skipping upgrade: python-dateutil>=2.6.1 in /usr/local/lib/python3.6/dist-packages (from pandas) (2.8.1)\\n\",\n      \"Requirement already satisfied, skipping upgrade: numpy>=1.13.3 in /usr/local/lib/python3.6/dist-packages (from pandas) (1.18.1)\\n\",\n      \"Requirement already satisfied, skipping upgrade: pytz>=2017.2 in /usr/local/lib/python3.6/dist-packages (from pandas) (2019.3)\\n\",\n      \"Requirement already satisfied, skipping upgrade: six>=1.5 in /usr/lib/python3/dist-packages (from python-dateutil>=2.6.1->pandas) (1.11.0)\\n\"\n     ]\n    }\n   ],\n   \"source\": [\n    \"!pip install kfmd --upgrade --user\\n\",\n    \"!pip install pandas --upgrade --user\\n\",\n    \"\\n\",\n    \"from kfmd import metadata\\n\",\n    \"import pandas\\n\",\n    \"from datetime import datetime\\n\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"Create a workspace, run and execution\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": 2,\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"ws1 = metadata.Workspace(\\n\",\n    \"    # Connect to metadata-service in namesapce kubeflow in k8s cluster.\\n\",\n    \"    backend_url_prefix=\\\"metadata-service.kubeflow.svc.cluster.local:8080\\\",\\n\",\n    \"    name=\\\"ws1\\\",\\n\",\n    \"    description=\\\"a workspace for testing\\\",\\n\",\n    \"    labels={\\\"n1\\\": \\\"v1\\\"})\\n\",\n    \"r = metadata.Run(\\n\",\n    \"    workspace=ws1,\\n\",\n    \"    name=\\\"run-\\\" + datetime.utcnow().isoformat(\\\"T\\\") ,\\n\",\n    \"    description=\\\"a run in ws_1\\\",\\n\",\n    \")\\n\",\n    \"exec = metadata.Execution(\\n\",\n    \"    name = \\\"execution\\\" + datetime.utcnow().isoformat(\\\"T\\\") ,\\n\",\n    \"    workspace=ws1,\\n\",\n    \"    run=r,\\n\",\n    \"    description=\\\"execution example\\\",\\n\",\n    \")\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"Log data set, model and its evaluation\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": 3,\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"data_set = exec.log_input(\\n\",\n    \"        metadata.DataSet(\\n\",\n    \"            description=\\\"an example data\\\",\\n\",\n    \"            name=\\\"mytable-dump\\\",\\n\",\n    \"            owner=\\\"owner@my-company.org\\\",\\n\",\n    \"            uri=\\\"file://path/to/dataset\\\",\\n\",\n    \"            version=\\\"v1.0.0\\\",\\n\",\n    \"            query=\\\"SELECT * FROM mytable\\\"))\\n\",\n    \"model = exec.log_output(\\n\",\n    \"    metadata.Model(\\n\",\n    \"            name=\\\"MNIST\\\",\\n\",\n    \"            description=\\\"model to recognize handwritten digits\\\",\\n\",\n    \"            owner=\\\"someone@kubeflow.org\\\",\\n\",\n    \"            uri=\\\"gcs://my-bucket/mnist\\\",\\n\",\n    \"            model_type=\\\"neural network\\\",\\n\",\n    \"            training_framework={\\n\",\n    \"                \\\"name\\\": \\\"tensorflow\\\",\\n\",\n    \"                \\\"version\\\": \\\"v1.0\\\"\\n\",\n    \"            },\\n\",\n    \"            hyperparameters={\\n\",\n    \"                \\\"learning_rate\\\": 0.5,\\n\",\n    \"                \\\"layers\\\": [10, 3, 1],\\n\",\n    \"                \\\"early_stop\\\": True\\n\",\n    \"            },\\n\",\n    \"            version=\\\"v0.0.1\\\",\\n\",\n    \"            labels={\\\"mylabel\\\": \\\"l1\\\"}))\\n\",\n    \"metrics = exec.log_output(\\n\",\n    \"    metadata.Metrics(\\n\",\n    \"            name=\\\"MNIST-evaluation\\\",\\n\",\n    \"            description=\\\"validating the MNIST model to recognize handwritten digits\\\",\\n\",\n    \"            owner=\\\"someone@kubeflow.org\\\",\\n\",\n    \"            uri=\\\"gcs://my-bucket/mnist-eval.csv\\\",\\n\",\n    \"            data_set_id=data_set.id,\\n\",\n    \"            model_id=model.id,\\n\",\n    \"            metrics_type=metadata.Metrics.VALIDATION,\\n\",\n    \"            values={\\\"accuracy\\\": 0.95},\\n\",\n    \"            labels={\\\"mylabel\\\": \\\"l1\\\"}))\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"List all the models in the workspace\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": 4,\n   \"metadata\": {},\n   \"outputs\": [\n    {\n     \"data\": {\n      \"text/html\": [\n       \"<div>\\n\",\n       \"<style scoped>\\n\",\n       \"    .dataframe tbody tr th:only-of-type {\\n\",\n       \"        vertical-align: middle;\\n\",\n       \"    }\\n\",\n       \"\\n\",\n       \"    .dataframe tbody tr th {\\n\",\n       \"        vertical-align: top;\\n\",\n       \"    }\\n\",\n       \"\\n\",\n       \"    .dataframe thead th {\\n\",\n       \"        text-align: right;\\n\",\n       \"    }\\n\",\n       \"</style>\\n\",\n       \"<table border=\\\"1\\\" class=\\\"dataframe\\\">\\n\",\n       \"  <thead>\\n\",\n       \"    <tr style=\\\"text-align: right;\\\">\\n\",\n       \"      <th></th>\\n\",\n       \"      <th>id</th>\\n\",\n       \"      <th>workspace</th>\\n\",\n       \"      <th>run</th>\\n\",\n       \"      <th>create_time</th>\\n\",\n       \"      <th>description</th>\\n\",\n       \"      <th>model_type</th>\\n\",\n       \"      <th>name</th>\\n\",\n       \"      <th>owner</th>\\n\",\n       \"      <th>version</th>\\n\",\n       \"      <th>uri</th>\\n\",\n       \"      <th>training_framework</th>\\n\",\n       \"      <th>hyperparameters</th>\\n\",\n       \"      <th>labels</th>\\n\",\n       \"    </tr>\\n\",\n       \"  </thead>\\n\",\n       \"  <tbody>\\n\",\n       \"    <tr>\\n\",\n       \"      <th>0</th>\\n\",\n       \"      <td>8</td>\\n\",\n       \"      <td>ws1</td>\\n\",\n       \"      <td>run-2020-02-18T00:48:10.734939</td>\\n\",\n       \"      <td>2020-02-18T00:48:13.273533Z</td>\\n\",\n       \"      <td>model to recognize handwritten digits</td>\\n\",\n       \"      <td>neural network</td>\\n\",\n       \"      <td>MNIST</td>\\n\",\n       \"      <td>someone@kubeflow.org</td>\\n\",\n       \"      <td>v0.0.1</td>\\n\",\n       \"      <td>gcs://my-bucket/mnist</td>\\n\",\n       \"      <td>{'name': 'tensorflow', 'version': 'v1.0'}</td>\\n\",\n       \"      <td>{'learning_rate': 0.5, 'layers': [10, 3, 1], '...</td>\\n\",\n       \"      <td>{'mylabel': 'l1'}</td>\\n\",\n       \"    </tr>\\n\",\n       \"  </tbody>\\n\",\n       \"</table>\\n\",\n       \"</div>\"\n      ],\n      \"text/plain\": [\n       \"  id workspace                             run                  create_time  \\\\\\n\",\n       \"0  8       ws1  run-2020-02-18T00:48:10.734939  2020-02-18T00:48:13.273533Z   \\n\",\n       \"\\n\",\n       \"                             description      model_type   name  \\\\\\n\",\n       \"0  model to recognize handwritten digits  neural network  MNIST   \\n\",\n       \"\\n\",\n       \"                  owner version                    uri  \\\\\\n\",\n       \"0  someone@kubeflow.org  v0.0.1  gcs://my-bucket/mnist   \\n\",\n       \"\\n\",\n       \"                          training_framework  \\\\\\n\",\n       \"0  {'name': 'tensorflow', 'version': 'v1.0'}   \\n\",\n       \"\\n\",\n       \"                                     hyperparameters             labels  \\n\",\n       \"0  {'learning_rate': 0.5, 'layers': [10, 3, 1], '...  {'mylabel': 'l1'}  \"\n      ]\n     },\n     \"execution_count\": 4,\n     \"metadata\": {},\n     \"output_type\": \"execute_result\"\n    }\n   ],\n   \"source\": [\n    \"pandas.DataFrame.from_dict(ws1.list(metadata.Model.ARTIFACT_TYPE_NAME))\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"Get basic lineage\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": 5,\n   \"metadata\": {},\n   \"outputs\": [\n    {\n     \"name\": \"stdout\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"model id is 8\\n\",\n      \"\\n\"\n     ]\n    }\n   ],\n   \"source\": [\n    \"print(\\\"model id is %s\\\\n\\\" % model.id)\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"Find the execution that produces this model.\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": 6,\n   \"metadata\": {},\n   \"outputs\": [\n    {\n     \"name\": \"stdout\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"3\\n\"\n     ]\n    }\n   ],\n   \"source\": [\n    \"output_events = ws1.client.list_events2(model.id).events\\n\",\n    \"assert len(output_events) == 1\\n\",\n    \"execution_id = output_events[0].execution_id\\n\",\n    \"print(execution_id)\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"Find all events related to that execution.\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": 7,\n   \"metadata\": {},\n   \"outputs\": [\n    {\n     \"name\": \"stdout\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"\\n\",\n      \"All events related to this model:\\n\"\n     ]\n    },\n    {\n     \"data\": {\n      \"text/html\": [\n       \"<div>\\n\",\n       \"<style scoped>\\n\",\n       \"    .dataframe tbody tr th:only-of-type {\\n\",\n       \"        vertical-align: middle;\\n\",\n       \"    }\\n\",\n       \"\\n\",\n       \"    .dataframe tbody tr th {\\n\",\n       \"        vertical-align: top;\\n\",\n       \"    }\\n\",\n       \"\\n\",\n       \"    .dataframe thead th {\\n\",\n       \"        text-align: right;\\n\",\n       \"    }\\n\",\n       \"</style>\\n\",\n       \"<table border=\\\"1\\\" class=\\\"dataframe\\\">\\n\",\n       \"  <thead>\\n\",\n       \"    <tr style=\\\"text-align: right;\\\">\\n\",\n       \"      <th></th>\\n\",\n       \"      <th>artifact_id</th>\\n\",\n       \"      <th>execution_id</th>\\n\",\n       \"      <th>path</th>\\n\",\n       \"      <th>type</th>\\n\",\n       \"      <th>milliseconds_since_epoch</th>\\n\",\n       \"    </tr>\\n\",\n       \"  </thead>\\n\",\n       \"  <tbody>\\n\",\n       \"    <tr>\\n\",\n       \"      <th>0</th>\\n\",\n       \"      <td>7</td>\\n\",\n       \"      <td>3</td>\\n\",\n       \"      <td>None</td>\\n\",\n       \"      <td>INPUT</td>\\n\",\n       \"      <td>1581986893248</td>\\n\",\n       \"    </tr>\\n\",\n       \"    <tr>\\n\",\n       \"      <th>1</th>\\n\",\n       \"      <td>8</td>\\n\",\n       \"      <td>3</td>\\n\",\n       \"      <td>None</td>\\n\",\n       \"      <td>OUTPUT</td>\\n\",\n       \"      <td>1581986893273</td>\\n\",\n       \"    </tr>\\n\",\n       \"    <tr>\\n\",\n       \"      <th>2</th>\\n\",\n       \"      <td>9</td>\\n\",\n       \"      <td>3</td>\\n\",\n       \"      <td>None</td>\\n\",\n       \"      <td>OUTPUT</td>\\n\",\n       \"      <td>1581986893298</td>\\n\",\n       \"    </tr>\\n\",\n       \"  </tbody>\\n\",\n       \"</table>\\n\",\n       \"</div>\"\n      ],\n      \"text/plain\": [\n       \"  artifact_id execution_id  path    type milliseconds_since_epoch\\n\",\n       \"0           7            3  None   INPUT            1581986893248\\n\",\n       \"1           8            3  None  OUTPUT            1581986893273\\n\",\n       \"2           9            3  None  OUTPUT            1581986893298\"\n      ]\n     },\n     \"execution_count\": 7,\n     \"metadata\": {},\n     \"output_type\": \"execute_result\"\n    }\n   ],\n   \"source\": [\n    \"all_events = ws1.client.list_events(execution_id).events\\n\",\n    \"assert len(all_events) == 3\\n\",\n    \"print(\\\"\\\\nAll events related to this model:\\\")\\n\",\n    \"pandas.DataFrame.from_dict([e.to_dict() for e in all_events])\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": null,\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": []\n  }\n ],\n \"metadata\": {\n  \"kernelspec\": {\n   \"display_name\": \"Python 3\",\n   \"language\": \"python\",\n   \"name\": \"python3\"\n  },\n  \"language_info\": {\n   \"codemirror_mode\": {\n    \"name\": \"ipython\",\n    \"version\": 3\n   },\n   \"file_extension\": \".py\",\n   \"mimetype\": \"text/x-python\",\n   \"name\": \"python\",\n   \"nbconvert_exporter\": \"python\",\n   \"pygments_lexer\": \"ipython3\",\n   \"version\": \"3.6.9\"\n  }\n },\n \"nbformat\": 4,\n \"nbformat_minor\": 2\n}\n"
  },
  {
    "path": "ch06/Metadata.py",
    "content": "#!/usr/bin/env python\n# coding: utf-8\n\n# # Installation and imports\n\n# In[1]:\n\nget_ipython().system('pip install kfmd --upgrade --user')\nget_ipython().system('pip install pandas --upgrade --user')\n\nfrom kfmd import metadata\nimport pandas\nfrom datetime import datetime\n\n# Create a workspace, run and execution\n\n# In[2]:\n\nws1 = metadata.Workspace(\n    # Connect to metadata-service in namesapce kubeflow in k8s cluster.\n    backend_url_prefix=\"metadata-service.kubeflow.svc.cluster.local:8080\",\n    name=\"ws1\",\n    description=\"a workspace for testing\",\n    labels={\"n1\": \"v1\"})\nr = metadata.Run(\n    workspace=ws1,\n    name=\"run-\" + datetime.utcnow().isoformat(\"T\"),\n    description=\"a run in ws_1\",\n)\nexec = metadata.Execution(\n    name=\"execution\" + datetime.utcnow().isoformat(\"T\"),\n    workspace=ws1,\n    run=r,\n    description=\"execution example\",\n)\n\n# Log data set, model and its evaluation\n\n# In[3]:\n\ndata_set = exec.log_input(\n    metadata.DataSet(description=\"an example data\",\n                     name=\"mytable-dump\",\n                     owner=\"owner@my-company.org\",\n                     uri=\"file://path/to/dataset\",\n                     version=\"v1.0.0\",\n                     query=\"SELECT * FROM mytable\"))\nmodel = exec.log_output(\n    metadata.Model(name=\"MNIST\",\n                   description=\"model to recognize handwritten digits\",\n                   owner=\"someone@kubeflow.org\",\n                   uri=\"gcs://my-bucket/mnist\",\n                   model_type=\"neural network\",\n                   training_framework={\n                       \"name\": \"tensorflow\",\n                       \"version\": \"v1.0\"\n                   },\n                   hyperparameters={\n                       \"learning_rate\": 0.5,\n                       \"layers\": [10, 3, 1],\n                       \"early_stop\": True\n                   },\n                   version=\"v0.0.1\",\n                   labels={\"mylabel\": \"l1\"}))\nmetrics = exec.log_output(\n    metadata.Metrics(\n        name=\"MNIST-evaluation\",\n        description=\n        \"validating the MNIST model to recognize handwritten digits\",\n        owner=\"someone@kubeflow.org\",\n        uri=\"gcs://my-bucket/mnist-eval.csv\",\n        data_set_id=data_set.id,\n        model_id=model.id,\n        metrics_type=metadata.Metrics.VALIDATION,\n        values={\"accuracy\": 0.95},\n        labels={\"mylabel\": \"l1\"}))\n\n# List all the models in the workspace\n\n# In[4]:\n\npandas.DataFrame.from_dict(ws1.list(metadata.Model.ARTIFACT_TYPE_NAME))\n\n# Get basic lineage\n\n# In[5]:\n\nprint(\"model id is %s\\n\" % model.id)\n\n# Find the execution that produces this model.\n\n# In[6]:\n\noutput_events = ws1.client.list_events2(model.id).events\nassert len(output_events) == 1\nexecution_id = output_events[0].execution_id\nprint(execution_id)\n\n# Find all events related to that execution.\n\n# In[7]:\n\nall_events = ws1.client.list_events(execution_id).events\nassert len(all_events) == 3\nprint(\"\\nAll events related to this model:\")\npandas.DataFrame.from_dict([e.to_dict() for e in all_events])\n\n# In[ ]:\n"
  },
  {
    "path": "ch06/docker/Dockerfile",
    "content": "# from https://github.com/flmu/mlflow-tracking-server\n\nFROM python:3.7\n\nRUN pip3 install --upgrade pip && \\\n    pip3 install mlflow --upgrade && \\\n    pip3 install awscli --upgrade  && \\\n    pip3 install boto3 --upgrade\n\nENV PORT 5000\nENV AWS_BUCKET bucket\nENV AWS_ACCESS_KEY_ID aws_id\nENV AWS_SECRET_ACCESS_KEY aws_key\n\nENV FILE_DIR /tmp/mlflow\n\nRUN mkdir -p /opt/mlflow\n\nCOPY run.sh /opt/mlflow\nRUN chmod -R 777 /opt/mlflow/\n\nENTRYPOINT [\"/opt/mlflow/run.sh\"]"
  },
  {
    "path": "ch06/docker/build.sh",
    "content": "#!/bin/bash\n\nimg='lightbend/mlflow'\ntag='0.1'\ndocker build -t $img:$tag .\n\n"
  },
  {
    "path": "ch06/docker/run.sh",
    "content": "#!/bin/sh\n\nset -e\n\nif [ -z \"${AWS_BUCKET}\" ]; then\n  echo >&2 \"AWS_BUCKET must be set\"\n  exit 1\nfi\n\nif [ -z \"${AWS_ACCESS_KEY_ID}\" ]; then\n  echo >&2 \"AWS_ACCESS_KEY_ID must be set\"\n  exit 1\nfi\n\nif [ -z \"${AWS_SECRET_ACCESS_KEY}\" ]; then\n  echo >&2 \"AWS_SECRET_ACCESS_KEY must be set\"\n  exit 1\nfi\n\nmkdir -p \"${FILE_DIR}\"\n\nmlflow server \\\n    --backend-store-uri \"file://$FILE_DIR\" \\\n    --default-artifact-root \"s3://$AWS_BUCKET/mlflow/artifacts\" \\\n    --host 0.0.0.0 \\\n    --port \"$PORT\"\n"
  },
  {
    "path": "ch06/install/mlflowchart/.helmignore",
    "content": "# Patterns to ignore when building packages.\n# This supports shell glob matching, relative path matching, and\n# negation (prefixed with !). Only one pattern per line.\n.DS_Store\n# Common VCS dirs\n.git/\n.gitignore\n.bzr/\n.bzrignore\n.hg/\n.hgignore\n.svn/\n# Common backup files\n*.swp\n*.bak\n*.tmp\n*~\n# Various IDEs\n.project\n.idea/\n*.tmproj\n"
  },
  {
    "path": "ch06/install/mlflowchart/Chart.yaml",
    "content": "apiVersion: v1\nappVersion: 0.1\ndescription: MLFlow\nmaintainers:\n- name: Boris Lublinsky\nname: MLFLOW tracking server\nversion: 0.1"
  },
  {
    "path": "ch06/install/mlflowchart/templates/NOTES.txt",
    "content": "ML Flow tracking server is installed\n"
  },
  {
    "path": "ch06/install/mlflowchart/templates/_helpers.tpl",
    "content": "{{/* vim: set filetype=mustache: */}}\n{{/*\nExpand the name of the chart.\n*/}}\n{{- define \"modelserverchart.name\" -}}\n{{- default .Chart.Name .Values.nameOverride | trunc 63 | trimSuffix \"-\" -}}\n{{- end -}}\n\n{{/*\nCreate a default fully qualified app name.\nWe truncate at 63 chars because some Kubernetes name fields are limited to this (by the DNS naming spec).\n*/}}\n{{- define \"modelserverchart.fullname\" -}}\n{{- $name := default .Chart.Name .Values.nameOverride -}}\n{{- printf \"%s-%s\" .Release.Name $name | trunc 63 | trimSuffix \"-\" -}}\n{{- end -}}\n"
  },
  {
    "path": "ch06/install/mlflowchart/templates/mlflow.yaml",
    "content": "apiVersion: apps/v1\nkind: Deployment\nmetadata:\n  namespace: kubeflow\n  name: mlflowserver\n  labels:\n    app: mlflowserver\nspec:\n  replicas: 1\n  selector:\n    matchLabels:\n      app: mlflowserver\n  strategy:\n    type: RollingUpdate\n  template:\n    metadata:\n      labels:\n        app: mlflowserver\n    spec:\n      containers:\n        - name: server\n          image: \"{{ .Values.image.server }}:{{ .Values.image.version }}\"\n          imagePullPolicy: \"{{ .Values.image.pullPolicy }}\"\n          ports:\n            - containerPort: 5000\n              name: serving\n              protocol: TCP\n          env:\n            - name: \"MLFLOW_S3_ENDPOINT_URL\"\n              value: \"http://minio-service.kubeflow.svc.cluster.local:9000\"\n            - name: \"AWS_ACCESS_KEY_ID\"\n              valueFrom: { secretKeyRef: { name: \"minioaccess\", key: \"AWS_ACCESS_KEY_ID\" } }\n            - name: \"AWS_SECRET_ACCESS_KEY\"\n              valueFrom: { secretKeyRef: { name: \"minioaccess\", key: \"AWS_SECRET_ACCESS_KEY\" } }\n            - name: \"AWS_BUCKET\"\n              value: \"mlflow\"\n          volumes:\n            - name: secret-volume\n              secret:\n                secretName: minioaccess\n---\napiVersion: v1\nkind: Service\nmetadata:\n  namespace: kubeflow\n  name: mlflowserver\nspec:\n  selector:\n    app: mlflowserver\n  ports:\n  - protocol: TCP\n    port: 5000\n    targetPort: 5000\n---\napiVersion: networking.istio.io/v1alpha3\nkind: VirtualService\nmetadata:\n  name: mlflow-server\n  namespace: kubeflow\nspec:\n  gateways:\n    - kubeflow-gateway\n  hosts:\n    - '*'\n  http:\n    - match:\n        - uri:\n            prefix: /mlflow/\n      rewrite:\n        uri: /\n      route:\n        - destination:\n            host: mlflowserver.kubeflow.svc.cluster.local\n            port:\n              number: 5000"
  },
  {
    "path": "ch06/install/mlflowchart/values.yaml",
    "content": "# application name is a namespace\n# docker images\nimage:\n  server: lightbend/mlflow\n  pullPolicy: Always\n  version: 0.1\n"
  },
  {
    "path": "ch10/experiment.yaml",
    "content": "Name:         random-example\nNamespace:    kubeflow\nLabels:       controller-tools.k8s.io=1.0\nAnnotations:  <none>\nAPI Version:  kubeflow.org/v1alpha3\nKind:         Experiment\nMetadata:\n  Creation Timestamp:  2019-12-22T22:53:25Z\n  Finalizers:\n    update-prometheus-metrics\n  Generation:        2\n  Resource Version:  720692\n  Self Link:         /apis/kubeflow.org/v1alpha3/namespaces/kubeflow/experiments/random-example\n  UID:               dc6bc15a-250d-11ea-8cae-42010a80010f\nSpec:\n  Algorithm:\n    Algorithm Name:        random\n    Algorithm Settings:    <nil>\n  Max Failed Trial Count:  3\n  Max Trial Count:         12\n  Metrics Collector Spec:\n    Collector:\n      Kind:  StdOut\n  Objective:\n    Additional Metric Names:\n      accuracy\n    Goal:                   0.99\n    Objective Metric Name:  Validation-accuracy\n    Type:                   maximize\n  Parallel Trial Count:     3\n  Parameters:\n    Feasible Space:\n      Max:           0.03\n      Min:           0.01\n    Name:            --lr\n    Parameter Type:  double\n    Feasible Space:\n      Max:           5\n      Min:           2\n    Name:            --num-layers\n    Parameter Type:  int\n    Feasible Space:\n      List:\n        sgd\n        adam\n        ftrl\n    Name:            --optimizer\n    Parameter Type:  categorical\n  Trial Template:\n    Go Template:\n      Raw Template:  apiVersion: batch/v1\nkind: Job\nmetadata:\n  name: {{.Trial}}\n  namespace: {{.NameSpace}}\nspec:\n  template:\n    spec:\n      containers:\n      - name: {{.Trial}}\n        image: docker.io/kubeflowkatib/mxnet-mnist-example\n        command:\n        - \"python\"\n        - \"/mxnet/example/image-classification/train_mnist.py\"\n        - \"--batch-size=64\"\n        {{- with .HyperParameters}}\n        {{- range .}}\n        - \"{{.Name}}={{.Value}}\"\n        {{- end}}\n        {{- end}}\n      restartPolicy: Never\nStatus:\n  Conditions:\n    Last Transition Time:  2019-12-22T22:53:25Z\n    Last Update Time:      2019-12-22T22:53:25Z\n    Message:               Experiment is created\n    Reason:                ExperimentCreated\n    Status:                True\n    Type:                  Created\n    Last Transition Time:  2019-12-22T22:55:10Z\n    Last Update Time:      2019-12-22T22:55:10Z\n    Message:               Experiment is running\n    Reason:                ExperimentRunning\n    Status:                True\n    Type:                  Running\n  Current Optimal Trial:\n    Observation:\n      Metrics:\n        Name:   Validation-accuracy\n        Value:  0.981091\n    Parameter Assignments:\n      Name:          --lr\n      Value:         0.025139701133432946\n      Name:          --num-layers\n      Value:         4\n      Name:          --optimizer\n      Value:         sgd\n  Start Time:        2019-12-22T22:53:25Z\n  Trials:            12\n  Trials Running:    2\n  Trials Succeeded:  10\nEvents:              <none>Type something here!\n"
  },
  {
    "path": "ch10/hptuning.py",
    "content": "# Initialize search space\n# Initialize model\nwhile not objective_reached and not bugdget_exhausted:\n    # Obtain new hyperparameters\n    suggestion = GetSuggestions()\n\n    # Run trial with new hyperparameters; collect metrics\n    metrics = RunTrial(suggestion)\n\n    # Report metrics\n    Report(metrics)\n"
  },
  {
    "path": "ch10/random.yaml",
    "content": "apiVersion: \"kubeflow.org/v1alpha3\"\nkind: Experiment\nmetadata:\n  namespace: kubeflow\n  labels:\n    controller-tools.k8s.io: \"1.0\"\n  name: random-example\nspec:\n  objective:\n    type: maximize\n    goal: 0.99\n    objectiveMetricName: Validation-accuracy\n    additionalMetricNames:\n      - Train-accuracy\n  algorithm:\n    algorithmName: random\n  parallelTrialCount: 3\n  maxTrialCount: 12\n  maxFailedTrialCount: 3\n  parameters:\n    - name: --lr\n      parameterType: double\n      feasibleSpace:\n        min: \"0.01\"\n        max: \"0.03\"\n    - name: --num-layers\n      parameterType: int\n      feasibleSpace:\n        min: \"2\"\n        max: \"5\"\n    - name: --optimizer\n      parameterType: categorical\n      feasibleSpace:\n        list:\n        - sgd\n        - adam\n        - ftrl\n  trialTemplate:\n    goTemplate:\n        rawTemplate: |-\n          apiVersion: batch/v1\n          kind: Job\n          metadata:\n            name: {{.Trial}}\n            namespace: {{.NameSpace}}\n          spec:\n            template:\n              spec:\n                containers:\n                - name: {{.Trial}}\n                  image: docker.io/kubeflowkatib/mxnet-mnist\n                  command:\n                  - \"python3\"\n                  - \"/opt/mxnet-mnist/mnist.py\"\n                  - \"--batch-size=64\"\n                  {{- with .HyperParameters}}\n                  {{- range .}}\n                  - \"{{.Name}}={{.Value}}\"\n                  {{- end}}\n                  {{- end}}\n                restartPolicy: NeverType something here!\n"
  },
  {
    "path": "ch2/Dockerfile",
    "content": "FROM gcr.io/kubeflow-images-public/tensorflow-2.1.0-notebook-cpu:1.0.0"
  },
  {
    "path": "ch2/build-and-push.sh",
    "content": "#!/bin/bash\n#tag::buildandpush[]\nIMAGE=\"${CONTAINER_REGISTRY}/kubeflow/test:v1\"\ndocker build  -t \"${IMAGE}\" -f Dockerfile .\ndocker push \"${IMAGE}\"\n#end::buildandpush[]\n"
  },
  {
    "path": "ch2/query-endpoint.py",
    "content": "# Licensed to the Apache Software Foundation (ASF) under one\n# or more contributor license agreements.  See the NOTICE file\n# distributed with this work for additional information\n# regarding copyright ownership.  The ASF licenses this file\n# to you under the Apache License, Version 2.0 (the\n# \"License\"); you may not use this file except in compliance\n# with the License.  You may obtain a copy of the License at\n#\n# http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing,\n# software distributed under the License is distributed on an\n# \"AS IS\" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY\n# KIND, either express or implied.  See the License for the\n#     specific language governing permissions and limitations\n# under the License.\n\n#tag::scriptSetup[]\nimport requests\nimport numpy as np\n\nfrom tensorflow.examples.tutorials.mnist import input_data\nfrom matplotlib import pyplot as plt\n\ndef download_mnist():\n    return input_data.read_data_sets(\"MNIST_data/\", one_hot=True)\n\n\ndef gen_image(arr):\n    two_d = (np.reshape(arr, (28, 28)) * 255).astype(np.uint8)\n    plt.imshow(two_d, cmap=plt.cm.gray_r, interpolation='nearest')\n    return plt\n#end::scriptSetup[]\n\nAMBASSADOR_API_IP = \"10.53.148.167:30134\"\n\n#tag::scriptGuts[]\nmnist = download_mnist()\nbatch_xs, batch_ys = mnist.train.next_batch(1)\nchosen = 0\ngen_image(batch_xs[chosen]).show()\ndata = batch_xs[chosen].reshape((1, 784))\nfeatures = [\"X\" + str(i + 1) for i in range(0, 784)]\nrequest = {\"data\": {\"names\": features, \"ndarray\": data.tolist()}}\ndeploymentName = \"mnist-classifier\"\nuri = \"http://\" + AMBASSADOR_API_IP + \"/seldon/\" + \\\n    deploymentName + \"/api/v0.1/predictions\"\n\nresponse = requests.post(uri, json=request)\n#end::scriptGuts[]\nprint(response.status_code)\n"
  },
  {
    "path": "ch2_seldon_examples/pipeline_role.yaml",
    "content": "apiVersion: rbac.authorization.k8s.io/v1\nkind: Role\nmetadata:\n  namespace: kubeflow\n  name: pipeline-runner\nrules:\n- apiGroups: [\"machinelearning.seldon.io\"]\n  resources: [\"seldondeployments\"]\n  verbs: [\"*\"]\n"
  },
  {
    "path": "ch2_seldon_examples/pipeline_rolebinding.yaml",
    "content": "apiVersion: rbac.authorization.k8s.io/v1\nkind: RoleBinding\nmetadata:\n  name: pipeline-runner\n  namespace: kubeflow\nsubjects:\n- kind: ServiceAccount\n  name: pipeline-runner\n  namespace: kubeflow\nroleRef:\n  kind: Role\n  name: pipeline-runner\n  apiGroup: rbac.authorization.k8s.io\n"
  },
  {
    "path": "ch2_seldon_examples/pv-claim.yaml",
    "content": "kind: PersistentVolumeClaim\r\napiVersion: v1\r\nmetadata:\r\n  name: \"nfs-1\"\r\nspec:\r\n  storageClassName: manual\r\n  accessModes:\r\n    - ReadWriteOnce\r\n  resources:\r\n    requests:\r\n      storage: 3Gi\r\n"
  },
  {
    "path": "ch2_seldon_examples/pv-volume.yaml",
    "content": "kind: PersistentVolume\r\napiVersion: v1\r\nmetadata:\r\n  name: task-pv-volume\r\n  labels:\r\n    type: local\r\nspec:\r\n  storageClassName: manual\r\n  capacity:\r\n    storage: 10Gi\r\n  accessModes:\r\n    - ReadWriteOnce\r\n  hostPath:\r\n    path: \"/mnt/data\""
  },
  {
    "path": "ch2_seldon_examples/request_example.ipynb",
    "content": "{\n \"cells\": [\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": 2,\n   \"metadata\": {},\n   \"outputs\": [\n    {\n     \"name\": \"stdout\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"Collecting matplotlib\\n\",\n      \"\\u001b[?25l  Downloading https://files.pythonhosted.org/packages/57/4f/dd381ecf6c6ab9bcdaa8ea912e866dedc6e696756156d8ecc087e20817e2/matplotlib-3.1.1-cp36-cp36m-manylinux1_x86_64.whl (13.1MB)\\n\",\n      \"\\u001b[K    100% |████████████████████████████████| 13.1MB 2.7MB/s eta 0:00:01\\n\",\n      \"\\u001b[?25hRequirement already satisfied: python-dateutil>=2.1 in /opt/conda/lib/python3.6/site-packages (from matplotlib) (2.8.0)\\n\",\n      \"Collecting cycler>=0.10 (from matplotlib)\\n\",\n      \"  Downloading https://files.pythonhosted.org/packages/f7/d2/e07d3ebb2bd7af696440ce7e754c59dd546ffe1bbe732c8ab68b9c834e61/cycler-0.10.0-py2.py3-none-any.whl\\n\",\n      \"Collecting kiwisolver>=1.0.1 (from matplotlib)\\n\",\n      \"\\u001b[?25l  Downloading https://files.pythonhosted.org/packages/f8/a1/5742b56282449b1c0968197f63eae486eca2c35dcd334bab75ad524e0de1/kiwisolver-1.1.0-cp36-cp36m-manylinux1_x86_64.whl (90kB)\\n\",\n      \"\\u001b[K    100% |████████████████████████████████| 92kB 32.5MB/s ta 0:00:01\\n\",\n      \"\\u001b[?25hCollecting pyparsing!=2.0.4,!=2.1.2,!=2.1.6,>=2.0.1 (from matplotlib)\\n\",\n      \"\\u001b[?25l  Downloading https://files.pythonhosted.org/packages/11/fa/0160cd525c62d7abd076a070ff02b2b94de589f1a9789774f17d7c54058e/pyparsing-2.4.2-py2.py3-none-any.whl (65kB)\\n\",\n      \"\\u001b[K    100% |████████████████████████████████| 71kB 25.6MB/s ta 0:00:01\\n\",\n      \"\\u001b[?25hRequirement already satisfied: numpy>=1.11 in /opt/conda/lib/python3.6/site-packages (from matplotlib) (1.16.2)\\n\",\n      \"Requirement already satisfied: six>=1.5 in /opt/conda/lib/python3.6/site-packages (from python-dateutil>=2.1->matplotlib) (1.12.0)\\n\",\n      \"Requirement already satisfied: setuptools in /opt/conda/lib/python3.6/site-packages (from kiwisolver>=1.0.1->matplotlib) (40.9.0)\\n\",\n      \"Installing collected packages: cycler, kiwisolver, pyparsing, matplotlib\\n\",\n      \"Successfully installed cycler-0.10.0 kiwisolver-1.1.0 matplotlib-3.1.1 pyparsing-2.4.2\\n\",\n      \"\\u001b[33mYou are using pip version 19.0.1, however version 19.2.3 is available.\\n\",\n      \"You should consider upgrading via the 'pip install --upgrade pip' command.\\u001b[0m\\n\"\n     ]\n    }\n   ],\n   \"source\": [\n    \"!pip install matplotlib\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": 3,\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"import requests\\n\",\n    \"import numpy as np\\n\",\n    \"\\n\",\n    \"from tensorflow.examples.tutorials.mnist import input_data\\n\",\n    \"from matplotlib import pyplot as plt\\n\",\n    \"\\n\",\n    \"\\n\",\n    \"def download_mnist():\\n\",\n    \"    return input_data.read_data_sets(\\\"MNIST_data/\\\", one_hot = True)\\n\",\n    \"\\n\",\n    \"def gen_image(arr):\\n\",\n    \"    two_d = (np.reshape(arr, (28, 28)) * 255).astype(np.uint8)\\n\",\n    \"    plt.imshow(two_d,cmap=plt.cm.gray_r, interpolation='nearest')\\n\",\n    \"    return plt\\n\",\n    \"\\n\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": 4,\n   \"metadata\": {},\n   \"outputs\": [\n    {\n     \"name\": \"stdout\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"WARNING:tensorflow:From <ipython-input-3-0613226129c0>:9: read_data_sets (from tensorflow.contrib.learn.python.learn.datasets.mnist) is deprecated and will be removed in a future version.\\n\",\n      \"Instructions for updating:\\n\",\n      \"Please use alternatives such as official/mnist/dataset.py from tensorflow/models.\\n\",\n      \"WARNING:tensorflow:From /opt/conda/lib/python3.6/site-packages/tensorflow/contrib/learn/python/learn/datasets/mnist.py:260: maybe_download (from tensorflow.contrib.learn.python.learn.datasets.base) is deprecated and will be removed in a future version.\\n\",\n      \"Instructions for updating:\\n\",\n      \"Please write your own downloading logic.\\n\",\n      \"WARNING:tensorflow:From /opt/conda/lib/python3.6/site-packages/tensorflow/contrib/learn/python/learn/datasets/base.py:252: _internal_retry.<locals>.wrap.<locals>.wrapped_fn (from tensorflow.contrib.learn.python.learn.datasets.base) is deprecated and will be removed in a future version.\\n\",\n      \"Instructions for updating:\\n\",\n      \"Please use urllib or similar directly.\\n\",\n      \"Successfully downloaded train-images-idx3-ubyte.gz 9912422 bytes.\\n\",\n      \"WARNING:tensorflow:From /opt/conda/lib/python3.6/site-packages/tensorflow/contrib/learn/python/learn/datasets/mnist.py:262: extract_images (from tensorflow.contrib.learn.python.learn.datasets.mnist) is deprecated and will be removed in a future version.\\n\",\n      \"Instructions for updating:\\n\",\n      \"Please use tf.data to implement this functionality.\\n\",\n      \"Extracting MNIST_data/train-images-idx3-ubyte.gz\\n\",\n      \"Successfully downloaded train-labels-idx1-ubyte.gz 28881 bytes.\\n\",\n      \"WARNING:tensorflow:From /opt/conda/lib/python3.6/site-packages/tensorflow/contrib/learn/python/learn/datasets/mnist.py:267: extract_labels (from tensorflow.contrib.learn.python.learn.datasets.mnist) is deprecated and will be removed in a future version.\\n\",\n      \"Instructions for updating:\\n\",\n      \"Please use tf.data to implement this functionality.\\n\",\n      \"Extracting MNIST_data/train-labels-idx1-ubyte.gz\\n\",\n      \"WARNING:tensorflow:From /opt/conda/lib/python3.6/site-packages/tensorflow/contrib/learn/python/learn/datasets/mnist.py:110: dense_to_one_hot (from tensorflow.contrib.learn.python.learn.datasets.mnist) is deprecated and will be removed in a future version.\\n\",\n      \"Instructions for updating:\\n\",\n      \"Please use tf.one_hot on tensors.\\n\",\n      \"Successfully downloaded t10k-images-idx3-ubyte.gz 1648877 bytes.\\n\",\n      \"Extracting MNIST_data/t10k-images-idx3-ubyte.gz\\n\",\n      \"Successfully downloaded t10k-labels-idx1-ubyte.gz 4542 bytes.\\n\",\n      \"Extracting MNIST_data/t10k-labels-idx1-ubyte.gz\\n\",\n      \"WARNING:tensorflow:From /opt/conda/lib/python3.6/site-packages/tensorflow/contrib/learn/python/learn/datasets/mnist.py:290: DataSet.__init__ (from tensorflow.contrib.learn.python.learn.datasets.mnist) is deprecated and will be removed in a future version.\\n\",\n      \"Instructions for updating:\\n\",\n      \"Please use alternatives such as official/mnist/dataset.py from tensorflow/models.\\n\"\n     ]\n    }\n   ],\n   \"source\": [\n    \"mnist = download_mnist()\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": 7,\n   \"metadata\": {},\n   \"outputs\": [\n    {\n     \"data\": {\n      \"image/png\": \"iVBORw0KGgoAAAANSUhEUgAAAPsAAAD4CAYAAAAq5pAIAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADh0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uMy4xLjEsIGh0dHA6Ly9tYXRwbG90bGliLm9yZy8QZhcZAAANMklEQVR4nO3dXaxV9ZnH8d9Ppr0REmE4ORDAgakYo2Ok5EhMahonZIgvIdgYTblATMxQXyCtaeIYJ1ovvMAJ0BQzklAlpaRDbWwVYohTB5uY3hCPBoUjaX0JBghyDhqiqFiVZy7Osjni2Wsf9lr7RZ7vJznZe69nrb2erPBj7b3+e++/I0IAzn3ndbsBAJ1B2IEkCDuQBGEHkiDsQBL/0MmdTZ8+PebOndvJXQKpHDx4UMePH/d4tUpht32tpF9ImiTp8YhYW7b+3LlzNTg4WGWXAEoMDAw0rLX8Mt72JEn/Lek6SZdKWm770lafD0B7VXnPvkjSmxHxdkT8TdJvJS2rpy0AdasS9lmSDo15fLhY9hW2V9ketD04MjJSYXcAqmj71fiI2BwRAxEx0NfX1+7dAWigStiPSJoz5vHsYhmAHlQl7C9Jmm97nu1vS/qhpJ31tAWgbi0PvUXE57ZXS/pfjQ69bYmIodo6A1CrSuPsEbFL0q6aegHQRnxcFkiCsANJEHYgCcIOJEHYgSQIO5AEYQeSIOxAEoQdSIKwA0kQdiAJwg4kQdiBJAg7kARhB5Ig7EAShB1IgrADSRB2IAnCDiRB2IEkCDuQBGEHkiDsQBKEHUiCsANJEHYgCcIOJEHYgSQIO5BEpSmbbR+U9KGkLyR9HhEDdTQFoH6Vwl7414g4XsPzAGgjXsYDSVQNe0j6o+2Xba8abwXbq2wP2h4cGRmpuDsAraoa9qsjYqGk6yTdbfv7Z64QEZsjYiAiBvr6+iruDkCrKoU9Io4Ut8OSnpa0qI6mANSv5bDbPt/2lC/vS1oiaX9djQGoV5Wr8f2Snrb95fP8T0Q8V0tXAGrXctgj4m1JV9TYC4A2YugNSIKwA0kQdiAJwg4kQdiBJOr4Igy+wbZt21Za/+STTzrUydl74IEHSuvr1q1rWFuxYkXd7fQ8zuxAEoQdSIKwA0kQdiAJwg4kQdiBJAg7kATj7OeANWvWNKzt2bOndNu9e/eW1j/77LPSen9/f8vbnz59unTbEydOlNabKb5+jQJndiAJwg4kQdiBJAg7kARhB5Ig7EAShB1IgnH2Djh27Fhp/eGHH670/Dt27GhYO3ToUKXnvuOOO0rrt912W2n9o48+aljbtGlT6bZPPfVUaX3+/Pml9SuvvLK0ng1ndiAJwg4kQdiBJAg7kARhB5Ig7EAShB1IgnH2Grz11lul9WXLlpXWh4aGKu1/8uTJDWu33npr6bbr168vrU+bNq20ft555eeLxx9/vGFtcHCwdNvLLrustP7cc+UzhM+ePbu0nk3TM7vtLbaHbe8fs2ya7edtv1HcTm1vmwCqmsjL+F9JuvaMZfdJ2h0R8yXtLh4D6GFNwx4RL0p6/4zFyyRtLe5vlXRjzX0BqFmrF+j6I+Jocf9dSQ1/iMz2KtuDtgdHRkZa3B2AqipfjY+IkBQl9c0RMRARA319fVV3B6BFrYb9mO2ZklTcDtfXEoB2aDXsOyWtLO6vlNT4O5YAekLTcXbb2yVdI2m67cOSfiZpraTf2b5d0juSbmlnk72ubJxbkmbNmlVarzrO/sgjjzSs3XXXXZWeu5n33nuvtL5hw4aGtZMnT5Zue/PNN5fWGUc/O03DHhHLG5QW19wLgDbi47JAEoQdSIKwA0kQdiAJwg4kwVdca9Bs2uKNGzeW1i+55JJK+2/2k8rt9Nhjj5XWDxw40LDW7Ou3N9xwQ0s9YXyc2YEkCDuQBGEHkiDsQBKEHUiCsANJEHYgCcbZO+Ciiy4qra9Zs6a0/uijj9bZzlk5depUab3Z13NnzJjRsHbnnXeWbnvVVVeV1nF2OLMDSRB2IAnCDiRB2IEkCDuQBGEHkiDsQBKMs3fApEmTSuurV68urS9durS0vnDhwrPuaaJOnDhRWn/yySdL60uWLGlYYxy9szizA0kQdiAJwg4kQdiBJAg7kARhB5Ig7EASjLP3gIsvvrhSvZ2eeeaZru0b9Wp6Zre9xfaw7f1jlj1k+4jtvcXf9e1tE0BVE3kZ/ytJ146z/OcRsaD421VvWwDq1jTsEfGipPc70AuANqpygW617deKl/lTG61ke5XtQduDIyMjFXYHoIpWw75J0nckLZB0VNL6RitGxOaIGIiIgb6+vhZ3B6CqlsIeEcci4ouIOC3pl5IW1dsWgLq1FHbbM8c8/IGk/Y3WBdAbmo6z294u6RpJ020flvQzSdfYXiApJB2U9KM29ogu2rWrfKDlnnvuKa0/+OCDdbaDCpqGPSKWj7P4iTb0AqCN+LgskARhB5Ig7EAShB1IgrADSfAV1+SGh4dL659++mlpffLkyaX1Cy644Kx7QntwZgeSIOxAEoQdSIKwA0kQdiAJwg4kQdiBJBhnT+7yyy8vrX/88cel9XvvvbfOdtBGnNmBJAg7kARhB5Ig7EAShB1IgrADSRB2IAnG2VGq2ffVFy9e3KFOUBVndiAJwg4kQdiBJAg7kARhB5Ig7EAShB1IgnH2c9z27dtL6x988EFpfcaMGXW2gy5qema3Pcf2n2y/bnvI9o+L5dNsP2/7jeJ2avvbBdCqibyM/1zSTyPiUklXSbrb9qWS7pO0OyLmS9pdPAbQo5qGPSKORsQrxf0PJR2QNEvSMklbi9W2SrqxXU0CqO6sLtDZnivpu5L2SOqPiKNF6V1J/Q22WWV70PbgyMhIhVYBVDHhsNueLOn3kn4SEV+5qhMRISnG2y4iNkfEQEQM9PX1VWoWQOsmFHbb39Jo0H8TEX8oFh+zPbOoz5RUPh0ogK5qOvRm25KekHQgIjaMKe2UtFLS2uJ2R1s6RCVDQ0Ol9VOnTpXWN27cWGc76KKJjLN/T9IKSfts7y2W3a/RkP/O9u2S3pF0S3taBFCHpmGPiD9LcoMyv1wAfEPwcVkgCcIOJEHYgSQIO5AEYQeSIOxAEoQdSIKwA0kQdiAJwg4kQdiBJAg7kARhB5Lgp6TPAWvXrm1Y27lzZ+m2CxYsKK1fccUVLfWE3sOZHUiCsANJEHYgCcIOJEHYgSQIO5AEYQeSYJz9HPDCCy80rO3bt6/Sc7/66qul9QsvvLDS86NzOLMDSRB2IAnCDiRB2IEkCDuQBGEHkiDsQBITmZ99jqRfS+qXFJI2R8QvbD8k6d8ljRSr3h8Ru9rVKBqbN29e255727ZtpfWlS5e2bd+o10Q+VPO5pJ9GxCu2p0h62fbzRe3nEbGufe0BqMtE5mc/Kulocf9D2wckzWp3YwDqdVbv2W3PlfRdSXuKRattv2Z7i+2pDbZZZXvQ9uDIyMh4qwDogAmH3fZkSb+X9JOI+EDSJknfkbRAo2f+9eNtFxGbI2IgIgb6+vpqaBlAKyYUdtvf0mjQfxMRf5CkiDgWEV9ExGlJv5S0qH1tAqiqadhtW9ITkg5ExIYxy2eOWe0HkvbX3x6Aukzkavz3JK2QtM/23mLZ/ZKW216g0eG4g5J+1JYO0dS6dY0HRI4cOVK67eLFi0vrN910U0s9ofdM5Gr8nyV5nBJj6sA3CJ+gA5Ig7EAShB1IgrADSRB2IAnCDiTBT0mfA6ZMmdKw9uyzz3awE/QyzuxAEoQdSIKwA0kQdiAJwg4kQdiBJAg7kIQjonM7s0ckvTNm0XRJxzvWwNnp1d56tS+J3lpVZ2//FBHj/v5bR8P+tZ3bgxEx0LUGSvRqb73al0RvrepUb7yMB5Ig7EAS3Q775i7vv0yv9tarfUn01qqO9NbV9+wAOqfbZ3YAHULYgSS6Enbb19r+i+03bd/XjR4asX3Q9j7be20PdrmXLbaHbe8fs2ya7edtv1HcjjvHXpd6e8j2keLY7bV9fZd6m2P7T7Zftz1k+8fF8q4eu5K+OnLcOv6e3fYkSX+V9G+SDkt6SdLyiHi9o400YPugpIGI6PoHMGx/X9JJSb+OiH8plv2XpPcjYm3xH+XUiPiPHuntIUknuz2NdzFb0cyx04xLulHSberisSvp6xZ14Lh148y+SNKbEfF2RPxN0m8lLetCHz0vIl6U9P4Zi5dJ2lrc36rRfywd16C3nhARRyPileL+h5K+nGa8q8eupK+O6EbYZ0k6NObxYfXWfO8h6Y+2X7a9qtvNjKM/Io4W99+V1N/NZsbRdBrvTjpjmvGeOXatTH9eFRfovu7qiFgo6TpJdxcvV3tSjL4H66Wx0wlN490p40wz/nfdPHatTn9eVTfCfkTSnDGPZxfLekJEHCluhyU9rd6bivrYlzPoFrfDXe7n73ppGu/xphlXDxy7bk5/3o2wvyRpvu15tr8t6YeSdnahj6+xfX5x4US2z5e0RL03FfVOSSuL+ysl7ehiL1/RK9N4N5pmXF0+dl2f/jwiOv4n6XqNXpF/S9J/dqOHBn39s6RXi7+hbvcmabtGX9Z9ptFrG7dL+kdJuyW9Ien/JE3rod62Sdon6TWNBmtml3q7WqMv0V+TtLf4u77bx66kr44cNz4uCyTBBTogCcIOJEHYgSQIO5AEYQeSIOxAEoQdSOL/AQe88PwDu2A0AAAAAElFTkSuQmCC\\n\",\n      \"text/plain\": [\n       \"<Figure size 432x288 with 1 Axes>\"\n      ]\n     },\n     \"metadata\": {\n      \"needs_background\": \"light\"\n     },\n     \"output_type\": \"display_data\"\n    },\n    {\n     \"name\": \"stdout\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"401\\n\"\n     ]\n    }\n   ],\n   \"source\": [\n    \"batch_xs, batch_ys = mnist.train.next_batch(1)\\n\",\n    \"chosen=0\\n\",\n    \"gen_image(batch_xs[chosen]).show()\\n\",\n    \"data = batch_xs[chosen].reshape((1,784))\\n\",\n    \"features = [\\\"X\\\"+str(i+1) for i in range (0,784)]\\n\",\n    \"request = {\\\"data\\\":{\\\"names\\\":features,\\\"ndarray\\\":data.tolist()}}\\n\",\n    \"deploymentName = \\\"mnist-classifier\\\"\\n\",\n    \"uri = \\\"http://istio-ingressgateway.istio-system.svc.cluster.local/seldon/\\\"+deploymentName+\\\"/api/v0.1/predictions\\\"\\n\",\n    \"\\n\",\n    \"response = requests.post(\\n\",\n    \"    uri,\\n\",\n    \"    json=request)\\n\",\n    \"\\n\",\n    \"print(response.status_code)\\n\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": 9,\n   \"metadata\": {},\n   \"outputs\": [\n    {\n     \"name\": \"stdout\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"Origin authentication failed.\\n\"\n     ]\n    }\n   ],\n   \"source\": [\n    \"print(response.text)\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": null,\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": []\n  }\n ],\n \"metadata\": {\n  \"kernelspec\": {\n   \"display_name\": \"Python 3\",\n   \"language\": \"python\",\n   \"name\": \"python3\"\n  },\n  \"language_info\": {\n   \"codemirror_mode\": {\n    \"name\": \"ipython\",\n    \"version\": 3\n   },\n   \"file_extension\": \".py\",\n   \"mimetype\": \"text/x-python\",\n   \"name\": \"python\",\n   \"nbconvert_exporter\": \"python\",\n   \"pygments_lexer\": \"ipython3\",\n   \"version\": \"3.6.7\"\n  }\n },\n \"nbformat\": 4,\n \"nbformat_minor\": 2\n}\n"
  },
  {
    "path": "ch2_seldon_examples/run_example.sh",
    "content": "#!/bin/bash\n#tag::buildPipeline[]\ndsl-compile --py train_pipeline.py --output job.yaml\n#end::buildPipeline[]\n#tag::connectToWebUI[]\n# If you're on minikube and not using a loadbalancer:\nminikube service --url -n istio-system istio-ingressgateway\n# If your on GCP https://<kf_app_name>.endpoints.<gcp_project_name>.cloud.goog/\n# If you're on vanilla K8s\nINGRESS_HOST=$(kubectl -n istio-system get service istio-ingressgateway \\\n\t\t       -o jsonpath='{.status.loadBalancer.ingress[0].ip}')\nexport INGRESS_HOST\nINGRESS_PORT=$(kubectl -n istio-system get service istio-ingressgateway \\\n\t       -o jsonpath='{.spec.ports[?(@.name==\"http2\")].port}')\nexport INGRESS_PORT\nSECURE_INGRESS_PORT=$(kubectl -n istio-system get service istio-ingressgateway \\\n\t\t      -o jsonpath='{.spec.ports[?(@.name==\"https\")].port}')\nexport SECURE_INGRESS_PORT\n\nkubectl get svc istio-ingressgateway -n istio-system\n#end::connectToWebUI[]\n"
  },
  {
    "path": "ch2_seldon_examples/setup_example.sh",
    "content": "#!/bin/bash\n\nset -ex\n\necho \"Setting up example\"\n\nunset ch2_example_path\nch2_example_path=\"$( cd \"$( dirname \"${BASH_SOURCE[0]}\" )\" >/dev/null 2>&1 && pwd )\"\necho \"Using path ${ch2_example_path} for our example path\"\nexample_path=$(dirname \"${ch2_example_path}\")\n#tag::generate_kf_app_p1[]\n# Pick the correct config file for your platform from\n# https://github.com/kubeflow/manifests/tree/[version]/kfdef\n# You can download & edit the configuration at this point if you need to.\n# For generic k8s with istio:\nMANIFEST_BRANCH=${MANIFEST_BRANCH:-v1.0-branch}\nexport MANIFEST_BRANCH\nMANIFEST_VERSION=${MANIFEST_VERSION:-v1.0.1}\nexport MANIFEST_VERSION\n\nKF_PROJECT_NAME=${KF_PROJECT_NAME:-hello-kf-${PLATFORM}}\nexport KF_PROJECT_NAME\nmkdir \"${KF_PROJECT_NAME}\"\npushd \"${KF_PROJECT_NAME}\"\n\nmanifest_root=https://raw.githubusercontent.com/kubeflow/manifests/\n# On most enviroments this will create a \"vanilla\" kubeflow install using istio.\nKFDEF=${manifest_root}${MANIFEST_BRANCH}/kfdef/kfctl_k8s_istio.${MANIFEST_VERSION}.yaml\n#end::generate_kf_app_p1[]\n# On GCP this will create a cluster with basic authentication\nif [ \"$PLATFORM\" == \"gcp\" ]; then\n  KFDEF=${manifest_root}${MANIFEST_BRANCH}/kfdef/kfctl_gcp_iap.${MANIFEST_VERSION}.yaml\n  # Temp hack\n  cp \"${example_path}/kfctl_gcp_iap.v1.0.1.yaml\" ./\n  KFDEF=./kfctl_gcp_iap.v1.0.1.yaml\n  # Set up IAP\n  # TODO(holden)\n  # Set up environment variables for GCP\n  export PROJECT=${PROJECT:-\"<your GCP project name>\"}\n  gcloud config set project \"${PROJECT}\"\n  export ZONE=${ZONE:-\"<your GCP zone>\"}\n  gcloud config set compute/zone \"${ZONE}\"\nfi\npwd\n#tag::generate_kf_app_p2[]\nkfctl apply -f $KFDEF -V\necho $?\n\npopd\n#end::generate_kf_app_p2[]\n\n\n# TODO(trevor): what version/tag?\n#tag::cloneSeldonExample[]\n# Clone the base seldon example\ngit clone https://github.com/kubeflow/example-seldon\n#end::cloneSeldonExample[]\n"
  },
  {
    "path": "ch2_seldon_examples/tf_mnist_no_seldon_pipeline.py",
    "content": "# Copyright 2019 Google LLC\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\");\n# you may not use this file except in compliance with the License.\n# You may obtain a copy of the License at\n#\n#      http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n\"\"\"\nKubeflow Pipelines MNIST example\n\nRun this script to compile pipeline\n\"\"\"\n\nimport kfp.dsl as dsl\nimport kfp.gcp as gcp\nimport kfp.onprem as onprem\n\ngcs_or_pvc = 'PVC'\n\n\n@dsl.pipeline(name='MNIST',\n              description='A pipeline to train and serve the MNIST example.')\ndef mnist_pipeline(gcs_bucket=None,\n                   train_steps='200',\n                   learning_rate='0.01',\n                   batch_size='100'):\n    \"\"\"\n    Pipeline with three stages:\n      1. train an MNIST classifier\n      2. deploy a tf-serving instance to the cluster\n      3. deploy a web-ui to interact with it\n    \"\"\"\n\n    vop = None\n    volume = None\n    if gcs_or_pvc == \"PVC\":\n        vop = dsl.VolumeOp(name=\"create_pvc\",\n                           resource_name=\"nfs-1\",\n                           modes=dsl.VOLUME_MODE_RWO,\n                           size=\"10G\")\n        volume = vop.volume\n\n    train = dsl.ContainerOp(\n        name='train',\n        image=\n        'gcr.io/kubeflow-examples/mnist/model:v20190304-v0.2-176-g15d997b',\n        arguments=[\n            \"/opt/model.py\", \"--tf-export-dir\", gcs_bucket or \"/mnt\",\n            \"--tf-train-steps\", train_steps, \"--tf-batch-size\", batch_size,\n            \"--tf-learning-rate\", learning_rate\n        ])\n\n    serve_args = [\n        '--model-export-path', gcs_bucket or \"/mnt\", '--server-name',\n        \"mnist-service\"\n    ]\n    if gcs_or_pvc != 'GCS':\n        serve_args.extend(\n            ['--cluster-name', \"mnist-pipeline\", '--pvc-name', volume])\n\n    serve = dsl.ContainerOp(\n        name='serve',\n        image='gcr.io/ml-pipeline/ml-pipeline-kubeflow-deployer:'\n        '7775692adf28d6f79098e76e839986c9ee55dd61',\n        arguments=serve_args)\n    serve.after(train)\n\n    webui_args = [\n        '--image', 'gcr.io/kubeflow-examples/mnist/web-ui:'\n        'v20190304-v0.2-176-g15d997b-pipelines', '--name', 'web-ui',\n        '--container-port', '5000', '--service-port', '80', '--service-type',\n        \"LoadBalancer\"\n    ]\n\n    web_ui = dsl.ContainerOp(\n        name='web-ui',\n        image='gcr.io/kubeflow-examples/mnist/deploy-service:latest',\n        arguments=webui_args)\n    web_ui.after(serve)\n\n    steps = [train, serve, web_ui]\n    for step in steps:\n        if gcs_or_pvc == 'GCS':\n            step.apply(gcp.use_gcp_secret('user-gcp-sa'))\n        else:\n            step.after(vop)\n            step.add_pvolumes({\"/mnt\": volume})\n\n\nif __name__ == '__main__':\n    import kfp.compiler as compiler\n    compiler.Compiler().compile(mnist_pipeline, __file__ + '.tar.gz')\n"
  },
  {
    "path": "ch2_seldon_examples/tiller_rbac.yaml",
    "content": "apiVersion: v1\nkind: ServiceAccount\nmetadata:\n  name: tiller\n  namespace: kube-system\n---\napiVersion: rbac.authorization.k8s.io/v1\nkind: ClusterRoleBinding\nmetadata:\n  name: tiller\nroleRef:\n  apiGroup: rbac.authorization.k8s.io\n  kind: ClusterRole\n  name: cluster-admin\nsubjects:\n  - kind: ServiceAccount\n    name: tiller\n    namespace: kube-system"
  },
  {
    "path": "ch2_seldon_examples/train_pipeline.py",
    "content": "import kfp.dsl as dsl\nimport kfp.gcp as gcp\nimport kfp.onprem as onprem\n\nfrom string import Template\nimport json\n\n\n@dsl.pipeline(name='Simple sci-kit KF Pipeline',\n              description='A simple end to end sci-kit seldon kf pipeline')\ndef mnist_train_pipeline(docker_org=\"index.docker.io/seldonio\",\n                         train_container_version=\"0.2\",\n                         serve_container_version=\"0.1\"):\n\n    vop = dsl.VolumeOp(name=\"create_pvc\",\n                       resource_name=\"nfs-1\",\n                       modes=dsl.VOLUME_MODE_RWO,\n                       size=\"10G\")\n    volume = vop.volume\n    train = dsl.ContainerOp(\n        name='sk-train',\n        image=\n        f\"{docker_org}/skmnistclassifier_trainer:{train_container_version}\",\n        pvolumes={\"/data\": volume})\n\n    seldon_serving_json_template = Template(\"\"\"\n{\n\t\"apiVersion\": \"machinelearning.seldon.io/v1alpha2\",\n\t\"kind\": \"SeldonDeployment\",\n\t\"metadata\": {\n\t\t\"labels\": {\n\t\t\t\"app\": \"seldon\"\n\t\t},\n\t\t\"name\": \"mnist-classifier\"\n\t},\n\t\"spec\": {\n\t\t\"annotations\": {\n\t\t\t\"deployment_version\": \"v1\",\n\t\t\t\"project_name\": \"MNIST Example\"\n\t\t},\n\t\t\"name\": \"mnist-classifier\",\n\t\t\"predictors\": [\n\t\t\t{\n\t\t\t\t\"annotations\": {\n\t\t\t\t\t\"predictor_version\": \"v1\"\n\t\t\t\t},\n\t\t\t\t\"componentSpecs\": [\n\t\t\t\t\t{\n\t\t\t\t\t\t\"spec\": {\n\t\t\t\t\t\t\t\"containers\": [\n\t\t\t\t\t\t\t\t{\n\t\t\t\t\t\t\t\t\t\"image\": \"$dockerreposerving:$dockertagserving\",\n\t\t\t\t\t\t\t\t\t\"imagePullPolicy\": \"Always\",\n\t\t\t\t\t\t\t\t\t\"name\": \"mnist-classifier\",\n\t\t\t\t\t\t\t\t\t\"volumeMounts\": [\n\t\t\t\t\t\t\t\t\t\t{\n\t\t\t\t\t\t\t\t\t\t\t\"mountPath\": \"/data\",\n\t\t\t\t\t\t\t\t\t\t\t\"name\": \"persistent-storage\"\n\t\t\t\t\t\t\t\t\t\t}\n\t\t\t\t\t\t\t\t\t]\n\t\t\t\t\t\t\t\t}\n\t\t\t\t\t\t\t],\n\t\t\t\t\t\t\t\"terminationGracePeriodSeconds\": 1,\n\t\t\t\t\t\t\t\"volumes\": [\n\t\t\t\t\t\t\t\t{\n\t\t\t\t\t\t\t\t\t\"name\": \"persistent-storage\",\n\t\t\t\t\t\t\t\t\t\"persistentVolumeClaim\": {\n\t\t\t\t\t\t\t\t\t\t\t\"claimName\": \"$modelpvc\"\n\t\t\t\t\t\t\t\t\t}\n\t\t\t\t\t\t\t\t}\n\t\t\t\t\t\t\t]\n\t\t\t\t\t\t}\n\t\t\t\t\t}\n\t\t\t\t],\n\t\t\t\t\"graph\": {\n\t\t\t\t\t\"children\": [],\n\t\t\t\t\t\"endpoint\": {\n\t\t\t\t\t\t\"type\": \"REST\"\n\t\t\t\t\t},\n\t\t\t\t\t\"name\": \"mnist-classifier\",\n\t\t\t\t\t\"type\": \"MODEL\"\n\t\t\t\t},\n\t\t\t\t\"name\": \"mnist-classifier\",\n\t\t\t\t\"replicas\": 1\n\t\t\t}\n\t\t]\n\t}\n}    \n\"\"\")\n    seldon_serving_json = seldon_serving_json_template.substitute({\n        'dockerreposerving':\n        f\"{docker_org}/skmnistclassifier_runtime\",\n        'dockertagserving':\n        str(serve_container_version),\n        'modelpvc':\n        vop.outputs[\"name\"]\n    })\n\n    seldon_deployment = json.loads(seldon_serving_json)\n\n    serve = dsl.ResourceOp(\n        name='serve',\n        k8s_resource=seldon_deployment,\n        success_condition='status.state == Available').after(train)\n\n\n# If we're called directly create an expirement and run\nif __name__ == '__main__':\n    pipeline_func = mnist_train_pipeline\n    pipeline_filename = pipeline_func.__name__ + '.pipeline.zip'\n    import kfp.compiler as compiler\n    compiler.Compiler().compile(pipeline_func, pipeline_filename)\n    expirement_name = \"cheese\"\n    experiment = client.create_experiment(expirement_name)\n    run_name = pipeline_func.__name__ + ' run'\n    run_result = client.run_pipeline(experiment.id, run_name,\n                                     pipeline_filename, arguments)\n    print(run_result)\n"
  },
  {
    "path": "ch9/ctscans/DICOM Denoising Pipeline.ipynb",
    "content": "{\n \"cells\": [\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": 1,\n   \"metadata\": {},\n   \"outputs\": [\n    {\n     \"name\": \"stdout\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"Defaulting to user installation because normal site-packages is not writeable\\n\",\n      \"Collecting kfp\\n\",\n      \"  Downloading kfp-0.5.1.tar.gz (119 kB)\\n\",\n      \"\\u001b[K     |████████████████████████████████| 119 kB 3.5 MB/s eta 0:00:01\\n\",\n      \"\\u001b[?25hRequirement already satisfied: PyYAML in /usr/local/lib/python3.6/dist-packages (from kfp) (5.3)\\n\",\n      \"Requirement already satisfied: google-cloud-storage>=1.13.0 in /usr/local/lib/python3.6/dist-packages (from kfp) (1.25.0)\\n\",\n      \"Requirement already satisfied: kubernetes<12.0.0,>=8.0.0 in /usr/local/lib/python3.6/dist-packages (from kfp) (10.0.1)\\n\",\n      \"Requirement already satisfied: google-auth>=1.6.1 in /usr/local/lib/python3.6/dist-packages (from kfp) (1.11.0)\\n\",\n      \"Collecting requests_toolbelt>=0.8.0\\n\",\n      \"  Downloading requests_toolbelt-0.9.1-py2.py3-none-any.whl (54 kB)\\n\",\n      \"\\u001b[K     |████████████████████████████████| 54 kB 4.0 MB/s  eta 0:00:01\\n\",\n      \"\\u001b[?25hRequirement already satisfied: cloudpickle in /usr/local/lib/python3.6/dist-packages (from kfp) (1.2.2)\\n\",\n      \"Collecting kfp-server-api<0.6.0,>=0.2.5\\n\",\n      \"  Downloading kfp-server-api-0.5.0.tar.gz (39 kB)\\n\",\n      \"Requirement already satisfied: jsonschema>=3.0.1 in /usr/local/lib/python3.6/dist-packages (from kfp) (3.2.0)\\n\",\n      \"Collecting tabulate\\n\",\n      \"  Downloading tabulate-0.8.7-py3-none-any.whl (24 kB)\\n\",\n      \"Collecting click\\n\",\n      \"  Downloading click-7.1.2-py2.py3-none-any.whl (82 kB)\\n\",\n      \"\\u001b[K     |████████████████████████████████| 82 kB 1.5 MB/s  eta 0:00:01\\n\",\n      \"\\u001b[?25hCollecting Deprecated\\n\",\n      \"  Downloading Deprecated-1.2.9-py2.py3-none-any.whl (8.6 kB)\\n\",\n      \"Collecting strip-hints\\n\",\n      \"  Downloading strip-hints-0.1.9.tar.gz (30 kB)\\n\",\n      \"Requirement already satisfied: google-cloud-core<2.0dev,>=1.2.0 in /usr/local/lib/python3.6/dist-packages (from google-cloud-storage>=1.13.0->kfp) (1.3.0)\\n\",\n      \"Requirement already satisfied: google-resumable-media<0.6dev,>=0.5.0 in /usr/local/lib/python3.6/dist-packages (from google-cloud-storage>=1.13.0->kfp) (0.5.0)\\n\",\n      \"Requirement already satisfied: six>=1.9.0 in /usr/lib/python3/dist-packages (from kubernetes<12.0.0,>=8.0.0->kfp) (1.11.0)\\n\",\n      \"Requirement already satisfied: websocket-client!=0.40.0,!=0.41.*,!=0.42.*,>=0.32.0 in /usr/local/lib/python3.6/dist-packages (from kubernetes<12.0.0,>=8.0.0->kfp) (0.57.0)\\n\",\n      \"Requirement already satisfied: python-dateutil>=2.5.3 in /usr/local/lib/python3.6/dist-packages (from kubernetes<12.0.0,>=8.0.0->kfp) (2.8.1)\\n\",\n      \"Requirement already satisfied: requests in /usr/local/lib/python3.6/dist-packages (from kubernetes<12.0.0,>=8.0.0->kfp) (2.22.0)\\n\",\n      \"Requirement already satisfied: requests-oauthlib in /usr/local/lib/python3.6/dist-packages (from kubernetes<12.0.0,>=8.0.0->kfp) (1.3.0)\\n\",\n      \"Requirement already satisfied: setuptools>=21.0.0 in /usr/local/lib/python3.6/dist-packages (from kubernetes<12.0.0,>=8.0.0->kfp) (45.1.0)\\n\",\n      \"Requirement already satisfied: urllib3>=1.24.2 in /usr/local/lib/python3.6/dist-packages (from kubernetes<12.0.0,>=8.0.0->kfp) (1.25.8)\\n\",\n      \"Requirement already satisfied: certifi>=14.05.14 in /usr/local/lib/python3.6/dist-packages (from kubernetes<12.0.0,>=8.0.0->kfp) (2019.11.28)\\n\",\n      \"Requirement already satisfied: rsa<4.1,>=3.1.4 in /usr/local/lib/python3.6/dist-packages (from google-auth>=1.6.1->kfp) (4.0)\\n\",\n      \"Requirement already satisfied: cachetools<5.0,>=2.0.0 in /usr/local/lib/python3.6/dist-packages (from google-auth>=1.6.1->kfp) (4.0.0)\\n\",\n      \"Requirement already satisfied: pyasn1-modules>=0.2.1 in /usr/local/lib/python3.6/dist-packages (from google-auth>=1.6.1->kfp) (0.2.8)\\n\",\n      \"Requirement already satisfied: attrs>=17.4.0 in /usr/local/lib/python3.6/dist-packages (from jsonschema>=3.0.1->kfp) (19.3.0)\\n\",\n      \"Requirement already satisfied: importlib-metadata; python_version < \\\"3.8\\\" in /usr/local/lib/python3.6/dist-packages (from jsonschema>=3.0.1->kfp) (1.4.0)\\n\",\n      \"Requirement already satisfied: pyrsistent>=0.14.0 in /usr/local/lib/python3.6/dist-packages (from jsonschema>=3.0.1->kfp) (0.15.7)\\n\",\n      \"Requirement already satisfied: wrapt<2,>=1.10 in /usr/local/lib/python3.6/dist-packages (from Deprecated->kfp) (1.11.2)\\n\",\n      \"Requirement already satisfied: wheel in /usr/lib/python3/dist-packages (from strip-hints->kfp) (0.30.0)\\n\",\n      \"Requirement already satisfied: google-api-core<2.0.0dev,>=1.16.0 in /usr/local/lib/python3.6/dist-packages (from google-cloud-core<2.0dev,>=1.2.0->google-cloud-storage>=1.13.0->kfp) (1.16.0)\\n\",\n      \"Requirement already satisfied: idna<2.9,>=2.5 in /usr/lib/python3/dist-packages (from requests->kubernetes<12.0.0,>=8.0.0->kfp) (2.6)\\n\",\n      \"Requirement already satisfied: chardet<3.1.0,>=3.0.2 in /usr/local/lib/python3.6/dist-packages (from requests->kubernetes<12.0.0,>=8.0.0->kfp) (3.0.4)\\n\",\n      \"Requirement already satisfied: oauthlib>=3.0.0 in /usr/local/lib/python3.6/dist-packages (from requests-oauthlib->kubernetes<12.0.0,>=8.0.0->kfp) (3.1.0)\\n\",\n      \"Requirement already satisfied: pyasn1>=0.1.3 in /usr/local/lib/python3.6/dist-packages (from rsa<4.1,>=3.1.4->google-auth>=1.6.1->kfp) (0.4.8)\\n\",\n      \"Requirement already satisfied: zipp>=0.5 in /usr/local/lib/python3.6/dist-packages (from importlib-metadata; python_version < \\\"3.8\\\"->jsonschema>=3.0.1->kfp) (2.1.0)\\n\",\n      \"Requirement already satisfied: protobuf>=3.4.0 in /usr/local/lib/python3.6/dist-packages (from google-api-core<2.0.0dev,>=1.16.0->google-cloud-core<2.0dev,>=1.2.0->google-cloud-storage>=1.13.0->kfp) (3.11.2)\\n\",\n      \"Requirement already satisfied: pytz in /usr/local/lib/python3.6/dist-packages (from google-api-core<2.0.0dev,>=1.16.0->google-cloud-core<2.0dev,>=1.2.0->google-cloud-storage>=1.13.0->kfp) (2019.3)\\n\",\n      \"Requirement already satisfied: googleapis-common-protos<2.0dev,>=1.6.0 in /usr/local/lib/python3.6/dist-packages (from google-api-core<2.0.0dev,>=1.16.0->google-cloud-core<2.0dev,>=1.2.0->google-cloud-storage>=1.13.0->kfp) (1.51.0)\\n\",\n      \"Building wheels for collected packages: kfp, kfp-server-api, strip-hints\\n\",\n      \"  Building wheel for kfp (setup.py) ... \\u001b[?25ldone\\n\",\n      \"\\u001b[?25h  Created wheel for kfp: filename=kfp-0.5.1-py3-none-any.whl size=163151 sha256=da5b540ae9834d37659146f0576997ffd8f7a7e2b305e1eb7b2a99dd4745930b\\n\",\n      \"  Stored in directory: /home/jovyan/.cache/pip/wheels/2f/26/f9/e3836cb6e6cabd63ef912304e18a852ac29cb870a4a0b85f98\\n\",\n      \"  Building wheel for kfp-server-api (setup.py) ... \\u001b[?25ldone\\n\",\n      \"\\u001b[?25h  Created wheel for kfp-server-api: filename=kfp_server_api-0.5.0-py3-none-any.whl size=106319 sha256=84f55948cc254c0f836dffdfd51574a828ae8a503a2ca9198acf7a27ca2aaea7\\n\",\n      \"  Stored in directory: /home/jovyan/.cache/pip/wheels/73/36/4e/bfe2efeeea4f74f04984ebe1d44136202b72191302f4760951\\n\",\n      \"  Building wheel for strip-hints (setup.py) ... \\u001b[?25ldone\\n\",\n      \"\\u001b[?25h  Created wheel for strip-hints: filename=strip_hints-0.1.9-py2.py3-none-any.whl size=24671 sha256=3bcfd573a91f5f4c46d23509ac3fee9a0cf351b414e00ed505a8f71d0e6a1141\\n\",\n      \"  Stored in directory: /home/jovyan/.cache/pip/wheels/21/6d/fa/7ed7c0560e1ef39ebabd5cc0241e7fca711660bae1ad752e2b\\n\",\n      \"Successfully built kfp kfp-server-api strip-hints\\n\",\n      \"Installing collected packages: requests-toolbelt, kfp-server-api, tabulate, click, Deprecated, strip-hints, kfp\\n\",\n      \"\\u001b[33m  WARNING: The script tabulate is installed in '/home/jovyan/.local/bin' which is not on PATH.\\n\",\n      \"  Consider adding this directory to PATH or, if you prefer to suppress this warning, use --no-warn-script-location.\\u001b[0m\\n\",\n      \"\\u001b[33m  WARNING: The script strip-hints is installed in '/home/jovyan/.local/bin' which is not on PATH.\\n\",\n      \"  Consider adding this directory to PATH or, if you prefer to suppress this warning, use --no-warn-script-location.\\u001b[0m\\n\",\n      \"\\u001b[33m  WARNING: The scripts dsl-compile and kfp are installed in '/home/jovyan/.local/bin' which is not on PATH.\\n\",\n      \"  Consider adding this directory to PATH or, if you prefer to suppress this warning, use --no-warn-script-location.\\u001b[0m\\n\",\n      \"Successfully installed Deprecated-1.2.9 click-7.1.2 kfp-0.5.1 kfp-server-api-0.5.0 requests-toolbelt-0.9.1 strip-hints-0.1.9 tabulate-0.8.7\\n\",\n      \"\\u001b[33mWARNING: You are using pip version 20.0.2; however, version 20.1 is available.\\n\",\n      \"You should consider upgrading via the '/usr/bin/python3 -m pip install --upgrade pip' command.\\u001b[0m\\n\"\n     ]\n    }\n   ],\n   \"source\": [\n    \"!pip3 install kfp\\n\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": 1,\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"import kfp\\n\",\n    \"import kubernetes\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": 2,\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"\\n\",\n    \"container_manifest = {\\n\",\n    \"    \\\"apiVersion\\\": \\\"sparkoperator.k8s.io/v1beta2\\\",\\n\",\n    \"    \\\"kind\\\": \\\"SparkApplication\\\",\\n\",\n    \"    \\\"metadata\\\": {\\n\",\n    \"        \\\"name\\\": \\\"spark-app\\\",\\n\",\n    \"        \\\"namespace\\\": \\\"kubeflow\\\"\\n\",\n    \"    },\\n\",\n    \"    \\\"spec\\\": {\\n\",\n    \"        \\\"type\\\": \\\"Scala\\\",\\n\",\n    \"        \\\"mode\\\": \\\"cluster\\\",\\n\",\n    \"        \\\"image\\\": \\\"docker.io/rawkintrevo/covid-basis-vectors:0.2.0\\\",\\n\",\n    \"        \\\"imagePullPolicy\\\": \\\"Always\\\",\\n\",\n    \"        \\\"hadoopConf\\\": {\\n\",\n    \"            \\\"fs.gs.project.id\\\": \\\"kubeflow-hacky-hacky\\\",\\n\",\n    \"            \\\"fs.gs.system.bucket\\\": \\\"covid-dicoms\\\",\\n\",\n    \"            \\\"fs.gs.impl\\\" : \\\"com.google.cloud.hadoop.fs.gcs.GoogleHadoopFileSystem\\\",\\n\",\n    \"            \\\"google.cloud.auth.service.account.enable\\\": \\\"true\\\",\\n\",\n    \"            \\\"google.cloud.auth.service.account.json.keyfile\\\": \\\"/mnt/secrets/user-gcp-sa.json\\\",\\n\",\n    \"        },\\n\",\n    \"        \\\"mainClass\\\": \\\"org.rawkintrevo.covid.App\\\",\\n\",\n    \"        \\\"mainApplicationFile\\\": \\\"local:///covid-0.1-jar-with-dependencies.jar\\\", # See the Dockerfile\\n\",\n    \"        \\\"arguments\\\": [\\\"245\\\", \\\"15\\\", \\\"1\\\"],\\n\",\n    \"        \\\"sparkVersion\\\": \\\"2.4.5\\\",\\n\",\n    \"        \\\"restartPolicy\\\": {\\n\",\n    \"            \\\"type\\\": \\\"Never\\\"\\n\",\n    \"        },\\n\",\n    \"        \\\"driver\\\": {\\n\",\n    \"            \\\"cores\\\": 1,\\n\",\n    \"            \\\"secrets\\\": [\\n\",\n    \"                {\\\"name\\\": \\\"user-gcp-sa\\\",\\n\",\n    \"                 \\\"path\\\": \\\"/mnt/secrets\\\",\\n\",\n    \"                 \\\"secretType\\\": \\\"GCPServiceAccount\\\"\\n\",\n    \"                 }\\n\",\n    \"            ],\\n\",\n    \"\\n\",\n    \"            \\\"coreLimit\\\": \\\"1200m\\\",\\n\",\n    \"            \\\"memory\\\": \\\"512m\\\",\\n\",\n    \"            \\\"labels\\\": {\\n\",\n    \"                \\\"version\\\": \\\"2.4.5\\\",\\n\",\n    \"            },\\n\",\n    \"            \\\"serviceAccount\\\": \\\"spark-operatoroperator-sa\\\", # also try spark-operatoroperator-sa\\n\",\n    \"        },\\n\",\n    \"        \\\"executor\\\": {\\n\",\n    \"            \\\"cores\\\": 1,\\n\",\n    \"            \\\"secrets\\\": [\\n\",\n    \"                {\\\"name\\\": \\\"user-gcp-sa\\\",\\n\",\n    \"                 \\\"path\\\": \\\"/mnt/secrets\\\",\\n\",\n    \"                 \\\"secretType\\\": \\\"GCPServiceAccount\\\"\\n\",\n    \"                 }\\n\",\n    \"            ],\\n\",\n    \"            \\\"instances\\\": 4,\\n\",\n    \"            \\\"memory\\\": \\\"4084m\\\"\\n\",\n    \"        },\\n\",\n    \"        \\\"labels\\\": {\\n\",\n    \"            \\\"version\\\": \\\"2.4.5\\\"\\n\",\n    \"        },\\n\",\n    \"\\n\",\n    \"    }\\n\",\n    \"}\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": 16,\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"from kfp.gcp import use_gcp_secret\\n\",\n    \"@kfp.dsl.pipeline(\\n\",\n    \"    name=\\\"Covid DICOM Pipe v2\\\",\\n\",\n    \"    description=\\\"Create Basis Vectors for Lung Images\\\"\\n\",\n    \")\\n\",\n    \"def covid_dicom_pipeline():\\n\",\n    \"    vop = kfp.dsl.VolumeOp(\\n\",\n    \"        name=\\\"requisition-PVC\\\",\\n\",\n    \"        resource_name=\\\"datapvc\\\",\\n\",\n    \"        size=\\\"20Gi\\\", #10 Gi blows up...\\n\",\n    \"        modes=kfp.dsl.VOLUME_MODE_RWO\\n\",\n    \"    )\\n\",\n    \"    step1 = kfp.dsl.ContainerOp(\\n\",\n    \"        name=\\\"download-dicom\\\",\\n\",\n    \"        image=\\\"rawkintrevo/download-dicom:0.0.0.4\\\",\\n\",\n    \"        command=[\\\"/run.sh\\\"],\\n\",\n    \"        pvolumes={\\\"/data\\\": vop.volume}\\n\",\n    \"    )\\n\",\n    \"    step2 = kfp.dsl.ContainerOp(\\n\",\n    \"        name=\\\"convert-dicoms-to-vectors\\\",\\n\",\n    \"        image=\\\"rawkintrevo/covid-prep-dicom:0.9.5\\\",\\n\",\n    \"        arguments=[\\n\",\n    \"            '--bucket_name', \\\"covid-dicoms\\\",\\n\",\n    \"        ],\\n\",\n    \"        command=[\\\"python\\\", \\\"/program.py\\\"],\\n\",\n    \"        pvolumes={\\\"/mnt/data\\\": step1.pvolume}\\n\",\n    \"    ).apply(kfp.gcp.use_gcp_secret(secret_name='user-gcp-sa'))\\n\",\n    \"    rop = kfp.dsl.ResourceOp(\\n\",\n    \"        name=\\\"calculate-basis-vectors\\\",\\n\",\n    \"        k8s_resource=container_manifest,\\n\",\n    \"        action=\\\"create\\\",\\n\",\n    \"        success_condition=\\\"status.applicationState.state == COMPLETED\\\"\\n\",\n    \"    ).after(step2)\\n\",\n    \"    pyviz = kfp.dsl.ContainerOp(\\n\",\n    \"        name=\\\"visualize-slice-of-dicom\\\",\\n\",\n    \"        image=\\\"rawkintrevo/visualize-dicom-output:0.0.11\\\",\\n\",\n    \"        command=[\\\"python\\\", \\\"/program.py\\\"],\\n\",\n    \"        arguments=[\\n\",\n    \"            '--bucket_name', \\\"covid-dicoms\\\",\\n\",\n    \"        ],\\n\",\n    \"    ).apply(kfp.gcp.use_gcp_secret(secret_name='user-gcp-sa')).after(rop)\\n\",\n    \"    \\n\",\n    \"\\n\",\n    \"kfp.compiler.Compiler().compile(covid_dicom_pipeline,\\\"dicom-pipeline-2.zip\\\")\\n\",\n    \"client = kfp.Client()\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": 17,\n   \"metadata\": {},\n   \"outputs\": [\n    {\n     \"data\": {\n      \"text/html\": [\n       \"Experiment link <a href=\\\"/pipeline/#/experiments/details/a7292089-5186-4e53-b0bb-9264dfbb9775\\\" target=\\\"_blank\\\" >here</a>\"\n      ],\n      \"text/plain\": [\n       \"<IPython.core.display.HTML object>\"\n      ]\n     },\n     \"metadata\": {},\n     \"output_type\": \"display_data\"\n    },\n    {\n     \"data\": {\n      \"text/html\": [\n       \"Run link <a href=\\\"/pipeline/#/runs/details/0f3f3d01-f6c4-4216-8e03-396c49fa040f\\\" target=\\\"_blank\\\" >here</a>\"\n      ],\n      \"text/plain\": [\n       \"<IPython.core.display.HTML object>\"\n      ]\n     },\n     \"metadata\": {},\n     \"output_type\": \"display_data\"\n    }\n   ],\n   \"source\": [\n    \"\\n\",\n    \"my_experiment = client.create_experiment(name='my-experiments')\\n\",\n    \"my_run = client.run_pipeline(my_experiment.id, 'my-run1', 'dicom-pipeline-2.zip')\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": null,\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": []\n  }\n ],\n \"metadata\": {\n  \"kernelspec\": {\n   \"display_name\": \"Python 3\",\n   \"language\": \"python\",\n   \"name\": \"python3\"\n  },\n  \"language_info\": {\n   \"codemirror_mode\": {\n    \"name\": \"ipython\",\n    \"version\": 3\n   },\n   \"file_extension\": \".py\",\n   \"mimetype\": \"text/x-python\",\n   \"name\": \"python\",\n   \"nbconvert_exporter\": \"python\",\n   \"pygments_lexer\": \"ipython3\",\n   \"version\": \"3.6.9\"\n  }\n },\n \"nbformat\": 4,\n \"nbformat_minor\": 4\n}\n"
  },
  {
    "path": "ch9/ctscans/calculate-basis-vectors/Dockerfile",
    "content": "FROM gcr.io/spark-operator/spark:v2.4.5-gcs-prometheus\n\nCOPY target/covid-0.1-jar-with-dependencies.jar /\n\n## Someday soon we'll live in a world where this hack is unnessecary\n# https://github.com/GoogleCloudDataproc/hadoop-connectors/issues/323\nCMD rm /opt/spark/jars/gcs-connector-latest-hadoop2.jar\nADD https://storage.googleapis.com/hadoop-lib/gcs/gcs-connector-hadoop2-2.0.1.jar $SPARK_HOME/jars\n\nENTRYPOINT [\"/opt/entrypoint.sh\"]\n"
  },
  {
    "path": "ch9/ctscans/calculate-basis-vectors/build-component.sh",
    "content": "#!/usr/bin/env bash\n\nimage_name=rawkintrevo/covid-basis-vectors # Specify the image name here\nimage_tag=0.2.0\nfull_image_name=${image_name}:${image_tag}\n\ncd \"$(dirname \"$0\")\"\ndocker build -t \"${full_image_name}\" .\ndocker push \"$full_image_name\"\n"
  },
  {
    "path": "ch9/ctscans/calculate-basis-vectors/pom.xml",
    "content": "<project xmlns=\"http://maven.apache.org/POM/4.0.0\" xmlns:xsi=\"http://www.w3.org/2001/XMLSchema-instance\" xsi:schemaLocation=\"http://maven.apache.org/POM/4.0.0 http://maven.apache.org/maven-v4_0_0.xsd\">\n  <modelVersion>4.0.0</modelVersion>\n  <groupId>org.rawkintrevo</groupId>\n  <artifactId>covid</artifactId>\n  <version>0.1</version>\n  <inceptionYear>2020</inceptionYear>\n  <properties>\n    <scala.version>2.11.12</scala.version>\n  </properties>\n\n  <repositories>\n    <repository>\n      <id>scala-tools.org</id>\n      <name>Scala-Tools Maven2 Repository</name>\n      <url>http://scala-tools.org/repo-releases</url>\n    </repository>\n  </repositories>\n\n  <pluginRepositories>\n    <pluginRepository>\n      <id>scala-tools.org</id>\n      <name>Scala-Tools Maven2 Repository</name>\n      <url>http://scala-tools.org/repo-releases</url>\n    </pluginRepository>\n  </pluginRepositories>\n\n  <dependencies>\n    <dependency>\n      <groupId>org.scala-lang</groupId>\n      <artifactId>scala-library</artifactId>\n      <version>${scala.version}</version>\n    </dependency>\n    <dependency>\n      <groupId>junit</groupId>\n      <artifactId>junit</artifactId>\n      <version>4.4</version>\n      <scope>test</scope>\n    </dependency>\n    <dependency>\n      <groupId>org.specs</groupId>\n      <artifactId>specs</artifactId>\n      <version>1.2.5</version>\n      <scope>test</scope>\n    </dependency>\n\n    <dependency>\n      <groupId>org.apache.mahout</groupId>\n      <artifactId>mahout-core_2.11</artifactId>\n      <version>14.1-SNAPSHOT</version>\n    </dependency>\n\n    <dependency>\n      <groupId>org.apache.mahout</groupId>\n      <artifactId>mahout-hdfs_2.11</artifactId>\n      <version>14.1-SNAPSHOT</version>\n    </dependency>\n\n    <dependency>\n      <groupId>org.apache.mahout</groupId>\n      <artifactId>mahout-spark_2.11</artifactId>\n      <version>14.1-SNAPSHOT</version>\n    </dependency>\n\n\n  </dependencies>\n\n  <build>\n    <sourceDirectory>src/main/scala</sourceDirectory>\n    <testSourceDirectory>src/test/scala</testSourceDirectory>\n    <plugins>\n      <plugin>\n        <groupId>org.scala-tools</groupId>\n        <artifactId>maven-scala-plugin</artifactId>\n        <executions>\n          <execution>\n            <goals>\n              <goal>compile</goal>\n              <goal>testCompile</goal>\n            </goals>\n          </execution>\n        </executions>\n        <configuration>\n          <scalaVersion>${scala.version}</scalaVersion>\n          <args>\n            <arg>-target:jvm-1.5</arg>\n          </args>\n        </configuration>\n      </plugin>\n      <plugin>\n        <groupId>org.apache.maven.plugins</groupId>\n        <artifactId>maven-eclipse-plugin</artifactId>\n        <configuration>\n          <downloadSources>true</downloadSources>\n          <buildcommands>\n            <buildcommand>ch.epfl.lamp.sdt.core.scalabuilder</buildcommand>\n          </buildcommands>\n          <additionalProjectnatures>\n            <projectnature>ch.epfl.lamp.sdt.core.scalanature</projectnature>\n          </additionalProjectnatures>\n          <classpathContainers>\n            <classpathContainer>org.eclipse.jdt.launching.JRE_CONTAINER</classpathContainer>\n            <classpathContainer>ch.epfl.lamp.sdt.launching.SCALA_CONTAINER</classpathContainer>\n          </classpathContainers>\n        </configuration>\n      </plugin>\n      <!-- This builds the fat JAR -->\n      <plugin>\n        <artifactId>maven-assembly-plugin</artifactId>\n        <configuration>\n          <archive>\n            <manifest>\n              <mainClass>org.rawkintrevo.covid.App</mainClass>\n            </manifest>\n          </archive>\n          <descriptorRefs>\n            <descriptorRef>jar-with-dependencies</descriptorRef>\n          </descriptorRefs>\n        </configuration>\n        <executions>\n          <execution>\n            <id>make-assembly</id>\n            <phase>package</phase>\n            <goals>\n              <goal>single</goal>\n            </goals>\n          </execution>\n        </executions>\n      </plugin>\n    </plugins>\n  </build>\n  <reporting>\n    <plugins>\n      <plugin>\n        <groupId>org.scala-tools</groupId>\n        <artifactId>maven-scala-plugin</artifactId>\n        <configuration>\n          <scalaVersion>${scala.version}</scalaVersion>\n        </configuration>\n      </plugin>\n    </plugins>\n  </reporting>\n</project>\n"
  },
  {
    "path": "ch9/ctscans/calculate-basis-vectors/src/main/scala/org/rawkintrevo/covid/App.scala",
    "content": "package org.rawkintrevo.covid\n\nimport org.apache.mahout.math._\nimport org.apache.mahout.math.scalabindings._\nimport org.apache.mahout.math.drm._\nimport org.apache.mahout.math.scalabindings.RLikeOps._\nimport org.apache.mahout.math.drm.RLikeDrmOps._\nimport org.apache.mahout.sparkbindings._\nimport org.apache.mahout.math.decompositions._\nimport org.apache.mahout.math.scalabindings.MahoutCollections._\n\nimport org.apache.spark.SparkContext\nimport org.apache.spark.SparkConf\n\nimport org.apache.spark.SparkFiles\n\nobject App {\n  def main(args: Array[String]) {\n\n    val conf:SparkConf = new SparkConf()\n      .setAppName(\"Calculate CT Scan Basis Vectors\")\n      .set(\"spark.kryo.referenceTracking\", \"false\")\n      .set(\"spark.kryo.registrator\", \"org.apache.mahout.sparkbindings.io.MahoutKryoRegistrator\")\n      .set(\"spark.kryoserializer.buffer\", \"32\")\n      .set(\"spark.kryoserializer.buffer.max\" , \"600m\")\n      .set(\"spark.serializer\",\t\"org.apache.spark.serializer.KryoSerializer\")\n\n    //create spark context object\n    val sc = new SparkContext(conf)\n    implicit val sdc: org.apache.mahout.sparkbindings.SparkDistributedContext = sc2sdc(sc)\n\n\n    val pathToMatrix = \"gs://covid-dicoms/s.csv\"  // todo make this an arg.\n\n    val voxelRDD:DrmRdd[Int]  = sc.textFile(pathToMatrix)\n      .map(s => dvec( s.split(\",\")\n      .map(f => f.toDouble)))\n      .zipWithIndex\n      .map(o => (o._2.toInt, o._1))\n\n    val voxelDRM = drmWrap(voxelRDD)\n\n    // k, p, q should all be cli parameters\n    // k is rank of the output e.g. the number of eigenfaces we want out.\n    // p is oversampling parameter,\n    // and q is the number of additional power iterations\n    // Read https://mahout.apache.org/users/dim-reduction/ssvd.html\n    val k = args(0).toInt\n    val p = args(1).toInt\n    val q = args(2).toInt\n\n    val(drmU, drmV, s) = dssvd(voxelDRM.t, k, p, q)\n\n    val V = drmV.checkpoint().rdd.saveAsTextFile(\"gs://covid-dicoms/drmV\")\n    val U = drmU.t.checkpoint().rdd.saveAsTextFile(\"gs://covid-dicoms/drmU\")\n\n    sc.parallelize(s.toArray,1).saveAsTextFile(\"gs://covid-dicoms/s\")\n    println(\"The job is done!\")\n  }\n}\n\n// $SPARK_HOME/bin/spark-submit --driver-memory 4G --executor-memory 4G --class org.rawkintrevo.book.App *jar"
  },
  {
    "path": "ch9/ctscans/download-dicom/Dockerfile",
    "content": "FROM gcr.io/google.com/cloudsdktool/cloud-sdk:latest\n#\n## install gsutil lightly\n#RUN  apt update \\\n#  && apt install -y wget\n#RUN wget https://storage.googleapis.com/pub/gsutil.tar.gz\n#RUN tar xfz gsutil.tar.gz -C $HOME\n#ENV PATH=\"${PATH}:$HOME/gsutil\"\n\nCOPY ./run.sh /run.sh\n"
  },
  {
    "path": "ch9/ctscans/download-dicom/build-component.sh",
    "content": "#!/usr/bin/env bash\n\nimage_name=rawkintrevo/download-dicom # Specify the image name here\nimage_tag=0.0.0.4\nfull_image_name=${image_name}:${image_tag}\n\ncd \"$(dirname \"$0\")\"\ndocker build -t \"${full_image_name}\" .\ndocker push \"$full_image_name\"\n"
  },
  {
    "path": "ch9/ctscans/download-dicom/run.sh",
    "content": "#!/usr/bin/env bash\nset -e\n\n# 1st arg- case number (leading zero required if < 10), defaults to case1\n\nif [ -z \"${1}\" ]\nthen\n      CASE=\"01\"\nelse\n      CASE=\"${1}\"\nfi\n\n\n\necho \"Downloading DICOMs\"\n# If not on GCP need to download this\ngsutil cp gs://covid-dicoms/covid-dicoms.tar.gz /tmp/covid-dicoms.tar.gz\ntar -xzf /tmp/covid-dicoms.tar.gz -C /tmp\n\nmv \"/tmp/case0${CASE}/axial\" /data/dicom\n\n\n\n"
  },
  {
    "path": "ch9/ctscans/process-dicoms-into-vectors/Dockerfile",
    "content": "FROM pydicom/dicom:v3.6.5\n\n# From https://github.com/HealthplusAI/python3-gdcm\nRUN apt update && apt install -y python-vtk6 libvtk6-dev cmake-curses-gui swig python3-dev libpython3.7-dev\n## checkinstall missing...\nRUN ln -s /opt/conda/bin/* /usr/local/bin\nRUN git clone --branch release git://git.code.sf.net/p/gdcm/gdcm\nRUN mkdir build\nRUN cd build && cmake -D CMAKE_BUILD_TYPE=Release -D CMAKE_C_FLAGS=-fPIC -D CMAKE_CXX_FLAGS=-fPIC -D GDCM_BUILD_SHARED_LIBS:BOOL=ON \\\n    -D GDCM_WRAP_PYTHON=ON -D PYTHON_EXECUTABLE=/usr/local/bin/python3.7 \\\n    -D PYTHON_INCLUDE_DIR=/usr/include/python3.7m/ -D GDCM_BUILD_SHARED_LIBS=ON -D GDCM_USE_VTK=ON ../gdcm\n## They forgot this line in instuctions, but is important...\nRUN cd build && make install\n# checkinstall -D -y --pkgversion --pkgname=python3-gdcm --pkgversion=1  # checkinstall doesn't exist in debian?\nRUN cp /usr/local/lib/gdcm.py /opt/conda/lib/python3.7/site-packages/\nRUN cp /usr/local/lib/gdcmswig.py /opt/conda/lib/python3.7/site-packages/\nRUN cp /usr/local/lib/_gdcmswig.so /opt/conda/lib/python3.7/site-packages/\nRUN cp /usr/local/lib/libgdcm* /opt/conda/lib/python3.7/site-packages/\nRUN ldconfig\n\n# RUN mkdir /data already exists in base\n# todo move these to requirements.txt\nRUN pip install numpy\nRUN pip install scipy\nRUN pip install google-cloud-storage\n\nENV GOOGLE_APPLICATION_CREDENTIALS=\"/secret/gcp-credentials/user-gcp-sa.json\"\nCOPY src/program.py /program.py\n"
  },
  {
    "path": "ch9/ctscans/process-dicoms-into-vectors/build-component.sh",
    "content": "#!/usr/bin/env bash\n\nimage_name=rawkintrevo/covid-prep-dicom # Specify the image name here\nimage_tag=0.9.5\nfull_image_name=${image_name}:${image_tag}\n\ncd \"$(dirname \"$0\")\"\ndocker build -t \"${full_image_name}\" .\ndocker push \"$full_image_name\"\n"
  },
  {
    "path": "ch9/ctscans/process-dicoms-into-vectors/data/s.150.csv",
    "content": ""
  },
  {
    "path": "ch9/ctscans/process-dicoms-into-vectors/process-dicoms-into-vectors.yaml",
    "content": "name: Process DICOMs into Vectors\ndescription: Take a number of COVID DICOMs - output a list of vectors for DS-SVD.\ninputs:\n  - {name: in, type: String, description='Input file name.'}\n  - {name: out, type: String, description='Output file name.'}\nimplementation:\n  container:\n    image: rawkintrevo/covid-prep-docim\n    command: [\n      python, /program.py,\n      {inputValue:  in},\n      inputValue:  out}\n    ]"
  },
  {
    "path": "ch9/ctscans/process-dicoms-into-vectors/src/program.py",
    "content": "from os import listdir\nimport numpy as np\nimport pydicom\n\nimport argparse\nfrom google.cloud import storage\n\nparser = argparse.ArgumentParser(\n    description='Process DICOM Images into Vectors.')\nparser.add_argument('--input_dir',\n                    type=str,\n                    default=\"/mnt/data/dicom\",\n                    help='Directory containing DICOM Images')\nparser.add_argument('--bucket_name',\n                    type=str,\n                    help='name of bucket to write output to.')\nparser.add_argument('--output_file',\n                    type=str,\n                    default=\"s.csv\",\n                    help='file name of dcm converted to 2d numerical matrix')\n\nargs = parser.parse_args()\n\n\ndef create_3d_matrix(path):\n    dicoms = [pydicom.dcmread(f\"{path}/{f}\") for f in listdir(path)]\n    slices = [d for d in dicoms if hasattr(d, \"SliceLocation\")]\n    slices = sorted(slices, key=lambda s: s.SliceLocation)\n    ps = slices[0].PixelSpacing\n    ss = slices[0].SliceThickness\n    ax_aspect = ps[1] / ps[0]\n    sag_aspect = ps[1] / ss\n    cor_aspect = ss / ps[0]\n\n    # create 3D array\n    img_shape = list(slices[0].pixel_array.shape)\n    img_shape.append(len(slices))\n    img3d = np.zeros(img_shape)\n\n    for i, s in enumerate(slices):\n        img2d = s.pixel_array\n        img3d[:, :, i] = img2d\n\n    return {\n        \"img3d\": img3d,\n        \"img_shape\": img_shape,\n        \"ax_aspect\": ax_aspect,\n        \"sag_aspect\": sag_aspect,\n        \"cor_aspect\": cor_aspect\n    }\n\n\ndef upload_blob(bucket_name, source_file_name, destination_blob_name):\n    \"\"\"Uploads a file to the bucket.\"\"\"\n    # bucket_name = \"your-bucket-name\"\n    # source_file_name = \"local/path/to/file\"\n    # destination_blob_name = \"storage-object-name\"\n\n    storage_client = storage.Client()\n    bucket = storage_client.bucket(bucket_name)\n    blob = bucket.blob(destination_blob_name)\n\n    blob.upload_from_filename(source_file_name)\n\n    print(\"File {} uploaded to {}.\".format(source_file_name,\n                                           destination_blob_name))\n\n\ninput_dir = args.input_dir\noutput_file = args.output_file\n\nm = create_3d_matrix(f\"{input_dir}\")\nnp.savetxt(\"/tmp/s.csv\",\n           m['img3d'].reshape((-1, m['img_shape'][2])),\n           delimiter=\",\")\n\nupload_blob(args.bucket_name, \"/tmp/s.csv\", output_file)\n"
  },
  {
    "path": "ch9/ctscans/visualize-basis-vectors/Dockerfile",
    "content": "FROM python:3-buster\n\nRUN pip install numpy\nRUN pip install matplotlib\nRUN pip install google-cloud-storage\nCOPY src/program.py /program.py\n\nCMD [\"python\" , \"/program.py\"]"
  },
  {
    "path": "ch9/ctscans/visualize-basis-vectors/build-component.sh",
    "content": "#!/usr/bin/env bash\n\nimage_name=rawkintrevo/visualize-dicom-output # Specify the image name here\nimage_tag=0.0.11\nfull_image_name=${image_name}:${image_tag}\n\ncd \"$(dirname \"$0\")\"\ndocker build -t \"${full_image_name}\" .\ndocker push \"$full_image_name\"\n"
  },
  {
    "path": "ch9/ctscans/visualize-basis-vectors/src/program.py",
    "content": "from ast import literal_eval\n\nfrom os import listdir\n\nimport matplotlib.pyplot as plt\nimport numpy as np\n\nimport argparse\nfrom google.cloud import storage\n\nparser = argparse.ArgumentParser(\n    description='Convert DRMs into DICOMs and Images')\nparser.add_argument('--bucket_name',\n                    type=str,\n                    help='name of bucket to write output to.')\nargs = parser.parse_args()\n\n\ndef read_mahout_drm(path):\n    data = {}\n    counter = 0\n    parts = [p for p in listdir(path) if \"part\"]\n    for p in parts:\n        with open(f\"{path}/{p}\", 'r') as f:\n            lines = f.read().split(\"\\n\")\n            for l in lines[:-1]:\n                counter += 1\n                t = literal_eval(l)\n                arr = np.array([t[1][i] for i in range(len(t[1].keys()))])\n                data[t[0]] = arr\n    print(f\"read {counter} lines from {path}\")\n    return data\n\n\ndef plot_3d_matrix(img3d, img_shape, ax_aspect, sag_aspect, cor_aspect):\n    # plot 3 orthogonal slices\n    a1 = plt.subplot(2, 2, 1)\n    plt.imshow(img3d[:, :, img_shape[2] // 2])\n    a1.set_aspect(ax_aspect)\n\n    a2 = plt.subplot(2, 2, 2)\n    plt.imshow(img3d[:, img_shape[1] // 2, :])\n    a2.set_aspect(sag_aspect)\n\n    a3 = plt.subplot(2, 2, 3)\n    plt.imshow(img3d[img_shape[0] // 2, :, :].T)\n    a3.set_aspect(cor_aspect)\n    plt.show(cmap=plt.cm.bone)\n\n\ndef plot_2_3d_matrices(img1, img2, aspect, slice, cmap):\n    a1 = plt.subplot(1, 2, 1)\n    plt.imshow(img1[:, slice, :], cmap=cmap)\n    a1.set_aspect(aspect)\n\n    a2 = plt.subplot(1, 2, 2)\n    plt.imshow(img2[:, slice, :], cmap=cmap)\n    a2.set_aspect(aspect)\n\n\ndef upload_blob(bucket_name, source_file_name, destination_blob_name):\n    \"\"\"Uploads a file to the bucket.\"\"\"\n    # bucket_name = \"your-bucket-name\"\n    # source_file_name = \"local/path/to/file\"\n    # destination_blob_name = \"storage-object-name\"\n\n    storage_client = storage.Client()\n    bucket = storage_client.bucket(bucket_name)\n    blob = bucket.blob(destination_blob_name)\n\n    blob.upload_from_filename(source_file_name)\n\n    print(\"File {} uploaded to {}.\".format(source_file_name,\n                                           destination_blob_name))\n\n\ndef download_folder(bucket_name='your-bucket-name',\n                    bucket_dir='your-bucket-directory/',\n                    dl_dir=\"local-dir/\"):\n    storage_client = storage.Client()\n    bucket = storage_client.get_bucket(bucket_name)\n    blobs = bucket.list_blobs(prefix=bucket_dir)  # Get list of files\n    for blob in blobs:\n        filename = blob.name.replace('/', '_')\n        blob.download_to_filename(dl_dir + filename)  # Download\n\n\nimport os\n\nbucket_name = args.bucket_name\nos.mkdir('/tmp/drmU')\nos.mkdir('/tmp/drmV')\nos.mkdir('/tmp/s')\n\ndownload_folder(bucket_name, \"drmU/\", \"/tmp/drmU/\")\ndownload_folder(bucket_name, \"drmV/\", \"/tmp/drmV/\")\ndownload_folder(bucket_name, \"s/\", \"/tmp/s/\")\n\ndrmU = read_mahout_drm(\"/tmp/drmU\")\ndrmV = read_mahout_drm(\"/tmp/drmV\")\n\nprint(os.listdir(\"/tmp\"))\nprint(os.listdir(\"/tmp/s\"))\n\ndrmU_p5 = np.transpose(np.array([drmU[i] for i in range(len(drmU.keys()))]))\ndrmV_p5 = np.array([drmV[i] for i in range(len(drmV.keys()))])\n\nwith open(f\"/tmp/s/s_part-00000\", 'r') as f:\n    diags = [float(d) for d in f.read().split('\\n') if d != '']\n\nrecon = drmU_p5 @ np.diag(diags) @ drmV_p5.transpose()\n# plot_3d_matrix(recon.transpose().reshape((512,512,301)), (512,512,301), 1.0, 0.810547, 1.2337347494963278)\ncomposite_img = recon.transpose().reshape((512, 512, 301))\n\ndiags_orig = diags\npercs = [0.001, 0.01, 0.05, 0.1, 0.3]\n\nfor p in range(len(percs)):\n    perc = percs[p]\n    diags = [\n        diags_orig[i] if i < round(len(diags) - (len(diags) * perc)) else 0\n        for i in range(len(diags))\n    ]\n    recon = drmU_p5 @ np.diag(diags) @ drmV_p5.transpose()\n    # plot_3d_matrix(recon.transpose().reshape((512,512,301)), (512,512,301), 1.0, 0.810547, 1.2337347494963278)\n    composite_img = recon.transpose().reshape((512, 512, 301))\n    a1 = plt.subplot(1, 1, 1)\n    plt.imshow(composite_img[:, :, 150], cmap=plt.cm.bone)\n    plt.title(\n        f\"{perc*100}% denoised.  (k={len(diags)}, oversample=15, power_iters=2)\"\n    )\n    a1.set_aspect(1.0)\n    plt.axis('off')\n    fname = f\"{100-(perc*100)}%-denoised-img.png\"\n    plt.savefig(f\"/tmp/{fname}\")\n    upload_blob(bucket_name, f\"/tmp/{fname}\", f\"/output/{fname}\")\n"
  },
  {
    "path": "ci.sh",
    "content": "#!/bin/bash\n\nset -ex\n\n# Check all the shell scripts\nfind ./ -iregex '^.+\\.sh$' -type f -print0 | \\\n  xargs -0 shellcheck -e SC1091 -e SC2164 -e SC1090\n# Check for cases where I use tags rather than tag\nbad_tags=$(grep -r \"tags::\" ./ | grep -v \"ci.sh:\" || true)\n# Look for long lines\nlong_lines=$(grep --include '*.sh' --exclude '*venv*' -Hnr '.\\{90\\}' ./ || true)\nif [[ -n \"$bad_tags\" ]]; then\n  echo \"Found bad tags $bad_tags replace tags with tag\"\nfi\nif [[ -n \"$long_lines\" ]]; then\n  print \"Found long lines:\\n$long_lines\"\nfi\nif [[ -n \"$bad_tags\" ]] || [[ -n \"$long_lines\" ]]; then\n  exit 1\nfi\n./runthrough.sh\n"
  },
  {
    "path": "convert_notebooks.sh",
    "content": "#!/bin/bash\nfind . -name \"*ipynb\" |grep -v venv | xargs -d '\\n' ipython3 nbconvert --to script\n"
  },
  {
    "path": "data-extraction/README.md",
    "content": "## Data Extraction\n\nTo successfully construct a machine learning pipeline we need to collect the data we are going to train on.\nThe data extraction is organized here by the different use case.\n\n\nIn many introduction to machine learning examples the data is pre-extracted, and sometimes even pre-cleaned.\nHere we will show some ways to collect the initial data.\nOnce the initial training data has been extracted, we will continue on downstream with data cleaning, and\nmay later do some data augmentation."
  },
  {
    "path": "data-extraction/github_comments_query.bsql",
    "content": "SELECT pull_request_url,\n ANY_VALUE(pull_patch_url) as pull_patch_url,\n ARRAY_AGG(comment_position) as comments_positions,\n ARRAY_AGG(diff_hunk) as diff_hunks,\n ARRAY_AGG(comment_original_position) as comments_original_positions,\n ARRAY_AGG(comment_commit_id IGNORE NULLS) as comment_commit_ids,\n ARRAY_AGG(comment_file_path IGNORE NULLS) as comment_file_paths  FROM (\n   SELECT *, JSON_EXTRACT(payload, '$.action') AS action,\n   JSON_EXTRACT(payload, '$.pull_request.url') AS pull_request_url,\n   JSON_EXTRACT(payload, '$.pull_request.patch_url') AS pull_patch_url,\n   IFNULL(JSON_EXTRACT(payload, '$.comment.original_position'), \"-1\") AS comment_original_position,\n   IFNULL(JSON_EXTRACT(payload, '$.comment.position'), \"-1\") AS comment_position,\n   JSON_EXTRACT(payload, '$.comment.commit_id') AS comment_commit_id,\n   JSON_EXTRACT(payload, '$.comment.path') AS comment_file_path\n   FROM \"githubarchive.day.*\"\n   WHERE type = \"PullRequestReviewCommentEvent\")\n GROUP BY pull_request_url"
  },
  {
    "path": "data-extraction/github_issues_query.bsql",
    "content": "SELECT repo.name, JSON_EXTRACT(payload, '$.issue.url') \nAS url FROM (\n  SELECT *, JSON_EXTRACT(payload, '$.action') AS action\n  FROM \"githubarchive.day.*\" WHERE type = \"IssuesEvent\")\nWHERE type = \"IssuesEvent\"  AND action = \"\\\"opened\\\"\""
  },
  {
    "path": "data-extraction/iot/basic.yaml",
    "content": "apiVersion: batch/v1\nkind: Job\nmetadata:\n  name: iot-data-extraction\n  namespace: kubeflow\nspec:\n  template:\n    spec:\n      containers:\n      - env:\n        - name: GOOGLE_APPLICATION_CREDENTIALS\n          value: /secret/gcp-credentials/user-gcp-sa.json\n        image: IMAGE_NAME\n        name: gh-data-extract-gh-job\n        volumeMounts:\n        - mountPath: /secret/gcp-credentials\n          name: secret-volume\n          readOnly: true\n      restartPolicy: OnFailure\n      volumes:\n      - name: secret-volume\n        secret:\n          secretName: user-gcp-sa\n"
  },
  {
    "path": "data-extraction/iot/build.sh",
    "content": "#!/bin/bash\n\nCONTAINER_REGISTRY=\"gcr.io/${PROJECT_NAME}\"\n#tag::buildandpush[]\nTARGET=\"${CONTAINER_REGISTRY}/kf-steps/iot-extract:v2\"\ndocker build . -t \"${TARGET}\"\ndocker push \"${TARGET}\"\n#end::buildandpush[]\n#tag::run[]\nkubectl apply -f iot_extract_job.yaml\n#end::run[]\n#tag::verify[]\nkubectl get jobs |grep gh-data\n#end::verify[]\n"
  },
  {
    "path": "data-extraction/python-notebook/AddSpamassassinDockerfile",
    "content": "ARG base\nFROM $base\n# Run as root for updates\nUSER root\n# Install Spamassassin\nRUN apt-get update && \\\n    apt-get install -yq spamassassin spamc && \\\n    apt-get clean && \\\n    rm -rf /var/lib/apt/lists/* && \\\n    rm -rf /var/cache/apt\n# Switch back to the expected user\nUSER jovyan"
  },
  {
    "path": "data-extraction/python-notebook/MailingListDataPrep.ipynb",
    "content": "{\n \"cells\": [\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"Here we can install some packages our notebook needs. We can also install them in our container to speed things up & make it more reliable. But for prototyping this works great!\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": null,\n   \"metadata\": {\n    \"jupyter\": {\n     \"outputs_hidden\": true\n    }\n   },\n   \"outputs\": [],\n   \"source\": [\n    \"!pip3 install --upgrade lxml\\n\",\n    \"!pip3 install --upgrade pandas\\n\",\n    \"!pip3 install --upgrade scikit-learn\\n\",\n    \"!pip3 install --upgrade scipy\\n\",\n    \"!pip3 install --upgrade tables\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"We can use Jupyter notebooks just like normal inside of Kubeflow\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": null,\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"from datetime import datetime\\n\",\n    \"from requests import get\\n\",\n    \"from lxml import etree\\n\",\n    \"from time import sleep\\n\",\n    \"\\n\",\n    \"import re\\n\",\n    \"\\n\",\n    \"import pandas as pd\\n\",\n    \"\\n\",\n    \"import os\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": null,\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"container_registry = \\\"\\\" # Wherever you put your containers\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": null,\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"def scrapeMailArchives(mailingList: str, year: int, month: int):\\n\",\n    \"    baseUrl = \\\"http://mail-archives.apache.org/mod_mbox/%s/%s.mbox/ajax/\\\" % (mailingList, datetime(year,month,1).strftime(\\\"%Y%m\\\"))\\n\",\n    \"    r = get(baseUrl + \\\"thread?0\\\")\\n\",\n    \"    utf8_parser = etree.XMLParser(encoding='utf-8')\\n\",\n    \"    root = etree.fromstring(r.text.replace('encoding=\\\"UTF-8\\\"', \\\"\\\"),  parser=utf8_parser)\\n\",\n    \"    output = []\\n\",\n    \"    for message in root.xpath(\\\"//message\\\"):\\n\",\n    \"        _id = message.get(\\\"id\\\")\\n\",\n    \"        linked = message.get(\\\"linked\\\")\\n\",\n    \"        depth = message.get(\\\"depth\\\")\\n\",\n    \"        fr = message.xpath(\\\"from\\\")[0].text\\n\",\n    \"        dt = message.xpath(\\\"date\\\")[0].text ## todo convert to date\\n\",\n    \"        subject = message.xpath(\\\"subject\\\")[0].text\\n\",\n    \"        r2 = get(baseUrl + _id)\\n\",\n    \"        bodyRoot = etree.fromstring(r2.text.replace('encoding=\\\"UTF-8\\\"', \\\"\\\"),  parser=utf8_parser)\\n\",\n    \"        body = bodyRoot.xpath(\\\"//contents\\\")[0].text\\n\",\n    \"        record = {\\n\",\n    \"            \\\"id\\\"        : _id,\\n\",\n    \"            \\\"linked\\\"    : linked,\\n\",\n    \"            \\\"depth\\\"     : depth,\\n\",\n    \"            \\\"from\\\"      : fr,\\n\",\n    \"            \\\"dt\\\"        : dt,\\n\",\n    \"            \\\"subject\\\"   : subject,\\n\",\n    \"            \\\"body\\\"      : body\\n\",\n    \"        }\\n\",\n    \"        output.append(record)\\n\",\n    \"        sleep(0.1)\\n\",\n    \"    return output\\n\",\n    \"\\n\",\n    \"\\n\",\n    \"def extract_links(body):\\n\",\n    \"    link_regex_str = r'(http(|s)://(.*?))([\\\\s\\\\n]|$)'\\n\",\n    \"    itr = re.finditer(link_regex_str, body, re.MULTILINE)\\n\",\n    \"    return list(map(lambda elem: elem.group(1), itr))\\n\",\n    \"\\n\",\n    \"def extract_domains(links):\\n\",\n    \"    from urllib.parse import urlparse\\n\",\n    \"    def extract_domain(link):\\n\",\n    \"        try:\\n\",\n    \"            nloc = urlparse(link).netloc\\n\",\n    \"            # We want to drop www and any extra spaces wtf nloc on the spaces.\\n\",\n    \"            regex_str = r'^(www\\\\.|)(.*?)\\\\s*$'\\n\",\n    \"            match = re.search(regex_str, nloc)\\n\",\n    \"            return match.group(2)\\n\",\n    \"        except:\\n\",\n    \"            return None\\n\",\n    \"    return list(map(extract_domain, links))\\n\",\n    \"\\n\",\n    \"def contains_python_stack_trace(body):\\n\",\n    \"    return \\\"Traceback (most recent call last)\\\" in body\\n\",\n    \"\\n\",\n    \"def contains_probably_java_stack_trace(body):\\n\",\n    \"    # Look for something based on regex\\n\",\n    \"    # Tried https://stackoverflow.com/questions/20609134/regular-expression-optional-multiline-java-stacktrace - more msg looking\\n\",\n    \"    # Tried https://stackoverflow.com/questions/3814327/regular-expression-to-parse-a-log-file-and-find-stacktraces\\n\",\n    \"    # Yes the compile is per call, but it's cached so w/e\\n\",\n    \"    import re\\n\",\n    \"    stack_regex_str = r'^\\\\s*(.+Exception.*):\\\\n(.*\\\\n){0,3}?(\\\\s+at\\\\s+.*\\\\(.*\\\\))+'\\n\",\n    \"    match = re.search(stack_regex_str, body, re.MULTILINE)\\n\",\n    \"    return match is not None\\n\",\n    \"\\n\",\n    \"def contains_exception_in_task(body):\\n\",\n    \"    # Look for a line along the lines of ERROR Executor: Exception in task\\n\",\n    \"    return \\\"ERROR Executor: Exception in task\\\" in body\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": null,\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"datesToScrape =  [(2019, i) for i in range(1,13)]\\n\",\n    \"\\n\",\n    \"records = []\\n\",\n    \"for y,m in datesToScrape:\\n\",\n    \"    print(m,\\\"-\\\",y)\\n\",\n    \"    records += scrapeMailArchives(\\\"spark-dev\\\", y, m)\\n\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": null,\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"df = pd.DataFrame(records)\\n\",\n    \"df['links'] = df['body'].apply(extract_links)\\n\",\n    \"df['containsPythonStackTrace'] = df['body'].apply(contains_python_stack_trace)\\n\",\n    \"df['containsJavaStackTrace'] = df['body'].apply(contains_probably_java_stack_trace)\\n\",\n    \"df['containsExceptionInTaskBody'] = df['body'].apply(contains_exception_in_task)\\n\",\n    \"\\n\",\n    \"df['domains'] = df['links'].apply(extract_domains)\\n\",\n    \"df['isThreadStart'] = df['depth'] == '0'\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": null,\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"from sklearn.feature_extraction.text import TfidfVectorizer\\n\",\n    \"\\n\",\n    \"bodyV = TfidfVectorizer()\\n\",\n    \"# bodyV = TfidfVectorizer(max_features=10000) #if we cared about making this 1:1 w holden's code.\\n\",\n    \"bodyFeatures = bodyV.fit_transform(df['body'])\\n\",\n    \"\\n\",\n    \"domainV = TfidfVectorizer()\\n\",\n    \"# domainV = TfidfVectorizer(max_features=100)\\n\",\n    \"\\n\",\n    \"## A couple of \\\"None\\\" domains really screwed the pooch on this one. Also, no lists just space seperated domains.\\n\",\n    \"def makeDomainsAList(d):\\n\",\n    \"    return ' '.join([a for a in d if not a is None])\\n\",\n    \"\\n\",\n    \"domainFeatures = domainV.fit_transform(df['domains'].apply(makeDomainsAList))\\n\",\n    \"\\n\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": null,\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": []\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": null,\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"\\n\",\n    \"from scipy.sparse import csr_matrix, hstack\\n\",\n    \"\\n\",\n    \"data = hstack([csr_matrix(df[['containsPythonStackTrace', 'containsJavaStackTrace', 'containsExceptionInTaskBody', 'isThreadStart']].to_numpy()),\\n\",\n    \"                             bodyFeatures,\\n\",\n    \"                            domainFeatures])\\n\",\n    \"\\n\",\n    \"\\n\",\n    \"from sklearn.cluster import KMeans\\n\",\n    \"from sklearn.model_selection import train_test_split\\n\",\n    \"\\n\",\n    \"train, test = train_test_split(data, test_size=0.1)\\n\",\n    \"\\n\",\n    \"kmeans = KMeans(n_clusters=2, random_state=42).fit(train)\\n\",\n    \"train_pred = kmeans.predict(train)\\n\",\n    \"test_pred = kmeans.predict(test)\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"Alternatively, by structuring our code correctly we can take advantage of pipelines\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": null,\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"!pip3 install --upgrade kfp\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": null,\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"import kfp\\n\",\n    \"import kfp.dsl as dsl\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": null,\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"def download_data(year: int) -> str:\\n\",\n    \"    \\n\",\n    \"    from datetime import datetime\\n\",\n    \"    from lxml import etree\\n\",\n    \"    from requests import get\\n\",\n    \"    from time import sleep\\n\",\n    \"    \\n\",\n    \"    import json\\n\",\n    \"    \\n\",\n    \"    def scrapeMailArchives(mailingList: str, year: int, month: int):\\n\",\n    \"        baseUrl = \\\"http://mail-archives.apache.org/mod_mbox/%s/%s.mbox/ajax/\\\" % (mailingList, datetime(year,month,1).strftime(\\\"%Y%m\\\"))\\n\",\n    \"        r = get(baseUrl + \\\"thread?0\\\")\\n\",\n    \"        utf8_parser = etree.XMLParser(encoding='utf-8')\\n\",\n    \"        root = etree.fromstring(r.text.replace('encoding=\\\"UTF-8\\\"', \\\"\\\"),  parser=utf8_parser)\\n\",\n    \"        output = []\\n\",\n    \"        for message in root.xpath(\\\"//message\\\"):\\n\",\n    \"            _id = message.get(\\\"id\\\")\\n\",\n    \"            linked = message.get(\\\"linked\\\")\\n\",\n    \"            depth = message.get(\\\"depth\\\")\\n\",\n    \"            fr = message.xpath(\\\"from\\\")[0].text\\n\",\n    \"            dt = message.xpath(\\\"date\\\")[0].text ## todo convert to date\\n\",\n    \"            subject = message.xpath(\\\"subject\\\")[0].text\\n\",\n    \"            r2 = get(baseUrl + _id)\\n\",\n    \"            bodyRoot = etree.fromstring(r2.text.replace('encoding=\\\"UTF-8\\\"', \\\"\\\"),  parser=utf8_parser)\\n\",\n    \"            body = bodyRoot.xpath(\\\"//contents\\\")[0].text\\n\",\n    \"            record = {\\n\",\n    \"                \\\"id\\\"        : _id,\\n\",\n    \"                \\\"linked\\\"    : linked,\\n\",\n    \"                \\\"depth\\\"     : depth,\\n\",\n    \"                \\\"from\\\"      : fr,\\n\",\n    \"                \\\"dt\\\"        : dt,\\n\",\n    \"                \\\"subject\\\"   : subject,\\n\",\n    \"                \\\"body\\\"      : body\\n\",\n    \"            }\\n\",\n    \"            output.append(record)\\n\",\n    \"            sleep(0.1)\\n\",\n    \"            \\n\",\n    \"        return output\\n\",\n    \"\\n\",\n    \"    datesToScrape =  [(year, i) for i in range(1,2)]\\n\",\n    \"\\n\",\n    \"    records = []\\n\",\n    \"    ## todo, go back further\\n\",\n    \"    for y,m in datesToScrape:\\n\",\n    \"        print(m,\\\"-\\\",y)\\n\",\n    \"        records += scrapeMailArchives(\\\"spark-dev\\\", y, m)\\n\",\n    \"    import os\\n\",\n    \"    output_path = '/data_processing/data.json'\\n\",\n    \"    with open(output_path, 'w') as f:\\n\",\n    \"        json.dump(records, f)\\n\",\n    \"    \\n\",\n    \"    return output_path\\n\",\n    \"    \"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": null,\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": []\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": null,\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"def download_tld_data() -> str:\\n\",\n    \"    from requests import get\\n\",\n    \"    import pandas as pd\\n\",\n    \"    print(\\\"importing io....\\\")\\n\",\n    \"    import io\\n\",\n    \"\\n\",\n    \"    url = \\\"https://pkgstore.datahub.io/core/country-list/data_csv/data/d7c9d7cfb42cb69f4422dec222dbbaa8/data_csv.csv\\\"\\n\",\n    \"    print(\\\"Getting the url\\\")\\n\",\n    \"    s = get(url).content\\n\",\n    \"    print(\\\"Converting content\\\")\\n\",\n    \"    df = pd.read_csv(io.StringIO(s.decode('utf-8')))\\n\",\n    \"    print(\\\"Writing output\\\")\\n\",\n    \"    output_path_hdf = '/tld_info/clean_data.hdf'\\n\",\n    \"    df.to_hdf(output_path_hdf, key=\\\"tld\\\")\\n\",\n    \"    \\n\",\n    \"    return output_path_hdf\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": null,\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": []\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"Now that we have some data, we want to get rid of any \\\"bad\\\" records\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": null,\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"#tag::clean_data_fun[]\\n\",\n    \"def clean_data(input_path: str) -> str:\\n\",\n    \"    import json\\n\",\n    \"    import pandas as pd\\n\",\n    \"    \\n\",\n    \"    print(\\\"loading records...\\\")\\n\",\n    \"    with open(input_path, 'r') as f:\\n\",\n    \"        records = json.load(f)\\n\",\n    \"    print(\\\"records loaded\\\")\\n\",\n    \"    \\n\",\n    \"    df = pd.DataFrame(records)\\n\",\n    \"    # Drop records without a subject, body, or sender\\n\",\n    \"    cleaned = df.dropna(subset=[\\\"subject\\\", \\\"body\\\", \\\"from\\\"])\\n\",\n    \"    \\n\",\n    \"    output_path_hdf = '/data_processing/clean_data.hdf'\\n\",\n    \"    cleaned.to_hdf(output_path_hdf, key=\\\"clean\\\")\\n\",\n    \"    \\n\",\n    \"    return output_path_hdf\\n\",\n    \"#end::clean_data_fun[]\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"### Preparing the data\\n\",\n    \"\\n\",\n    \"Remember earlier when we did that big (and arguably pointless) classification of emails from the Apache Spark mailing list? OK, now we're going to do it again, as a \\\"lightweight\\\" Python function in a Kubeflow Pipeline.  I hope the irony of the term \\\"lightweight\\\" isn't lost on anyone, because this is pretty blatent abuse of something that was originally presented for conveinience. \\n\",\n    \"\\n\",\n    \"First note, all of the imports and declarations of helper functions MUST be with in the \\\"ligthweight\\\" function. One could argue (and they would probably be correct) that I have two steps here- feature prep and ML, and as such I should split them. I would say that's fair, but I choose not to do so at this time.  Perhaps in some scripts later on?\\n\",\n    \"\\n\",\n    \"As has been pointed out so many times before, we assume the reader either arleady understands what is going on with the KMeans clustering, or better yet, doesn't even care. I won't be digging into that right now. What I will point out- and maybe as a note to the editor, the model that is finally saved really ought to be persisted somewhere.  If the model isn't saved, then this basically pointless pipeline, is truly pointless. \\n\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"Now let's make sure we can read that data in the next step (before we write a big complicated model to do whatever torture to it).\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": null,\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"def prepare_features(input_path: str, tld_info_path: str):\\n\",\n    \"   \\n\",\n    \"    import re\\n\",\n    \"    import pandas as pd\\n\",\n    \"    \\n\",\n    \"    print(\\\"loading records...\\\")\\n\",\n    \"    df = pd.read_hdf(input_path, key=\\\"clean\\\")\\n\",\n    \"    print(\\\"records loaded\\\")\\n\",\n    \"    \\n\",\n    \"    print(\\\"Loading tld info....\\\")\\n\",\n    \"    tld_df = pd.read_hdf(tld_info_path, key=\\\"tld\\\")\\n\",\n    \"    print(\\\"Loaded tld info\\\")\\n\",\n    \"    \\n\",\n    \"    \\n\",\n    \"    ## Note: \\\"Lightweight\\\" Python Fns mean helper code must be inside the fn. (Bad Form)\\n\",\n    \"    def extract_links(body):\\n\",\n    \"        link_regex_str = r'(http(|s)://(.*?))([\\\\s\\\\n]|$)'\\n\",\n    \"        itr = re.finditer(link_regex_str, body, re.MULTILINE)\\n\",\n    \"        return list(map(lambda elem: elem.group(1), itr))\\n\",\n    \"\\n\",\n    \"    def extract_domains(links):\\n\",\n    \"        from urllib.parse import urlparse\\n\",\n    \"        def extract_domain(link):\\n\",\n    \"            try:\\n\",\n    \"                nloc = urlparse(link).netloc\\n\",\n    \"                # We want to drop www and any extra spaces wtf nloc on the spaces.\\n\",\n    \"                regex_str = r'^(www\\\\.|)(.*?)\\\\s*$'\\n\",\n    \"                match = re.search(regex_str, nloc)\\n\",\n    \"                return match.group(2)\\n\",\n    \"            except:\\n\",\n    \"                return None\\n\",\n    \"        return list(map(extract_domain, links))\\n\",\n    \"\\n\",\n    \"    def contains_python_stack_trace(body):\\n\",\n    \"        return \\\"Traceback (most recent call last)\\\" in body\\n\",\n    \"\\n\",\n    \"    def contains_probably_java_stack_trace(body):\\n\",\n    \"        # Look for something based on regex\\n\",\n    \"        # Tried https://stackoverflow.com/questions/20609134/regular-expression-optional-multiline-java-stacktrace - more msg looking\\n\",\n    \"        # Tried https://stackoverflow.com/questions/3814327/regular-expression-to-parse-a-log-file-and-find-stacktraces\\n\",\n    \"        # Yes the compile is per call, but it's cached so w/e\\n\",\n    \"        import re\\n\",\n    \"        stack_regex_str = r'^\\\\s*(.+Exception.*):\\\\n(.*\\\\n){0,3}?(\\\\s+at\\\\s+.*\\\\(.*\\\\))+'\\n\",\n    \"        match = re.search(stack_regex_str, body, re.MULTILINE)\\n\",\n    \"        return match is not None\\n\",\n    \"\\n\",\n    \"    def contains_exception_in_task(body):\\n\",\n    \"        # Look for a line along the lines of ERROR Executor: Exception in task\\n\",\n    \"        return \\\"ERROR Executor: Exception in task\\\" in body\\n\",\n    \"\\n\",\n    \"    print(df.shape)\\n\",\n    \"    df['links'] = df['body'].apply(extract_links)\\n\",\n    \"    df['containsPythonStackTrace'] = df['body'].apply(contains_python_stack_trace)\\n\",\n    \"    df['containsJavaStackTrace'] = df['body'].apply(contains_probably_java_stack_trace)\\n\",\n    \"    df['containsExceptionInTaskBody'] = df['body'].apply(contains_exception_in_task)\\n\",\n    \"\\n\",\n    \"    #tag::local_mailing_list_feature_prep_fun[]\\n\",\n    \"    df['domains'] = df['links'].apply(extract_domains)\\n\",\n    \"    df['isThreadStart'] = df['depth'] == '0'\\n\",\n    \"    \\n\",\n    \"    # Arguably, you could split building the dataset away from the actual witchcraft.\\n\",\n    \"    from sklearn.feature_extraction.text import TfidfVectorizer\\n\",\n    \"\\n\",\n    \"    bodyV = TfidfVectorizer()\\n\",\n    \"    bodyFeatures = bodyV.fit_transform(df['body'])\\n\",\n    \"\\n\",\n    \"    domainV = TfidfVectorizer()\\n\",\n    \"\\n\",\n    \"    ## A couple of \\\"None\\\" domains really screwed the pooch on this one.Also, no lists just space seperated domains.\\n\",\n    \"    def makeDomainsAList(d):\\n\",\n    \"        return ' '.join([a for a in d if not a is None])\\n\",\n    \"\\n\",\n    \"    domainFeatures = domainV.fit_transform(df['domains'].apply(makeDomainsAList))\\n\",\n    \"\\n\",\n    \"    from scipy.sparse import csr_matrix, hstack\\n\",\n    \"\\n\",\n    \"    data = hstack([csr_matrix(df[['containsPythonStackTrace',\\n\",\n    \"                                  'containsJavaStackTrace',\\n\",\n    \"                                  'containsExceptionInTaskBody', \\n\",\n    \"                                  'isThreadStart']].to_numpy()),\\n\",\n    \"                                 bodyFeatures,\\n\",\n    \"                                domainFeatures])\\n\",\n    \"    #end::local_mailing_list_feature_prep_fun[]\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"\\n\",\n    \"### The Kubeflow Bit.\\n\",\n    \"\\n\",\n    \"Now we can put these two pieces together into a pipeline. Since the data is relatively small we will use a persistent volume put them together. Later on we can add training to this pipeline as well.\\n\",\n    \"\\n\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": null,\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"# Make a volume example. We redo it inside of the pipeline definition because we need to be inside\\n\",\n    \"#tag::makeVolume[]\\n\",\n    \"dvop = dsl.VolumeOp(\\n\",\n    \"    name=\\\"create_pvc\\\",\\n\",\n    \"    resource_name=\\\"my-pvc-2\\\",\\n\",\n    \"    size=\\\"5Gi\\\",\\n\",\n    \"    modes=dsl.VOLUME_MODE_RWO)\\n\",\n    \"#end::makeVolume[]\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": null,\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"!rm local-data-prep-2.zip\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": null,\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"#tag::makePipeline[]\\n\",\n    \"@kfp.dsl.pipeline(\\n\",\n    \"  name='Simple1',\\n\",\n    \"  description='Simple1'\\n\",\n    \")\\n\",\n    \"def my_pipeline_mini(year: int):\\n\",\n    \"    dvop = dsl.VolumeOp(\\n\",\n    \"        name=\\\"create_pvc\\\",\\n\",\n    \"        resource_name=\\\"my-pvc-2\\\",\\n\",\n    \"        size=\\\"5Gi\\\",\\n\",\n    \"        modes=dsl.VOLUME_MODE_RWO)\\n\",\n    \"    tldvop = dsl.VolumeOp(\\n\",\n    \"        name=\\\"create_pvc\\\",\\n\",\n    \"        resource_name=\\\"tld-volume-2\\\",\\n\",\n    \"        size=\\\"100Mi\\\",\\n\",\n    \"        modes=dsl.VOLUME_MODE_RWO)\\n\",\n    \"    download_data_op = kfp.components.func_to_container_op(\\n\",\n    \"        download_data,\\n\",\n    \"        packages_to_install=['lxml', 'requests'])\\n\",\n    \"    download_tld_info_op = kfp.components.func_to_container_op(\\n\",\n    \"        download_tld_data,\\n\",\n    \"        packages_to_install=['requests', 'pandas>=0.24', 'tables'])\\n\",\n    \"    clean_data_op = kfp.components.func_to_container_op(\\n\",\n    \"        clean_data,\\n\",\n    \"        packages_to_install=['pandas>=0.24', 'tables'])\\n\",\n    \"\\n\",\n    \"    step1 = download_data_op(year).add_pvolumes({\\\"/data_processing\\\": dvop.volume})\\n\",\n    \"    step2 = clean_data_op(input_path=step1.output).add_pvolumes({\\\"/data_processing\\\": dvop.volume})\\n\",\n    \"    step3 = download_tld_info_op().add_pvolumes({\\\"/tld_info\\\": tldvop.volume})\\n\",\n    \"\\n\",\n    \"kfp.compiler.Compiler().compile(my_pipeline_mini, 'local-data-prep-2.zip')\\n\",\n    \"#end::makePipeline[]\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": null,\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"!rm *.zip\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": null,\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"#tag::longPipeline[]\\n\",\n    \"@kfp.dsl.pipeline(\\n\",\n    \"  name='Simple1',\\n\",\n    \"  description='Simple1'\\n\",\n    \")\\n\",\n    \"def my_pipeline2(year: int):\\n\",\n    \"    dvop = dsl.VolumeOp(\\n\",\n    \"        name=\\\"create_pvc\\\",\\n\",\n    \"        resource_name=\\\"my-pvc-2\\\",\\n\",\n    \"        size=\\\"5Gi\\\",\\n\",\n    \"        modes=dsl.VOLUME_MODE_RWO)\\n\",\n    \"    tldvop = dsl.VolumeOp(\\n\",\n    \"        name=\\\"create_pvc\\\",\\n\",\n    \"        resource_name=\\\"tld-volume-2\\\",\\n\",\n    \"        size=\\\"100Mi\\\",\\n\",\n    \"        modes=dsl.VOLUME_MODE_RWO)\\n\",\n    \"\\n\",\n    \"    download_data_op = kfp.components.func_to_container_op(\\n\",\n    \"        download_data,\\n\",\n    \"        packages_to_install=['lxml', 'requests'])\\n\",\n    \"    download_tld_info_op = kfp.components.func_to_container_op(\\n\",\n    \"        download_tld_data,\\n\",\n    \"        packages_to_install=['requests', 'pandas>=0.24', 'tables'])\\n\",\n    \"    clean_data_op = kfp.components.func_to_container_op(\\n\",\n    \"        clean_data,\\n\",\n    \"        packages_to_install=['pandas>=0.24', 'tables'])\\n\",\n    \"#tag::add_feature_step[]\\n\",\n    \"    prepare_features_op = kfp.components.func_to_container_op(\\n\",\n    \"        prepare_features,\\n\",\n    \"        packages_to_install=['pandas>=0.24', 'tables', 'scikit-learn'])\\n\",\n    \"#tag::end_feature_step[]\\n\",\n    \"\\n\",\n    \"    step1 = download_data_op(year).add_pvolumes({\\\"/data_processing\\\": dvop.volume})\\n\",\n    \"    step2 = clean_data_op(input_path=step1.output).add_pvolumes({\\\"/data_processing\\\": dvop.volume})\\n\",\n    \"    step3 = download_tld_info_op().add_pvolumes({\\\"/tld_info\\\": tldvop.volume})\\n\",\n    \"    step4 = prepare_features_op(input_path=step2.output, tld_info_path=step3.output).add_pvolumes({\\n\",\n    \"        \\\"/data_processing\\\": dvop.volume,\\n\",\n    \"        \\\"/tld_info\\\": tldvop.volume})\\n\",\n    \"#end::longPipeline[]\\n\",\n    \"\\n\",\n    \"kfp.compiler.Compiler().compile(my_pipeline2, 'local-data-and-feature-prep-2.zip')\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": null,\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"client = kfp.Client()\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": null,\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"my_experiment = client.create_experiment(name='local-data-prep-test-2')\\n\",\n    \"my_run = client.run_pipeline(my_experiment.id, 'local-data-prep', \\n\",\n    \"  'local-data-and-feature-prep-2.zip', params={'year': '2019'})\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"If we were using Spamassasin or some other library installed in a different base container we would:\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": null,\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"# Clean data with custom container\\n\",\n    \"#tag::cleanDataWithContainer[]\\n\",\n    \"clean_data_op = kfp.components.func_to_container_op(\\n\",\n    \"    clean_data,\\n\",\n    \"    base_image=\\\"{0}/kubeflow/spammassisan\\\".format(container_registry),\\n\",\n    \"    packages_to_install=['pandas>=0.24', 'tables'])\\n\",\n    \"#end::cleanDataWithContainer[]\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": null,\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"def train_func(input_path: String):\\n\",\n    \"    from sklearn.cluster import KMeans\\n\",\n    \"    from sklearn.model_selection import train_test_split\\n\",\n    \"\\n\",\n    \"    train, test = train_test_split(data, test_size=0.1)\\n\",\n    \"\\n\",\n    \"    kmeans = KMeans(n_clusters=2, random_state=42).fit(train)\\n\",\n    \"    train_pred = kmeans.predict(train)\\n\",\n    \"    test_pred = kmeans.predict(test)\\n\",\n    \"    print(test_pred)\\n\",\n    \"    # TODO: Dump the model somewhere you can use it later. \"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"And just like that, we've done it. We've created a Kubeflow Pipeline.\\n\",\n    \"\\n\",\n    \"So let's take a moment to step back and think, \\\"what in the crazy-town-heck is going on here?!\\\".  A valid question, and well spotted.  Each \\\"Step\\\" is going to be creating a container.  Maybe I should have noted that earlier when talking about attatching volumes, beacuse if you thougth I was doing that to a function, you'd probably think me quite insane. \\n\",\n    \"\\n\",\n    \"But, if you follow this code, and create this pipeline, download it and run it, you will see each \\\"step\\\" as a seperate container, downloading data, saving it to a `PVC` then passing some parameters to a next container, which also will load the `PVC`, etc. etc.  \\n\",\n    \"\\n\",\n    \"### Using Python to Create Containers, but not like a crazy person\\n\",\n    \"\\n\",\n    \"For completeness, let's last explore how to do all of these things using annotations. \\n\",\n    \"\\n\",\n    \"The trick for the most part is to create a function that returns a `kfp.dsl.ContainerOp`.  This will point to an image, note the volumes that need to be mounted, and a number of other things. I've heard told people don't always just like creating absurdly large and fat functions to do everything in real life, so I leave this hear as an aside in case the reader is interested in it.  It's alsow worth noting that adding the `@kfp.dsl.component` annotation instructs teh Kubeflow compiler to turn on static typce checking. \\n\",\n    \"\\n\",\n    \"```\\n\",\n    \"@kfp.dsl.component\\n\",\n    \"def my_component(my_param):\\n\",\n    \"  ...\\n\",\n    \"  return kfp.dsl.ContainerOp(\\n\",\n    \"    name='My component name',\\n\",\n    \"    image='gcr.io/path/to/container/image'\\n\",\n    \"  )\\n\",\n    \"```\\n\",\n    \"\\n\",\n    \"Finally, when it comes to incorporating these components into pipelines, you would do something like this:\\n\",\n    \"\\n\",\n    \"```\\n\",\n    \"@kfp.dsl.pipeline(\\n\",\n    \"  name='My pipeline',\\n\",\n    \"  description='My machine learning pipeline'\\n\",\n    \")\\n\",\n    \"def my_pipeline(param_1: PipelineParam, param_2: PipelineParam):\\n\",\n    \"  my_step = my_component(my_param='a')\\n\",\n    \"```\\n\",\n    \"\\n\",\n    \"Which should look exceedingly familiar as we did something very similar with our `download_data_fn` and `witchcraft_fn`.  \"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": null,\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": []\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": null,\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": []\n  }\n ],\n \"metadata\": {\n  \"kernelspec\": {\n   \"display_name\": \"Python 3\",\n   \"language\": \"python\",\n   \"name\": \"python3\"\n  },\n  \"language_info\": {\n   \"codemirror_mode\": {\n    \"name\": \"ipython\",\n    \"version\": 3\n   },\n   \"file_extension\": \".py\",\n   \"mimetype\": \"text/x-python\",\n   \"name\": \"python\",\n   \"nbconvert_exporter\": \"python\",\n   \"pygments_lexer\": \"ipython3\",\n   \"version\": \"3.6.9\"\n  }\n },\n \"nbformat\": 4,\n \"nbformat_minor\": 4\n}\n"
  },
  {
    "path": "data-extraction/python-notebook/MailingListDataPrep.py",
    "content": "#!/usr/bin/env python\n# coding: utf-8\n\n# Here we can install some packages our notebook needs. We can also install them in our container to speed things up & make it more reliable. But for prototyping this works great!\n\n# In[ ]:\n\nget_ipython().system('pip3 install --upgrade lxml')\nget_ipython().system('pip3 install --upgrade pandas')\nget_ipython().system('pip3 install --upgrade scikit-learn')\nget_ipython().system('pip3 install --upgrade scipy')\nget_ipython().system('pip3 install --upgrade tables')\n\n# We can use Jupyter notebooks just like normal inside of Kubeflow\n\n# In[ ]:\n\nfrom datetime import datetime\nfrom requests import get\nfrom lxml import etree\nfrom time import sleep\n\nimport re\n\nimport pandas as pd\n\nimport os\n\n# In[ ]:\n\ncontainer_registry = \"\"  # Wherever you put your containers\n\n# In[ ]:\n\n\ndef scrapeMailArchives(mailingList: str, year: int, month: int):\n    baseUrl = \"http://mail-archives.apache.org/mod_mbox/%s/%s.mbox/ajax/\" % (\n        mailingList, datetime(year, month, 1).strftime(\"%Y%m\"))\n    r = get(baseUrl + \"thread?0\")\n    utf8_parser = etree.XMLParser(encoding='utf-8')\n    root = etree.fromstring(r.text.replace('encoding=\"UTF-8\"', \"\"),\n                            parser=utf8_parser)\n    output = []\n    for message in root.xpath(\"//message\"):\n        _id = message.get(\"id\")\n        linked = message.get(\"linked\")\n        depth = message.get(\"depth\")\n        fr = message.xpath(\"from\")[0].text\n        dt = message.xpath(\"date\")[0].text  # todo convert to date\n        subject = message.xpath(\"subject\")[0].text\n        r2 = get(baseUrl + _id)\n        bodyRoot = etree.fromstring(r2.text.replace('encoding=\"UTF-8\"', \"\"),\n                                    parser=utf8_parser)\n        body = bodyRoot.xpath(\"//contents\")[0].text\n        record = {\n            \"id\": _id,\n            \"linked\": linked,\n            \"depth\": depth,\n            \"from\": fr,\n            \"dt\": dt,\n            \"subject\": subject,\n            \"body\": body\n        }\n        output.append(record)\n        sleep(0.1)\n    return output\n\n\ndef extract_links(body):\n    link_regex_str = r'(http(|s)://(.*?))([\\s\\n]|$)'\n    itr = re.finditer(link_regex_str, body, re.MULTILINE)\n    return list(map(lambda elem: elem.group(1), itr))\n\n\ndef extract_domains(links):\n    from urllib.parse import urlparse\n\n    def extract_domain(link):\n        try:\n            nloc = urlparse(link).netloc\n            # We want to drop www and any extra spaces wtf nloc on the spaces.\n            regex_str = r'^(www\\.|)(.*?)\\s*$'\n            match = re.search(regex_str, nloc)\n            return match.group(2)\n        except:\n            return None\n\n    return list(map(extract_domain, links))\n\n\ndef contains_python_stack_trace(body):\n    return \"Traceback (most recent call last)\" in body\n\n\ndef contains_probably_java_stack_trace(body):\n    # Look for something based on regex\n    # Tried https://stackoverflow.com/questions/20609134/regular-expression-optional-multiline-java-stacktrace - more msg looking\n    # Tried https://stackoverflow.com/questions/3814327/regular-expression-to-parse-a-log-file-and-find-stacktraces\n    # Yes the compile is per call, but it's cached so w/e\n    import re\n    stack_regex_str = r'^\\s*(.+Exception.*):\\n(.*\\n){0,3}?(\\s+at\\s+.*\\(.*\\))+'\n    match = re.search(stack_regex_str, body, re.MULTILINE)\n    return match is not None\n\n\ndef contains_exception_in_task(body):\n    # Look for a line along the lines of ERROR Executor: Exception in task\n    return \"ERROR Executor: Exception in task\" in body\n\n\n# In[ ]:\n\ndatesToScrape = [(2019, i) for i in range(1, 13)]\n\nrecords = []\nfor y, m in datesToScrape:\n    print(m, \"-\", y)\n    records += scrapeMailArchives(\"spark-dev\", y, m)\n\n# In[ ]:\n\ndf = pd.DataFrame(records)\ndf['links'] = df['body'].apply(extract_links)\ndf['containsPythonStackTrace'] = df['body'].apply(contains_python_stack_trace)\ndf['containsJavaStackTrace'] = df['body'].apply(\n    contains_probably_java_stack_trace)\ndf['containsExceptionInTaskBody'] = df['body'].apply(\n    contains_exception_in_task)\n\ndf['domains'] = df['links'].apply(extract_domains)\ndf['isThreadStart'] = df['depth'] == '0'\n\n# In[ ]:\n\nfrom sklearn.feature_extraction.text import TfidfVectorizer\n\nbodyV = TfidfVectorizer()\n# bodyV = TfidfVectorizer(max_features=10000) #if we cared about making this 1:1 w holden's code.\nbodyFeatures = bodyV.fit_transform(df['body'])\n\ndomainV = TfidfVectorizer()\n# domainV = TfidfVectorizer(max_features=100)\n\n## A couple of \"None\" domains really screwed the pooch on this one. Also, no lists just space seperated domains.\n\n\ndef makeDomainsAList(d):\n    return ' '.join([a for a in d if not a is None])\n\n\ndomainFeatures = domainV.fit_transform(df['domains'].apply(makeDomainsAList))\n\n# In[ ]:\n\n# In[ ]:\n\nfrom scipy.sparse import csr_matrix, hstack\n\ndata = hstack([\n    csr_matrix(df[[\n        'containsPythonStackTrace', 'containsJavaStackTrace',\n        'containsExceptionInTaskBody', 'isThreadStart'\n    ]].to_numpy()), bodyFeatures, domainFeatures\n])\n\nfrom sklearn.cluster import KMeans\nfrom sklearn.model_selection import train_test_split\n\ntrain, test = train_test_split(data, test_size=0.1)\n\nkmeans = KMeans(n_clusters=2, random_state=42).fit(train)\ntrain_pred = kmeans.predict(train)\ntest_pred = kmeans.predict(test)\n\n# Alternatively, by structuring our code correctly we can take advantage of pipelines\n\n# In[ ]:\n\nget_ipython().system('pip3 install --upgrade kfp')\n\n# In[ ]:\n\nimport kfp\nimport kfp.dsl as dsl\n\n# In[ ]:\n\n\ndef download_data(year: int) -> str:\n\n    from datetime import datetime\n    from lxml import etree\n    from requests import get\n    from time import sleep\n\n    import json\n\n    def scrapeMailArchives(mailingList: str, year: int, month: int):\n        baseUrl = \"http://mail-archives.apache.org/mod_mbox/%s/%s.mbox/ajax/\" % (\n            mailingList, datetime(year, month, 1).strftime(\"%Y%m\"))\n        r = get(baseUrl + \"thread?0\")\n        utf8_parser = etree.XMLParser(encoding='utf-8')\n        root = etree.fromstring(r.text.replace('encoding=\"UTF-8\"', \"\"),\n                                parser=utf8_parser)\n        output = []\n        for message in root.xpath(\"//message\"):\n            _id = message.get(\"id\")\n            linked = message.get(\"linked\")\n            depth = message.get(\"depth\")\n            fr = message.xpath(\"from\")[0].text\n            dt = message.xpath(\"date\")[0].text  # todo convert to date\n            subject = message.xpath(\"subject\")[0].text\n            r2 = get(baseUrl + _id)\n            bodyRoot = etree.fromstring(r2.text.replace(\n                'encoding=\"UTF-8\"', \"\"),\n                                        parser=utf8_parser)\n            body = bodyRoot.xpath(\"//contents\")[0].text\n            record = {\n                \"id\": _id,\n                \"linked\": linked,\n                \"depth\": depth,\n                \"from\": fr,\n                \"dt\": dt,\n                \"subject\": subject,\n                \"body\": body\n            }\n            output.append(record)\n            sleep(0.1)\n\n        return output\n\n    datesToScrape = [(year, i) for i in range(1, 2)]\n\n    records = []\n    ## todo, go back further\n    for y, m in datesToScrape:\n        print(m, \"-\", y)\n        records += scrapeMailArchives(\"spark-dev\", y, m)\n    import os\n    output_path = '/data_processing/data.json'\n    with open(output_path, 'w') as f:\n        json.dump(records, f)\n\n    return output_path\n\n\n# In[ ]:\n\n# In[ ]:\n\n\ndef download_tld_data() -> str:\n    from requests import get\n    import pandas as pd\n    print(\"importing io....\")\n    import io\n\n    url = \"https://pkgstore.datahub.io/core/country-list/data_csv/data/d7c9d7cfb42cb69f4422dec222dbbaa8/data_csv.csv\"\n    print(\"Getting the url\")\n    s = get(url).content\n    print(\"Converting content\")\n    df = pd.read_csv(io.StringIO(s.decode('utf-8')))\n    print(\"Writing output\")\n    output_path_hdf = '/tld_info/clean_data.hdf'\n    df.to_hdf(output_path_hdf, key=\"tld\")\n\n    return output_path_hdf\n\n\n# In[ ]:\n\n# Now that we have some data, we want to get rid of any \"bad\" records\n\n# In[ ]:\n\n\n#tag::clean_data_fun[]\ndef clean_data(input_path: str) -> str:\n    import json\n    import pandas as pd\n\n    print(\"loading records...\")\n    with open(input_path, 'r') as f:\n        records = json.load(f)\n    print(\"records loaded\")\n\n    df = pd.DataFrame(records)\n    # Drop records without a subject, body, or sender\n    cleaned = df.dropna(subset=[\"subject\", \"body\", \"from\"])\n\n    output_path_hdf = '/data_processing/clean_data.hdf'\n    cleaned.to_hdf(output_path_hdf, key=\"clean\")\n\n    return output_path_hdf\n\n\n#end::clean_data_fun[]\n\n# ### Preparing the data\n#\n# Remember earlier when we did that big (and arguably pointless) classification of emails from the Apache Spark mailing list? OK, now we're going to do it again, as a \"lightweight\" Python function in a Kubeflow Pipeline.  I hope the irony of the term \"lightweight\" isn't lost on anyone, because this is pretty blatent abuse of something that was originally presented for conveinience.\n#\n# First note, all of the imports and declarations of helper functions MUST be with in the \"ligthweight\" function. One could argue (and they would probably be correct) that I have two steps here- feature prep and ML, and as such I should split them. I would say that's fair, but I choose not to do so at this time.  Perhaps in some scripts later on?\n#\n# As has been pointed out so many times before, we assume the reader either arleady understands what is going on with the KMeans clustering, or better yet, doesn't even care. I won't be digging into that right now. What I will point out- and maybe as a note to the editor, the model that is finally saved really ought to be persisted somewhere.  If the model isn't saved, then this basically pointless pipeline, is truly pointless.\n#\n\n# Now let's make sure we can read that data in the next step (before we write a big complicated model to do whatever torture to it).\n\n# In[ ]:\n\n\ndef prepare_features(input_path: str, tld_info_path: str):\n\n    import re\n    import pandas as pd\n\n    print(\"loading records...\")\n    df = pd.read_hdf(input_path, key=\"clean\")\n    print(\"records loaded\")\n\n    print(\"Loading tld info....\")\n    tld_df = pd.read_hdf(tld_info_path, key=\"tld\")\n    print(\"Loaded tld info\")\n\n    ## Note: \"Lightweight\" Python Fns mean helper code must be inside the fn. (Bad Form)\n\n    def extract_links(body):\n        link_regex_str = r'(http(|s)://(.*?))([\\s\\n]|$)'\n        itr = re.finditer(link_regex_str, body, re.MULTILINE)\n        return list(map(lambda elem: elem.group(1), itr))\n\n    def extract_domains(links):\n        from urllib.parse import urlparse\n\n        def extract_domain(link):\n            try:\n                nloc = urlparse(link).netloc\n                # We want to drop www and any extra spaces wtf nloc on the spaces.\n                regex_str = r'^(www\\.|)(.*?)\\s*$'\n                match = re.search(regex_str, nloc)\n                return match.group(2)\n            except:\n                return None\n\n        return list(map(extract_domain, links))\n\n    def contains_python_stack_trace(body):\n        return \"Traceback (most recent call last)\" in body\n\n    def contains_probably_java_stack_trace(body):\n        # Look for something based on regex\n        # Tried https://stackoverflow.com/questions/20609134/regular-expression-optional-multiline-java-stacktrace - more msg looking\n        # Tried https://stackoverflow.com/questions/3814327/regular-expression-to-parse-a-log-file-and-find-stacktraces\n        # Yes the compile is per call, but it's cached so w/e\n        import re\n        stack_regex_str = r'^\\s*(.+Exception.*):\\n(.*\\n){0,3}?(\\s+at\\s+.*\\(.*\\))+'\n        match = re.search(stack_regex_str, body, re.MULTILINE)\n        return match is not None\n\n    def contains_exception_in_task(body):\n        # Look for a line along the lines of ERROR Executor: Exception in task\n        return \"ERROR Executor: Exception in task\" in body\n\n    print(df.shape)\n    df['links'] = df['body'].apply(extract_links)\n    df['containsPythonStackTrace'] = df['body'].apply(\n        contains_python_stack_trace)\n    df['containsJavaStackTrace'] = df['body'].apply(\n        contains_probably_java_stack_trace)\n    df['containsExceptionInTaskBody'] = df['body'].apply(\n        contains_exception_in_task)\n\n    #tag::local_mailing_list_feature_prep_fun[]\n    df['domains'] = df['links'].apply(extract_domains)\n    df['isThreadStart'] = df['depth'] == '0'\n\n    # Arguably, you could split building the dataset away from the actual witchcraft.\n    from sklearn.feature_extraction.text import TfidfVectorizer\n\n    bodyV = TfidfVectorizer()\n    bodyFeatures = bodyV.fit_transform(df['body'])\n\n    domainV = TfidfVectorizer()\n\n    ## A couple of \"None\" domains really screwed the pooch on this one.Also, no lists just space seperated domains.\n    def makeDomainsAList(d):\n        return ' '.join([a for a in d if not a is None])\n\n    domainFeatures = domainV.fit_transform(\n        df['domains'].apply(makeDomainsAList))\n\n    from scipy.sparse import csr_matrix, hstack\n\n    data = hstack([\n        csr_matrix(df[[\n            'containsPythonStackTrace', 'containsJavaStackTrace',\n            'containsExceptionInTaskBody', 'isThreadStart'\n        ]].to_numpy()), bodyFeatures, domainFeatures\n    ])\n    #end::local_mailing_list_feature_prep_fun[]\n\n\n#\n# ### The Kubeflow Bit.\n#\n# Now we can put these two pieces together into a pipeline. Since the data is relatively small we will use a persistent volume put them together. Later on we can add training to this pipeline as well.\n#\n#\n\n# In[ ]:\n\n# Make a volume example. We redo it inside of the pipeline definition because we need to be inside\n#tag::makeVolume[]\ndvop = dsl.VolumeOp(name=\"create_pvc\",\n                    resource_name=\"my-pvc-2\",\n                    size=\"5Gi\",\n                    modes=dsl.VOLUME_MODE_RWO)\n#end::makeVolume[]\n\n# In[ ]:\n\nget_ipython().system('rm local-data-prep-2.zip')\n\n# In[ ]:\n\n\n#tag::makePipeline[]\n@kfp.dsl.pipeline(name='Simple1', description='Simple1')\ndef my_pipeline_mini(year: int):\n    dvop = dsl.VolumeOp(name=\"create_pvc\",\n                        resource_name=\"my-pvc-2\",\n                        size=\"5Gi\",\n                        modes=dsl.VOLUME_MODE_RWO)\n    tldvop = dsl.VolumeOp(name=\"create_pvc\",\n                          resource_name=\"tld-volume-2\",\n                          size=\"100Mi\",\n                          modes=dsl.VOLUME_MODE_RWO)\n    download_data_op = kfp.components.func_to_container_op(\n        download_data, packages_to_install=['lxml', 'requests'])\n    download_tld_info_op = kfp.components.func_to_container_op(\n        download_tld_data,\n        packages_to_install=['requests', 'pandas>=0.24', 'tables'])\n    clean_data_op = kfp.components.func_to_container_op(\n        clean_data, packages_to_install=['pandas>=0.24', 'tables'])\n\n    step1 = download_data_op(year).add_pvolumes(\n        {\"/data_processing\": dvop.volume})\n    step2 = clean_data_op(input_path=step1.output).add_pvolumes(\n        {\"/data_processing\": dvop.volume})\n    step3 = download_tld_info_op().add_pvolumes({\"/tld_info\": tldvop.volume})\n\n\nkfp.compiler.Compiler().compile(my_pipeline_mini, 'local-data-prep-2.zip')\n#end::makePipeline[]\n\n# In[ ]:\n\nget_ipython().system('rm *.zip')\n\n# In[ ]:\n\n\n#tag::longPipeline[]\n@kfp.dsl.pipeline(name='Simple1', description='Simple1')\ndef my_pipeline2(year: int):\n    dvop = dsl.VolumeOp(name=\"create_pvc\",\n                        resource_name=\"my-pvc-2\",\n                        size=\"5Gi\",\n                        modes=dsl.VOLUME_MODE_RWO)\n    tldvop = dsl.VolumeOp(name=\"create_pvc\",\n                          resource_name=\"tld-volume-2\",\n                          size=\"100Mi\",\n                          modes=dsl.VOLUME_MODE_RWO)\n\n    download_data_op = kfp.components.func_to_container_op(\n        download_data, packages_to_install=['lxml', 'requests'])\n    download_tld_info_op = kfp.components.func_to_container_op(\n        download_tld_data,\n        packages_to_install=['requests', 'pandas>=0.24', 'tables'])\n    clean_data_op = kfp.components.func_to_container_op(\n        clean_data, packages_to_install=['pandas>=0.24', 'tables'])\n#tag::add_feature_step[]\n    prepare_features_op = kfp.components.func_to_container_op(\n        prepare_features,\n        packages_to_install=['pandas>=0.24', 'tables', 'scikit-learn'])\n#end::add_feature_step[]\n\n    step1 = download_data_op(year).add_pvolumes(\n        {\"/data_processing\": dvop.volume})\n    step2 = clean_data_op(input_path=step1.output).add_pvolumes(\n        {\"/data_processing\": dvop.volume})\n    step3 = download_tld_info_op().add_pvolumes({\"/tld_info\": tldvop.volume})\n    step4 = prepare_features_op(input_path=step2.output,\n                                tld_info_path=step3.output).add_pvolumes({\n                                    \"/data_processing\":\n                                    dvop.volume,\n                                    \"/tld_info\":\n                                    tldvop.volume\n                                })\n\n\n#end::longPipeline[]\n\nkfp.compiler.Compiler().compile(my_pipeline2,\n                                'local-data-and-feature-prep-2.zip')\n\n# In[ ]:\n\nclient = kfp.Client()\n\n# In[ ]:\n\nmy_experiment = client.create_experiment(name='local-data-prep-test-2')\nmy_run = client.run_pipeline(my_experiment.id,\n                             'local-data-prep',\n                             'local-data-and-feature-prep-2.zip',\n                             params={'year': '2019'})\n\n# If we were using Spamassasin or some other library installed in a different base container we would:\n\n# In[ ]:\n\n# Clean data with custom container\n#tag::cleanDataWithContainer[]\nclean_data_op = kfp.components.func_to_container_op(\n    clean_data,\n    base_image=\"{0}/kubeflow/spammassisan\".format(container_registry),\n    packages_to_install=['pandas>=0.24', 'tables'])\n#end::cleanDataWithContainer[]\n\n# In[ ]:\n\n\ndef train_func(input_path: String):\n    from sklearn.cluster import KMeans\n    from sklearn.model_selection import train_test_split\n\n    train, test = train_test_split(data, test_size=0.1)\n\n    kmeans = KMeans(n_clusters=2, random_state=42).fit(train)\n    train_pred = kmeans.predict(train)\n    test_pred = kmeans.predict(test)\n    print(test_pred)\n    # TODO: Dump the model somewhere you can use it later.\n\n\n# And just like that, we've done it. We've created a Kubeflow Pipeline.\n#\n# So let's take a moment to step back and think, \"what in the crazy-town-heck is going on here?!\".  A valid question, and well spotted.  Each \"Step\" is going to be creating a container.  Maybe I should have noted that earlier when talking about attatching volumes, beacuse if you thougth I was doing that to a function, you'd probably think me quite insane.\n#\n# But, if you follow this code, and create this pipeline, download it and run it, you will see each \"step\" as a seperate container, downloading data, saving it to a `PVC` then passing some parameters to a next container, which also will load the `PVC`, etc. etc.\n#\n# ### Using Python to Create Containers, but not like a crazy person\n#\n# For completeness, let's last explore how to do all of these things using annotations.\n#\n# The trick for the most part is to create a function that returns a `kfp.dsl.ContainerOp`.  This will point to an image, note the volumes that need to be mounted, and a number of other things. I've heard told people don't always just like creating absurdly large and fat functions to do everything in real life, so I leave this hear as an aside in case the reader is interested in it.  It's alsow worth noting that adding the `@kfp.dsl.component` annotation instructs teh Kubeflow compiler to turn on static typce checking.\n#\n# ```\n# @kfp.dsl.component\n# def my_component(my_param):\n#   ...\n#   return kfp.dsl.ContainerOp(\n#     name='My component name',\n#     image='gcr.io/path/to/container/image'\n#   )\n# ```\n#\n# Finally, when it comes to incorporating these components into pipelines, you would do something like this:\n#\n# ```\n# @kfp.dsl.pipeline(\n#   name='My pipeline',\n#   description='My machine learning pipeline'\n# )\n# def my_pipeline(param_1: PipelineParam, param_2: PipelineParam):\n#   my_step = my_component(my_param='a')\n# ```\n#\n# Which should look exceedingly familiar as we did something very similar with our `download_data_fn` and `witchcraft_fn`.\n\n# In[ ]:\n\n# In[ ]:\n"
  },
  {
    "path": "data-extraction/python-notebook/RunNBDockerfile",
    "content": "# Since we used Jupyter notebooks to do the first pass extraction, we can try directly use that notebook with\n# Kubeflow's pre-baked \"tensorflow-notebook-image\" (based on the Jupyter image) that automatically\n# launches the notebooks included in the docker file. If you have multiple notebooks\n# Give them names like:\n# 01-mything.ipynb\n# 02-step2.ipynb\n# as they will be executed in lexiographical order.\n#tag::spec[]\nFROM gcr.io/kubeflow-images-public/tensorflow-1.6.0-notebook-cpu\n\nCOPY ./ /workdir /\n#end::spec[]\n#tag::deps[]\nRUN pip3 install --upgrade lxml pandas\n#end::deps[]\n"
  },
  {
    "path": "data-extraction/python-spark/Dockerfile",
    "content": "# Use the spark operator image as base\nFROM gcr.io/spark-operator/spark-py:v2.4.5\n# Install Python requirements\nCOPY requirements.txt /\nRUN pip3 install -r /requirements.txt\n# Now you can reference local:///job/my_file.py\nRUN mkdir -p /job\nCOPY *.py /job\n\nENTRYPOINT [\"/opt/entrypoint.sh\"]"
  },
  {
    "path": "data-extraction/python-spark/LaunchSparkJobs.ipynb",
    "content": "{\n \"cells\": [\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": null,\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"!pip3 install --upgrade --user kfp\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": null,\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"import kfp\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": null,\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"import kfp.dsl as dsl\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": null,\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"# Use Kubeflow's built in Spark operator\\n\",\n    \"#tag::launch_operator[]\\n\",\n    \"resource = {\\n\",\n    \"    \\\"apiVersion\\\": \\\"sparkoperator.k8s.io/v1beta2\\\",\\n\",\n    \"    \\\"kind\\\": \\\"SparkApplication\\\",\\n\",\n    \"    \\\"metadata\\\": {\\n\",\n    \"        \\\"name\\\": \\\"boop\\\",\\n\",\n    \"        \\\"namespace\\\": \\\"kubeflow\\\"\\n\",\n    \"    },\\n\",\n    \"  \\\"spec\\\": {\\n\",\n    \"      \\\"type\\\": \\\"Python\\\",\\n\",\n    \"      \\\"mode\\\": \\\"cluster\\\",\\n\",\n    \"      \\\"image\\\": \\\"gcr.io/boos-demo-projects-are-rad/kf-steps/kubeflow/myspark\\\",\\n\",\n    \"      \\\"imagePullPolicy\\\": \\\"Always\\\",\\n\",\n    \"      \\\"mainApplicationFile\\\": \\\"local:///job/job.py\\\", # See the Dockerfile OR use GCS/S3/...\\n\",\n    \"      \\\"sparkVersion\\\": \\\"2.4.5\\\",\\n\",\n    \"      \\\"restartPolicy\\\": {\\n\",\n    \"        \\\"type\\\": \\\"Never\\\"\\n\",\n    \"      },\\n\",\n    \"  \\\"driver\\\": {\\n\",\n    \"    \\\"cores\\\": 1,  \\n\",\n    \"    \\\"coreLimit\\\": \\\"1200m\\\",  \\n\",\n    \"    \\\"memory\\\": \\\"512m\\\",  \\n\",\n    \"    \\\"labels\\\": {\\n\",\n    \"      \\\"version\\\": \\\"2.4.5\\\",  \\n\",\n    \"    },      \\n\",\n    \"    \\\"serviceAccount\\\": \\\"spark-operatoroperator-sa\\\", # also try spark-operatoroperator-sa\\n\",\n    \" },\\n\",\n    \"  \\\"executor\\\": {\\n\",\n    \"    \\\"cores\\\": 1,\\n\",\n    \"    \\\"instances\\\": 2,\\n\",\n    \"    \\\"memory\\\": \\\"512m\\\"  \\n\",\n    \"  },    \\n\",\n    \"  \\\"labels\\\": {\\n\",\n    \"    \\\"version\\\": \\\"2.4.5\\\"\\n\",\n    \"  },      \\n\",\n    \"  }\\n\",\n    \"}\\n\",\n    \"\\n\",\n    \"@dsl.pipeline(\\n\",\n    \"    name=\\\"local Pipeline\\\",\\n\",\n    \"    description=\\\"No need to ask why.\\\"\\n\",\n    \")\\n\",\n    \"def local_pipeline():\\n\",\n    \"\\n\",\n    \"    rop = dsl.ResourceOp(\\n\",\n    \"        name=\\\"boop\\\",\\n\",\n    \"        k8s_resource=resource,\\n\",\n    \"        action=\\\"create\\\",\\n\",\n    \"        success_condition=\\\"status.applicationState.state == COMPLETED\\\"\\n\",\n    \"    )\\n\",\n    \"#end::launch_operator[]\\n\",\n    \"\\n\",\n    \"import kfp.compiler as compiler\\n\",\n    \"\\n\",\n    \"compiler.Compiler().compile(local_pipeline,\\\"boop.zip\\\")\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": null,\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"client = kfp.Client()\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": null,\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"my_experiment = client.create_experiment(name='boop-test-2')\\n\",\n    \"my_run = client.run_pipeline(my_experiment.id, 'boop-test', \\n\",\n    \"  'boop.zip')\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": null,\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": []\n  }\n ],\n \"metadata\": {\n  \"kernelspec\": {\n   \"display_name\": \"Python 3\",\n   \"language\": \"python\",\n   \"name\": \"python3\"\n  },\n  \"language_info\": {\n   \"codemirror_mode\": {\n    \"name\": \"ipython\",\n    \"version\": 3\n   },\n   \"file_extension\": \".py\",\n   \"mimetype\": \"text/x-python\",\n   \"name\": \"python\",\n   \"nbconvert_exporter\": \"python\",\n   \"pygments_lexer\": \"ipython3\",\n   \"version\": \"3.6.9\"\n  }\n },\n \"nbformat\": 4,\n \"nbformat_minor\": 2\n}\n"
  },
  {
    "path": "data-extraction/python-spark/LaunchSparkJobs.py",
    "content": "#!/usr/bin/env python\n# coding: utf-8\n\n# In[ ]:\n\nget_ipython().system('pip3 install --upgrade --user kfp')\n\n# In[ ]:\n\nimport kfp\n\n# In[ ]:\n\nimport kfp.dsl as dsl\n\n# In[ ]:\n\n# Use Kubeflow's built in Spark operator\n#tag::launch_operator[]\nresource = {\n    \"apiVersion\": \"sparkoperator.k8s.io/v1beta2\",\n    \"kind\": \"SparkApplication\",\n    \"metadata\": {\n        \"name\": \"boop\",\n        \"namespace\": \"kubeflow\"\n    },\n    \"spec\": {\n        \"type\": \"Python\",\n        \"mode\": \"cluster\",\n        \"image\": \"gcr.io/boos-demo-projects-are-rad/kf-steps/kubeflow/myspark\",\n        \"imagePullPolicy\": \"Always\",\n        # See the Dockerfile OR use GCS/S3/...\n        \"mainApplicationFile\": \"local:///job/job.py\",\n        \"sparkVersion\": \"2.4.5\",\n        \"restartPolicy\": {\n            \"type\": \"Never\"\n        },\n        \"driver\": {\n            \"cores\": 1,\n            \"coreLimit\": \"1200m\",\n            \"memory\": \"512m\",\n            \"labels\": {\n                \"version\": \"2.4.5\",\n            },\n            # also try spark-operatoroperator-sa\n            \"serviceAccount\": \"spark-operatoroperator-sa\",\n        },\n        \"executor\": {\n            \"cores\": 1,\n            \"instances\": 2,\n            \"memory\": \"512m\"\n        },\n        \"labels\": {\n            \"version\": \"2.4.5\"\n        },\n    }\n}\n\n\n@dsl.pipeline(name=\"local Pipeline\", description=\"No need to ask why.\")\ndef local_pipeline():\n\n    rop = dsl.ResourceOp(\n        name=\"boop\",\n        k8s_resource=resource,\n        action=\"create\",\n        success_condition=\"status.applicationState.state == COMPLETED\")\n\n\n#end::launch_operator[]\n\nimport kfp.compiler as compiler\n\ncompiler.Compiler().compile(local_pipeline, \"boop.zip\")\n\n# In[ ]:\n\nclient = kfp.Client()\n\n# In[ ]:\n\nmy_experiment = client.create_experiment(name='boop-test-2')\nmy_run = client.run_pipeline(my_experiment.id, 'boop-test', 'boop.zip')\n\n# In[ ]:\n"
  },
  {
    "path": "data-extraction/python-spark/fake_job.py",
    "content": "# Yes we need both these imports\n#tag::imports[]\nfrom pyspark.sql import SparkSession\nfrom pyspark.sql.functions import col, to_date\nfrom pyspark.sql.types import *\n#end::imports[]\nfrom pyspark.sql.catalog import UserDefinedFunction\nimport os\n\n#tag::basic_session[]\nsession = SparkSession.builder.getOrCreate()\n#end::basic_session[]\n"
  },
  {
    "path": "data-extraction/python-spark/requirements.txt",
    "content": "pandas\n"
  },
  {
    "path": "data-extraction/python-spark-notebook/AddGCSDockerfile",
    "content": "ARG base\nFROM $base\n\n# Set an enviroment variable for where we are going to put spark\nENV SPARK_HOME /opt/spark\n\n# Run as root for updates\nUSER root\n\n# Add access to GCS\nRUN rm $SPARK_HOME/jars/guava-1*.jar\nADD http://maven-central.storage.googleapis.com/maven2/com/google/guava/guava/23.0/guava-23.0.jar $SPARK_HOME/jars\n# Add the connector jar needed to access Google Cloud Storage using the Hadoop FileSystem API.\nADD https://storage.googleapis.com/hadoop-lib/gcs/gcs-connector-latest-hadoop3.jar $SPARK_HOME/jars\n\n# Add the S3A connector\nADD https://repo1.maven.org/maven2/org/apache/hadoop/hadoop-aws/3.2.0/hadoop-aws-3.2.0.jar $SPARK_HOME/jars\nADD https://repo1.maven.org/maven2/com/amazonaws/aws-java-sdk-bundle/1.11.732/aws-java-sdk-bundle-1.11.732.jar $SPARK_HOME/jars\n\nRUN chmod -R 777 $SPARK_HOME/jars\n\nUSER 185"
  },
  {
    "path": "data-extraction/python-spark-notebook/AddPython3.6Dockerfile",
    "content": "ARG base\nFROM $base\n\nUSER root\n\n# Install libraries we need to build Python 3.6\nRUN apt-get update && \\\n    DEBIAN_FRONTEND=noninteractive apt-get install -y -q \\\n    make build-essential libssl-dev zlib1g-dev libbz2-dev \\\n    libreadline-dev libsqlite3-dev wget curl llvm libncurses5-dev \\\n    libncursesw5-dev xz-utils tk-dev libffi-dev liblzma-dev && \\\n    rm -rf /var/cache/apt\n\n# Install python3.6 to match the notebook\nRUN cd /tmp && \\\n    wget https://www.python.org/ftp/python/3.6.10/Python-3.6.10.tgz && \\\n    tar -xvf Python-3.6.10.tgz && \\\n    cd Python-3.6.10 && \\\n    ./configure && \\\n    make -j 8 && \\\n    make altinstall\n    \n    \nRUN python3.6 -m pip install pandas pyarrow==0.11.0 spacy\n# We depend on Spark being on the PYTHONPATH so no pip install\nUSER 185"
  },
  {
    "path": "data-extraction/python-spark-notebook/Dockerfile",
    "content": "#tag::include[]\n# See https://www.kubeflow.org/docs/notebooks/custom-notebook/\nARG base\nFROM $base\nARG sparkversion\nARG sparkrelease\nARG sparkserver https://www-us.apache.org/dist/spark\n# We need to run as root for updates\nUSER root\n\n# Set an enviroment variable for where we are going to put spark\nENV SPARK_HOME /opt/spark\n\n# Install java because Spark needs it\nRUN apt-get update && \\\n    apt-get install -yq openjdk-8-jre openjdk-8-jre-headless && \\\n    apt-get clean && \\\n    rm -rf /var/lib/apt/lists/*\n\n# Install Spark\nRUN set -ex && \\\n    rm /bin/sh && \\\n    ln -sv /bin/bash /bin/sh\n\nRUN  echo \"Setting up $sparkversion\"\nRUN  cd /tmp && \\\n     (wget ${sparkserver}/spark-${sparkversion}/${sparkrelease}.tgz) && \\\n     cd /opt && tar -xvf /tmp/${sparkrelease}.tgz && \\\n     rm /tmp/${sparkrelease}.tgz && mv ${sparkrelease} spark && \\\n     cd spark/python && pip install -e .\n#end::include[]\n\n# Add access to GCS\nRUN rm $SPARK_HOME/jars/guava-1*.jar\nADD https://maven-central.storage.googleapis.com/maven2/com/google/guava/guava/23.0/guava-23.0.jar $SPARK_HOME/jars\n# Add the connector jar needed to access Google Cloud Storage using the Hadoop FileSystem API.\nADD https://storage.googleapis.com/hadoop-lib/gcs/gcs-connector-latest-hadoop3.jar $SPARK_HOME/jars\n\n# Add the S3A connector\nADD https://repo1.maven.org/maven2/org/apache/hadoop/hadoop-aws/3.2.0/hadoop-aws-3.2.0.jar $SPARK_HOME/jars\nADD https://repo1.maven.org/maven2/com/amazonaws/aws-java-sdk-bundle/1.11.732/aws-java-sdk-bundle-1.11.732.jar $SPARK_HOME/jars\n\n#tag::include[]\n# Fix permissions\nWORKDIR /opt/spark/work-dir\nRUN chmod -R 777 /opt/spark/\n\n\n# Switch the user back, using jovyan as a user is bad but the base image\n# depends on it.\nUSER jovyan\n# Install some common tools\npip install pandas numpy scipy pyarrow\n#end::include[]"
  },
  {
    "path": "data-extraction/python-spark-notebook/SparkMailingListForKF.ipynb",
    "content": "{\n \"cells\": [\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": null,\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"# Yes we need both these imports\\n\",\n    \"from pyspark.sql import SparkSession\\n\",\n    \"from pyspark.sql.functions import col, to_date\\n\",\n    \"from pyspark.sql.types import *\\n\",\n    \"from pyspark.sql.types import StructField, StructType\\n\",\n    \"from pyspark.sql.catalog import UserDefinedFunction\\n\",\n    \"import os\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": null,\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": []\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": null,\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"fs_prefix = \\\"s3a://kf-book-examples/mailing-lists\\\" # Create with mc as in ch1\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": null,\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"# See https://medium.com/@szinck/setting-up-pyspark-jupyter-and-minio-on-kubeflow-kubernetes-aab98874794f\\n\",\n    \"#tag::configurePythonVersion[]\\n\",\n    \"os.environ[\\\"PYSPARK_PYTHON\\\"] = \\\"python3.6\\\"\\n\",\n    \"#end::configurePythonVersion[]\\n\",\n    \"session = (\\n\",\n    \"    SparkSession.builder\\n\",\n    \"    .appName(\\\"fetchMailingListData\\\")\\n\",\n    \"    .config(\\\"spark.executor.instances\\\", \\\"8\\\")\\n\",\n    \"    .config(\\\"spark.driver.memoryOverhead\\\", \\\"0.25\\\")\\n\",\n    \"    .config(\\\"spark.executor.memory\\\", \\\"6g\\\")\\n\",\n    \"    .config(\\\"spark.dynamicAllocation.enabled\\\", \\\"false\\\")\\n\",\n    \"    .config(\\\"spark.ui.enabled\\\", \\\"true\\\")\\n\",\n    \"    .config(\\\"spark.kubernetes.container.image\\\",\\n\",\n    \"           \\\"gcr.io/boos-demo-projects-are-rad/kubeflow/spark-worker/spark-py-36:v3.0.0-preview2-23\\\")\\n\",\n    \"    #tag::notebookSession[]\\n\",\n    \"    .config(\\\"spark.driver.bindAddress\\\", \\\"0.0.0.0\\\")\\n\",\n    \"    .config(\\\"spark.kubernetes.namespace\\\", \\\"kubeflow-programmerboo\\\")\\n\",\n    \"    .config(\\\"spark.master\\\", \\\"k8s://https://kubernetes.default\\\")\\n\",\n    \"    .config(\\\"spark.driver.host\\\", \\n\",\n    \"            \\\"spark-driver.kubeflow-programmerboo.svc.cluster.local\\\")\\n\",\n    \"    .config(\\\"spark.kubernetes.executor.annotation.sidecar.istio.io/inject\\\",\\n\",\n    \"            \\\"false\\\")\\n\",\n    \"    .config(\\\"spark.driver.port\\\", \\\"39235\\\")\\n\",\n    \"    .config(\\\"spark.blockManager.port\\\", \\\"39236\\\")\\n\",\n    \"    #end::notebookSession[]\\n\",\n    \"    # If using minio - see https://github.com/minio/cookbook/blob/master/docs/apache-spark-with-minio.md\\n\",\n    \"    #tag::minio[]\\n\",\n    \"    .config(\\\"spark.hadoop.fs.s3a.endpoint\\\",\\n\",\n    \"            \\\"minio-service.kubeflow.svc.cluster.local:9000\\\")\\n\",\n    \"    .config(\\\"fs.s3a.connection.ssl.enabled\\\", \\\"false\\\")\\n\",\n    \"    .config(\\\"fs.s3a.path.style.access\\\", \\\"true\\\")\\n\",\n    \"    # You can also add an account using the minio command as described in chapter 1\\n\",\n    \"    .config(\\\"spark.hadoop.fs.s3a.access.key\\\", \\\"minio\\\")\\n\",\n    \"    .config(\\\"spark.hadoop.fs.s3a.secret.key\\\", \\\"minio123\\\")\\n\",\n    \"    #end::minio[]\\n\",\n    \"    ).getOrCreate()\\n\",\n    \"sc = session.sparkContext\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": null,\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"# Data fetch pipeline: Download mailing list data\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": null,\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"list_name=\\\"spark-user\\\"\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": null,\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"mailing_list_template=\\\"http://mail-archives.apache.org/mod_mbox/{list_name}/{date}.mbox\\\"\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": null,\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"# Generate the possible dates\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": null,\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"start_year=2019 # Change to 2002 once you've verified\\n\",\n    \"end_year=2021\\n\",\n    \"dates = [\\\"{:d}{:02d}\\\".format(year, month) for year in range(start_year, end_year) for month in range (1,12)]\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": null,\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"def download_emails(date):\\n\",\n    \"    import subprocess\\n\",\n    \"    from mailbox import mbox\\n\",\n    \"    import os\\n\",\n    \"    mbox_filename = \\\"{date}.mbox\\\".format(date=date)\\n\",\n    \"    url=mailing_list_template.format(list_name=list_name,date=date)\\n\",\n    \"    subprocess.call([\\\"wget\\\", url])\\n\",\n    \"    # Skip years that don't exist\\n\",\n    \"    if not os.path.exists(mbox_filename):\\n\",\n    \"        return []\\n\",\n    \"    mail = mbox(mbox_filename.format(date=date), create=False)\\n\",\n    \"    # LC the keys since the casing is non-consistent\\n\",\n    \"    def get_body(message):\\n\",\n    \"        content_type = message.get_content_type()\\n\",\n    \"        # Multi-part messages\\n\",\n    \"        if message.is_multipart():\\n\",\n    \"            return \\\"\\\".join(map(get_body, message.get_payload()))\\n\",\n    \"        elif \\\"text\\\" in content_type or \\\"html\\\" in content_type:\\n\",\n    \"            return message.get_payload()\\n\",\n    \"        else:\\n\",\n    \"            return \\\"\\\"\\n\",\n    \"    def message_to_dict(message):\\n\",\n    \"        ret = dict((k.lower(), v) for k, v in message.items())\\n\",\n    \"        ret[\\\"multipart\\\"] = message.is_multipart()\\n\",\n    \"        ret[\\\"body\\\"] = get_body(message)\\n\",\n    \"        return ret\\n\",\n    \"    emails = list(map(message_to_dict, mail.itervalues()))\\n\",\n    \"    os.remove(mbox_filename)\\n\",\n    \"    return emails\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": null,\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"# Optional: test that it works locally\\n\",\n    \"# download_emails(\\\"202001\\\")\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": null,\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"emails_rdd = sc.parallelize(dates).flatMap(download_emails).cache()\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": null,\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"emails_rdd.count()\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": null,\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"mailing_list_posts_mbox_df = emails_rdd.toDF(sampleRatio=1.0)\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": null,\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"cached = mailing_list_posts_mbox_df.cache()\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": null,\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"mailing_list_posts_mbox_df.select(\\\"list-id\\\", \\\"In-Reply-To\\\").take(5)\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": null,\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"spark_mailing_list_data = mailing_list_posts_mbox_df.filter(\\n\",\n    \"    mailing_list_posts_mbox_df[\\\"list-id\\\"].contains(\\\"spark\\\")).repartition(60).cache()\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": null,\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"spark_mailing_list_data.show()\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": null,\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"spark_mailing_list_data.printSchema()\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": null,\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"def extract_date_from_email_datefield(datefield):\\n\",\n    \"    if datefield is None:\\n\",\n    \"        return None\\n\",\n    \"    from datetime import datetime\\n\",\n    \"    import time\\n\",\n    \"    import email.utils\\n\",\n    \"    parsed_date = email.utils.parsedate(datefield)\\n\",\n    \"    return datetime.fromtimestamp(time.mktime((parsed_date)))\\n\",\n    \"\\n\",\n    \"\\n\",\n    \"extract_date_from_email_datefield_udf = UserDefinedFunction(\\n\",\n    \"    extract_date_from_email_datefield, StringType(), \\\"extract_date_from_email_datefield\\\")\\n\",\n    \"\\n\",\n    \"session.catalog._jsparkSession.udf().registerPython(\\n\",\n    \"    \\\"extract_date_from_email_datefield\\\",\\n\",\n    \"    extract_date_from_email_datefield_udf._judf)\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": null,\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"spark_mailing_list_data_with_date = spark_mailing_list_data.select(\\n\",\n    \"    \\\"*\\\",\\n\",\n    \"    extract_date_from_email_datefield_udf(spark_mailing_list_data[\\\"Date\\\"]).alias(\\\"email_date\\\"))\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": null,\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"# Manually verify that our date parser is looking ok\\n\",\n    \"spark_mailing_list_data.select(spark_mailing_list_data[\\\"Date\\\"],\\n\",\n    \"                               extract_date_from_email_datefield_udf(spark_mailing_list_data[\\\"Date\\\"]).alias(\\\"email_date\\\")\\n\",\n    \"                              ).take(2)\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": null,\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"#tag::filter_junk[]\\n\",\n    \"def is_ok(post):\\n\",\n    \"    # Your special business logic goes here\\n\",\n    \"    return True\\n\",\n    \"spark_mailing_list_data_cleaned = spark_mailing_list_data_with_date.filter(is_ok)\\n\",\n    \"#end::filter_junk[]\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": null,\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"mailing_list_posts_in_reply_to = spark_mailing_list_data_cleaned.filter(\\n\",\n    \"    spark_mailing_list_data[\\\"In-Reply-To\\\"].isNotNull()).alias(\\\"mailing_list_posts_in_reply_to\\\")\\n\",\n    \"initial_posts = spark_mailing_list_data_cleaned.filter(\\n\",\n    \"    spark_mailing_list_data[\\\"In-Reply-To\\\"].isNull()).alias(\\\"initial_posts\\\").cache()\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": null,\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"# See how many start-of-thread posts we have\\n\",\n    \"initial_posts.count()\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": null,\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"ids_in_reply = mailing_list_posts_in_reply_to.select(\\\"In-Reply-To\\\", \\\"message-id\\\")\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": null,\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"ids_in_reply.schema\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": null,\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"# Ok now it's time to save these\\n\",\n    \"#tag::write_big_data[]\\n\",\n    \"initial_posts.write.format(\\\"parquet\\\").mode('overwrite').save(fs_prefix + \\\"/initial_posts\\\")\\n\",\n    \"ids_in_reply.write.format(\\\"parquet\\\").mode('overwrite').save(fs_prefix + \\\"/ids_in_reply\\\")\\n\",\n    \"#end::write_big_data[]\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": null,\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"#tag::small_data[]\\n\",\n    \"initial_posts.toPandas()\\n\",\n    \"#end::small_data[]\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": null,\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"session.stop()\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": null,\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": []\n  }\n ],\n \"metadata\": {\n  \"kernelspec\": {\n   \"display_name\": \"Python 3\",\n   \"language\": \"python\",\n   \"name\": \"python3\"\n  },\n  \"language_info\": {\n   \"codemirror_mode\": {\n    \"name\": \"ipython\",\n    \"version\": 3\n   },\n   \"file_extension\": \".py\",\n   \"mimetype\": \"text/x-python\",\n   \"name\": \"python\",\n   \"nbconvert_exporter\": \"python\",\n   \"pygments_lexer\": \"ipython3\",\n   \"version\": \"3.6.9\"\n  }\n },\n \"nbformat\": 4,\n \"nbformat_minor\": 2\n}\n"
  },
  {
    "path": "data-extraction/python-spark-notebook/SparkMailingListForKF.py",
    "content": "#!/usr/bin/env python\n# coding: utf-8\n\n# In[ ]:\n\n# Yes we need both these imports\nfrom pyspark.sql import SparkSession\nfrom pyspark.sql.functions import col, to_date\nfrom pyspark.sql.types import *\nfrom pyspark.sql.types import StructField, StructType\nfrom pyspark.sql.catalog import UserDefinedFunction\nimport os\n\n# In[ ]:\n\n# In[ ]:\n\nfs_prefix = \"s3a://kf-book-examples/mailing-lists\"  # Create with mc as in ch1\n\n# In[ ]:\n\n# See https://medium.com/@szinck/setting-up-pyspark-jupyter-and-minio-on-kubeflow-kubernetes-aab98874794f\n#tag::configurePythonVersion[]\nos.environ[\"PYSPARK_PYTHON\"] = \"python3.6\"\n#end::configurePythonVersion[]\nsession = (\n    SparkSession.builder.appName(\"fetchMailingListData\").config(\n        \"spark.executor.instances\",\n        \"8\").config(\"spark.driver.memoryOverhead\",\n                    \"0.25\").config(\"spark.executor.memory\", \"6g\").config(\n                        \"spark.dynamicAllocation.enabled\", \"false\").\n    config(\"spark.ui.enabled\", \"true\").config(\n        \"spark.kubernetes.container.image\",\n        \"gcr.io/boos-demo-projects-are-rad/kubeflow/spark-worker/spark-py-36:v3.0.0-preview2-23\"\n    )\n    #tag::notebookSession[]\n    .config(\"spark.driver.bindAddress\",\n            \"0.0.0.0\").config(\"spark.kubernetes.namespace\",\n                              \"kubeflow-programmerboo\").\n    config(\"spark.master\", \"k8s://https://kubernetes.default\").config(\n        \"spark.driver.host\",\n        \"spark-driver.kubeflow-programmerboo.svc.cluster.local\").config(\n            \"spark.kubernetes.executor.annotation.sidecar.istio.io/inject\",\n            \"false\").config(\"spark.driver.port\",\n                            \"39235\").config(\"spark.blockManager.port\", \"39236\")\n    #end::notebookSession[]\n    # If using minio - see https://github.com/minio/cookbook/blob/master/docs/apache-spark-with-minio.md\n    #tag::minio[]\n    .config(\"spark.hadoop.fs.s3a.endpoint\",\n            \"minio-service.kubeflow.svc.cluster.local:9000\").config(\n                \"fs.s3a.connection.ssl.enabled\",\n                \"false\").config(\"fs.s3a.path.style.access\", \"true\")\n    # You can also add an account using the minio command as described in chapter 1\n    .config(\"spark.hadoop.fs.s3a.access.key\",\n            \"minio\").config(\"spark.hadoop.fs.s3a.secret.key\", \"minio123\")\n    #end::minio[]\n).getOrCreate()\nsc = session.sparkContext\n\n# In[ ]:\n\n# Data fetch pipeline: Download mailing list data\n\n# In[ ]:\n\nlist_name = \"spark-user\"\n\n# In[ ]:\n\nmailing_list_template = \"http://mail-archives.apache.org/mod_mbox/{list_name}/{date}.mbox\"\n\n# In[ ]:\n\n# Generate the possible dates\n\n# In[ ]:\n\nstart_year = 2019  # Change to 2002 once you've verified\nend_year = 2021\ndates = [\n    \"{:d}{:02d}\".format(year, month) for year in range(start_year, end_year)\n    for month in range(1, 12)\n]\n\n# In[ ]:\n\n\ndef download_emails(date):\n    import subprocess\n    from mailbox import mbox\n    import os\n    mbox_filename = \"{date}.mbox\".format(date=date)\n    url = mailing_list_template.format(list_name=list_name, date=date)\n    subprocess.call([\"wget\", url])\n    # Skip years that don't exist\n    if not os.path.exists(mbox_filename):\n        return []\n    mail = mbox(mbox_filename.format(date=date), create=False)\n\n    # LC the keys since the casing is non-consistent\n\n    def get_body(message):\n        content_type = message.get_content_type()\n        # Multi-part messages\n        if message.is_multipart():\n            return \"\".join(map(get_body, message.get_payload()))\n        elif \"text\" in content_type or \"html\" in content_type:\n            return message.get_payload()\n        else:\n            return \"\"\n\n    def message_to_dict(message):\n        ret = dict((k.lower(), v) for k, v in message.items())\n        ret[\"multipart\"] = message.is_multipart()\n        ret[\"body\"] = get_body(message)\n        return ret\n\n    emails = list(map(message_to_dict, mail.itervalues()))\n    os.remove(mbox_filename)\n    return emails\n\n\n# In[ ]:\n\n# Optional: test that it works locally\n# download_emails(\"202001\")\n\n# In[ ]:\n\nemails_rdd = sc.parallelize(dates).flatMap(download_emails).cache()\n\n# In[ ]:\n\nemails_rdd.count()\n\n# In[ ]:\n\nmailing_list_posts_mbox_df = emails_rdd.toDF(sampleRatio=1.0)\n\n# In[ ]:\n\ncached = mailing_list_posts_mbox_df.cache()\n\n# In[ ]:\n\nmailing_list_posts_mbox_df.select(\"list-id\", \"In-Reply-To\").take(5)\n\n# In[ ]:\n\nspark_mailing_list_data = mailing_list_posts_mbox_df.filter(\n    mailing_list_posts_mbox_df[\"list-id\"].contains(\"spark\")).repartition(\n        60).cache()\n\n# In[ ]:\n\nspark_mailing_list_data.show()\n\n# In[ ]:\n\nspark_mailing_list_data.printSchema()\n\n# In[ ]:\n\n\ndef extract_date_from_email_datefield(datefield):\n    if datefield is None:\n        return None\n    from datetime import datetime\n    import time\n    import email.utils\n    parsed_date = email.utils.parsedate(datefield)\n    return datetime.fromtimestamp(time.mktime((parsed_date)))\n\n\nextract_date_from_email_datefield_udf = UserDefinedFunction(\n    extract_date_from_email_datefield, StringType(),\n    \"extract_date_from_email_datefield\")\n\nsession.catalog._jsparkSession.udf().registerPython(\n    \"extract_date_from_email_datefield\",\n    extract_date_from_email_datefield_udf._judf)\n\n# In[ ]:\n\nspark_mailing_list_data_with_date = spark_mailing_list_data.select(\n    \"*\",\n    extract_date_from_email_datefield_udf(\n        spark_mailing_list_data[\"Date\"]).alias(\"email_date\"))\n\n# In[ ]:\n\n# Manually verify that our date parser is looking ok\nspark_mailing_list_data.select(\n    spark_mailing_list_data[\"Date\"],\n    extract_date_from_email_datefield_udf(\n        spark_mailing_list_data[\"Date\"]).alias(\"email_date\")).take(2)\n\n# In[ ]:\n\n\n#tag::filter_junk[]\ndef is_ok(post):\n    # Your special business logic goes here\n    return True\n\n\nspark_mailing_list_data_cleaned = spark_mailing_list_data_with_date.filter(\n    is_ok)\n#end::filter_junk[]\n\n# In[ ]:\n\nmailing_list_posts_in_reply_to = spark_mailing_list_data_cleaned.filter(\n    spark_mailing_list_data[\"In-Reply-To\"].isNotNull()).alias(\n        \"mailing_list_posts_in_reply_to\")\ninitial_posts = spark_mailing_list_data_cleaned.filter(\n    spark_mailing_list_data[\"In-Reply-To\"].isNull()).alias(\n        \"initial_posts\").cache()\n\n# In[ ]:\n\n# See how many start-of-thread posts we have\ninitial_posts.count()\n\n# In[ ]:\n\nids_in_reply = mailing_list_posts_in_reply_to.select(\"In-Reply-To\",\n                                                     \"message-id\")\n\n# In[ ]:\n\nids_in_reply.schema\n\n# In[ ]:\n\n# Ok now it's time to save these\n#tag::write_big_data[]\ninitial_posts.write.format(\"parquet\").mode('overwrite').save(fs_prefix +\n                                                             \"/initial_posts\")\nids_in_reply.write.format(\"parquet\").mode('overwrite').save(fs_prefix +\n                                                            \"/ids_in_reply\")\n#end::write_big_data[]\n\n# In[ ]:\n\n#tag::small_data[]\ninitial_posts.toPandas()\n#end::small_data[]\n\n# In[ ]:\n\nsession.stop()\n\n# In[ ]:\n"
  },
  {
    "path": "data-extraction/python-spark-notebook/build.sh",
    "content": "#!/bin/bash\n# Build a notebook with Spark 3\n# Note when Spark 3 is fully released we can use gcr.io/spark-operator/spark-py:v3.0.0\nset -ex\nV=${V:-\"23\"}\nREPO=${REPO:-\"gcr.io/$PROJECT\"}\nTARGET=${TARGET:-\"$REPO/kubeflow/spark-notebook:v$V\"}\nKF_BASE=${KF_BASE:-\"gcr.io/kubeflow-images-public\"}\nBASE=${BASE:-\"$KF_BASE/tensorflow-1.15.2-notebook-cpu:1.0.0\"}\nSPARK_VERSION=\"3.0.0-preview2\"\nSPARK_RELEASE=\"spark-3.0.0-preview2-bin-hadoop3.2\"\nSPARK_ARTIFACT=\"${SPARK_RELEASE}.tgz\"\ndocker build . -t \"${TARGET}\" --build-arg sparkversion=\"${SPARK_VERSION}\" \\\n       --build-arg sparkrelease=\"${SPARK_RELEASE}\" --build-arg base=\"${BASE}\"\ndocker push \"${TARGET}\"\n# Build Spark worker image\nSPARK_TARGET=${SPARK_TARGET:-\"$REPO/kubeflow/spark-worker\"}\nif [ ! -f /tmp/${SPARK_ARTIFACT} ]; then\n  pushd /tmp/\n  wget \"https://www-us.apache.org/dists/spark/spark-${SPARK_VERSION}/${SPARK_ARTIFACT}\"\n  popd\nfi\n\ntmp_dir=$(mktemp -d -t spark-build-XXXXXXXXXX)\npushd \"${tmp_dir}\"\ntar -xvf \"/tmp/${SPARK_ARTIFACT}\"\n\npushd \"${SPARK_RELEASE}\"\n./bin/docker-image-tool.sh -r \"${SPARK_TARGET}\" -t \"v${SPARK_VERSION}-${V}\" build\n./bin/docker-image-tool.sh -r \"${SPARK_TARGET}\" -t \"v${SPARK_VERSION}-${V}\" \\\n\t\t\t   -p kubernetes/dockerfiles/spark/bindings/python/Dockerfile \\\n\t\t\t   build\n./bin/docker-image-tool.sh -r \"${SPARK_TARGET}\" -t \"v${SPARK_VERSION}-${V}\" push\npopd\n\npopd\n# Add GCS to Spark images\ndocker build --build-arg base=\"${SPARK_TARGET}/spark:v${SPARK_VERSION}-${V}\" \\\n       -t \"${SPARK_TARGET}/spark-with-gcs:v${SPARK_VERSION}-$V\" -f AddGCSDockerfile .\nPYSPARK_WITH_GCS=\"${SPARK_TARGET}/spark-py-with-gcs:v${SPARK_VERSION}-$V\"\ndocker build --build-arg base=\"${SPARK_TARGET}/spark-py:v${SPARK_VERSION}-${V}\" \\\n       -t \"${PYSPARK_WITH_GCS}\" -f AddGCSDockerfile .\n# Add Python 3.6 to PySpark images for notebook compat\nSPARK_PY36_WORKER=\"${SPARK_TARGET}/spark-py-36:v${SPARK_VERSION}-$V\"\ndocker build --build-arg base=\"${PYSPARK_WITH_GCS}\" \\\n       -t \"${SPARK_PY36_WORKER}\" -f AddPython3.6Dockerfile .\n\ndocker push \"${SPARK_TARGET}/spark-with-gcs:v${SPARK_VERSION}-$V\"\ndocker push \"${SPARK_TARGET}/spark-py-with-gcs:v${SPARK_VERSION}-$V\"\ndocker push \"${SPARK_PY36_WORKER}\"\nrm -rf \"${tmp_dir}\"\n\necho \"Spark notebook pushed to ${TARGET}\"\necho \"Spark py worker pushed to ${SPARK_PY36_WORKER}\"\n"
  },
  {
    "path": "data-extraction/python-spark-notebook/dr.yaml",
    "content": "apiVersion: networking.istio.io/v1alpha3\nkind: DestinationRule\nmetadata:\n  name: default\n  namespace: kubeflow-programmerboo\nspec:\n  host: '*.svc.cluster.local'\n  trafficPolicy:\n    tls:\n      mode: DISABLE"
  },
  {
    "path": "data-extraction/python-spark-notebook/no-saprk-tls.yaml",
    "content": " apiVersion: \"authentication.istio.io/v1alpha1\"\n kind: \"Policy\"\n metadata:\n   name: spark-no-tls\n spec:\n   targets:\n   - name: spark-notebook-0"
  },
  {
    "path": "data-extraction/python-spark-notebook/spark-driver-service.yaml",
    "content": "apiVersion: v1\nkind: Service\nmetadata:\n  name: spark-driver\n  namespace: kubeflow-programmerboo\nspec:\n  selector:\n    notebook-name: spark-test-2\n  ports:\n    - port: 39235\n      targetPort: 39235\n      name: spark-driver-port\n    - port: 39236\n      targetPort: 39236\n      name: spark-block-port\n"
  },
  {
    "path": "data-extraction/python-spark-notebook/virt_service.yaml",
    "content": "apiVersion: networking.istio.io/v1alpha3\nkind: VirtualService\nmetadata:\n  creationTimestamp: \"2019-10-14T20:09:50Z\"\n  generation: 1\n  name: notebook-programmerboo-spark-notebook\n  namespace: programmerboo\n  ownerReferences:\n  - apiVersion: kubeflow.org/v1beta1\n    blockOwnerDeletion: true\n    controller: true\n    kind: Notebook\n    name: spark-notebook\n    uid: 93fb0c0e-eebe-11e9-a454-42010a8e0119\n  resourceVersion: \"3616573\"\n  selfLink: /apis/networking.istio.io/v1alpha3/namespaces/programmerboo/virtualservices/notebook-programmerboo-spark-notebook\n  uid: 9404145c-eebe-11e9-a454-42010a8e0119\nspec:\n  gateways:\n  - kubeflow/kubeflow-gateway\n  hosts:\n  - '*'\n  http:\n  - match:\n    - uri:\n        prefix: /notebook/programmerboo/spark-notebook\n    rewrite:\n      uri: /notebook/programmerboo/spark-notebook\n    route:\n    - destination:\n        host: spark-notebook.programmerboo.svc.cluster.local\n        port:\n          number: 80\n    timeout: 300s\n"
  },
  {
    "path": "data-extraction/spark-hello-world/Dockerfile",
    "content": ""
  },
  {
    "path": "data-extraction/spark-hello-world/README.md",
    "content": "This directory will walk you through running a Spark Hello world example with kubeflow.\nIt (currently) uses the master branch of Kubeflow unlike the rest of the examples\nsince Spark support is not yet in a released version.\n"
  },
  {
    "path": "data-extraction/spark-hello-world/hello_world_pipeline.py",
    "content": "import kfp.dsl as dsl\nimport kfp.gcp as gcp\nimport kfp.onprem as onprem\n\nfrom string import Template\nimport json\n\n\n@dsl.pipeline(name='Simple spark pipeline demo',\n              description='Shows how to use Spark operator inside KF')\ndef spark_hello_world_pipeline(jar_location=\"gcs://....\", tf_job_image=\"...\"):\n    spark_json_template = Template(\"\"\"\n{\n    \"apiVersion\": \"sparkoperator.k8s.io/v1beta2\",\n    \"kind\": \"SparkApplication\",\n    \"metadata\": {\n      \"name\": \"spark-frank\",\n      \"namespace\": \"kubeflow\"},\n    \"spec\": {\n      \"type\": \"Scala\",\n      \"mode\": \"cluster\",\n      \"mainApplicationFile\": \"$jar_location\"\n    }\"\"\")\n    spark_json = spark_json_template.substitute({'jar_location': jar_location})\n    spark_job = json.loads(spark_json)\n    spark_resource = dsl.ResourceOp(\n        name='spark-job',\n        k8s_resource=spark_job,\n        success_condition='status.state == Succeeded')\n    train = dsl.ContainerOp(\n        name='train',\n        image=tf_job_image,\n    ).after(spark_resoure)\n"
  },
  {
    "path": "data-extraction/spark-hello-world/lr_demo/.gitignore",
    "content": "*.class\n*.log\nbuild.sbt_back\n\n# sbt specific\ndist/*\ntarget/\nlib_managed/\nsrc_managed/\nproject/boot/\nproject/plugins/project/\nsbt/*.jar\nmini-complete-example/sbt/*.jar\n\n# Scala-IDE specific\n.scala_dependencies\n\n#Emacs\n*~\n\n#ignore the metastore\nmetastore_db/*\n\n# Byte-compiled / optimized / DLL files\n__pycache__/\n*.py[cod]\n\n# C extensions\n*.so\n\n# Distribution / packaging\n.env\n.Python\nenv/\nbin/\nbuild/*.jar\ndevelop-eggs/\ndist/\neggs/\nlib/\nlib64/\nparts/\nsdist/\nvar/\n*.egg-info/\n.installed.cfg\n*.egg\n\n# Installer logs\npip-log.txt\npip-delete-this-directory.txt\n\n# Unit test / coverage reports\nhtmlcov/\n.tox/\n.coverage\n.cache\nnosetests.xml\ncoverage.xml\n\n# Translations\n*.mo\n\n# Mr Developer\n.mr.developer.cfg\n.project\n.pydevproject\n\n# Rope\n.ropeproject\n\n# Django stuff:\n*.log\n*.pot\n\n# Sphinx documentation\ndocs/_build/\n\n# PyCharm files\n*.idea\n\n# emacs stuff\n\n# Autoenv\n.env\n*~\n# Byte-compiled / optimized / DLL files\n__pycache__/\n*.py[cod]\n\n# C extensions\n*.so\n\n# Distribution / packaging\n.env\n.Python\nenv/\nbin/\nbuild/\ndevelop-eggs/\ndist/\neggs/\nlib/\nlib64/\nparts/\nsdist/\nvar/\n*.egg-info/\n.installed.cfg\n*.egg\n\n# Installer logs\npip-log.txt\npip-delete-this-directory.txt\n\n# Unit test / coverage reports\nhtmlcov/\n.tox/\n.coverage\n.cache\nnosetests.xml\ncoverage.xml\n\n# Translations\n*.mo\n\n# Mr Developer\n.mr.developer.cfg\n.project\n.pydevproject\n\n# Rope\n.ropeproject\n\n# Django stuff:\n*.log\n*.pot\n\n# Sphinx documentation\ndocs/_build/\n\n# PyCharm files\n*.idea\n\n# emacs stuff\n\\#*\\#\n\\.\\#*\n\n# Autoenv\n.env\n*~\n\n"
  },
  {
    "path": "data-extraction/spark-hello-world/lr_demo/.travis.yml",
    "content": "language: scala\n\n# These directories are cached to S3 at the end of the build\ncache:\n  directories:\n    - $HOME/.ivy2/cache\n    - $HOME/.sbt/boot/\n    - $HOME/.sbt/launchers\n    - $HOME/build\n\njdk:\n  - oraclejdk8\nscala:\n  - 2.11.8\nafter_success:\n  - bash <(curl -s https://codecov.io/bash)\nsudo: false"
  },
  {
    "path": "data-extraction/spark-hello-world/lr_demo/README.md",
    "content": "A simple, bad, LR example with Spark.\n"
  },
  {
    "path": "data-extraction/spark-hello-world/lr_demo/build.sbt",
    "content": "val sparkVersion = \"2.3.1\"\n\nlazy val root = (project in file(\".\")).\n\n  settings(\n    inThisBuild(List(\n      organization := \"com.introtomlwithkubeflow.spark.demo\",\n      scalaVersion := \"2.11.12\"\n    )),\n    name := \"basic.lr\",\n    version := \"0.0.1\",\n\n    javacOptions ++= Seq(\"-source\", \"1.8\", \"-target\", \"1.8\"),\n    javaOptions ++= Seq(\"-Xms512M\", \"-Xmx2048M\", \"-XX:MaxPermSize=2048M\", \"-XX:+CMSClassUnloadingEnabled\"),\n    scalacOptions ++= Seq(\"-deprecation\", \"-unchecked\"),\n    parallelExecution in Test := false,\n    fork := true,\n\n    coverageHighlighting := true,\n\n    libraryDependencies ++= Seq(\n      \"org.apache.spark\" %% \"spark-streaming\" % sparkVersion % \"provided\",\n      \"org.apache.spark\" %% \"spark-sql\" % sparkVersion % \"provided\",\n      \"org.apache.spark\" %% \"spark-mllib\" % sparkVersion % \"provided\",\n      \"ml.combust.mleap\" %% \"mleap-spark\" % \"0.13.0\",\n\n      \"org.scalatest\" %% \"scalatest\" % \"3.0.1\" % \"test\",\n      \"org.scalacheck\" %% \"scalacheck\" % \"1.13.4\" % \"test\",\n      \"com.holdenkarau\" %% \"spark-testing-base\" % \"2.3.1_0.11.0\" % \"test\"\n    ),\n\n    // uses compile classpath for the run task, including \"provided\" jar (cf http://stackoverflow.com/a/21803413/3827)\n    run in Compile := Defaults.runTask(fullClasspath in Compile, mainClass in (Compile, run), runner in (Compile, run)).evaluated,\n\n    scalacOptions ++= Seq(\"-deprecation\", \"-unchecked\"),\n    pomIncludeRepository := { x => false },\n\n   resolvers ++= Seq(\n      \"sonatype-releases\" at \"https://oss.sonatype.org/content/repositories/releases/\",\n      \"Typesafe repository\" at \"http://repo.typesafe.com/typesafe/releases/\",\n      \"Second Typesafe repo\" at \"http://repo.typesafe.com/typesafe/maven-releases/\",\n      Resolver.sonatypeRepo(\"public\")\n    ),\n\n    pomIncludeRepository := { x => false },\n        mergeStrategy in assembly := {\n      case m if m.toLowerCase.endsWith(\"manifest.mf\") => MergeStrategy.discard\n      case m if m.toLowerCase.endsWith(\"io.netty.versions.properties\") => MergeStrategy.concat\n      case m if m.toLowerCase.endsWith(\"services\") => MergeStrategy.filterDistinctLines\n      case m if m.toLowerCase.endsWith(\"git.properties\") => MergeStrategy.discard\n      case m if m.toLowerCase.endsWith(\"reference.conf\") => MergeStrategy.filterDistinctLines\n        // Travis is giving a weird error on netty I don't see locally :(\n      case PathList(\"META-INF\", \"io.netty.versions.properties\") => MergeStrategy.first\n      case PathList(\"META-INF\", \"native\", xs @ _*) => MergeStrategy.deduplicate\n      case PathList(\"META-INF\", \"services\", xs @ _ *) => MergeStrategy.filterDistinctLines\n      case PathList(\"META-INF\", xs @ _ *) => MergeStrategy.discard\n      case PathList(\"javax\", \"servlet\", xs @ _*) => MergeStrategy.last\n      case PathList(\"org\", \"apache\", xs @ _*) => MergeStrategy.last\n      case PathList(\"org\", \"jboss\", xs @ _*) => MergeStrategy.last\n        // Start http://queirozf.com/entries/creating-scala-fat-jars-for-spark-on-sbt-with-sbt-assembly-plugin\n      case PathList(\"org\",\"aopalliance\", xs @ _*) => MergeStrategy.last\n      case PathList(\"javax\", \"inject\", xs @ _*) => MergeStrategy.last\n      case PathList(\"javax\", \"servlet\", xs @ _*) => MergeStrategy.last\n      case PathList(\"javax\", \"activation\", xs @ _*) => MergeStrategy.last\n      case PathList(\"org\", \"apache\", xs @ _*) => MergeStrategy.last\n      case PathList(\"com\", \"google\", xs @ _*) => MergeStrategy.last\n      case PathList(\"com\", \"esotericsoftware\", xs @ _*) => MergeStrategy.last\n      case PathList(\"com\", \"codahale\", xs @ _*) => MergeStrategy.last\n      case PathList(\"com\", \"yammer\", xs @ _*) => MergeStrategy.last\n        // End http://queirozf.com/entries/creating-scala-fat-jars-for-spark-on-sbt-with-sbt-assembly-plugin\n      case PathList(\"com\", \"sun\", \"activation\", \"registries\", xs @ _*) => MergeStrategy.last\n      case PathList(\"com\", \"sun\", \"activation\", \"viewers\", xs @ _*) => MergeStrategy.last\n      case \"about.html\"  => MergeStrategy.rename\n      case \"reference.conf\" => MergeStrategy.concat\n      case m =>\n        val oldStrategy = (assemblyMergeStrategy in assembly).value\n        oldStrategy(m)\n    },\n    assemblyShadeRules in assembly := Seq(\n      ShadeRule.rename(\"com.google.protobuf.**\" -> \"shadeproto.@1\").inAll\n    ),\n\n\n    // publish settings\n    publishTo := {\n      val nexus = \"https://oss.sonatype.org/\"\n      if (isSnapshot.value)\n        Some(\"snapshots\" at nexus + \"content/repositories/snapshots\")\n      else\n        Some(\"releases\"  at nexus + \"service/local/staging/deploy/maven2\")\n    }\n  )\n"
  },
  {
    "path": "data-extraction/spark-hello-world/lr_demo/project/build.properties",
    "content": "sbt.version=1.2.8\n"
  },
  {
    "path": "data-extraction/spark-hello-world/lr_demo/project/plugins.sbt",
    "content": "addSbtPlugin(\"org.scalastyle\" %% \"scalastyle-sbt-plugin\" % \"1.0.0\")\n\nresolvers += \"sonatype-releases\" at \"https://oss.sonatype.org/content/repositories/releases/\"\n\nresolvers += \"Spark Package Main Repo\" at \"https://dl.bintray.com/spark-packages/maven\"\n\naddSbtPlugin(\"org.scoverage\" % \"sbt-scoverage\" % \"1.5.1\")\n\naddSbtPlugin(\"com.eed3si9n\" % \"sbt-assembly\" % \"0.14.5\")\n"
  },
  {
    "path": "data-extraction/spark-hello-world/lr_demo/sample.csv",
    "content": "e1,e2,label\n1.0, 0.0, 1.0\n2.0, 2.1, 2.0\n"
  },
  {
    "path": "data-extraction/spark-hello-world/lr_demo/sbt/sbt",
    "content": "#!/bin/bash\n\n#\n# Licensed to the Apache Software Foundation (ASF) under one or more\n# contributor license agreements.  See the NOTICE file distributed with\n# this work for additional information regarding copyright ownership.\n# The ASF licenses this file to You under the Apache License, Version 2.0\n# (the \"License\"); you may not use this file except in compliance with\n# the License.  You may obtain a copy of the License at\n#\n#    http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n#\n\n# This script launches sbt for this project. If present it uses the system \n# version of sbt. If there is no system version of sbt it attempts to download\n# sbt locally.\nSBT_VERSION=0.13.15\nURL1=http://typesafe.artifactoryonline.com/typesafe/ivy-releases/org.scala-sbt/sbt-launch/${SBT_VERSION}/sbt-launch.jar\nURL2=http://repo.typesafe.com/typesafe/ivy-releases/org.scala-sbt/sbt-launch/${SBT_VERSION}/sbt-launch.jar\nJAR=sbt/sbt-launch-${SBT_VERSION}.jar\n\n# Download sbt launch jar if it hasn't been downloaded yet\nif [ ! -f ${JAR} ]; then\n  # Download\n  printf \"Attempting to fetch sbt\\n\"\n  set -x\n  JAR_DL=${JAR}.part\n  if hash wget 2>/dev/null; then\n    (wget --progress=bar ${URL1} -O ${JAR_DL} || wget --progress=bar ${URL2} -O ${JAR_DL}) && mv ${JAR_DL} ${JAR}\n  elif hash axel 2>/dev/null; then\n    (axel  ${URL1} -o ${JAR_DL} || axel  ${URL2} -o ${JAR_DL}) && mv ${JAR_DL} ${JAR}\n  else\n    printf \"You do not have curl or wget installed, please install sbt manually from http://www.scala-sbt.org/\\n\"\n    exit -1\n  fi\nfi\nif [ ! -f ${JAR} ]; then\n  # We failed to download\n  printf \"Our attempt to download sbt locally to ${JAR} failed. Please install sbt manually from http://www.scala-sbt.org/\\n\"\n  exit -1\nfi\nprintf \"Launching sbt from ${JAR}\\n\"\njava \\\n  -Xmx1200m -XX:MaxPermSize=350m -XX:ReservedCodeCacheSize=256m \\\n  -jar ${JAR} \\\n  \"$@\"\n"
  },
  {
    "path": "data-extraction/spark-hello-world/lr_demo/src/main/scala/com/introtomlwithkubeflow/spark/demo/lr/TrainingApp.scala",
    "content": "package com.introtomlwithkubeflow.spark.demo.lr\n\nimport org.apache.spark.{SparkConf, SparkContext}\n\n\n/**\n  * Use this when submitting the app to a cluster with spark-submit\n  * */\nobject TrainingApp extends App{\n  val (inputFile, outputFile) = (args(0), args(1))\n\n  // spark-submit command should supply all necessary config elements\n  Runner.run(new SparkConf(), inputFile, outputFile)\n}\n\nobject Runner {\n  def run(conf: SparkConf, inputFile: String, outputFile: String): Unit = {\n    val sc = new SparkContext(conf)\n    val trainer = new TrainingPipeline(sc)\n    trainer.train(inputFile, outputFile)\n  }\n}\n"
  },
  {
    "path": "data-extraction/spark-hello-world/lr_demo/src/main/scala/com/introtomlwithkubeflow/spark/demo/lr/TrainingPipeline.scala",
    "content": "package com.introtomlwithkubeflow.spark.demo.lr\n\nimport java.nio.file.{Files, Paths}\n\n\nimport ml.combust.bundle.BundleFile\nimport ml.combust.mleap.spark.SparkSupport._\nimport org.apache.hadoop.fs.{FileSystem, Path}\nimport org.apache.spark.ml.bundle.SparkBundleContext // Actually an mleap import\nimport org.apache.spark.{SparkConf, SparkContext}\nimport org.apache.spark.sql._\nimport org.apache.spark.sql.functions._\nimport org.apache.spark.sql.catalyst.ScalaReflection\nimport org.apache.spark.ml.Transformer\nimport org.apache.spark.ml.{Pipeline, PipelineModel}\nimport org.apache.spark.ml.feature._\nimport org.apache.spark.ml.regression._\nimport resource._\n\n\nclass TrainingPipeline(sc: SparkContext) {\n  val session = SparkSession.builder().getOrCreate()\n  import session.implicits._\n\n  def train(input: String, outputFile: String) = {\n    val trainingData = session.read.format(\"csv\")\n      .option(\"inferSchema\", \"true\").option(\"header\", \"true\").load(input)\n    val vectorizer = new VectorAssembler().setInputCols(Array(\"e1\", \"e2\")).setOutputCol(\"features\")\n    val lr = new GeneralizedLinearRegression()\n      .setFamily(\"gaussian\")\n      .setLink(\"identity\")\n      .setMaxIter(10)\n      .setRegParam(0.3)\n    val pipeline = new Pipeline().setStages(Array(\n      vectorizer,\n      lr))\n    val fit = pipeline.fit(trainingData)\n    // Serialize the fit pipeline\n    val resultData = fit.transform(trainingData)\n    val localFile = \"/tmp/mleap.zip\"\n    val localOutput = s\"jar:file:${localFile}\"\n    val sbc = SparkBundleContext().withDataset(resultData)\n    for(bf <- managed(BundleFile(localOutput))) {\n      fit.writeBundle.save(bf)(sbc).get\n    }\n    // We only have one file so its k\n    val modelBinary = Files.readAllBytes(Paths.get(localFile))\n    val fs = FileSystem.get(sc.hadoopConfiguration)\n    val out = fs.create(new Path(outputFile))\n    out.write(modelBinary);\n    out.close();\n  }\n}\n"
  },
  {
    "path": "data-extraction/spark-hello-world/lr_demo/src/test/scala/com/introtomlwithkubeflow/spark/demo/lr/TrainingPipelineTest.scala",
    "content": "package com.introtomlwithkubeflow.spark.demo.lr\n\n/**\n * A simple test for the training pipeline\n */\n\nimport com.holdenkarau.spark.testing.{SharedSparkContext, Utils}\n\nimport org.apache.spark.sql._\n\nimport org.scalatest.FunSuite\n\nimport java.io.File\n\ncase class MyData(e1: Double, e2: Double, label: Double)\n\nclass TrainingPipelineTest extends FunSuite with SharedSparkContext {\n  test(\"smok test\"){\n    val session = SparkSession.builder().getOrCreate()\n    import session.implicits._\n\n    val tempDir = Utils.createTempDir()\n\n    val sampleDataRDD = sc.parallelize(Seq(\n      MyData(1.0, 0.0, 1.0),\n      MyData(2.0, 2.1, 2.0)))\n    val sampleDataDS = session.createDataset(sampleDataRDD)\n    val inputDataLocation = tempDir + \"/input\"\n    val outputFile = tempDir + \"/output.zip\"\n    sampleDataDS.write.format(\"csv\").option(\"header\", \"true\").save(inputDataLocation)\n\n    val trainingPipeline = new TrainingPipeline(sc)\n    trainingPipeline.train(inputDataLocation, outputFile)\n    assert(new File(outputFile).exists())\n  }\n}\n"
  },
  {
    "path": "data-extraction/spark-hello-world/setup.sh",
    "content": "#!/bin/bash\nset -ex\n\nSPARK_DEMO_DIR=${SPARK_DEMO_DIR:=~/spark_demo_3}\nSPARK_DEMO_GCS=${SPARK_DEMO_GCS:=gs://boo-spark-kf-demo}\n\n# Set up kubeflow\nmkdir \"$SPARK_DEMO_DIR\"\npushd \"$SPARK_DEMO_DIR\"\npwd\n\nwget https://raw.githubusercontent.com/kubeflow/kubeflow/master/scripts/download.sh\nchmod a+x download.sh\nKUBEFLOW_VERSION=0.5.0\nexport KUBEFLOW_VERSION\n./download.sh\n\nPATH=\"$(pwd)/scripts\":$PATH\nkfctl.sh init mydemoapp --platform none\npushd mydemoapp\nsource env.sh\n#kfctl.sh generate platform\n#kfctl.sh apply platform\nkfctl.sh generate k8s\nkfctl.sh apply k8s\npushd ks_app\n# Set up the Spark operator\nks pkg install kubeflow/spark\nks generate spark-operator spark-operator --name=spark-operator\nks apply default -c spark-operator\n\n# Create a Spark job with the operator (Pi)\nlocal_jar_path=\"local:///opt/spark/examples/jars/spark-examples_2.11-2.3.1.jar\"\nks generate spark-job spark-pi --name=spark-operator \\\n   --applicationResource=\"$local_jar_path\" \\\n   --mainClass=org.apache.spark.examples.SparkPi\nks apply default -c spark-pi\n\n# Create a Spark job with the operator to train an LR model\n\npushd \"$SPARK_MNIST_DIR/lr_demo\"\nsbt assembly\ngsutil cp target/scala-2.11/basic.lr-assembly-0.0.1.jar \"$SPARK_DEMO_GCS/jars\"\ngsutil cp sample.csv \"$SPARK_DEMO_GCS/input/part0.csv\"\npopd\n\nks generate spark-job spark-lr --name=spark-operator \\\n   --applicationResource=\"$SPARK_DEMO_GCS/jars/basic.lr-assembly-0.0.1.jar\" \\\n   --mainClass=com.introtomlwithkubeflow.spark.demo.lr.TrainingApp\n   \"$SPARK_DEMO_GCS/input\" \"$SPARK_DEMO_GCS/output\"\nks apply default -c spark-lr\n\n\n# Create a Spark job with the operator for data prep on the GitHub data\n\npopd\n"
  },
  {
    "path": "data-extraction/spark-hello-world/spark-pi-min.yaml",
    "content": "apiVersion: \"sparkoperator.k8s.io/v1beta2\"\nkind: SparkApplication\nmetadata:\n  name: spark-pi\n  namespace: kubeflow\nspec:\n  type: Scala\n  mode: cluster\n  image: \"gcr.io/spark-operator/spark:v2.4.4\"\n  imagePullPolicy: Always\n  mainClass: org.apache.spark.examples.SparkPi\n  mainApplicationFile: \"local:///opt/spark/examples/jars/spark-examples_2.11-2.4.4.jar\"\n  sparkVersion: \"2.4.4\"\n  restartPolicy:\n    type: Never\n  volumes:\n    - name: \"test-volume\"\n      hostPath:\n        path: \"/tmp\"\n        type: Directory\n  driver:\n    cores: 1\n    coreLimit: \"1200m\"\n    memory: \"512m\"\n    labels:\n      version: 2.4.4\n    volumeMounts:\n      - name: \"test-volume\"\n        mountPath: \"/tmp\"\n  executor:\n    cores: 1\n    instances: 1\n    memory: \"512m\"\n    labels:\n      version: 2.4.4\n    volumeMounts:\n      - name: \"test-volume\"\n        mountPath: \"/tmp\"\n"
  },
  {
    "path": "data-extraction/spark-hello-world/spark-pi.yaml",
    "content": "apiVersion: \"sparkoperator.k8s.io/v1beta2\"\nkind: SparkApplication\nmetadata:\n  name: spark-pi\n  namespace: kubeflow\nspec:\n  type: Scala\n  mode: cluster\n  image: \"gcr.io/spark-operator/spark:v2.4.4\"\n  imagePullPolicy: Always\n  mainClass: org.apache.spark.examples.SparkPi\n  mainApplicationFile: \"local:///opt/spark/examples/jars/spark-examples_2.11-2.4.4.jar\"\n  sparkVersion: \"2.4.4\"\n  restartPolicy:\n    type: Never\n  volumes:\n    - name: \"test-volume\"\n      hostPath:\n        path: \"/tmp\"\n        type: Directory\n  driver:\n    cores: 1\n    coreLimit: \"1200m\"\n    memory: \"512m\"\n    labels:\n      version: 2.4.4\n    serviceAccount: spark-operatoroperator-sa\n    volumeMounts:\n      - name: \"test-volume\"\n        mountPath: \"/tmp\"\n  executor:\n    cores: 1\n    instances: 1\n    memory: \"512m\"\n    labels:\n      version: 2.4.4\n    volumeMounts:\n      - name: \"test-volume\"\n        mountPath: \"/tmp\"\n"
  },
  {
    "path": "data-extraction/stack_overflow_questions.bsql",
    "content": "SELECT "
  },
  {
    "path": "data-extraction/tfx/TFDV.ipynb",
    "content": "{\n \"cells\": [\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"We start by downloading a specific release of the components because running from master is not a good way to buid \\\"repetable\\\" systems\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": null,\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"!wget https://github.com/kubeflow/pipelines/archive/0.2.5.tar.gz\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": null,\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"!tar -xvf 0.2.5.tar.gz\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": null,\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"import kfp\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": null,\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": []\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": null,\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"#tag::loadGCSDLComponent[]\\n\",\n    \"gcs_download_component = kfp.components.load_component_from_file(\\n\",\n    \"    \\\"pipelines-0.2.5/components/google-cloud/storage/download/component.yaml\\\")\\n\",\n    \"#end::loadGCSDLComponent[]\\n\",\n    \"#tag::loadTFDVAndFriendsComponents[]\\n\",\n    \"tfx_csv_gen = kfp.components.load_component_from_file(\\n\",\n    \"    \\\"pipelines-0.2.5/components/tfx/ExampleGen/CsvExampleGen/component.yaml\\\")\\n\",\n    \"tfx_statistic_gen = kfp.components.load_component_from_file(\\n\",\n    \"    \\\"pipelines-0.2.5/components/tfx/StatisticsGen/component.yaml\\\")\\n\",\n    \"tfx_schema_gen = kfp.components.load_component_from_file(\\n\",\n    \"    \\\"pipelines-0.2.5/components/tfx/SchemaGen/component.yaml\\\")\\n\",\n    \"tfx_example_validator = kfp.components.load_component_from_file(\\n\",\n    \"    \\\"pipelines-0.2.5/components/tfx/ExampleValidator/component.yaml\\\")\\n\",\n    \"#end::loadTFDVAndFriendsComponents[]\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": null,\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"@kfp.dsl.pipeline(\\n\",\n    \"  name='DL',\\n\",\n    \"  description='Sample DL pipeline'\\n\",\n    \")\\n\",\n    \"def pipeline_with_dl():\\n\",\n    \"    #tag::dlOp[]\\n\",\n    \"    dl_op = gcs_download_component(\\n\",\n    \"        gcs_path=\\\"gs://ml-pipeline-playground/tensorflow-tfx-repo/tfx/components/testdata/external/csv\\\") # Your path goes here\\n\",\n    \"    #end::dlOp[]\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": null,\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"kfp.compiler.Compiler().compile(pipeline_with_dl, 'dl_pipeline.zip')\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": null,\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"client = kfp.Client()\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": null,\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"my_experiment = client.create_experiment(name='dl')\\n\",\n    \"my_run = client.run_pipeline(my_experiment.id, 'dl', \\n\",\n    \"  'dl_pipeline.zip')\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": null,\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"#tag::standaloneTFDVPipeline[]\\n\",\n    \"@kfp.dsl.pipeline(\\n\",\n    \"  name='TFDV',\\n\",\n    \"  description='TF DV Pipeline'\\n\",\n    \")\\n\",\n    \"def tfdv_pipeline():\\n\",\n    \"    # DL with wget, can use gcs instead as well\\n\",\n    \"    data_url = \\\"https://raw.githubusercontent.com/moorissa/medium/master/items-recommender/data/trx_data.csv\\\"\\n\",\n    \"    #tag::wget[]\\n\",\n    \"    fetch = kfp.dsl.ContainerOp(\\n\",\n    \"      name='download',\\n\",\n    \"      image='busybox',\\n\",\n    \"      command=['sh', '-c'],\\n\",\n    \"      arguments=[\\n\",\n    \"          'sleep 1;'\\n\",\n    \"          'mkdir -p /tmp/data;'\\n\",\n    \"          'wget '+ data_url +' -O /tmp/data/results.csv'],\\n\",\n    \"      file_outputs={'downloaded': '/tmp/data'})\\n\",\n    \"    # This expects a directory of inputs not just a single file\\n\",\n    \"    #end::wget[]\\n\",\n    \"    #tag::csv[]\\n\",\n    \"    records_example = tfx_csv_gen(input_base=fetch.output)\\n\",\n    \"    #end::csv[]\\n\",\n    \"    #tag::stats[]\\n\",\n    \"    stats = tfx_statistic_gen(input_data=records_example.output)\\n\",\n    \"    #end::stats[]\\n\",\n    \"    #tag::schema[]\\n\",\n    \"    schema_op = tfx_schema_gen(stats.output)\\n\",\n    \"    #end::schema[]\\n\",\n    \"    #tag::validate[]\\n\",\n    \"    tfx_example_validator(stats=stats.outputs['output'], schema=schema_op.outputs['output'])\\n\",\n    \"    #end::validate[]\\n\",\n    \"#end::standaloneTFDVPipeline[]\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": null,\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"kfp.compiler.Compiler().compile(tfdv_pipeline, 'tfdv_pipeline.zip')\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": null,\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"my_experiment = client.create_experiment(name='tfdv_pipeline')\\n\",\n    \"my_run = client.run_pipeline(my_experiment.id, 'tfdv', \\n\",\n    \"  'tfdv_pipeline.zip')\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": null,\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"!pip3 install tfx tensorflow-data-validation\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": null,\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"#tag::importTFDV[]\\n\",\n    \"import tensorflow_data_validation as tfdv\\n\",\n    \"#end::importTFDV[]\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"You can download your schema by looking at the inputs/outputs in your pipeline run for the schema gen stage\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": null,\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"#tag::displaySchema{}\\n\",\n    \"schema = tfdv.load_schema_text(\\\"schema_info_2\\\")\\n\",\n    \"tfdv.display_schema(schema)\\n\",\n    \"#end::displaySchema[]\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": null,\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"#tag::loadTFT[]\\n\",\n    \"tfx_transform = kfp.components.load_component_from_file(\\n\",\n    \"    \\\"pipelines-0.2.5/components/tfx/Transform/component.yaml\\\")\\n\",\n    \"#end::loadTFT[]\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": null,\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"module_file=\\\"gcs://\\\"\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": null,\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"@kfp.dsl.pipeline(\\n\",\n    \"  name='TFX',\\n\",\n    \"  description='TFX pipeline'\\n\",\n    \")\\n\",\n    \"def tfx_pipeline():\\n\",\n    \"    # DL with wget, can use gcs instead as well\\n\",\n    \"    fetch = kfp.dsl.ContainerOp(\\n\",\n    \"      name='download',\\n\",\n    \"      image='busybox',\\n\",\n    \"      command=['sh', '-c'],\\n\",\n    \"      arguments=[\\n\",\n    \"          'sleep 1;'\\n\",\n    \"          'mkdir -p /tmp/data;'\\n\",\n    \"          'wget https://raw.githubusercontent.com/moorissa/medium/master/items-recommender/data/trx_data.csv -O /tmp/data/results.csv'],\\n\",\n    \"      file_outputs={'downloaded': '/tmp/data'})\\n\",\n    \"    records_example = tfx_csv_gen(input_base=fetch.output)\\n\",\n    \"    stats = tfx_statistic_gen(input_data=records_example.output)\\n\",\n    \"    schema_op = tfx_schema_gen(stats.output)\\n\",\n    \"    tfx_example_validator(stats=stats.outputs['output'], schema=schema_op.outputs['output'])\\n\",\n    \"    #tag::tft[]\\n\",\n    \"    transformed_output = tfx_transform(\\n\",\n    \"        input_data=records_example.output,\\n\",\n    \"        schema=schema_op.outputs['output'],\\n\",\n    \"        module_file=module_file) # Path to your TFT code on GCS/S3\\n\",\n    \"    #end::tft[]\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": null,\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"kfp.compiler.Compiler().compile(tfx_pipeline, 'tfx_pipeline.zip')\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": null,\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"my_experiment = client.create_experiment(name='tfx_pipeline')\\n\",\n    \"my_run = client.run_pipeline(my_experiment.id, 'tfx', \\n\",\n    \"  'tfx_pipeline.zip')\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": null,\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": []\n  }\n ],\n \"metadata\": {\n  \"kernelspec\": {\n   \"display_name\": \"Python 3\",\n   \"language\": \"python\",\n   \"name\": \"python3\"\n  },\n  \"language_info\": {\n   \"codemirror_mode\": {\n    \"name\": \"ipython\",\n    \"version\": 3\n   },\n   \"file_extension\": \".py\",\n   \"mimetype\": \"text/x-python\",\n   \"name\": \"python\",\n   \"nbconvert_exporter\": \"python\",\n   \"pygments_lexer\": \"ipython3\",\n   \"version\": \"3.6.9\"\n  }\n },\n \"nbformat\": 4,\n \"nbformat_minor\": 4\n}\n"
  },
  {
    "path": "data-extraction/tfx/TFDV.py",
    "content": "#!/usr/bin/env python\n# coding: utf-8\n\n# We start by downloading a specific release of the components because running from master is not a good way to buid \"repetable\" systems\n\n# In[ ]:\n\nget_ipython().system(\n    'wget https://github.com/kubeflow/pipelines/archive/0.2.5.tar.gz')\n\n# In[ ]:\n\nget_ipython().system('tar -xvf 0.2.5.tar.gz')\n\n# In[ ]:\n\nimport kfp\n\n# In[ ]:\n\n# In[ ]:\n\n#tag::loadGCSDLComponent[]\ngcs_download_component = kfp.components.load_component_from_file(\n    \"pipelines-0.2.5/components/google-cloud/storage/download/component.yaml\")\n#end::loadGCSDLComponent[]\n#tag::loadTFDVAndFriendsComponents[]\ntfx_csv_gen = kfp.components.load_component_from_file(\n    \"pipelines-0.2.5/components/tfx/ExampleGen/CsvExampleGen/component.yaml\")\ntfx_statistic_gen = kfp.components.load_component_from_file(\n    \"pipelines-0.2.5/components/tfx/StatisticsGen/component.yaml\")\ntfx_schema_gen = kfp.components.load_component_from_file(\n    \"pipelines-0.2.5/components/tfx/SchemaGen/component.yaml\")\ntfx_example_validator = kfp.components.load_component_from_file(\n    \"pipelines-0.2.5/components/tfx/ExampleValidator/component.yaml\")\n#end::loadTFDVAndFriendsComponents[]\n\n# In[ ]:\n\n\n@kfp.dsl.pipeline(name='DL', description='Sample DL pipeline')\ndef pipeline_with_dl():\n    #tag::dlOp[]\n    dl_op = gcs_download_component(\n        gcs_path=\n        \"gs://ml-pipeline-playground/tensorflow-tfx-repo/tfx/components/testdata/external/csv\"\n    )  # Your path goes here\n    #end::dlOp[]\n\n\n# In[ ]:\n\nkfp.compiler.Compiler().compile(pipeline_with_dl, 'dl_pipeline.zip')\n\n# In[ ]:\n\nclient = kfp.Client()\n\n# In[ ]:\n\nmy_experiment = client.create_experiment(name='dl')\nmy_run = client.run_pipeline(my_experiment.id, 'dl', 'dl_pipeline.zip')\n\n# In[ ]:\n\n\n#tag::standaloneTFDVPipeline[]\n@kfp.dsl.pipeline(name='TFDV', description='TF DV Pipeline')\ndef tfdv_pipeline():\n    # DL with wget, can use gcs instead as well\n    data_url = \"https://raw.githubusercontent.com/moorissa/medium/master/items-recommender/data/trx_data.csv\"\n    #tag::wget[]\n    fetch = kfp.dsl.ContainerOp(name='download',\n                                image='busybox',\n                                command=['sh', '-c'],\n                                arguments=[\n                                    'sleep 1;'\n                                    'mkdir -p /tmp/data;'\n                                    'wget ' + data_url +\n                                    ' -O /tmp/data/results.csv'\n                                ],\n                                file_outputs={'downloaded': '/tmp/data'})\n    # This expects a directory of inputs not just a single file\n    #end::wget[]\n    #tag::csv[]\n    records_example = tfx_csv_gen(input_base=fetch.output)\n    #end::csv[]\n    #tag::stats[]\n    stats = tfx_statistic_gen(input_data=records_example.output)\n    #end::stats[]\n    #tag::schema[]\n    schema_op = tfx_schema_gen(stats.output)\n    #end::schema[]\n    #tag::validate[]\n    tfx_example_validator(stats=stats.outputs['output'],\n                          schema=schema_op.outputs['output'])\n    #end::validate[]\n\n\n#end::standaloneTFDVPipeline[]\n\n# In[ ]:\n\nkfp.compiler.Compiler().compile(tfdv_pipeline, 'tfdv_pipeline.zip')\n\n# In[ ]:\n\nmy_experiment = client.create_experiment(name='tfdv_pipeline')\nmy_run = client.run_pipeline(my_experiment.id, 'tfdv', 'tfdv_pipeline.zip')\n\n# In[ ]:\n\nget_ipython().system('pip3 install tfx tensorflow-data-validation')\n\n# In[ ]:\n\n#tag::importTFDV[]\nimport tensorflow_data_validation as tfdv\n#end::importTFDV[]\n\n# You can download your schema by looking at the inputs/outputs in your pipeline run for the schema gen stage\n\n# In[ ]:\n\n#tag::displaySchema{}\nschema = tfdv.load_schema_text(\"schema_info_2\")\ntfdv.display_schema(schema)\n#end::displaySchema[]\n\n# In[ ]:\n\n#tag::loadTFT[]\ntfx_transform = kfp.components.load_component_from_file(\n    \"pipelines-0.2.5/components/tfx/Transform/component.yaml\")\n#end::loadTFT[]\n\n# In[ ]:\n\nmodule_file = \"gcs://\"\n\n# In[ ]:\n\n\n@kfp.dsl.pipeline(name='TFX', description='TFX pipeline')\ndef tfx_pipeline():\n    # DL with wget, can use gcs instead as well\n    fetch = kfp.dsl.ContainerOp(\n        name='download',\n        image='busybox',\n        command=['sh', '-c'],\n        arguments=[\n            'sleep 1;'\n            'mkdir -p /tmp/data;'\n            'wget https://raw.githubusercontent.com/moorissa/medium/master/items-recommender/data/trx_data.csv -O /tmp/data/results.csv'\n        ],\n        file_outputs={'downloaded': '/tmp/data'})\n    records_example = tfx_csv_gen(input_base=fetch.output)\n    stats = tfx_statistic_gen(input_data=records_example.output)\n    schema_op = tfx_schema_gen(stats.output)\n    tfx_example_validator(stats=stats.outputs['output'],\n                          schema=schema_op.outputs['output'])\n    #tag::tft[]\n    transformed_output = tfx_transform(\n        input_data=records_example.output,\n        schema=schema_op.outputs['output'],\n        module_file=module_file)  # Path to your TFT code on GCS/S3\n    #end::tft[]\n\n\n# In[ ]:\n\nkfp.compiler.Compiler().compile(tfx_pipeline, 'tfx_pipeline.zip')\n\n# In[ ]:\n\nmy_experiment = client.create_experiment(name='tfx_pipeline')\nmy_run = client.run_pipeline(my_experiment.id, 'tfx', 'tfx_pipeline.zip')\n\n# In[ ]:\n"
  },
  {
    "path": "data-extraction/tfx/install_tfx.sh",
    "content": "#!/bin/bash\n#tag::install[]\npip3 install tfx tensorflow-data-validation\n#end::install[]\n"
  },
  {
    "path": "data-extraction/tfx/requirements.txt",
    "content": "tfx\n"
  },
  {
    "path": "data-extraction/tfx/run_on_dataflow_ex.py",
    "content": "#tag::example[]\ngenerated_output_uri = root_output_uri + kfp.dsl.EXECUTION_ID_PLACEHOLDER\nbeam_pipeline_args = [\n    '--runner=DataflowRunner',\n    '--project=' + project_id,\n    '--temp_location=' + root_output_uri + '/tmp'),\n    '--region=' + gcp_region,\n    '--disk_size_gb=50', # Adjust as needed\n]\n\nrecords_example = tfx_csv_gen(\n    input_uri=fetch.output, # Must be on distributed storage\n    beam_pipeline_args=beam_pipeline_args,\n    output_examples_uri=generated_output_uri)\n#end::example[]\n"
  },
  {
    "path": "dev-setup/install-argo.sh",
    "content": "#!/bin/bash\n# Download the binary\ncurl -sLO https://github.com/argoproj/argo/releases/download/v2.8.1/argo-linux-amd64\n\n# Make binary executable\nchmod +x argo-linux-amd64\n\n# Move binary to path\nmv ./argo-linux-amd64 ~/bin/argo\n"
  },
  {
    "path": "dev-setup/install-kf-pipeline-sdk.sh",
    "content": "#!/bin/bash\n# Put as inside a venv\npushd /tmp\n#tag::venv[]\nvirtualenv kfvenv --python python3\nsource kfvenv/bin/activate\n#end::venv[]\npopd\n#tag::install[]\nURL=https://storage.googleapis.com/ml-pipeline/release/latest/kfp.tar.gz\npip install \"${URL}\" --upgrade\n#end::install[]\nmkdir -p ~/repos\npushd ~/repos\nif [[ ! -d pipelines ]]; then\n#tag::checkout_sdk[]\n  git clone --single-branch --branch 0.3.0 https://github.com/kubeflow/pipelines.git\n#end::checkout_sdk[]\nfi\npopd\n"
  },
  {
    "path": "dev-setup/install-kf.sh",
    "content": "#!/bin/bash\nset -ex\n#tag::install[]\nPLATFORM=$(uname) # Either Linux or Darwin\nexport PLATFORM\nmkdir -p ~/bin\n#Configuration\nexport KUBEFLOW_TAG=1.0.1\n# ^ You can also point this to a different version if you want to try\nKUBEFLOW_BASE=\"https://api.github.com/repos/kubeflow/kfctl/releases\"\n# Or just go to https://github.com/kubeflow/kfctl/releases\nKFCTL_URL=$(curl -s ${KUBEFLOW_BASE} |\\\n\t      grep http |\\\n\t      grep \"${KUBEFLOW_TAG}\" |\\\n\t      grep -i \"${PLATFORM}\" |\\\n\t      cut -d : -f 2,3 |\\\n\t      tr -d '\\\" ' )\nwget \"${KFCTL_URL}\"\nKFCTL_FILE=${KFCTL_URL##*/}\ntar -xvf \"${KFCTL_FILE}\"\nmv ./kfctl ~/bin/\nrm \"${KFCTL_FILE}\"\n# Recommended add the scripts directory to your path\nexport PATH=$PATH:~/bin\n#end::install[]\n"
  },
  {
    "path": "dev-setup/install-kubectl.sh",
    "content": "#!/bin/bash\n#tag::ubuntu-kubectl[]\nsudo snap install kubectl --classic\n#end::ubuntu-kubectl[]\n#tag::debian-kubectl[]\nsudo apt-get update && sudo apt-get install -y apt-transport-https\ncurl -s https://packages.cloud.google.com/apt/doc/apt-key.gpg |\\\n  sudo apt-key add -\necho \"deb https://apt.kubernetes.io/ kubernetes-xenial main\" |\\\n  sudo tee -a /etc/apt/sources.list.d/kubernetes.list\nsudo apt-get update\nsudo apt-get install -y kubectl\n#end::debian-kubectl[]\n#tag::redhat-kubectl[]\ncat <<EOF > /etc/yum.repos.d/kubernetes.repo\n[kubernetes]\nname=Kubernetes\nbaseurl=https://packages.cloud.google.com/yum/repos/kubernetes-el7-x86_64\nenabled=1\ngpgcheck=1\nrepo_gpgcheck=0\ngpgkey=https://packages.cloud.google.com/yum/doc/rpm-package-key.gpg\nEOF\nyum install -y kubectl\n#end::redhat-kubectl[]\n#tag::osx-kubectl[]\nbrew install kubernetes-cli\n#end::osx-kubectl[]\n#tag::no-pkg-manager-kubectl[]\nkubectl_release_base=\"https://storage.googleapis.com/kubernetes-release\"\nstable_url=\"$kubectl_release_base/release/stable.txt\"\nKUBECTL_VERSION=$(curl -s \"$stable_url\")\nexport KUBECTL_VERSION\ncurl -LO \"$kubectl_release_base/$KUBECTL_VERSION/bin/$PLATFORM/amd64/kubectl\"\n# Now either move kubectl to /usr/bin or add it to your PATH\n#end::no-pkg-manager-kubectl[]\n"
  },
  {
    "path": "dev-setup/install-kustomize.sh",
    "content": "#!/bin/bash\n#tag::kustomize[]\nPLATFORM=$(uname) # Either Linux or Darwin\nexport PLATFORM\nmkdir -p ~/bin\nKUSTOMIZE_URL=$(curl -s \\\n  https://api.github.com/repos/kubernetes-sigs/kustomize/releases/latest |\\\n  grep browser_download |\\\n  grep -i \"${PLATFORM}\" |\\\n  cut -d '\"' -f 4)\nwget \"${KUSTOMIZE_URL}\"\nKUSTOMIZE_FILE=${KUSTOMIZE_URL##*/}\ntar -xvf \"${KUSTOMIZE_FILE}\"\nrm \"${KUSTOMIZE_FILE}\"\nmv kustomize ~/bin/kustomize\nchmod u+x ~/bin/kustomize\n# Add this + platform/version exports to your bashrc or move the ks bin into /usr/bin\nexport PATH=$PATH:\"~/bin\"\n#end::kustomize[]\n"
  },
  {
    "path": "dev-setup/install-microk8s.sh",
    "content": "#!/bin/bash\n#tag::installmicrok8s[]\nsudo snap install microk8s --classic\n#end::installmicrok8s[]\n#tag::setupmicrok8s[]\n# Alias the microk8s versions of kubectl and docker so kubeflow uses them\n# You will want to add this to your bashrc if you intend to use microk8s\n# generally.\nalias kubectl=\"microk8s.kubectl\"\nalias docker=\"microk8s.docker\"\n### Faking Docker registry, skip for production docker registry\nmicrok8s.enable registry\nexport DOCKER_HOST=\"unix:///var/snap/microk8s/current/docker.sock\"\nsudo ln -s /var/snap/microk8s/current/docker.sock /var/run/docker.sock\nsudo ln -s /var/snap/microk8s/common/var/lib/docker /var/lib/docker\n#end::setupmicrok8s[]\n#tag::bootstrapwithcanonicallabs[]\ngit clone https://github.com/canonical-labs/kubeflow-tools\npushd kubeflow-tools\nKUBEFLOW_VERSION=0.4.1 ./install-kubeflow.sh\n#end::bootstrapwithcanonicallabs[]\n#tag::unaliasmicrok8s[]\nunalias kubectl\nunalias docker\n#end::unaliasmicrok8s[]\n"
  },
  {
    "path": "dev-setup/jsonnet.sh",
    "content": "#!/bin/bash\nset -e\nset -x\n#tag::snap[]\nsudo snap install jsonnet\n#end::snap[]\n#tag::manual[]\nexport JSONNET_VERSION=0.12.1\nwget https://github.com/google/jsonnet/archive/v$JSONNET_VERSION.tar.gz\n# You will need to add this to your path if it is not already\ntar -xvf v$JSONNET_VERSION.tar.gz\ncd jsonnet-$JSONNET_VERSION\nmake\n# Or otherwise add to your path\nsudo cp jsonent /usr/bin/\n#end::manual[]\n"
  },
  {
    "path": "feature-prep/README.md",
    "content": "Feature preparation is the task of converting the data into features\nsuitable for our machine algorithms. What makes a \"feature\" suitable\ndepends on the algorithm used.\n\nIn the `tft` directory we show feature prep using Tensorflow Transform. At the time of writing this only supports Python 2 and has limited support on non-GCP platforms, but it is rapidly improving in both areas."
  },
  {
    "path": "feature-prep/spark/SparkMailingListFeaturePrep.ipynb",
    "content": "{\n \"cells\": [\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": null,\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"# Yes we need both these imports\\n\",\n    \"from pyspark.sql import SparkSession\\n\",\n    \"from pyspark.sql.functions import col, to_date, lit, isnull\\n\",\n    \"from pyspark.sql.types import *\\n\",\n    \"from pyspark.sql.types import StructField, StructType\\n\",\n    \"from pyspark.sql.catalog import UserDefinedFunction\\n\",\n    \"from pyspark.ml.feature import *\\n\",\n    \"from pyspark.ml.pipeline import Pipeline\\n\",\n    \"import os\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": null,\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": []\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": null,\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"fs_prefix = \\\"s3a://kf-book-examples/mailing-lists\\\" # Create with mc as in ch1\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": null,\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"os.environ[\\\"PYSPARK_PYTHON\\\"] = \\\"python3.6\\\"\\n\",\n    \"# See https://medium.com/@szinck/setting-up-pyspark-jupyter-and-minio-on-kubeflow-kubernetes-aab98874794f\\n\",\n    \"session = (SparkSession.builder\\n\",\n    \"           .appName(\\\"processMailingListData\\\")\\n\",\n    \"           .config(\\\"spark.executor.instances\\\", \\\"8\\\")\\n\",\n    \"           .config(\\\"spark.driver.memoryOverhead\\\", \\\"0.25\\\")\\n\",\n    \"           .config(\\\"spark.executor.memory\\\", \\\"10g\\\")\\n\",\n    \"           .config(\\\"spark.dynamicAllocation.enabled\\\", \\\"false\\\")\\n\",\n    \"           .config(\\\"spark.ui.enabled\\\", \\\"true\\\")\\n\",\n    \"           .config(\\\"spark.kubernetes.container.image\\\",\\n\",\n    \"                   \\\"gcr.io/boos-demo-projects-are-rad/kubeflow/spark-worker/spark-py-36:v3.0.0-preview2-23\\\")\\n\",\n    \"           .config(\\\"spark.driver.bindAddress\\\", \\\"0.0.0.0\\\")\\n\",\n    \"           .config(\\\"spark.kubernetes.namespace\\\", \\\"kubeflow-programmerboo\\\")\\n\",\n    \"           .config(\\\"spark.master\\\", \\\"k8s://https://kubernetes.default\\\")\\n\",\n    \"           .config(\\\"spark.driver.host\\\", \\\"spark-driver.kubeflow-programmerboo.svc.cluster.local\\\")\\n\",\n    \"           .config(\\\"spark.kubernetes.executor.annotation.sidecar.istio.io/inject\\\", \\\"false\\\")\\n\",\n    \"           .config(\\\"spark.driver.port\\\", \\\"39235\\\")\\n\",\n    \"           .config(\\\"spark.blockManager.port\\\", \\\"39236\\\")\\n\",\n    \"            # If using minio - see https://github.com/minio/cookbook/blob/master/docs/apache-spark-with-minio.md\\n\",\n    \"           .config(\\\"spark.hadoop.fs.s3a.endpoint\\\", \\\"minio-service.kubeflow.svc.cluster.local:9000\\\")\\n\",\n    \"           .config(\\\"fs.s3a.connection.ssl.enabled\\\", \\\"false\\\")\\n\",\n    \"           .config(\\\"fs.s3a.path.style.access\\\", \\\"true\\\")\\n\",\n    \"           # You can also add an account using the minio command as described in chapter 1\\n\",\n    \"           .config(\\\"spark.hadoop.fs.s3a.access.key\\\", \\\"minio\\\")\\n\",\n    \"           .config(\\\"spark.hadoop.fs.s3a.secret.key\\\", \\\"minio123\\\")\\n\",\n    \"          ).getOrCreate()\\n\",\n    \"sc = session.sparkContext\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": null,\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"#Load data from the previous stage\\n\",\n    \"#tag::load_data[]\\n\",\n    \"initial_posts = session.read.format(\\\"parquet\\\").load(fs_prefix + \\\"/initial_posts\\\")\\n\",\n    \"ids_in_reply = session.read.format(\\\"parquet\\\").load(fs_prefix + \\\"/ids_in_reply\\\")\\n\",\n    \"#end::load_data[]\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": null,\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"# Load data from the previous stage while checking the schema\\n\",\n    \"#tag::load_with_schema[]\\n\",\n    \"ids_schema = StructType([\\n\",\n    \"    StructField(\\\"In-Reply-To\\\", StringType(), nullable=True),\\n\",\n    \"    StructField(\\\"message-id\\\", StringType(),nullable=True)])\\n\",\n    \"ids_in_reply = session.read.format(\\\"parquet\\\").schema(ids_schema).load(fs_prefix + \\\"/ids_in_reply\\\")\\n\",\n    \"#end::load_with_schema[]\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": null,\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"# Cache the data\\n\",\n    \"initial_posts = initial_posts.alias(\\\"initial_posts\\\").cache()\\n\",\n    \"ids_in_reply = ids_in_reply.alias(\\\"ids_in_reply\\\").cache()\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": null,\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"# We can write random SQL -- although we need to wait for preview 3 cause it was taken out in preview1\\n\",\n    \"#tag::direct_sql[]\\n\",\n    \"#ids_in_reply.registerTempTable(\\\"cheese\\\")\\n\",\n    \"#no_text = session.sql(\\\"select * from cheese where body = '' AND subject = ''\\\")\\n\",\n    \"#end::direct_sql[]\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": null,\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"# Drop bad data\\n\",\n    \"#tag::drop_bad_fields[]\\n\",\n    \"initial_posts_count = initial_posts.count()\\n\",\n    \"initial_posts_cleaned = initial_posts.na.drop(how='any', subset=['body', 'from'])\\n\",\n    \"initial_posts_cleaned_count = initial_posts_cleaned.count()\\n\",\n    \"#end::drop_bad_fields[]\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": null,\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"initial_posts.show()\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": null,\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"# Start with computing the labels\\n\",\n    \"# Find the initial posts where no one replied\\n\",\n    \"posts_with_replies = (initial_posts.join(\\n\",\n    \"        ids_in_reply,\\n\",\n    \"        col(\\\"ids_in_reply.In-Reply-To\\\") == col(\\\"initial_posts.Message-Id\\\"),\\n\",\n    \"        \\\"left_outer\\\")\\n\",\n    \"       .filter(col(\\\"ids_in_reply.In-Reply-To\\\").isNotNull())).cache()\\n\",\n    \"posts_with_replies.count()\\n\",\n    \"post_ids_with_replies = (posts_with_replies\\n\",\n    \"                            .select(col(\\\"initial_posts.Message-Id\\\").alias(\\\"id\\\"))\\n\",\n    \"                            .withColumn(\\\"has_reply\\\", lit(1.0))).alias(\\\"post_with_replies\\\")\\n\",\n    \"\\n\",\n    \"joined_posts = initial_posts.join(\\n\",\n    \"    post_ids_with_replies,\\n\",\n    \"    col(\\\"initial_posts.Message-Id\\\") == col(\\\"post_with_replies.id\\\"))\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": null,\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"joined_posts.show()\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": null,\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"posts_with_labels = joined_posts.na.fill({\\\"has_reply\\\": 0.0}).cache()\\n\",\n    \"posts_with_labels.count()\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": null,\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"def extract_links(body):\\n\",\n    \"    import re\\n\",\n    \"    link_regex_str = r'(http(|s)://(.*?))([\\\\s\\\\n]|$)'\\n\",\n    \"    itr = re.finditer(link_regex_str, body, re.MULTILINE)\\n\",\n    \"    return list(map(lambda elem: elem.group(1), itr))\\n\",\n    \"\\n\",\n    \"def extract_domains(links):\\n\",\n    \"    from urllib.parse import urlparse\\n\",\n    \"    def extract_domain(link):\\n\",\n    \"        try:\\n\",\n    \"            nloc = urlparse(link).netloc\\n\",\n    \"            # We want to drop www and any extra spaces wtf nloc on the spaces.\\n\",\n    \"            regex_str = r'^(www\\\\.|)(.*?)\\\\s*$'\\n\",\n    \"            match = re.search(regex_str, nloc)\\n\",\n    \"            return match.group(2)\\n\",\n    \"        except:\\n\",\n    \"            return None\\n\",\n    \"    return list(map(extract_domain, links))\\n\",\n    \"\\n\",\n    \"def contains_python_stack_trace(body):\\n\",\n    \"    return \\\"Traceback (most recent call last)\\\" in body\\n\",\n    \"\\n\",\n    \"\\n\",\n    \"\\n\",\n    \"def contains_probably_java_stack_trace(body):\\n\",\n    \"    # Look for something based on regex\\n\",\n    \"    # Tried https://stackoverflow.com/questions/20609134/regular-expression-optional-multiline-java-stacktrace - more msg looking\\n\",\n    \"    # Tried https://stackoverflow.com/questions/3814327/regular-expression-to-parse-a-log-file-and-find-stacktraces\\n\",\n    \"    # Yes the compile is per call, but it's cached so w/e\\n\",\n    \"    import re\\n\",\n    \"    stack_regex_str = r'^\\\\s*(.+Exception.*):\\\\n(.*\\\\n){0,3}?(\\\\s+at\\\\s+.*\\\\(.*\\\\))+'\\n\",\n    \"    match = re.search(stack_regex_str, body, re.MULTILINE)\\n\",\n    \"    return match is not None\\n\",\n    \"\\n\",\n    \"\\n\",\n    \"def contains_exception_in_task(body):\\n\",\n    \"    # Look for a line along the lines of ERROR Executor: Exception in task \\n\",\n    \"    return \\\"ERROR Executor: Exception in task\\\" in body\\n\",\n    \"    \"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": null,\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"extract_links_udf = UserDefinedFunction(\\n\",\n    \"    extract_links, ArrayType(StringType()), \\\"extract_links\\\")\\n\",\n    \"\\n\",\n    \"session.catalog._jsparkSession.udf().registerPython(\\n\",\n    \"    \\\"extract_links\\\",\\n\",\n    \"    extract_links_udf._judf)\\n\",\n    \"\\n\",\n    \"\\n\",\n    \"extract_domains_udf = UserDefinedFunction(\\n\",\n    \"    extract_domains, ArrayType(StringType()), \\\"extract_domains\\\")\\n\",\n    \"\\n\",\n    \"session.catalog._jsparkSession.udf().registerPython(\\n\",\n    \"    \\\"extract_domains\\\",\\n\",\n    \"    extract_domains_udf._judf)\\n\",\n    \"\\n\",\n    \"\\n\",\n    \"contains_python_stack_trace_udf = UserDefinedFunction(\\n\",\n    \"    contains_python_stack_trace, BooleanType(), \\\"contains_python_stack_trace\\\")\\n\",\n    \"\\n\",\n    \"session.catalog._jsparkSession.udf().registerPython(\\n\",\n    \"    \\\"contains_python_stack_trace\\\",\\n\",\n    \"    contains_python_stack_trace_udf._judf)\\n\",\n    \"\\n\",\n    \"\\n\",\n    \"contains_probably_java_stack_trace_udf = UserDefinedFunction(\\n\",\n    \"    contains_probably_java_stack_trace, BooleanType(), \\\"contains_probably_java_stack_trace\\\")\\n\",\n    \"\\n\",\n    \"session.catalog._jsparkSession.udf().registerPython(\\n\",\n    \"    \\\"contains_probably_java_stack_trace\\\",\\n\",\n    \"    contains_probably_java_stack_trace_udf._judf)\\n\",\n    \"\\n\",\n    \"\\n\",\n    \"contains_exception_in_task_udf = UserDefinedFunction(\\n\",\n    \"    contains_exception_in_task, BooleanType(), \\\"contains_exception_in_task\\\")\\n\",\n    \"\\n\",\n    \"session.catalog._jsparkSession.udf().registerPython(\\n\",\n    \"    \\\"contains_exception_in_task\\\",\\n\",\n    \"    contains_exception_in_task_udf._judf)\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"We could make this a transformer stage, but I'm lazy so we'll just use a UDF directly.\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": null,\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"annotated_spark_mailing_list_data = posts_with_labels.select(\\n\",\n    \"    \\\"*\\\",\\n\",\n    \"    extract_links_udf(posts_with_labels[\\\"body\\\"]).alias(\\\"links_in_email\\\"),\\n\",\n    \"    contains_python_stack_trace_udf(posts_with_labels.body).alias(\\\"contains_python_stack_trace\\\").cast(\\\"double\\\"),\\n\",\n    \"    contains_probably_java_stack_trace_udf(posts_with_labels.body).alias(\\\"contains_java_stack_trace\\\").cast(\\\"double\\\"),\\n\",\n    \"    contains_exception_in_task_udf(posts_with_labels.body).alias(\\\"contains_exception_in_task\\\").cast(\\\"double\\\"))\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": null,\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"annotated_spark_mailing_list_data.cache()\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": null,\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"annotated_spark_mailing_list_data.show()\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": null,\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"further_annotated = annotated_spark_mailing_list_data.withColumn(\\n\",\n    \"    \\\"domain_links\\\",\\n\",\n    \"    extract_domains_udf(annotated_spark_mailing_list_data.links_in_email))\\n\",\n    \"# Long story, allow mixed UDF types\\n\",\n    \"further_annotated.cache()\\n\",\n    \"further_annotated.count()\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": null,\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"#tag::make_features[]\\n\",\n    \"tokenizer = Tokenizer(inputCol=\\\"body\\\", outputCol=\\\"body_tokens\\\")\\n\",\n    \"body_hashing = HashingTF(\\n\",\n    \"    inputCol=\\\"body_tokens\\\", outputCol=\\\"raw_body_features\\\",\\n\",\n    \"    numFeatures=10000)\\n\",\n    \"body_idf = IDF(\\n\",\n    \"    inputCol=\\\"raw_body_features\\\", outputCol=\\\"body_features\\\")\\n\",\n    \"body_word2Vec = Word2Vec(\\n\",\n    \"    vectorSize=5, minCount=0, numPartitions=10,\\n\",\n    \"    inputCol=\\\"body_tokens\\\", outputCol=\\\"body_vecs\\\")\\n\",\n    \"assembler = VectorAssembler(\\n\",\n    \"    inputCols=[\\n\",\n    \"        \\\"body_features\\\", \\\"body_vecs\\\", \\\"contains_python_stack_trace\\\",\\n\",\n    \"        \\\"contains_java_stack_trace\\\", \\\"contains_exception_in_task\\\"],\\n\",\n    \"    outputCol=\\\"features\\\")\\n\",\n    \"#end::make_features[]\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": null,\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"featureprep_pipeline = Pipeline(\\n\",\n    \"    stages=[tokenizer, body_hashing, body_idf, body_word2Vec, assembler])\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": null,\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"featureprep_pipeline_transformer = featureprep_pipeline.fit(further_annotated)\\n\",\n    \"preped_data = featureprep_pipeline_transformer.transform(further_annotated)\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": null,\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"featureprep_pipeline_transformer.write().save(fs_prefix+\\\"/feature_prep-2\\\")\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": null,\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"preped_data.write.format(\\\"parquet\\\").mode(\\\"overwrite\\\").save(fs_prefix+\\\"/prepared_data\\\")\"\n   ]\n  }\n ],\n \"metadata\": {\n  \"kernelspec\": {\n   \"display_name\": \"Python 3\",\n   \"language\": \"python\",\n   \"name\": \"python3\"\n  },\n  \"language_info\": {\n   \"codemirror_mode\": {\n    \"name\": \"ipython\",\n    \"version\": 3\n   },\n   \"file_extension\": \".py\",\n   \"mimetype\": \"text/x-python\",\n   \"name\": \"python\",\n   \"nbconvert_exporter\": \"python\",\n   \"pygments_lexer\": \"ipython3\",\n   \"version\": \"3.6.9\"\n  }\n },\n \"nbformat\": 4,\n \"nbformat_minor\": 2\n}\n"
  },
  {
    "path": "feature-prep/spark/SparkMailingListFeaturePrep.py",
    "content": "#!/usr/bin/env python\n# coding: utf-8\n\n# In[ ]:\n\n# Yes we need both these imports\nfrom pyspark.sql import SparkSession\nfrom pyspark.sql.functions import col, to_date, lit, isnull\nfrom pyspark.sql.types import *\nfrom pyspark.sql.types import StructField, StructType\nfrom pyspark.sql.catalog import UserDefinedFunction\nfrom pyspark.ml.feature import *\nfrom pyspark.ml.pipeline import Pipeline\nimport os\n\n# In[ ]:\n\n# In[ ]:\n\nfs_prefix = \"s3a://kf-book-examples/mailing-lists\"  # Create with mc as in ch1\n\n# In[ ]:\n\nos.environ[\"PYSPARK_PYTHON\"] = \"python3.6\"\n# See https://medium.com/@szinck/setting-up-pyspark-jupyter-and-minio-on-kubeflow-kubernetes-aab98874794f\nsession = (\n    SparkSession.builder.appName(\"processMailingListData\").config(\n        \"spark.executor.instances\",\n        \"8\").config(\"spark.driver.memoryOverhead\",\n                    \"0.25\").config(\"spark.executor.memory\", \"10g\").config(\n                        \"spark.dynamicAllocation.enabled\", \"false\").\n    config(\"spark.ui.enabled\", \"true\").config(\n        \"spark.kubernetes.container.image\",\n        \"gcr.io/boos-demo-projects-are-rad/kubeflow/spark-worker/spark-py-36:v3.0.0-preview2-23\"\n    ).config(\"spark.driver.bindAddress\",\n             \"0.0.0.0\").config(\"spark.kubernetes.namespace\",\n                               \"kubeflow-programmerboo\").\n    config(\"spark.master\", \"k8s://https://kubernetes.default\").config(\n        \"spark.driver.host\",\n        \"spark-driver.kubeflow-programmerboo.svc.cluster.local\").config(\n            \"spark.kubernetes.executor.annotation.sidecar.istio.io/inject\",\n            \"false\").config(\"spark.driver.port\",\n                            \"39235\").config(\"spark.blockManager.port\", \"39236\")\n    # If using minio - see https://github.com/minio/cookbook/blob/master/docs/apache-spark-with-minio.md\n    .config(\"spark.hadoop.fs.s3a.endpoint\",\n            \"minio-service.kubeflow.svc.cluster.local:9000\").config(\n                \"fs.s3a.connection.ssl.enabled\",\n                \"false\").config(\"fs.s3a.path.style.access\", \"true\")\n    # You can also add an account using the minio command as described in chapter 1\n    .config(\"spark.hadoop.fs.s3a.access.key\",\n            \"minio\").config(\"spark.hadoop.fs.s3a.secret.key\",\n                            \"minio123\")).getOrCreate()\nsc = session.sparkContext\n\n# In[ ]:\n\n#Load data from the previous stage\n#tag::load_data[]\ninitial_posts = session.read.format(\"parquet\").load(fs_prefix +\n                                                    \"/initial_posts\")\nids_in_reply = session.read.format(\"parquet\").load(fs_prefix + \"/ids_in_reply\")\n#end::load_data[]\n\n# In[ ]:\n\n# Load data from the previous stage while checking the schema\n#tag::load_with_schema[]\nids_schema = StructType([\n    StructField(\"In-Reply-To\", StringType(), nullable=True),\n    StructField(\"message-id\", StringType(), nullable=True)\n])\nids_in_reply = session.read.format(\"parquet\").schema(ids_schema).load(\n    fs_prefix + \"/ids_in_reply\")\n#end::load_with_schema[]\n\n# In[ ]:\n\n# Cache the data\ninitial_posts = initial_posts.alias(\"initial_posts\").cache()\nids_in_reply = ids_in_reply.alias(\"ids_in_reply\").cache()\n\n# In[ ]:\n\n# We can write random SQL -- although we need to wait for preview 3 cause it was taken out in preview1\n#tag::direct_sql[]\n#ids_in_reply.registerTempTable(\"cheese\")\n#no_text = session.sql(\"select * from cheese where body = '' AND subject = ''\")\n#end::direct_sql[]\n\n# In[ ]:\n\n# Drop bad data\n#tag::drop_bad_fields[]\ninitial_posts_count = initial_posts.count()\ninitial_posts_cleaned = initial_posts.na.drop(how='any',\n                                              subset=['body', 'from'])\ninitial_posts_cleaned_count = initial_posts_cleaned.count()\n#end::drop_bad_fields[]\n\n# In[ ]:\n\ninitial_posts.show()\n\n# In[ ]:\n\n# Start with computing the labels\n# Find the initial posts where no one replied\nposts_with_replies = (initial_posts.join(\n    ids_in_reply,\n    col(\"ids_in_reply.In-Reply-To\") == col(\"initial_posts.Message-Id\"),\n    \"left_outer\").filter(col(\"ids_in_reply.In-Reply-To\").isNotNull())).cache()\nposts_with_replies.count()\npost_ids_with_replies = (posts_with_replies.select(\n    col(\"initial_posts.Message-Id\").alias(\"id\")).withColumn(\n        \"has_reply\", lit(1.0))).alias(\"post_with_replies\")\n\njoined_posts = initial_posts.join(\n    post_ids_with_replies,\n    col(\"initial_posts.Message-Id\") == col(\"post_with_replies.id\"))\n\n# In[ ]:\n\njoined_posts.show()\n\n# In[ ]:\n\nposts_with_labels = joined_posts.na.fill({\"has_reply\": 0.0}).cache()\nposts_with_labels.count()\n\n# In[ ]:\n\n\ndef extract_links(body):\n    import re\n    link_regex_str = r'(http(|s)://(.*?))([\\s\\n]|$)'\n    itr = re.finditer(link_regex_str, body, re.MULTILINE)\n    return list(map(lambda elem: elem.group(1), itr))\n\n\ndef extract_domains(links):\n    from urllib.parse import urlparse\n\n    def extract_domain(link):\n        try:\n            nloc = urlparse(link).netloc\n            # We want to drop www and any extra spaces wtf nloc on the spaces.\n            regex_str = r'^(www\\.|)(.*?)\\s*$'\n            match = re.search(regex_str, nloc)\n            return match.group(2)\n        except:\n            return None\n\n    return list(map(extract_domain, links))\n\n\ndef contains_python_stack_trace(body):\n    return \"Traceback (most recent call last)\" in body\n\n\ndef contains_probably_java_stack_trace(body):\n    # Look for something based on regex\n    # Tried https://stackoverflow.com/questions/20609134/regular-expression-optional-multiline-java-stacktrace - more msg looking\n    # Tried https://stackoverflow.com/questions/3814327/regular-expression-to-parse-a-log-file-and-find-stacktraces\n    # Yes the compile is per call, but it's cached so w/e\n    import re\n    stack_regex_str = r'^\\s*(.+Exception.*):\\n(.*\\n){0,3}?(\\s+at\\s+.*\\(.*\\))+'\n    match = re.search(stack_regex_str, body, re.MULTILINE)\n    return match is not None\n\n\ndef contains_exception_in_task(body):\n    # Look for a line along the lines of ERROR Executor: Exception in task\n    return \"ERROR Executor: Exception in task\" in body\n\n\n# In[ ]:\n\nextract_links_udf = UserDefinedFunction(extract_links, ArrayType(StringType()),\n                                        \"extract_links\")\n\nsession.catalog._jsparkSession.udf().registerPython(\"extract_links\",\n                                                    extract_links_udf._judf)\n\nextract_domains_udf = UserDefinedFunction(extract_domains,\n                                          ArrayType(StringType()),\n                                          \"extract_domains\")\n\nsession.catalog._jsparkSession.udf().registerPython(\"extract_domains\",\n                                                    extract_domains_udf._judf)\n\ncontains_python_stack_trace_udf = UserDefinedFunction(\n    contains_python_stack_trace, BooleanType(), \"contains_python_stack_trace\")\n\nsession.catalog._jsparkSession.udf().registerPython(\n    \"contains_python_stack_trace\", contains_python_stack_trace_udf._judf)\n\ncontains_probably_java_stack_trace_udf = UserDefinedFunction(\n    contains_probably_java_stack_trace, BooleanType(),\n    \"contains_probably_java_stack_trace\")\n\nsession.catalog._jsparkSession.udf().registerPython(\n    \"contains_probably_java_stack_trace\",\n    contains_probably_java_stack_trace_udf._judf)\n\ncontains_exception_in_task_udf = UserDefinedFunction(\n    contains_exception_in_task, BooleanType(), \"contains_exception_in_task\")\n\nsession.catalog._jsparkSession.udf().registerPython(\n    \"contains_exception_in_task\", contains_exception_in_task_udf._judf)\n\n# We could make this a transformer stage, but I'm lazy so we'll just use a UDF directly.\n\n# In[ ]:\n\nannotated_spark_mailing_list_data = posts_with_labels.select(\n    \"*\",\n    extract_links_udf(posts_with_labels[\"body\"]).alias(\"links_in_email\"),\n    contains_python_stack_trace_udf(posts_with_labels.body).alias(\n        \"contains_python_stack_trace\").cast(\"double\"),\n    contains_probably_java_stack_trace_udf(posts_with_labels.body).alias(\n        \"contains_java_stack_trace\").cast(\"double\"),\n    contains_exception_in_task_udf(posts_with_labels.body).alias(\n        \"contains_exception_in_task\").cast(\"double\"))\n\n# In[ ]:\n\nannotated_spark_mailing_list_data.cache()\n\n# In[ ]:\n\nannotated_spark_mailing_list_data.show()\n\n# In[ ]:\n\nfurther_annotated = annotated_spark_mailing_list_data.withColumn(\n    \"domain_links\",\n    extract_domains_udf(annotated_spark_mailing_list_data.links_in_email))\n# Long story, allow mixed UDF types\nfurther_annotated.cache()\nfurther_annotated.count()\n\n# In[ ]:\n\n#tag::make_features[]\ntokenizer = Tokenizer(inputCol=\"body\", outputCol=\"body_tokens\")\nbody_hashing = HashingTF(inputCol=\"body_tokens\",\n                         outputCol=\"raw_body_features\",\n                         numFeatures=10000)\nbody_idf = IDF(inputCol=\"raw_body_features\", outputCol=\"body_features\")\nbody_word2Vec = Word2Vec(vectorSize=5,\n                         minCount=0,\n                         numPartitions=10,\n                         inputCol=\"body_tokens\",\n                         outputCol=\"body_vecs\")\nassembler = VectorAssembler(inputCols=[\n    \"body_features\", \"body_vecs\", \"contains_python_stack_trace\",\n    \"contains_java_stack_trace\", \"contains_exception_in_task\"\n],\n                            outputCol=\"features\")\n#end::make_features[]\n\n# In[ ]:\n\nfeatureprep_pipeline = Pipeline(\n    stages=[tokenizer, body_hashing, body_idf, body_word2Vec, assembler])\n\n# In[ ]:\n\nfeatureprep_pipeline_transformer = featureprep_pipeline.fit(further_annotated)\npreped_data = featureprep_pipeline_transformer.transform(further_annotated)\n\n# In[ ]:\n\nfeatureprep_pipeline_transformer.write().save(fs_prefix + \"/feature_prep-2\")\n\n# In[ ]:\n\npreped_data.write.format(\"parquet\").mode(\"overwrite\").save(fs_prefix +\n                                                           \"/prepared_data\")\n"
  },
  {
    "path": "feature-prep/tft/requirements.txt",
    "content": "tfx\ntensorflow\napache-beam\n"
  },
  {
    "path": "feature-prep/tft/transform.py",
    "content": "#tag::imports[]\nimport tensorflow as tf\nimport tensorflow_transform as tft\nfrom tensorflow_transform.tf_metadata import schema_utils\n#end::imports[]\n\n#tag::entry_point[]\n\n\ndef preprocessing_fn(inputs):\n    #end::entry_point[]\n    #tag::logic[]\n    outputs = {}\n    # TFT business logic goes here\n    outputs[\"body_stuff\"] = tft.compute_and_apply_vocabulary(inputs[\"body\"],\n                                                             top_k=1000)\n    return outputs\n\n\n#end::logic[]\n"
  },
  {
    "path": "gcp-setup/cloudshell_scrip.sh",
    "content": "#!/bin/bash\n# Note: this only works inside of cloudshell!\n#tag::cloudshell_script[]\nG_SOURCES=\"https://source.developers.google.com/p\"\ncloudshell_open \\\n  --repo_url \"$G_SOURCES/$PROJECTID/r/$PROJECTID-$DEPLOYMENTNAME-config\"\\\n  --dir\"v$KUBEFLOWVERSION/kubeflow/kf_util\" \\\n  --page \"editor\" \\\n  --tutorial \"conn.md\"\n#end::cloudshell_script[]\n"
  },
  {
    "path": "gcp-setup/setup-gcp.sh",
    "content": "#!/bin/bash\n#tag::ubuntu[]\napt-get install google-cloud-sdk\n#end::ubuntu[]\napt-get remove google-cloud-sdk\n#tag::general[]\ncurl https://sdk.cloud.google.com | bash\n#end::general[]\n#tag::enable_container_apis[]\ngcloud services enable container.googleapis.com\n#end::enable_container_apis[]\nPROJECT_ID=\"boos-demo-projects-are-rad\"\n#tag::configure_cloud_sdk[]\ngcloud auth login # Launches a web browser to login with\ngcloud config set project \"$PROJECT_ID\" #Project ID is your Google project ID\n#end::configure_cloud_sdk[]\nZONE=\"us-central1-a\" # For TPU access\nCLUSTER_NAME=\"ci-cluster\"\n#tag::launch_cluster[]\ngcloud beta container clusters create $CLUSTER_NAME \\\n       --zone $ZONE \\\n       --machine-type \"n1-standard-4\" \\\n       --disk-type \"pd-standard\" \\\n       --disk-size \"100\" \\\n       --scopes \"https://www.googleapis.com/auth/cloud-platform\" \\\n       --addons HorizontalPodAutoscaling,HttpLoadBalancing \\\n       --enable-autoupgrade \\\n       --enable-autorepair \\\n       --enable-autoscaling --min-nodes 1 --max-nodes 10 --num-nodes 2\n#end::launch_cluster[]\n#tag::delete_cluster[]\ngcloud beta container clusters delete $CLUSTER_NAME --zone $ZONE\n#end::delete_cluster[]\n"
  },
  {
    "path": "kfctl_gcp_iap.v1.0.1.yaml",
    "content": "apiVersion: kfdef.apps.kubeflow.org/v1\nkind: KfDef\nmetadata:\n  namespace: kubeflow\nspec:\n  applications:\n  - kustomizeConfig:\n      parameters:\n      - name: namespace\n        value: istio-system\n      repoRef:\n        name: manifests\n        path: istio/istio-crds\n    name: istio-crds\n  - kustomizeConfig:\n      parameters:\n      - name: namespace\n        value: istio-system\n      repoRef:\n        name: manifests\n        path: istio/istio-install\n    name: istio-install\n  - kustomizeConfig:\n      parameters:\n      - name: namespace\n        value: istio-system\n      repoRef:\n        name: manifests\n        path: istio/cluster-local-gateway\n    name: cluster-local-gateway\n  - kustomizeConfig:\n      parameters:\n      - name: namespace\n        value: istio-system\n      repoRef:\n        name: manifests\n        path: istio/kfserving-gateway\n    name: kfserving-gateway\n  - kustomizeConfig:\n      parameters:\n      - name: clusterRbacConfig\n        value: 'ON'\n      repoRef:\n        name: manifests\n        path: istio/istio\n    name: istio\n  - kustomizeConfig:\n      repoRef:\n        name: manifests\n        path: application/application-crds\n    name: application-crds\n  - kustomizeConfig:\n      overlays:\n      - application\n      repoRef:\n        name: manifests\n        path: application/application\n    name: application\n  - kustomizeConfig:\n      parameters:\n      - name: namespace\n        value: cert-manager\n      repoRef:\n        name: manifests\n        path: cert-manager/cert-manager-crds\n    name: cert-manager-crds\n  - kustomizeConfig:\n      parameters:\n      - name: namespace\n        value: kube-system\n      repoRef:\n        name: manifests\n        path: cert-manager/cert-manager-kube-system-resources\n    name: cert-manager-kube-system-resources\n  - kustomizeConfig:\n      overlays:\n      - self-signed\n      - application\n      parameters:\n      - name: namespace\n        value: cert-manager\n      repoRef:\n        name: manifests\n        path: cert-manager/cert-manager\n    name: cert-manager\n  - kustomizeConfig:\n      repoRef:\n        name: manifests\n        path: kubeflow-roles\n    name: kubeflow-roles\n  - kustomizeConfig:\n      repoRef:\n        name: manifests\n        path: metacontroller\n    name: metacontroller\n  - kustomizeConfig:\n      overlays:\n      - istio\n      - application\n      repoRef:\n        name: manifests\n        path: argo\n    name: argo\n  - kustomizeConfig:\n      overlays:\n      - istio\n      - application\n      parameters:\n      - name: userid-header\n        value: X-Goog-Authenticated-User-Email\n      - name: userid-prefix\n        value: 'accounts.google.com:'\n      repoRef:\n        name: manifests\n        path: common/centraldashboard\n    name: centraldashboard\n  - kustomizeConfig:\n      overlays:\n      - application\n      repoRef:\n        name: manifests\n        path: admission-webhook/webhook\n    name: webhook\n  - kustomizeConfig:\n      overlays:\n      - application\n      parameters:\n      - name: webhookNamePrefix\n        value: admission-webhook-\n      repoRef:\n        name: manifests\n        path: admission-webhook/bootstrap\n    name: bootstrap\n  - kustomizeConfig:\n      overlays:\n      - istio\n      - application\n      parameters:\n      - name: userid-header\n        value: X-Goog-Authenticated-User-Email\n      - name: userid-prefix\n        value: 'accounts.google.com:'\n      repoRef:\n        name: manifests\n        path: jupyter/jupyter-web-app\n    name: jupyter-web-app\n  - kustomizeConfig:\n      overlays:\n      - application\n      repoRef:\n        name: manifests\n        path: spark/spark-operator\n    name: spark-operator\n  - kustomizeConfig:\n      overlays:\n      - istio\n      - application\n      - db\n      repoRef:\n        name: manifests\n        path: metadata\n    name: metadata\n  - kustomizeConfig:\n      overlays:\n      - istio\n      - application\n      parameters:\n      - name: injectGcpCredentials\n        value: 'true'\n      repoRef:\n        name: manifests\n        path: jupyter/notebook-controller\n    name: notebook-controller\n  - kustomizeConfig:\n      overlays:\n      - application\n      repoRef:\n        name: manifests\n        path: pytorch-job/pytorch-job-crds\n    name: pytorch-job-crds\n  - kustomizeConfig:\n      overlays:\n      - application\n      repoRef:\n        name: manifests\n        path: pytorch-job/pytorch-operator\n    name: pytorch-operator\n  - kustomizeConfig:\n      overlays:\n      - application\n      parameters:\n      - name: namespace\n        value: knative-serving\n      repoRef:\n        name: manifests\n        path: knative/knative-serving-crds\n    name: knative-crds\n  - kustomizeConfig:\n      overlays:\n      - application\n      parameters:\n      - name: namespace\n        value: knative-serving\n      repoRef:\n        name: manifests\n        path: knative/knative-serving-install\n    name: knative-install\n  - kustomizeConfig:\n      overlays:\n      - application\n      repoRef:\n        name: manifests\n        path: kfserving/kfserving-crds\n    name: kfserving-crds\n  - kustomizeConfig:\n      overlays:\n      - application\n      repoRef:\n        name: manifests\n        path: kfserving/kfserving-install\n    name: kfserving-install\n  - kustomizeConfig:\n      overlays:\n      - application\n      parameters:\n      - name: usageId\n        value: '7439583937720421527'\n      - name: reportUsage\n        value: 'true'\n      repoRef:\n        name: manifests\n        path: common/spartakus\n    name: spartakus\n  - kustomizeConfig:\n      overlays:\n      - istio\n      repoRef:\n        name: manifests\n        path: tensorboard\n    name: tensorboard\n  - kustomizeConfig:\n      overlays:\n      - application\n      repoRef:\n        name: manifests\n        path: tf-training/tf-job-crds\n    name: tf-job-crds\n  - kustomizeConfig:\n      overlays:\n      - application\n      repoRef:\n        name: manifests\n        path: tf-training/tf-job-operator\n    name: tf-job-operator\n  - kustomizeConfig:\n      overlays:\n      - application\n      repoRef:\n        name: manifests\n        path: katib/katib-crds\n    name: katib-crds\n  - kustomizeConfig:\n      overlays:\n      - application\n      - istio\n      repoRef:\n        name: manifests\n        path: katib/katib-controller\n    name: katib-controller\n  - kustomizeConfig:\n      overlays:\n      - application\n      repoRef:\n        name: manifests\n        path: pipeline/api-service\n    name: api-service\n  - kustomizeConfig:\n      overlays:\n      - minioPd\n      - application\n      parameters:\n      - name: minioPd\n        value: test1-storage-artifact-store\n      - name: minioPvName\n        value: minio-pv\n      - name: minioPvcName\n        value: minio-pv-claim\n      repoRef:\n        name: manifests\n        path: pipeline/minio\n    name: minio\n  - kustomizeConfig:\n      overlays:\n      - mysqlPd\n      - application\n      parameters:\n      - name: mysqlPd\n        value: test1-storage-metadata-store\n      - name: mysqlPvName\n        value: mysql-pv\n      - name: mysqlPvcName\n        value: mysql-pv-claim\n      repoRef:\n        name: manifests\n        path: pipeline/mysql\n    name: mysql\n  - kustomizeConfig:\n      overlays:\n      - application\n      repoRef:\n        name: manifests\n        path: pipeline/persistent-agent\n    name: persistent-agent\n  - kustomizeConfig:\n      overlays:\n      - application\n      repoRef:\n        name: manifests\n        path: pipeline/pipelines-runner\n    name: pipelines-runner\n  - kustomizeConfig:\n      overlays:\n      - gcp\n      - istio\n      - application\n      repoRef:\n        name: manifests\n        path: pipeline/pipelines-ui\n    name: pipelines-ui\n  - kustomizeConfig:\n      overlays:\n      - application\n      repoRef:\n        name: manifests\n        path: pipeline/pipelines-viewer\n    name: pipelines-viewer\n  - kustomizeConfig:\n      overlays:\n      - application\n      repoRef:\n        name: manifests\n        path: pipeline/scheduledworkflow\n    name: scheduledworkflow\n  - kustomizeConfig:\n      overlays:\n      - application\n      repoRef:\n        name: manifests\n        path: pipeline/pipeline-visualization-service\n    name: pipeline-visualization-service\n  - kustomizeConfig:\n      overlays:\n      - application\n      repoRef:\n        name: manifests\n        path: gcp/cloud-endpoints\n    name: cloud-endpoints\n  - kustomizeConfig:\n      overlays:\n      - application\n      - istio\n      parameters:\n      - name: admin\n      - name: userid-header\n        value: X-Goog-Authenticated-User-Email\n      - name: userid-prefix\n        value: 'accounts.google.com:'\n      repoRef:\n        name: manifests\n        path: profiles\n    name: profiles\n  - kustomizeConfig:\n      overlays:\n      - application\n      repoRef:\n        name: manifests\n        path: gcp/gpu-driver\n    name: gpu-driver\n  - kustomizeConfig:\n      overlays:\n      - managed-cert\n      - application\n      parameters:\n      - name: namespace\n        value: istio-system\n      - name: ipName\n        value: test1-ip\n      - name: hostname\n      repoRef:\n        name: manifests\n        path: gcp/iap-ingress\n    name: iap-ingress\n  - kustomizeConfig:\n      overlays:\n      - application\n      repoRef:\n        name: manifests\n        path: seldon/seldon-core-operator\n    name: seldon-core-operator\n  - kustomizeConfig:\n      parameters:\n      - name: user\n      - name: profile-name\n        value: anonymous\n      repoRef:\n        name: manifests\n        path: default-install\n    name: default-install\n  plugins:\n  - kind: KfGcpPlugin\n    metadata:\n      creationTimestamp: null\n      name: gcp\n    spec:\n      createPipelinePersistentStorage: true\n      deploymentManagerConfig:\n        repoRef:\n          name: manifests\n          path: gcp/deployment_manager_configs\n      enableWorkloadIdentity: true\n      skipInitProject: true\n      useBasicAuth: false\n  repos:\n  - name: manifests\n    uri: https://github.com/holdenk/manifests/archive/fix-spark-crd.tar.gz\n  version: v1.0.1\n"
  },
  {
    "path": "pipelines/ControlStructures.ipynb",
    "content": "{\n \"cells\": [\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"# Simple Control structure\\n\",\n    \"\\n\",\n    \"Shows how to use conditional execution\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": 1,\n   \"metadata\": {},\n   \"outputs\": [\n    {\n     \"name\": \"stdout\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"Requirement already up-to-date: kfp in ./.local/lib/python3.6/site-packages (0.2.2.1)\\n\",\n      \"Requirement already satisfied, skipping upgrade: jsonschema>=3.0.1 in /usr/local/lib/python3.6/dist-packages (from kfp) (3.2.0)\\n\",\n      \"Requirement already satisfied, skipping upgrade: google-cloud-storage>=1.13.0 in /usr/local/lib/python3.6/dist-packages (from kfp) (1.25.0)\\n\",\n      \"Requirement already satisfied, skipping upgrade: python-dateutil in /usr/local/lib/python3.6/dist-packages (from kfp) (2.8.1)\\n\",\n      \"Requirement already satisfied, skipping upgrade: google-auth>=1.6.1 in /usr/local/lib/python3.6/dist-packages (from kfp) (1.11.0)\\n\",\n      \"Requirement already satisfied, skipping upgrade: argo-models==2.2.1a in ./.local/lib/python3.6/site-packages (from kfp) (2.2.1a0)\\n\",\n      \"Requirement already satisfied, skipping upgrade: urllib3<1.25,>=1.15 in ./.local/lib/python3.6/site-packages (from kfp) (1.24.3)\\n\",\n      \"Requirement already satisfied, skipping upgrade: cloudpickle==1.1.1 in ./.local/lib/python3.6/site-packages (from kfp) (1.1.1)\\n\",\n      \"Requirement already satisfied, skipping upgrade: kubernetes<=10.0.0,>=8.0.0 in ./.local/lib/python3.6/site-packages (from kfp) (10.0.0)\\n\",\n      \"Requirement already satisfied, skipping upgrade: click==7.0 in ./.local/lib/python3.6/site-packages (from kfp) (7.0)\\n\",\n      \"Requirement already satisfied, skipping upgrade: certifi in /usr/local/lib/python3.6/dist-packages (from kfp) (2019.11.28)\\n\",\n      \"Requirement already satisfied, skipping upgrade: PyYAML in /usr/local/lib/python3.6/dist-packages (from kfp) (5.3)\\n\",\n      \"Requirement already satisfied, skipping upgrade: kfp-server-api<=0.1.40,>=0.1.18 in ./.local/lib/python3.6/site-packages (from kfp) (0.1.40)\\n\",\n      \"Requirement already satisfied, skipping upgrade: six>=1.10 in /usr/lib/python3/dist-packages (from kfp) (1.11.0)\\n\",\n      \"Requirement already satisfied, skipping upgrade: PyJWT>=1.6.4 in ./.local/lib/python3.6/site-packages (from kfp) (1.7.1)\\n\",\n      \"Requirement already satisfied, skipping upgrade: Deprecated in ./.local/lib/python3.6/site-packages (from kfp) (1.2.7)\\n\",\n      \"Requirement already satisfied, skipping upgrade: requests-toolbelt>=0.8.0 in ./.local/lib/python3.6/site-packages (from kfp) (0.9.1)\\n\",\n      \"Requirement already satisfied, skipping upgrade: cryptography>=2.4.2 in ./.local/lib/python3.6/site-packages (from kfp) (2.8)\\n\",\n      \"Requirement already satisfied, skipping upgrade: tabulate==0.8.3 in ./.local/lib/python3.6/site-packages (from kfp) (0.8.3)\\n\",\n      \"Requirement already satisfied, skipping upgrade: importlib-metadata; python_version < \\\"3.8\\\" in /usr/local/lib/python3.6/dist-packages (from jsonschema>=3.0.1->kfp) (1.4.0)\\n\",\n      \"Requirement already satisfied, skipping upgrade: attrs>=17.4.0 in /usr/local/lib/python3.6/dist-packages (from jsonschema>=3.0.1->kfp) (19.3.0)\\n\",\n      \"Requirement already satisfied, skipping upgrade: pyrsistent>=0.14.0 in /usr/local/lib/python3.6/dist-packages (from jsonschema>=3.0.1->kfp) (0.15.7)\\n\",\n      \"Requirement already satisfied, skipping upgrade: setuptools in /usr/local/lib/python3.6/dist-packages (from jsonschema>=3.0.1->kfp) (45.1.0)\\n\",\n      \"Requirement already satisfied, skipping upgrade: google-resumable-media<0.6dev,>=0.5.0 in /usr/local/lib/python3.6/dist-packages (from google-cloud-storage>=1.13.0->kfp) (0.5.0)\\n\",\n      \"Requirement already satisfied, skipping upgrade: google-cloud-core<2.0dev,>=1.2.0 in /usr/local/lib/python3.6/dist-packages (from google-cloud-storage>=1.13.0->kfp) (1.3.0)\\n\",\n      \"Requirement already satisfied, skipping upgrade: cachetools<5.0,>=2.0.0 in /usr/local/lib/python3.6/dist-packages (from google-auth>=1.6.1->kfp) (4.0.0)\\n\",\n      \"Requirement already satisfied, skipping upgrade: pyasn1-modules>=0.2.1 in /usr/local/lib/python3.6/dist-packages (from google-auth>=1.6.1->kfp) (0.2.8)\\n\",\n      \"Requirement already satisfied, skipping upgrade: rsa<4.1,>=3.1.4 in /usr/local/lib/python3.6/dist-packages (from google-auth>=1.6.1->kfp) (4.0)\\n\",\n      \"Requirement already satisfied, skipping upgrade: requests-oauthlib in /usr/local/lib/python3.6/dist-packages (from kubernetes<=10.0.0,>=8.0.0->kfp) (1.3.0)\\n\",\n      \"Requirement already satisfied, skipping upgrade: requests in /usr/local/lib/python3.6/dist-packages (from kubernetes<=10.0.0,>=8.0.0->kfp) (2.22.0)\\n\",\n      \"Requirement already satisfied, skipping upgrade: websocket-client!=0.40.0,!=0.41.*,!=0.42.*,>=0.32.0 in /usr/local/lib/python3.6/dist-packages (from kubernetes<=10.0.0,>=8.0.0->kfp) (0.57.0)\\n\",\n      \"Requirement already satisfied, skipping upgrade: wrapt<2,>=1.10 in /usr/local/lib/python3.6/dist-packages (from Deprecated->kfp) (1.11.2)\\n\",\n      \"Requirement already satisfied, skipping upgrade: cffi!=1.11.3,>=1.8 in ./.local/lib/python3.6/site-packages (from cryptography>=2.4.2->kfp) (1.14.0)\\n\",\n      \"Requirement already satisfied, skipping upgrade: zipp>=0.5 in /usr/local/lib/python3.6/dist-packages (from importlib-metadata; python_version < \\\"3.8\\\"->jsonschema>=3.0.1->kfp) (2.1.0)\\n\",\n      \"Requirement already satisfied, skipping upgrade: google-api-core<2.0.0dev,>=1.16.0 in /usr/local/lib/python3.6/dist-packages (from google-cloud-core<2.0dev,>=1.2.0->google-cloud-storage>=1.13.0->kfp) (1.16.0)\\n\",\n      \"Requirement already satisfied, skipping upgrade: pyasn1<0.5.0,>=0.4.6 in /usr/local/lib/python3.6/dist-packages (from pyasn1-modules>=0.2.1->google-auth>=1.6.1->kfp) (0.4.8)\\n\",\n      \"Requirement already satisfied, skipping upgrade: oauthlib>=3.0.0 in /usr/local/lib/python3.6/dist-packages (from requests-oauthlib->kubernetes<=10.0.0,>=8.0.0->kfp) (3.1.0)\\n\",\n      \"Requirement already satisfied, skipping upgrade: chardet<3.1.0,>=3.0.2 in /usr/local/lib/python3.6/dist-packages (from requests->kubernetes<=10.0.0,>=8.0.0->kfp) (3.0.4)\\n\",\n      \"Requirement already satisfied, skipping upgrade: idna<2.9,>=2.5 in /usr/lib/python3/dist-packages (from requests->kubernetes<=10.0.0,>=8.0.0->kfp) (2.6)\\n\",\n      \"Requirement already satisfied, skipping upgrade: pycparser in ./.local/lib/python3.6/site-packages (from cffi!=1.11.3,>=1.8->cryptography>=2.4.2->kfp) (2.19)\\n\",\n      \"Requirement already satisfied, skipping upgrade: protobuf>=3.4.0 in /usr/local/lib/python3.6/dist-packages (from google-api-core<2.0.0dev,>=1.16.0->google-cloud-core<2.0dev,>=1.2.0->google-cloud-storage>=1.13.0->kfp) (3.11.2)\\n\",\n      \"Requirement already satisfied, skipping upgrade: pytz in /usr/local/lib/python3.6/dist-packages (from google-api-core<2.0.0dev,>=1.16.0->google-cloud-core<2.0dev,>=1.2.0->google-cloud-storage>=1.13.0->kfp) (2019.3)\\n\",\n      \"Requirement already satisfied, skipping upgrade: googleapis-common-protos<2.0dev,>=1.6.0 in /usr/local/lib/python3.6/dist-packages (from google-api-core<2.0.0dev,>=1.16.0->google-cloud-core<2.0dev,>=1.2.0->google-cloud-storage>=1.13.0->kfp) (1.51.0)\\n\"\n     ]\n    }\n   ],\n   \"source\": [\n    \"!pip install kfp --upgrade --user\\n\",\n    \"\\n\",\n    \"import kfp\\n\",\n    \"from kfp import dsl\\n\",\n    \"from kfp.components import func_to_container_op, InputPath, OutputPath\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"# Functions\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": 2,\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"@func_to_container_op\\n\",\n    \"def get_random_int_op(minimum: int, maximum: int) -> int:\\n\",\n    \"    \\\"\\\"\\\"Generate a random number between minimum and maximum (inclusive).\\\"\\\"\\\"\\n\",\n    \"    import random\\n\",\n    \"    result = random.randint(minimum, maximum)\\n\",\n    \"    print(result)\\n\",\n    \"    return result\\n\",\n    \"\\n\",\n    \"@func_to_container_op\\n\",\n    \"def process_small_op(data: int):\\n\",\n    \"    \\\"\\\"\\\"Process small numbers.\\\"\\\"\\\"\\n\",\n    \"    print(\\\"Processing small result\\\", data)\\n\",\n    \"    return\\n\",\n    \"\\n\",\n    \"@func_to_container_op\\n\",\n    \"def process_medium_op(data: int):\\n\",\n    \"    \\\"\\\"\\\"Process medium numbers.\\\"\\\"\\\"\\n\",\n    \"    print(\\\"Processing medium result\\\", data)\\n\",\n    \"    return\\n\",\n    \"\\n\",\n    \"@func_to_container_op\\n\",\n    \"def process_large_op(data: int):\\n\",\n    \"    \\\"\\\"\\\"Process large numbers.\\\"\\\"\\\"\\n\",\n    \"    print(\\\"Processing large result\\\", data)\\n\",\n    \"    return\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"# Conditional pipeline\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": 3,\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"@dsl.pipeline(\\n\",\n    \"    name='Conditional execution pipeline',\\n\",\n    \"    description='Shows how to use dsl.Condition().'\\n\",\n    \")\\n\",\n    \"def conditional_pipeline():\\n\",\n    \"    number = get_random_int_op(0, 100).output\\n\",\n    \"    with dsl.Condition(number < 10):\\n\",\n    \"        process_small_op(number)\\n\",\n    \"    with dsl.Condition(number > 10 and number < 50):\\n\",\n    \"        process_medium_op(number)\\n\",\n    \"    with dsl.Condition(number > 50):\\n\",\n    \"        process_large_op(number)\\n\",\n    \"        \"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"# Submit the pipeline for execution:\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": 4,\n   \"metadata\": {},\n   \"outputs\": [\n    {\n     \"data\": {\n      \"text/html\": [\n       \"Experiment link <a href=\\\"/pipeline/#/experiments/details/2abe16d1-fa2e-4f49-a3a5-acad8d36790d\\\" target=\\\"_blank\\\" >here</a>\"\n      ],\n      \"text/plain\": [\n       \"<IPython.core.display.HTML object>\"\n      ]\n     },\n     \"metadata\": {},\n     \"output_type\": \"display_data\"\n    },\n    {\n     \"data\": {\n      \"text/html\": [\n       \"Run link <a href=\\\"/pipeline/#/runs/details/293a92c5-50b2-4a96-bbd4-ebc85106f337\\\" target=\\\"_blank\\\" >here</a>\"\n      ],\n      \"text/plain\": [\n       \"<IPython.core.display.HTML object>\"\n      ]\n     },\n     \"metadata\": {},\n     \"output_type\": \"display_data\"\n    },\n    {\n     \"data\": {\n      \"text/plain\": [\n       \"RunPipelineResult(run_id=293a92c5-50b2-4a96-bbd4-ebc85106f337)\"\n      ]\n     },\n     \"execution_count\": 4,\n     \"metadata\": {},\n     \"output_type\": \"execute_result\"\n    }\n   ],\n   \"source\": [\n    \"kfp.Client().create_run_from_pipeline_func(conditional_pipeline, arguments={})\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": null,\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": []\n  }\n ],\n \"metadata\": {\n  \"kernelspec\": {\n   \"display_name\": \"Python 3\",\n   \"language\": \"python\",\n   \"name\": \"python3\"\n  },\n  \"language_info\": {\n   \"codemirror_mode\": {\n    \"name\": \"ipython\",\n    \"version\": 3\n   },\n   \"file_extension\": \".py\",\n   \"mimetype\": \"text/x-python\",\n   \"name\": \"python\",\n   \"nbconvert_exporter\": \"python\",\n   \"pygments_lexer\": \"ipython3\",\n   \"version\": \"3.6.9\"\n  }\n },\n \"nbformat\": 4,\n \"nbformat_minor\": 2\n}\n"
  },
  {
    "path": "pipelines/Lightweight Pipeline.ipynb",
    "content": "{\n \"cells\": [\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"# Setup\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": 1,\n   \"metadata\": {},\n   \"outputs\": [\n    {\n     \"name\": \"stdout\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"Requirement already up-to-date: kfp in ./.local/lib/python3.6/site-packages (0.2.2.1)\\n\",\n      \"Requirement already satisfied, skipping upgrade: PyYAML in /usr/local/lib/python3.6/dist-packages (from kfp) (5.3)\\n\",\n      \"Requirement already satisfied, skipping upgrade: requests-toolbelt>=0.8.0 in ./.local/lib/python3.6/site-packages (from kfp) (0.9.1)\\n\",\n      \"Requirement already satisfied, skipping upgrade: click==7.0 in ./.local/lib/python3.6/site-packages (from kfp) (7.0)\\n\",\n      \"Requirement already satisfied, skipping upgrade: Deprecated in ./.local/lib/python3.6/site-packages (from kfp) (1.2.7)\\n\",\n      \"Requirement already satisfied, skipping upgrade: kubernetes<=10.0.0,>=8.0.0 in ./.local/lib/python3.6/site-packages (from kfp) (10.0.0)\\n\",\n      \"Requirement already satisfied, skipping upgrade: six>=1.10 in /usr/lib/python3/dist-packages (from kfp) (1.11.0)\\n\",\n      \"Requirement already satisfied, skipping upgrade: certifi in /usr/local/lib/python3.6/dist-packages (from kfp) (2019.11.28)\\n\",\n      \"Requirement already satisfied, skipping upgrade: google-cloud-storage>=1.13.0 in /usr/local/lib/python3.6/dist-packages (from kfp) (1.25.0)\\n\",\n      \"Requirement already satisfied, skipping upgrade: PyJWT>=1.6.4 in ./.local/lib/python3.6/site-packages (from kfp) (1.7.1)\\n\",\n      \"Requirement already satisfied, skipping upgrade: cryptography>=2.4.2 in ./.local/lib/python3.6/site-packages (from kfp) (2.8)\\n\",\n      \"Requirement already satisfied, skipping upgrade: google-auth>=1.6.1 in /usr/local/lib/python3.6/dist-packages (from kfp) (1.11.0)\\n\",\n      \"Requirement already satisfied, skipping upgrade: cloudpickle==1.1.1 in ./.local/lib/python3.6/site-packages (from kfp) (1.1.1)\\n\",\n      \"Requirement already satisfied, skipping upgrade: kfp-server-api<=0.1.40,>=0.1.18 in ./.local/lib/python3.6/site-packages (from kfp) (0.1.40)\\n\",\n      \"Requirement already satisfied, skipping upgrade: argo-models==2.2.1a in ./.local/lib/python3.6/site-packages (from kfp) (2.2.1a0)\\n\",\n      \"Requirement already satisfied, skipping upgrade: tabulate==0.8.3 in ./.local/lib/python3.6/site-packages (from kfp) (0.8.3)\\n\",\n      \"Requirement already satisfied, skipping upgrade: python-dateutil in /usr/local/lib/python3.6/dist-packages (from kfp) (2.8.1)\\n\",\n      \"Requirement already satisfied, skipping upgrade: urllib3<1.25,>=1.15 in ./.local/lib/python3.6/site-packages (from kfp) (1.24.3)\\n\",\n      \"Requirement already satisfied, skipping upgrade: jsonschema>=3.0.1 in /usr/local/lib/python3.6/dist-packages (from kfp) (3.2.0)\\n\",\n      \"Requirement already satisfied, skipping upgrade: requests<3.0.0,>=2.0.1 in /usr/local/lib/python3.6/dist-packages (from requests-toolbelt>=0.8.0->kfp) (2.22.0)\\n\",\n      \"Requirement already satisfied, skipping upgrade: wrapt<2,>=1.10 in /usr/local/lib/python3.6/dist-packages (from Deprecated->kfp) (1.11.2)\\n\",\n      \"Requirement already satisfied, skipping upgrade: setuptools>=21.0.0 in /usr/local/lib/python3.6/dist-packages (from kubernetes<=10.0.0,>=8.0.0->kfp) (45.1.0)\\n\",\n      \"Requirement already satisfied, skipping upgrade: websocket-client!=0.40.0,!=0.41.*,!=0.42.*,>=0.32.0 in /usr/local/lib/python3.6/dist-packages (from kubernetes<=10.0.0,>=8.0.0->kfp) (0.57.0)\\n\",\n      \"Requirement already satisfied, skipping upgrade: requests-oauthlib in /usr/local/lib/python3.6/dist-packages (from kubernetes<=10.0.0,>=8.0.0->kfp) (1.3.0)\\n\",\n      \"Requirement already satisfied, skipping upgrade: google-cloud-core<2.0dev,>=1.2.0 in /usr/local/lib/python3.6/dist-packages (from google-cloud-storage>=1.13.0->kfp) (1.3.0)\\n\",\n      \"Requirement already satisfied, skipping upgrade: google-resumable-media<0.6dev,>=0.5.0 in /usr/local/lib/python3.6/dist-packages (from google-cloud-storage>=1.13.0->kfp) (0.5.0)\\n\",\n      \"Requirement already satisfied, skipping upgrade: cffi!=1.11.3,>=1.8 in ./.local/lib/python3.6/site-packages (from cryptography>=2.4.2->kfp) (1.14.0)\\n\",\n      \"Requirement already satisfied, skipping upgrade: cachetools<5.0,>=2.0.0 in /usr/local/lib/python3.6/dist-packages (from google-auth>=1.6.1->kfp) (4.0.0)\\n\",\n      \"Requirement already satisfied, skipping upgrade: rsa<4.1,>=3.1.4 in /usr/local/lib/python3.6/dist-packages (from google-auth>=1.6.1->kfp) (4.0)\\n\",\n      \"Requirement already satisfied, skipping upgrade: pyasn1-modules>=0.2.1 in /usr/local/lib/python3.6/dist-packages (from google-auth>=1.6.1->kfp) (0.2.8)\\n\",\n      \"Requirement already satisfied, skipping upgrade: attrs>=17.4.0 in /usr/local/lib/python3.6/dist-packages (from jsonschema>=3.0.1->kfp) (19.3.0)\\n\",\n      \"Requirement already satisfied, skipping upgrade: pyrsistent>=0.14.0 in /usr/local/lib/python3.6/dist-packages (from jsonschema>=3.0.1->kfp) (0.15.7)\\n\",\n      \"Requirement already satisfied, skipping upgrade: importlib-metadata; python_version < \\\"3.8\\\" in /usr/local/lib/python3.6/dist-packages (from jsonschema>=3.0.1->kfp) (1.4.0)\\n\",\n      \"Requirement already satisfied, skipping upgrade: chardet<3.1.0,>=3.0.2 in /usr/local/lib/python3.6/dist-packages (from requests<3.0.0,>=2.0.1->requests-toolbelt>=0.8.0->kfp) (3.0.4)\\n\",\n      \"Requirement already satisfied, skipping upgrade: idna<2.9,>=2.5 in /usr/lib/python3/dist-packages (from requests<3.0.0,>=2.0.1->requests-toolbelt>=0.8.0->kfp) (2.6)\\n\",\n      \"Requirement already satisfied, skipping upgrade: oauthlib>=3.0.0 in /usr/local/lib/python3.6/dist-packages (from requests-oauthlib->kubernetes<=10.0.0,>=8.0.0->kfp) (3.1.0)\\n\",\n      \"Requirement already satisfied, skipping upgrade: google-api-core<2.0.0dev,>=1.16.0 in /usr/local/lib/python3.6/dist-packages (from google-cloud-core<2.0dev,>=1.2.0->google-cloud-storage>=1.13.0->kfp) (1.16.0)\\n\",\n      \"Requirement already satisfied, skipping upgrade: pycparser in ./.local/lib/python3.6/site-packages (from cffi!=1.11.3,>=1.8->cryptography>=2.4.2->kfp) (2.19)\\n\",\n      \"Requirement already satisfied, skipping upgrade: pyasn1>=0.1.3 in /usr/local/lib/python3.6/dist-packages (from rsa<4.1,>=3.1.4->google-auth>=1.6.1->kfp) (0.4.8)\\n\",\n      \"Requirement already satisfied, skipping upgrade: zipp>=0.5 in /usr/local/lib/python3.6/dist-packages (from importlib-metadata; python_version < \\\"3.8\\\"->jsonschema>=3.0.1->kfp) (2.1.0)\\n\",\n      \"Requirement already satisfied, skipping upgrade: googleapis-common-protos<2.0dev,>=1.6.0 in /usr/local/lib/python3.6/dist-packages (from google-api-core<2.0.0dev,>=1.16.0->google-cloud-core<2.0dev,>=1.2.0->google-cloud-storage>=1.13.0->kfp) (1.51.0)\\n\",\n      \"Requirement already satisfied, skipping upgrade: pytz in /usr/local/lib/python3.6/dist-packages (from google-api-core<2.0.0dev,>=1.16.0->google-cloud-core<2.0dev,>=1.2.0->google-cloud-storage>=1.13.0->kfp) (2019.3)\\n\",\n      \"Requirement already satisfied, skipping upgrade: protobuf>=3.4.0 in /usr/local/lib/python3.6/dist-packages (from google-api-core<2.0.0dev,>=1.16.0->google-cloud-core<2.0dev,>=1.2.0->google-cloud-storage>=1.13.0->kfp) (3.11.2)\\n\"\n     ]\n    }\n   ],\n   \"source\": [\n    \"!pip install kfp --upgrade --user\\n\",\n    \"\\n\",\n    \"import kfp \\n\",\n    \"from kfp import compiler\\n\",\n    \"import kfp.dsl as dsl\\n\",\n    \"import kfp.notebook\\n\",\n    \"import kfp.components as comp\\n\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"Simple function that just add two numbers:\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": 2,\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"#Define a Python function\\n\",\n    \"def add(a: float, b: float) -> float:\\n\",\n    \"   '''Calculates sum of two arguments'''\\n\",\n    \"   return a + b\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"Convert the function to a pipeline operation\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": 3,\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"add_op = comp.func_to_container_op(add)\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"A bit more advanced function which demonstrates how to use imports, helper functions and produce multiple outputs.\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": 4,\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"from typing import NamedTuple\\n\",\n    \"def my_divmod(dividend: float, divisor:float) -> NamedTuple('MyDivmodOutput', [('quotient', float), ('remainder', float)]):\\n\",\n    \"    '''Divides two numbers and calculate  the quotient and remainder'''\\n\",\n    \"    #Imports inside a component function:\\n\",\n    \"    import numpy as np\\n\",\n    \"\\n\",\n    \"    #This function demonstrates how to use nested functions inside a component function:\\n\",\n    \"    def divmod_helper(dividend, divisor):\\n\",\n    \"        return np.divmod(dividend, divisor)\\n\",\n    \"\\n\",\n    \"    (quotient, remainder) = divmod_helper(dividend, divisor)\\n\",\n    \"\\n\",\n    \"    from collections import namedtuple\\n\",\n    \"    divmod_output = namedtuple('MyDivmodOutput', ['quotient', 'remainder'])\\n\",\n    \"    return divmod_output(quotient, remainder)\\n\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"Test running the python function directly\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": 5,\n   \"metadata\": {},\n   \"outputs\": [\n    {\n     \"data\": {\n      \"text/plain\": [\n       \"MyDivmodOutput(quotient=14, remainder=2)\"\n      ]\n     },\n     \"execution_count\": 5,\n     \"metadata\": {},\n     \"output_type\": \"execute_result\"\n    }\n   ],\n   \"source\": [\n    \"my_divmod(100, 7)\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"Convert the function to a pipeline operation\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": 6,\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"divmod_op = comp.func_to_container_op(my_divmod, base_image='tensorflow/tensorflow:1.14.0-py3')\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"Define the pipeline\\n\",\n    \"Pipeline function has to be decorated with the @dsl.pipeline decorator\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": 7,\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"@dsl.pipeline(\\n\",\n    \"   name='Calculation pipeline',\\n\",\n    \"   description='A toy pipeline that performs arithmetic calculations.'\\n\",\n    \")\\n\",\n    \"def calc_pipeline(\\n\",\n    \"   a='a',\\n\",\n    \"   b='7',\\n\",\n    \"   c='17',\\n\",\n    \"):\\n\",\n    \"    #Passing pipeline parameter and a constant value as operation arguments\\n\",\n    \"    add_task = add_op(a, 4) #Returns a dsl.ContainerOp class instance. \\n\",\n    \"    \\n\",\n    \"    #Passing a task output reference as operation arguments\\n\",\n    \"    #For an operation with a single return value, the output reference can be accessed using `task.output` or `task.outputs['output_name']` syntax\\n\",\n    \"    divmod_task = divmod_op(add_task.output, b)\\n\",\n    \"\\n\",\n    \"    #For an operation with a multiple return values, the output references can be accessed using `task.outputs['output_name']` syntax\\n\",\n    \"    result_task = add_op(divmod_task.outputs['quotient'], c)\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"Submit the pipeline for execution\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": 8,\n   \"metadata\": {},\n   \"outputs\": [\n    {\n     \"data\": {\n      \"text/html\": [\n       \"Experiment link <a href=\\\"/pipeline/#/experiments/details/2abe16d1-fa2e-4f49-a3a5-acad8d36790d\\\" target=\\\"_blank\\\" >here</a>\"\n      ],\n      \"text/plain\": [\n       \"<IPython.core.display.HTML object>\"\n      ]\n     },\n     \"metadata\": {},\n     \"output_type\": \"display_data\"\n    },\n    {\n     \"data\": {\n      \"text/html\": [\n       \"Run link <a href=\\\"/pipeline/#/runs/details/87276776-0c3a-4d4e-99d0-4563b7f42fa5\\\" target=\\\"_blank\\\" >here</a>\"\n      ],\n      \"text/plain\": [\n       \"<IPython.core.display.HTML object>\"\n      ]\n     },\n     \"metadata\": {},\n     \"output_type\": \"display_data\"\n    },\n    {\n     \"data\": {\n      \"text/plain\": [\n       \"RunPipelineResult(run_id=87276776-0c3a-4d4e-99d0-4563b7f42fa5)\"\n      ]\n     },\n     \"execution_count\": 8,\n     \"metadata\": {},\n     \"output_type\": \"execute_result\"\n    }\n   ],\n   \"source\": [\n    \"client = kfp.Client()\\n\",\n    \"\\n\",\n    \"#Specify pipeline argument values\\n\",\n    \"arguments = {'a': '7', 'b': '8'}\\n\",\n    \"\\n\",\n    \"#Submit a pipeline run\\n\",\n    \"client.create_run_from_pipeline_func(calc_pipeline, arguments=arguments)\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": null,\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": []\n  }\n ],\n \"metadata\": {\n  \"kernelspec\": {\n   \"display_name\": \"Python 3\",\n   \"language\": \"python\",\n   \"name\": \"python3\"\n  },\n  \"language_info\": {\n   \"codemirror_mode\": {\n    \"name\": \"ipython\",\n    \"version\": 3\n   },\n   \"file_extension\": \".py\",\n   \"mimetype\": \"text/x-python\",\n   \"name\": \"python\",\n   \"nbconvert_exporter\": \"python\",\n   \"pygments_lexer\": \"ipython3\",\n   \"version\": \"3.6.9\"\n  }\n },\n \"nbformat\": 4,\n \"nbformat_minor\": 2\n}\n"
  },
  {
    "path": "pipelines/RecommenderPipeline.ipynb",
    "content": "{\n \"cells\": [\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"# Kubeflow pipeline\\n\",\n    \"This is a fairly simple pipeline, containing sequential steps:\\n\",\n    \"\\n\",\n    \"1. Update data - implemented by lightbend/recommender-data-update-publisher:0.2 image\\n\",\n    \"2. Run model training. Ideally we would run TFJob, but due to the current limitations for pipelines, we will directly use an image implementing training lightbend/ml-tf-recommender:0.1\\n\",\n    \"3. Update serving model - implemented by lightbend/recommender-model-publisher:0.2\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"# Setup\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": 1,\n   \"metadata\": {},\n   \"outputs\": [\n    {\n     \"name\": \"stdout\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"Requirement already up-to-date: kubernetes in ./.local/lib/python3.6/site-packages (10.0.1)\\n\",\n      \"Requirement already satisfied, skipping upgrade: pyyaml>=3.12 in /usr/local/lib/python3.6/dist-packages (from kubernetes) (5.3)\\n\",\n      \"Requirement already satisfied, skipping upgrade: six>=1.9.0 in /usr/lib/python3/dist-packages (from kubernetes) (1.11.0)\\n\",\n      \"Requirement already satisfied, skipping upgrade: urllib3>=1.24.2 in ./.local/lib/python3.6/site-packages (from kubernetes) (1.24.3)\\n\",\n      \"Requirement already satisfied, skipping upgrade: requests-oauthlib in /usr/local/lib/python3.6/dist-packages (from kubernetes) (1.3.0)\\n\",\n      \"Requirement already satisfied, skipping upgrade: certifi>=14.05.14 in /usr/local/lib/python3.6/dist-packages (from kubernetes) (2019.11.28)\\n\",\n      \"Requirement already satisfied, skipping upgrade: python-dateutil>=2.5.3 in /usr/local/lib/python3.6/dist-packages (from kubernetes) (2.8.1)\\n\",\n      \"Requirement already satisfied, skipping upgrade: setuptools>=21.0.0 in /usr/local/lib/python3.6/dist-packages (from kubernetes) (45.1.0)\\n\",\n      \"Requirement already satisfied, skipping upgrade: requests in /usr/local/lib/python3.6/dist-packages (from kubernetes) (2.22.0)\\n\",\n      \"Requirement already satisfied, skipping upgrade: websocket-client!=0.40.0,!=0.41.*,!=0.42.*,>=0.32.0 in /usr/local/lib/python3.6/dist-packages (from kubernetes) (0.57.0)\\n\",\n      \"Requirement already satisfied, skipping upgrade: google-auth>=1.0.1 in /usr/local/lib/python3.6/dist-packages (from kubernetes) (1.11.0)\\n\",\n      \"Requirement already satisfied, skipping upgrade: oauthlib>=3.0.0 in /usr/local/lib/python3.6/dist-packages (from requests-oauthlib->kubernetes) (3.1.0)\\n\",\n      \"Requirement already satisfied, skipping upgrade: idna<2.9,>=2.5 in /usr/lib/python3/dist-packages (from requests->kubernetes) (2.6)\\n\",\n      \"Requirement already satisfied, skipping upgrade: chardet<3.1.0,>=3.0.2 in /usr/local/lib/python3.6/dist-packages (from requests->kubernetes) (3.0.4)\\n\",\n      \"Requirement already satisfied, skipping upgrade: pyasn1-modules>=0.2.1 in /usr/local/lib/python3.6/dist-packages (from google-auth>=1.0.1->kubernetes) (0.2.8)\\n\",\n      \"Requirement already satisfied, skipping upgrade: cachetools<5.0,>=2.0.0 in /usr/local/lib/python3.6/dist-packages (from google-auth>=1.0.1->kubernetes) (4.0.0)\\n\",\n      \"Requirement already satisfied, skipping upgrade: rsa<4.1,>=3.1.4 in /usr/local/lib/python3.6/dist-packages (from google-auth>=1.0.1->kubernetes) (4.0)\\n\",\n      \"Requirement already satisfied, skipping upgrade: pyasn1<0.5.0,>=0.4.6 in /usr/local/lib/python3.6/dist-packages (from pyasn1-modules>=0.2.1->google-auth>=1.0.1->kubernetes) (0.4.8)\\n\",\n      \"Requirement already up-to-date: kfp in ./.local/lib/python3.6/site-packages (0.2.2.1)\\n\",\n      \"Requirement already satisfied, skipping upgrade: PyJWT>=1.6.4 in ./.local/lib/python3.6/site-packages (from kfp) (1.7.1)\\n\",\n      \"Requirement already satisfied, skipping upgrade: requests-toolbelt>=0.8.0 in ./.local/lib/python3.6/site-packages (from kfp) (0.9.1)\\n\",\n      \"Requirement already satisfied, skipping upgrade: python-dateutil in /usr/local/lib/python3.6/dist-packages (from kfp) (2.8.1)\\n\",\n      \"Requirement already satisfied, skipping upgrade: PyYAML in /usr/local/lib/python3.6/dist-packages (from kfp) (5.3)\\n\",\n      \"Requirement already satisfied, skipping upgrade: kfp-server-api<=0.1.40,>=0.1.18 in ./.local/lib/python3.6/site-packages (from kfp) (0.1.40)\\n\",\n      \"Requirement already satisfied, skipping upgrade: google-cloud-storage>=1.13.0 in /usr/local/lib/python3.6/dist-packages (from kfp) (1.25.0)\\n\",\n      \"Requirement already satisfied, skipping upgrade: jsonschema>=3.0.1 in /usr/local/lib/python3.6/dist-packages (from kfp) (3.2.0)\\n\",\n      \"Requirement already satisfied, skipping upgrade: Deprecated in ./.local/lib/python3.6/site-packages (from kfp) (1.2.7)\\n\",\n      \"Requirement already satisfied, skipping upgrade: google-auth>=1.6.1 in /usr/local/lib/python3.6/dist-packages (from kfp) (1.11.0)\\n\",\n      \"Collecting kubernetes<=10.0.0,>=8.0.0\\n\",\n      \"  Using cached kubernetes-10.0.0-py2.py3-none-any.whl (1.5 MB)\\n\",\n      \"Requirement already satisfied, skipping upgrade: argo-models==2.2.1a in ./.local/lib/python3.6/site-packages (from kfp) (2.2.1a0)\\n\",\n      \"Requirement already satisfied, skipping upgrade: urllib3<1.25,>=1.15 in ./.local/lib/python3.6/site-packages (from kfp) (1.24.3)\\n\",\n      \"Requirement already satisfied, skipping upgrade: certifi in /usr/local/lib/python3.6/dist-packages (from kfp) (2019.11.28)\\n\",\n      \"Requirement already satisfied, skipping upgrade: tabulate==0.8.3 in ./.local/lib/python3.6/site-packages (from kfp) (0.8.3)\\n\",\n      \"Requirement already satisfied, skipping upgrade: click==7.0 in ./.local/lib/python3.6/site-packages (from kfp) (7.0)\\n\",\n      \"Requirement already satisfied, skipping upgrade: cloudpickle==1.1.1 in ./.local/lib/python3.6/site-packages (from kfp) (1.1.1)\\n\",\n      \"Requirement already satisfied, skipping upgrade: six>=1.10 in /usr/lib/python3/dist-packages (from kfp) (1.11.0)\\n\",\n      \"Requirement already satisfied, skipping upgrade: cryptography>=2.4.2 in ./.local/lib/python3.6/site-packages (from kfp) (2.8)\\n\",\n      \"Requirement already satisfied, skipping upgrade: requests<3.0.0,>=2.0.1 in /usr/local/lib/python3.6/dist-packages (from requests-toolbelt>=0.8.0->kfp) (2.22.0)\\n\",\n      \"Requirement already satisfied, skipping upgrade: google-resumable-media<0.6dev,>=0.5.0 in /usr/local/lib/python3.6/dist-packages (from google-cloud-storage>=1.13.0->kfp) (0.5.0)\\n\",\n      \"Requirement already satisfied, skipping upgrade: google-cloud-core<2.0dev,>=1.2.0 in /usr/local/lib/python3.6/dist-packages (from google-cloud-storage>=1.13.0->kfp) (1.3.0)\\n\",\n      \"Requirement already satisfied, skipping upgrade: setuptools in /usr/local/lib/python3.6/dist-packages (from jsonschema>=3.0.1->kfp) (45.1.0)\\n\",\n      \"Requirement already satisfied, skipping upgrade: attrs>=17.4.0 in /usr/local/lib/python3.6/dist-packages (from jsonschema>=3.0.1->kfp) (19.3.0)\\n\",\n      \"Requirement already satisfied, skipping upgrade: importlib-metadata; python_version < \\\"3.8\\\" in /usr/local/lib/python3.6/dist-packages (from jsonschema>=3.0.1->kfp) (1.4.0)\\n\",\n      \"Requirement already satisfied, skipping upgrade: pyrsistent>=0.14.0 in /usr/local/lib/python3.6/dist-packages (from jsonschema>=3.0.1->kfp) (0.15.7)\\n\",\n      \"Requirement already satisfied, skipping upgrade: wrapt<2,>=1.10 in /usr/local/lib/python3.6/dist-packages (from Deprecated->kfp) (1.11.2)\\n\",\n      \"Requirement already satisfied, skipping upgrade: rsa<4.1,>=3.1.4 in /usr/local/lib/python3.6/dist-packages (from google-auth>=1.6.1->kfp) (4.0)\\n\",\n      \"Requirement already satisfied, skipping upgrade: cachetools<5.0,>=2.0.0 in /usr/local/lib/python3.6/dist-packages (from google-auth>=1.6.1->kfp) (4.0.0)\\n\",\n      \"Requirement already satisfied, skipping upgrade: pyasn1-modules>=0.2.1 in /usr/local/lib/python3.6/dist-packages (from google-auth>=1.6.1->kfp) (0.2.8)\\n\",\n      \"Requirement already satisfied, skipping upgrade: websocket-client!=0.40.0,!=0.41.*,!=0.42.*,>=0.32.0 in /usr/local/lib/python3.6/dist-packages (from kubernetes<=10.0.0,>=8.0.0->kfp) (0.57.0)\\n\",\n      \"Requirement already satisfied, skipping upgrade: requests-oauthlib in /usr/local/lib/python3.6/dist-packages (from kubernetes<=10.0.0,>=8.0.0->kfp) (1.3.0)\\n\",\n      \"Requirement already satisfied, skipping upgrade: cffi!=1.11.3,>=1.8 in ./.local/lib/python3.6/site-packages (from cryptography>=2.4.2->kfp) (1.14.0)\\n\",\n      \"Requirement already satisfied, skipping upgrade: chardet<3.1.0,>=3.0.2 in /usr/local/lib/python3.6/dist-packages (from requests<3.0.0,>=2.0.1->requests-toolbelt>=0.8.0->kfp) (3.0.4)\\n\",\n      \"Requirement already satisfied, skipping upgrade: idna<2.9,>=2.5 in /usr/lib/python3/dist-packages (from requests<3.0.0,>=2.0.1->requests-toolbelt>=0.8.0->kfp) (2.6)\\n\",\n      \"Requirement already satisfied, skipping upgrade: google-api-core<2.0.0dev,>=1.16.0 in /usr/local/lib/python3.6/dist-packages (from google-cloud-core<2.0dev,>=1.2.0->google-cloud-storage>=1.13.0->kfp) (1.16.0)\\n\",\n      \"Requirement already satisfied, skipping upgrade: zipp>=0.5 in /usr/local/lib/python3.6/dist-packages (from importlib-metadata; python_version < \\\"3.8\\\"->jsonschema>=3.0.1->kfp) (2.1.0)\\n\",\n      \"Requirement already satisfied, skipping upgrade: pyasn1>=0.1.3 in /usr/local/lib/python3.6/dist-packages (from rsa<4.1,>=3.1.4->google-auth>=1.6.1->kfp) (0.4.8)\\n\",\n      \"Requirement already satisfied, skipping upgrade: oauthlib>=3.0.0 in /usr/local/lib/python3.6/dist-packages (from requests-oauthlib->kubernetes<=10.0.0,>=8.0.0->kfp) (3.1.0)\\n\",\n      \"Requirement already satisfied, skipping upgrade: pycparser in ./.local/lib/python3.6/site-packages (from cffi!=1.11.3,>=1.8->cryptography>=2.4.2->kfp) (2.19)\\n\",\n      \"Requirement already satisfied, skipping upgrade: googleapis-common-protos<2.0dev,>=1.6.0 in /usr/local/lib/python3.6/dist-packages (from google-api-core<2.0.0dev,>=1.16.0->google-cloud-core<2.0dev,>=1.2.0->google-cloud-storage>=1.13.0->kfp) (1.51.0)\\n\",\n      \"Requirement already satisfied, skipping upgrade: protobuf>=3.4.0 in /usr/local/lib/python3.6/dist-packages (from google-api-core<2.0.0dev,>=1.16.0->google-cloud-core<2.0dev,>=1.2.0->google-cloud-storage>=1.13.0->kfp) (3.11.2)\\n\",\n      \"Requirement already satisfied, skipping upgrade: pytz in /usr/local/lib/python3.6/dist-packages (from google-api-core<2.0.0dev,>=1.16.0->google-cloud-core<2.0dev,>=1.2.0->google-cloud-storage>=1.13.0->kfp) (2019.3)\\n\",\n      \"Installing collected packages: kubernetes\\n\",\n      \"  Attempting uninstall: kubernetes\\n\",\n      \"    Found existing installation: kubernetes 10.0.1\\n\",\n      \"    Uninstalling kubernetes-10.0.1:\\n\",\n      \"      Successfully uninstalled kubernetes-10.0.1\\n\",\n      \"Successfully installed kubernetes-10.0.0\\n\"\n     ]\n    }\n   ],\n   \"source\": [\n    \"!pip install kubernetes --upgrade --user\\n\",\n    \"!pip install kfp --upgrade --user\\n\",\n    \"\\n\",\n    \"\\n\",\n    \"import kfp  # the Pipelines SDK.  This library is included with the notebook image.\\n\",\n    \"from kfp import compiler\\n\",\n    \"import kfp.dsl as dsl\\n\",\n    \"import kfp.notebook\\n\",\n    \"from kubernetes import client as k8s_client\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"# Create/Get an Experiment in the Kubeflow Pipeline System\\n\",\n    \"The Kubeflow Pipeline system requires an \\\"Experiment\\\" to group pipeline runs. You can create a new experiment, or call client.list_experiments() to get existing ones.\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": 3,\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"client = kfp.Client()\\n\",\n    \"client.list_experiments()\\n\",\n    \"#exp = client.create_experiment(name='mdupdate')\\n\",\n    \"exp = client.get_experiment(experiment_name ='mdupdate')\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"# Define a Pipeline\\n\",\n    \"Authoring a pipeline is like authoring a normal Python function. The pipeline function describes the topology of the pipeline.\\n\",\n    \"\\n\",\n    \"Each step in the pipeline is typically a ContainerOp --- a simple class or function describing how to interact with a docker container image. In the pipeline, all the container images referenced in the pipeline are already built.\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": 4,\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"@dsl.pipeline(\\n\",\n    \"  name='Recommender model update',\\n\",\n    \"  description='Demonstrate usage of pipelines for multi-step model update'\\n\",\n    \")\\n\",\n    \"def recommender_pipeline():\\n\",\n    \"    # Load new data\\n\",\n    \"  data = dsl.ContainerOp(\\n\",\n    \"      name='updatedata',\\n\",\n    \"      image='lightbend/recommender-data-update-publisher:0.2') \\\\\\n\",\n    \"    .add_env_variable(k8s_client.V1EnvVar(name='MINIO_URL',value='http://minio-service.kubeflow.svc.cluster.local:9000')) \\\\\\n\",\n    \"    .add_env_variable(k8s_client.V1EnvVar(name='MINIO_KEY', value='minio')) \\\\\\n\",\n    \"    .add_env_variable(k8s_client.V1EnvVar(name='MINIO_SECRET', value='minio123'))\\n\",\n    \"    # Train the model\\n\",\n    \"  train = dsl.ContainerOp(\\n\",\n    \"      name='trainmodel',\\n\",\n    \"      image='lightbend/ml-tf-recommender:0.1') \\\\\\n\",\n    \"    .add_env_variable(k8s_client.V1EnvVar(name='MINIO_URL',value='minio-service.kubeflow.svc.cluster.local:9000')) \\\\\\n\",\n    \"    .add_env_variable(k8s_client.V1EnvVar(name='MINIO_KEY', value='minio')) \\\\\\n\",\n    \"    .add_env_variable(k8s_client.V1EnvVar(name='MINIO_SECRET', value='minio123'))\\n\",\n    \"  train.after(data)\\n\",\n    \"    # Publish new model model\\n\",\n    \"  publish = dsl.ContainerOp(\\n\",\n    \"      name='publishmodel',\\n\",\n    \"      image='lightbend/recommender-model-publisher:0.2') \\\\\\n\",\n    \"    .add_env_variable(k8s_client.V1EnvVar(name='MINIO_URL',value='http://minio-service.kubeflow.svc.cluster.local:9000')) \\\\\\n\",\n    \"    .add_env_variable(k8s_client.V1EnvVar(name='MINIO_KEY', value='minio')) \\\\\\n\",\n    \"    .add_env_variable(k8s_client.V1EnvVar(name='MINIO_SECRET', value='minio123')) \\\\\\n\",\n    \"    .add_env_variable(k8s_client.V1EnvVar(name='KAFKA_BROKERS', value='cloudflow-kafka-brokers.cloudflow.svc.cluster.local:9092')) \\\\\\n\",\n    \"    .add_env_variable(k8s_client.V1EnvVar(name='DEFAULT_RECOMMENDER_URL', value='http://recommendermodelserver.kubeflow.svc.cluster.local:8501')) \\\\\\n\",\n    \"    .add_env_variable(k8s_client.V1EnvVar(name='ALTERNATIVE_RECOMMENDER_URL', value='http://recommendermodelserver1.kubeflow.svc.cluster.local:8501'))\\n\",\n    \"  publish.after(train)\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"# Compile pipeline\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": 5,\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"compiler.Compiler().compile(recommender_pipeline, 'pipeline.tar.gz')\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"# Submit an experiment run\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": 6,\n   \"metadata\": {},\n   \"outputs\": [\n    {\n     \"data\": {\n      \"text/html\": [\n       \"Run link <a href=\\\"/pipeline/#/runs/details/df24284c-c7a1-480e-91b6-398bd352f164\\\" target=\\\"_blank\\\" >here</a>\"\n      ],\n      \"text/plain\": [\n       \"<IPython.core.display.HTML object>\"\n      ]\n     },\n     \"metadata\": {},\n     \"output_type\": \"display_data\"\n    }\n   ],\n   \"source\": [\n    \"run = client.run_pipeline(exp.id, 'pipeline1', 'pipeline.tar.gz')\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": null,\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": []\n  }\n ],\n \"metadata\": {\n  \"kernelspec\": {\n   \"display_name\": \"Python 3\",\n   \"language\": \"python\",\n   \"name\": \"python3\"\n  },\n  \"language_info\": {\n   \"codemirror_mode\": {\n    \"name\": \"ipython\",\n    \"version\": 3\n   },\n   \"file_extension\": \".py\",\n   \"mimetype\": \"text/x-python\",\n   \"name\": \"python\",\n   \"nbconvert_exporter\": \"python\",\n   \"pygments_lexer\": \"ipython3\",\n   \"version\": \"3.6.9\"\n  }\n },\n \"nbformat\": 4,\n \"nbformat_minor\": 2\n}\n"
  },
  {
    "path": "pipelines/download_components.sh",
    "content": "#!/bin/bash\n#tag::dlPipelineRelease[]\nwget https://github.com/kubeflow/pipelines/archive/0.2.5.tar.gz\ntar -xvf 0.2.5.tar.gz\n#end::dlPipelineRelease[]\n"
  },
  {
    "path": "recommender/Dockerfile",
    "content": "FROM  tensorflow/tensorflow:1.12.0-devel-py3\nRUN pip3 install --upgrade pip\nRUN pip3 install pandas --upgrade\nRUN pip3 install keras --upgrade\nRUN pip3 install minio --upgrade\nRUN mkdir -p /opt/kubeflow\nCOPY Recommender_Kubeflow.py /opt/kubeflow/\nENTRYPOINT [\"python3\", \"/opt/kubeflow/Recommender_Kubeflow.py\"]\n"
  },
  {
    "path": "recommender/Recommender_Kubeflow.ipynb",
    "content": "{\n \"cells\": [\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"# This is implementation of the Recommender training\\n\",\n    \"\\n\",\n    \"This implementation takes a list of users and their purchasing history to calculate prediction\\n\",\n    \"on the probability that they would by a certain product.\\n\",\n    \"The implementation is structured in 2 parts:\\n\",\n    \"1. Build rating matrix based on the purchasing history. The implementation is based on this blog post\\n\",\n    \"https://medium.com/datadriveninvestor/how-to-build-a-recommendation-system-for-purchase-data-step-by-step-d6d7a78800b6\\n\",\n    \"2. Build collabarative filtering model based on the rating matrix. The implementation is based on this project https://github.com/Piyushdharkar/Collaborative-Filtering-Using-Keras \\n\",\n    \"\\n\",\n    \"Implementation is leveraging Minio for storing both source data and result models\\n\",\n    \"\\n\",\n    \"It also uses Python kubernetes client for re starting model server pod\\n\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"# 1. Install libraries\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": 1,\n   \"metadata\": {},\n   \"outputs\": [\n    {\n     \"name\": \"stdout\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"Collecting pandas\\n\",\n      \"  Downloading pandas-1.0.1-cp36-cp36m-manylinux1_x86_64.whl (10.1 MB)\\n\",\n      \"\\u001b[K     |████████████████████████████████| 10.1 MB 3.2 MB/s eta 0:00:01\\n\",\n      \"\\u001b[?25hRequirement already satisfied, skipping upgrade: numpy>=1.13.3 in /usr/local/lib/python3.6/dist-packages (from pandas) (1.18.1)\\n\",\n      \"Requirement already satisfied, skipping upgrade: python-dateutil>=2.6.1 in /usr/local/lib/python3.6/dist-packages (from pandas) (2.8.1)\\n\",\n      \"Requirement already satisfied, skipping upgrade: pytz>=2017.2 in /usr/local/lib/python3.6/dist-packages (from pandas) (2019.3)\\n\",\n      \"Requirement already satisfied, skipping upgrade: six>=1.5 in /usr/lib/python3/dist-packages (from python-dateutil>=2.6.1->pandas) (1.11.0)\\n\",\n      \"Installing collected packages: pandas\\n\",\n      \"Successfully installed pandas-1.0.1\\n\",\n      \"Collecting keras\\n\",\n      \"  Downloading Keras-2.3.1-py2.py3-none-any.whl (377 kB)\\n\",\n      \"\\u001b[K     |████████████████████████████████| 377 kB 3.2 MB/s eta 0:00:01\\n\",\n      \"\\u001b[?25hRequirement already satisfied, skipping upgrade: h5py in /usr/local/lib/python3.6/dist-packages (from keras) (2.10.0)\\n\",\n      \"Requirement already satisfied, skipping upgrade: numpy>=1.9.1 in /usr/local/lib/python3.6/dist-packages (from keras) (1.18.1)\\n\",\n      \"Requirement already satisfied, skipping upgrade: keras-applications>=1.0.6 in /usr/local/lib/python3.6/dist-packages (from keras) (1.0.8)\\n\",\n      \"Requirement already satisfied, skipping upgrade: scipy>=0.14 in /usr/local/lib/python3.6/dist-packages (from keras) (1.4.1)\\n\",\n      \"Requirement already satisfied, skipping upgrade: keras-preprocessing>=1.0.5 in /usr/local/lib/python3.6/dist-packages (from keras) (1.1.0)\\n\",\n      \"Requirement already satisfied, skipping upgrade: six>=1.9.0 in /usr/lib/python3/dist-packages (from keras) (1.11.0)\\n\",\n      \"Requirement already satisfied, skipping upgrade: pyyaml in /usr/local/lib/python3.6/dist-packages (from keras) (5.3)\\n\",\n      \"Installing collected packages: keras\\n\",\n      \"Successfully installed keras-2.3.1\\n\",\n      \"Collecting minio\\n\",\n      \"  Downloading minio-5.0.7-py2.py3-none-any.whl (71 kB)\\n\",\n      \"\\u001b[K     |████████████████████████████████| 71 kB 1.9 MB/s eta 0:00:011\\n\",\n      \"\\u001b[?25hRequirement already satisfied, skipping upgrade: python-dateutil in /usr/local/lib/python3.6/dist-packages (from minio) (2.8.1)\\n\",\n      \"Requirement already satisfied, skipping upgrade: urllib3 in ./.local/lib/python3.6/site-packages (from minio) (1.24.3)\\n\",\n      \"Requirement already satisfied, skipping upgrade: pytz in /usr/local/lib/python3.6/dist-packages (from minio) (2019.3)\\n\",\n      \"Requirement already satisfied, skipping upgrade: certifi in /usr/local/lib/python3.6/dist-packages (from minio) (2019.11.28)\\n\",\n      \"Collecting configparser\\n\",\n      \"  Downloading configparser-4.0.2-py2.py3-none-any.whl (22 kB)\\n\",\n      \"Requirement already satisfied, skipping upgrade: six>=1.5 in /usr/lib/python3/dist-packages (from python-dateutil->minio) (1.11.0)\\n\",\n      \"Installing collected packages: configparser, minio\\n\",\n      \"Successfully installed configparser-4.0.2 minio-5.0.7\\n\",\n      \"Collecting kubernetes\\n\",\n      \"  Downloading kubernetes-10.0.1-py2.py3-none-any.whl (1.5 MB)\\n\",\n      \"\\u001b[K     |████████████████████████████████| 1.5 MB 3.4 MB/s eta 0:00:01\\n\",\n      \"\\u001b[?25hRequirement already satisfied, skipping upgrade: certifi>=14.05.14 in /usr/local/lib/python3.6/dist-packages (from kubernetes) (2019.11.28)\\n\",\n      \"Requirement already satisfied, skipping upgrade: python-dateutil>=2.5.3 in /usr/local/lib/python3.6/dist-packages (from kubernetes) (2.8.1)\\n\",\n      \"Requirement already satisfied, skipping upgrade: pyyaml>=3.12 in /usr/local/lib/python3.6/dist-packages (from kubernetes) (5.3)\\n\",\n      \"Requirement already satisfied, skipping upgrade: six>=1.9.0 in /usr/lib/python3/dist-packages (from kubernetes) (1.11.0)\\n\",\n      \"Requirement already satisfied, skipping upgrade: websocket-client!=0.40.0,!=0.41.*,!=0.42.*,>=0.32.0 in /usr/local/lib/python3.6/dist-packages (from kubernetes) (0.57.0)\\n\",\n      \"Requirement already satisfied, skipping upgrade: urllib3>=1.24.2 in ./.local/lib/python3.6/site-packages (from kubernetes) (1.24.3)\\n\",\n      \"Requirement already satisfied, skipping upgrade: setuptools>=21.0.0 in /usr/local/lib/python3.6/dist-packages (from kubernetes) (45.1.0)\\n\",\n      \"Requirement already satisfied, skipping upgrade: requests in /usr/local/lib/python3.6/dist-packages (from kubernetes) (2.22.0)\\n\",\n      \"Requirement already satisfied, skipping upgrade: google-auth>=1.0.1 in /usr/local/lib/python3.6/dist-packages (from kubernetes) (1.11.0)\\n\",\n      \"Requirement already satisfied, skipping upgrade: requests-oauthlib in /usr/local/lib/python3.6/dist-packages (from kubernetes) (1.3.0)\\n\",\n      \"Requirement already satisfied, skipping upgrade: idna<2.9,>=2.5 in /usr/lib/python3/dist-packages (from requests->kubernetes) (2.6)\\n\",\n      \"Requirement already satisfied, skipping upgrade: chardet<3.1.0,>=3.0.2 in /usr/local/lib/python3.6/dist-packages (from requests->kubernetes) (3.0.4)\\n\",\n      \"Requirement already satisfied, skipping upgrade: pyasn1-modules>=0.2.1 in /usr/local/lib/python3.6/dist-packages (from google-auth>=1.0.1->kubernetes) (0.2.8)\\n\",\n      \"Requirement already satisfied, skipping upgrade: rsa<4.1,>=3.1.4 in /usr/local/lib/python3.6/dist-packages (from google-auth>=1.0.1->kubernetes) (4.0)\\n\",\n      \"Requirement already satisfied, skipping upgrade: cachetools<5.0,>=2.0.0 in /usr/local/lib/python3.6/dist-packages (from google-auth>=1.0.1->kubernetes) (4.0.0)\\n\",\n      \"Requirement already satisfied, skipping upgrade: oauthlib>=3.0.0 in /usr/local/lib/python3.6/dist-packages (from requests-oauthlib->kubernetes) (3.1.0)\\n\",\n      \"Requirement already satisfied, skipping upgrade: pyasn1<0.5.0,>=0.4.6 in /usr/local/lib/python3.6/dist-packages (from pyasn1-modules>=0.2.1->google-auth>=1.0.1->kubernetes) (0.4.8)\\n\",\n      \"\\u001b[31mERROR: kfp 0.2.2.1 has requirement kubernetes<=10.0.0,>=8.0.0, but you'll have kubernetes 10.0.1 which is incompatible.\\u001b[0m\\n\",\n      \"Installing collected packages: kubernetes\\n\",\n      \"  Attempting uninstall: kubernetes\\n\",\n      \"    Found existing installation: kubernetes 10.0.0\\n\",\n      \"    Uninstalling kubernetes-10.0.0:\\n\",\n      \"      Successfully uninstalled kubernetes-10.0.0\\n\",\n      \"Successfully installed kubernetes-10.0.1\\n\",\n      \"Collecting kfmd\\n\",\n      \"  Downloading kfmd-0.1.8.tar.gz (29 kB)\\n\",\n      \"Building wheels for collected packages: kfmd\\n\",\n      \"  Building wheel for kfmd (setup.py) ... \\u001b[?25ldone\\n\",\n      \"\\u001b[?25h  Created wheel for kfmd: filename=kfmd-0.1.8-py3-none-any.whl size=65919 sha256=c65ab8ff649134dbe6c8391743d5361546e5b29e6df9c0ff13915c99b67be1e7\\n\",\n      \"  Stored in directory: /home/jovyan/.cache/pip/wheels/54/6b/5c/f063f501d5c632c93566ed967f2f0c36bad3b384d68c83aa65\\n\",\n      \"Successfully built kfmd\\n\",\n      \"Installing collected packages: kfmd\\n\",\n      \"Successfully installed kfmd-0.1.8\\n\"\n     ]\n    }\n   ],\n   \"source\": [\n    \"!pip install pandas --upgrade --user\\n\",\n    \"!pip install keras --upgrade --user\\n\",\n    \"!pip install minio --upgrade --user\\n\",\n    \"!pip install kubernetes --upgrade --user\\n\",\n    \"!pip install kfmd --upgrade --user\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"## imports\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": 2,\n   \"metadata\": {},\n   \"outputs\": [\n    {\n     \"name\": \"stderr\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"Using TensorFlow backend.\\n\"\n     ]\n    }\n   ],\n   \"source\": [\n    \"import pandas as pd\\n\",\n    \"import numpy as np\\n\",\n    \"import time\\n\",\n    \"from minio import Minio\\n\",\n    \"from keras.models import Model\\n\",\n    \"from keras.layers import *\\n\",\n    \"from keras.losses import *\\n\",\n    \"import tensorflow as tf\\n\",\n    \"import os\\n\",\n    \"from kfmd import metadata\\n\",\n    \"from datetime import datetime\\n\",\n    \"from keras import backend as K\\n\",\n    \"from kubernetes import client as k8s_client, config as k8s_config\\n\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"Create a workspace, run and execution\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": 3,\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"execTime = datetime.utcnow().isoformat(\\\"T\\\")\\n\",\n    \"ws = metadata.Workspace(\\n\",\n    \"    # Connect to metadata-service in namesapce kubeflow in k8s cluster.\\n\",\n    \"    backend_url_prefix=\\\"metadata-service.kubeflow.svc.cluster.local:8080\\\",\\n\",\n    \"    name=\\\"recommender\\\",\\n\",\n    \"    description=\\\"a workspace for saving recommender experiments\\\")\\n\",\n    \"r = metadata.Run(\\n\",\n    \"    workspace=ws,\\n\",\n    \"    name=\\\"run-\\\" + execTime ,\\n\",\n    \"    description=\\\"recommender run\\\",\\n\",\n    \")\\n\",\n    \"exec = metadata.Execution(\\n\",\n    \"    name = \\\"execution\\\" + execTime ,\\n\",\n    \"    workspace=ws,\\n\",\n    \"    run=r,\\n\",\n    \"    description=\\\"recommender ML execution\\\",\\n\",\n    \")\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"# 2. Read data\\n\",\n    \"\\n\",\n    \"For reading data we are using two diffierent approaches:\\n\",\n    \"1. We use Tensorflow build in support to write resulting model to Minio\\n\",\n    \"2. We use Minio APIs to read source data using Pandas. We could of use Boto APIs here instead.\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": 4,\n   \"metadata\": {},\n   \"outputs\": [\n    {\n     \"name\": \"stdout\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"Minio parameters : URL  minio-service.kubeflow.svc.cluster.local:9000  key  minio  secret  minio123\\n\"\n     ]\n    }\n   ],\n   \"source\": [\n    \"minio_endpoint = os.environ.get('MINIO_URL', 'minio-service.kubeflow.svc.cluster.local:9000')\\n\",\n    \"minio_key = os.environ.get('MINIO_KEY', 'minio')\\n\",\n    \"minio_secret = os.environ.get('MINIO_SECRET', 'minio123')\\n\",\n    \"\\n\",\n    \"print('Minio parameters : URL ', minio_endpoint, ' key ', minio_key, ' secret ', minio_secret)\\n\",\n    \"\\n\",\n    \"os.environ['AWS_ACCESS_KEY_ID'] = minio_key\\n\",\n    \"os.environ['AWS_SECRET_ACCESS_KEY'] = minio_secret\\n\",\n    \"os.environ['AWS_REGION'] = 'us-west-1'\\n\",\n    \"os.environ['S3_REGION'] = 'us-west-1'\\n\",\n    \"os.environ['S3_ENDPOINT'] = minio_endpoint\\n\",\n    \"os.environ['S3_USE_HTTPS'] = '0'\\n\",\n    \"os.environ['S3_VERIFY_SSL'] = '0'\\n\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": 5,\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"minioClient = Minio(minio_endpoint,\\n\",\n    \"                    access_key=minio_key,\\n\",\n    \"                    secret_key=minio_secret,\\n\",\n    \"                    secure=False)\\n\",\n    \"\\n\",\n    \"minioClient.fget_object('data', 'recommender/users.csv', '/tmp/users.csv')\\n\",\n    \"customers = pd.read_csv('/tmp/users.csv')\\n\",\n    \"minioClient.fget_object('data', 'recommender/transactions.csv', '/tmp/transactions.csv')\\n\",\n    \"transactions = pd.read_csv('/tmp/transactions.csv')\\n\",\n    \"\\n\",\n    \"#Log experiment data set\\n\",\n    \"data_set = exec.log_input(\\n\",\n    \"        metadata.DataSet(\\n\",\n    \"            description=\\\"recommender current transactions and customers\\\",\\n\",\n    \"            name=\\\"Current transactions and customers\\\",\\n\",\n    \"            version=execTime,\\n\",\n    \"            uri=\\\"minio:/tmp/transactions.csv; minio:/tmp/users.csv\\\"))\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": 6,\n   \"metadata\": {},\n   \"outputs\": [\n    {\n     \"name\": \"stdout\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"(1000, 1)\\n\"\n     ]\n    },\n    {\n     \"data\": {\n      \"text/html\": [\n       \"<div>\\n\",\n       \"<style scoped>\\n\",\n       \"    .dataframe tbody tr th:only-of-type {\\n\",\n       \"        vertical-align: middle;\\n\",\n       \"    }\\n\",\n       \"\\n\",\n       \"    .dataframe tbody tr th {\\n\",\n       \"        vertical-align: top;\\n\",\n       \"    }\\n\",\n       \"\\n\",\n       \"    .dataframe thead th {\\n\",\n       \"        text-align: right;\\n\",\n       \"    }\\n\",\n       \"</style>\\n\",\n       \"<table border=\\\"1\\\" class=\\\"dataframe\\\">\\n\",\n       \"  <thead>\\n\",\n       \"    <tr style=\\\"text-align: right;\\\">\\n\",\n       \"      <th></th>\\n\",\n       \"      <th>customerId</th>\\n\",\n       \"    </tr>\\n\",\n       \"  </thead>\\n\",\n       \"  <tbody>\\n\",\n       \"    <tr>\\n\",\n       \"      <th>0</th>\\n\",\n       \"      <td>1553</td>\\n\",\n       \"    </tr>\\n\",\n       \"    <tr>\\n\",\n       \"      <th>1</th>\\n\",\n       \"      <td>20400</td>\\n\",\n       \"    </tr>\\n\",\n       \"    <tr>\\n\",\n       \"      <th>2</th>\\n\",\n       \"      <td>19750</td>\\n\",\n       \"    </tr>\\n\",\n       \"    <tr>\\n\",\n       \"      <th>3</th>\\n\",\n       \"      <td>6334</td>\\n\",\n       \"    </tr>\\n\",\n       \"    <tr>\\n\",\n       \"      <th>4</th>\\n\",\n       \"      <td>27773</td>\\n\",\n       \"    </tr>\\n\",\n       \"  </tbody>\\n\",\n       \"</table>\\n\",\n       \"</div>\"\n      ],\n      \"text/plain\": [\n       \"   customerId\\n\",\n       \"0        1553\\n\",\n       \"1       20400\\n\",\n       \"2       19750\\n\",\n       \"3        6334\\n\",\n       \"4       27773\"\n      ]\n     },\n     \"execution_count\": 6,\n     \"metadata\": {},\n     \"output_type\": \"execute_result\"\n    }\n   ],\n   \"source\": [\n    \"print(customers.shape)\\n\",\n    \"customers.head()\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": 7,\n   \"metadata\": {},\n   \"outputs\": [\n    {\n     \"name\": \"stdout\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"(62483, 2)\\n\"\n     ]\n    },\n    {\n     \"data\": {\n      \"text/html\": [\n       \"<div>\\n\",\n       \"<style scoped>\\n\",\n       \"    .dataframe tbody tr th:only-of-type {\\n\",\n       \"        vertical-align: middle;\\n\",\n       \"    }\\n\",\n       \"\\n\",\n       \"    .dataframe tbody tr th {\\n\",\n       \"        vertical-align: top;\\n\",\n       \"    }\\n\",\n       \"\\n\",\n       \"    .dataframe thead th {\\n\",\n       \"        text-align: right;\\n\",\n       \"    }\\n\",\n       \"</style>\\n\",\n       \"<table border=\\\"1\\\" class=\\\"dataframe\\\">\\n\",\n       \"  <thead>\\n\",\n       \"    <tr style=\\\"text-align: right;\\\">\\n\",\n       \"      <th></th>\\n\",\n       \"      <th>customerId</th>\\n\",\n       \"      <th>products</th>\\n\",\n       \"    </tr>\\n\",\n       \"  </thead>\\n\",\n       \"  <tbody>\\n\",\n       \"    <tr>\\n\",\n       \"      <th>0</th>\\n\",\n       \"      <td>0</td>\\n\",\n       \"      <td>20</td>\\n\",\n       \"    </tr>\\n\",\n       \"    <tr>\\n\",\n       \"      <th>1</th>\\n\",\n       \"      <td>1</td>\\n\",\n       \"      <td>2|2|23|68|68|111|29|86|107|152</td>\\n\",\n       \"    </tr>\\n\",\n       \"    <tr>\\n\",\n       \"      <th>2</th>\\n\",\n       \"      <td>2</td>\\n\",\n       \"      <td>111|107|29|11|11|11|33|23</td>\\n\",\n       \"    </tr>\\n\",\n       \"    <tr>\\n\",\n       \"      <th>3</th>\\n\",\n       \"      <td>3</td>\\n\",\n       \"      <td>164|227</td>\\n\",\n       \"    </tr>\\n\",\n       \"    <tr>\\n\",\n       \"      <th>4</th>\\n\",\n       \"      <td>5</td>\\n\",\n       \"      <td>2|2</td>\\n\",\n       \"    </tr>\\n\",\n       \"  </tbody>\\n\",\n       \"</table>\\n\",\n       \"</div>\"\n      ],\n      \"text/plain\": [\n       \"   customerId                        products\\n\",\n       \"0           0                              20\\n\",\n       \"1           1  2|2|23|68|68|111|29|86|107|152\\n\",\n       \"2           2       111|107|29|11|11|11|33|23\\n\",\n       \"3           3                         164|227\\n\",\n       \"4           5                             2|2\"\n      ]\n     },\n     \"execution_count\": 7,\n     \"metadata\": {},\n     \"output_type\": \"execute_result\"\n    }\n   ],\n   \"source\": [\n    \"print(transactions.shape)\\n\",\n    \"transactions.head()\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"# 3 Data preparation\\n\",\n    \"\\n\",\n    \"Our goal here is to break down each list of items in the products column into rows \\n\",\n    \"and count the number of products bought by a user\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": 8,\n   \"metadata\": {},\n   \"outputs\": [\n    {\n     \"data\": {\n      \"text/html\": [\n       \"<div>\\n\",\n       \"<style scoped>\\n\",\n       \"    .dataframe tbody tr th:only-of-type {\\n\",\n       \"        vertical-align: middle;\\n\",\n       \"    }\\n\",\n       \"\\n\",\n       \"    .dataframe tbody tr th {\\n\",\n       \"        vertical-align: top;\\n\",\n       \"    }\\n\",\n       \"\\n\",\n       \"    .dataframe thead th {\\n\",\n       \"        text-align: right;\\n\",\n       \"    }\\n\",\n       \"</style>\\n\",\n       \"<table border=\\\"1\\\" class=\\\"dataframe\\\">\\n\",\n       \"  <thead>\\n\",\n       \"    <tr style=\\\"text-align: right;\\\">\\n\",\n       \"      <th></th>\\n\",\n       \"      <th>customerId</th>\\n\",\n       \"      <th>0</th>\\n\",\n       \"      <th>1</th>\\n\",\n       \"      <th>2</th>\\n\",\n       \"      <th>3</th>\\n\",\n       \"      <th>4</th>\\n\",\n       \"      <th>5</th>\\n\",\n       \"      <th>6</th>\\n\",\n       \"      <th>7</th>\\n\",\n       \"      <th>8</th>\\n\",\n       \"      <th>9</th>\\n\",\n       \"    </tr>\\n\",\n       \"  </thead>\\n\",\n       \"  <tbody>\\n\",\n       \"    <tr>\\n\",\n       \"      <th>0</th>\\n\",\n       \"      <td>0</td>\\n\",\n       \"      <td>20.0</td>\\n\",\n       \"      <td>NaN</td>\\n\",\n       \"      <td>NaN</td>\\n\",\n       \"      <td>NaN</td>\\n\",\n       \"      <td>NaN</td>\\n\",\n       \"      <td>NaN</td>\\n\",\n       \"      <td>NaN</td>\\n\",\n       \"      <td>NaN</td>\\n\",\n       \"      <td>NaN</td>\\n\",\n       \"      <td>NaN</td>\\n\",\n       \"    </tr>\\n\",\n       \"    <tr>\\n\",\n       \"      <th>1</th>\\n\",\n       \"      <td>1</td>\\n\",\n       \"      <td>2.0</td>\\n\",\n       \"      <td>2.0</td>\\n\",\n       \"      <td>23.0</td>\\n\",\n       \"      <td>68.0</td>\\n\",\n       \"      <td>68.0</td>\\n\",\n       \"      <td>111.0</td>\\n\",\n       \"      <td>29.0</td>\\n\",\n       \"      <td>86.0</td>\\n\",\n       \"      <td>107.0</td>\\n\",\n       \"      <td>152.0</td>\\n\",\n       \"    </tr>\\n\",\n       \"  </tbody>\\n\",\n       \"</table>\\n\",\n       \"</div>\"\n      ],\n      \"text/plain\": [\n       \"   customerId     0    1     2     3     4      5     6     7      8      9\\n\",\n       \"0           0  20.0  NaN   NaN   NaN   NaN    NaN   NaN   NaN    NaN    NaN\\n\",\n       \"1           1   2.0  2.0  23.0  68.0  68.0  111.0  29.0  86.0  107.0  152.0\"\n      ]\n     },\n     \"execution_count\": 8,\n     \"metadata\": {},\n     \"output_type\": \"execute_result\"\n    }\n   ],\n   \"source\": [\n    \"# 1: split product items\\n\",\n    \"transactions['products'] = transactions['products'].apply(lambda x: [int(i) for i in x.split('|')])\\n\",\n    \"transactions.head(2).set_index('customerId')['products'].apply(pd.Series).reset_index()\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": 9,\n   \"metadata\": {},\n   \"outputs\": [\n    {\n     \"data\": {\n      \"text/html\": [\n       \"<div>\\n\",\n       \"<style scoped>\\n\",\n       \"    .dataframe tbody tr th:only-of-type {\\n\",\n       \"        vertical-align: middle;\\n\",\n       \"    }\\n\",\n       \"\\n\",\n       \"    .dataframe tbody tr th {\\n\",\n       \"        vertical-align: top;\\n\",\n       \"    }\\n\",\n       \"\\n\",\n       \"    .dataframe thead th {\\n\",\n       \"        text-align: right;\\n\",\n       \"    }\\n\",\n       \"</style>\\n\",\n       \"<table border=\\\"1\\\" class=\\\"dataframe\\\">\\n\",\n       \"  <thead>\\n\",\n       \"    <tr style=\\\"text-align: right;\\\">\\n\",\n       \"      <th></th>\\n\",\n       \"      <th>customerId</th>\\n\",\n       \"      <th>productId</th>\\n\",\n       \"      <th>purchase_count</th>\\n\",\n       \"    </tr>\\n\",\n       \"  </thead>\\n\",\n       \"  <tbody>\\n\",\n       \"    <tr>\\n\",\n       \"      <th>0</th>\\n\",\n       \"      <td>0</td>\\n\",\n       \"      <td>20.0</td>\\n\",\n       \"      <td>1</td>\\n\",\n       \"    </tr>\\n\",\n       \"    <tr>\\n\",\n       \"      <th>1</th>\\n\",\n       \"      <td>1</td>\\n\",\n       \"      <td>2.0</td>\\n\",\n       \"      <td>2</td>\\n\",\n       \"    </tr>\\n\",\n       \"    <tr>\\n\",\n       \"      <th>2</th>\\n\",\n       \"      <td>1</td>\\n\",\n       \"      <td>23.0</td>\\n\",\n       \"      <td>1</td>\\n\",\n       \"    </tr>\\n\",\n       \"    <tr>\\n\",\n       \"      <th>3</th>\\n\",\n       \"      <td>1</td>\\n\",\n       \"      <td>29.0</td>\\n\",\n       \"      <td>1</td>\\n\",\n       \"    </tr>\\n\",\n       \"    <tr>\\n\",\n       \"      <th>4</th>\\n\",\n       \"      <td>1</td>\\n\",\n       \"      <td>68.0</td>\\n\",\n       \"      <td>2</td>\\n\",\n       \"    </tr>\\n\",\n       \"    <tr>\\n\",\n       \"      <th>5</th>\\n\",\n       \"      <td>1</td>\\n\",\n       \"      <td>86.0</td>\\n\",\n       \"      <td>1</td>\\n\",\n       \"    </tr>\\n\",\n       \"    <tr>\\n\",\n       \"      <th>6</th>\\n\",\n       \"      <td>1</td>\\n\",\n       \"      <td>107.0</td>\\n\",\n       \"      <td>1</td>\\n\",\n       \"    </tr>\\n\",\n       \"    <tr>\\n\",\n       \"      <th>7</th>\\n\",\n       \"      <td>1</td>\\n\",\n       \"      <td>111.0</td>\\n\",\n       \"      <td>1</td>\\n\",\n       \"    </tr>\\n\",\n       \"    <tr>\\n\",\n       \"      <th>8</th>\\n\",\n       \"      <td>1</td>\\n\",\n       \"      <td>152.0</td>\\n\",\n       \"      <td>1</td>\\n\",\n       \"    </tr>\\n\",\n       \"  </tbody>\\n\",\n       \"</table>\\n\",\n       \"</div>\"\n      ],\n      \"text/plain\": [\n       \"   customerId  productId  purchase_count\\n\",\n       \"0           0       20.0               1\\n\",\n       \"1           1        2.0               2\\n\",\n       \"2           1       23.0               1\\n\",\n       \"3           1       29.0               1\\n\",\n       \"4           1       68.0               2\\n\",\n       \"5           1       86.0               1\\n\",\n       \"6           1      107.0               1\\n\",\n       \"7           1      111.0               1\\n\",\n       \"8           1      152.0               1\"\n      ]\n     },\n     \"execution_count\": 9,\n     \"metadata\": {},\n     \"output_type\": \"execute_result\"\n    }\n   ],\n   \"source\": [\n    \"# 2: organize a given table into a dataframe with customerId, single productId, and purchase count\\n\",\n    \"pd.melt(transactions.head(2).set_index('customerId')['products'].apply(pd.Series).reset_index(), \\n\",\n    \"             id_vars=['customerId'],\\n\",\n    \"             value_name='products') \\\\\\n\",\n    \"    .dropna().drop(['variable'], axis=1) \\\\\\n\",\n    \"    .groupby(['customerId', 'products']) \\\\\\n\",\n    \"    .agg({'products': 'count'}) \\\\\\n\",\n    \"    .rename(columns={'products': 'purchase_count'}) \\\\\\n\",\n    \"    .reset_index() \\\\\\n\",\n    \"    .rename(columns={'products': 'productId'})\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"## 3.1 Create data with user, item, and target field\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": 10,\n   \"metadata\": {},\n   \"outputs\": [\n    {\n     \"name\": \"stdout\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"(133585, 3)\\n\"\n     ]\n    },\n    {\n     \"data\": {\n      \"text/html\": [\n       \"<div>\\n\",\n       \"<style scoped>\\n\",\n       \"    .dataframe tbody tr th:only-of-type {\\n\",\n       \"        vertical-align: middle;\\n\",\n       \"    }\\n\",\n       \"\\n\",\n       \"    .dataframe tbody tr th {\\n\",\n       \"        vertical-align: top;\\n\",\n       \"    }\\n\",\n       \"\\n\",\n       \"    .dataframe thead th {\\n\",\n       \"        text-align: right;\\n\",\n       \"    }\\n\",\n       \"</style>\\n\",\n       \"<table border=\\\"1\\\" class=\\\"dataframe\\\">\\n\",\n       \"  <thead>\\n\",\n       \"    <tr style=\\\"text-align: right;\\\">\\n\",\n       \"      <th></th>\\n\",\n       \"      <th>customerId</th>\\n\",\n       \"      <th>productId</th>\\n\",\n       \"      <th>purchase_count</th>\\n\",\n       \"    </tr>\\n\",\n       \"  </thead>\\n\",\n       \"  <tbody>\\n\",\n       \"    <tr>\\n\",\n       \"      <th>0</th>\\n\",\n       \"      <td>0</td>\\n\",\n       \"      <td>1</td>\\n\",\n       \"      <td>2</td>\\n\",\n       \"    </tr>\\n\",\n       \"    <tr>\\n\",\n       \"      <th>1</th>\\n\",\n       \"      <td>0</td>\\n\",\n       \"      <td>13</td>\\n\",\n       \"      <td>1</td>\\n\",\n       \"    </tr>\\n\",\n       \"    <tr>\\n\",\n       \"      <th>2</th>\\n\",\n       \"      <td>0</td>\\n\",\n       \"      <td>19</td>\\n\",\n       \"      <td>3</td>\\n\",\n       \"    </tr>\\n\",\n       \"    <tr>\\n\",\n       \"      <th>3</th>\\n\",\n       \"      <td>0</td>\\n\",\n       \"      <td>20</td>\\n\",\n       \"      <td>1</td>\\n\",\n       \"    </tr>\\n\",\n       \"    <tr>\\n\",\n       \"      <th>4</th>\\n\",\n       \"      <td>0</td>\\n\",\n       \"      <td>31</td>\\n\",\n       \"      <td>2</td>\\n\",\n       \"    </tr>\\n\",\n       \"  </tbody>\\n\",\n       \"</table>\\n\",\n       \"</div>\"\n      ],\n      \"text/plain\": [\n       \"   customerId  productId  purchase_count\\n\",\n       \"0           0          1               2\\n\",\n       \"1           0         13               1\\n\",\n       \"2           0         19               3\\n\",\n       \"3           0         20               1\\n\",\n       \"4           0         31               2\"\n      ]\n     },\n     \"execution_count\": 10,\n     \"metadata\": {},\n     \"output_type\": \"execute_result\"\n    }\n   ],\n   \"source\": [\n    \"data = pd.melt(transactions.set_index('customerId')['products'].apply(pd.Series).reset_index(), \\n\",\n    \"             id_vars=['customerId'],\\n\",\n    \"             value_name='products') \\\\\\n\",\n    \"    .dropna().drop(['variable'], axis=1) \\\\\\n\",\n    \"    .groupby(['customerId', 'products']) \\\\\\n\",\n    \"    .agg({'products': 'count'}) \\\\\\n\",\n    \"    .rename(columns={'products': 'purchase_count'}) \\\\\\n\",\n    \"    .reset_index() \\\\\\n\",\n    \"    .rename(columns={'products': 'productId'})\\n\",\n    \"data['productId'] = data['productId'].astype(np.int64)\\n\",\n    \"\\n\",\n    \"print(data.shape)\\n\",\n    \"data.head()\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"## 3.2 Normalize item values across users\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": 11,\n   \"metadata\": {},\n   \"outputs\": [\n    {\n     \"data\": {\n      \"text/html\": [\n       \"<div>\\n\",\n       \"<style scoped>\\n\",\n       \"    .dataframe tbody tr th:only-of-type {\\n\",\n       \"        vertical-align: middle;\\n\",\n       \"    }\\n\",\n       \"\\n\",\n       \"    .dataframe tbody tr th {\\n\",\n       \"        vertical-align: top;\\n\",\n       \"    }\\n\",\n       \"\\n\",\n       \"    .dataframe thead th {\\n\",\n       \"        text-align: right;\\n\",\n       \"    }\\n\",\n       \"</style>\\n\",\n       \"<table border=\\\"1\\\" class=\\\"dataframe\\\">\\n\",\n       \"  <thead>\\n\",\n       \"    <tr style=\\\"text-align: right;\\\">\\n\",\n       \"      <th>productId</th>\\n\",\n       \"      <th>0</th>\\n\",\n       \"      <th>1</th>\\n\",\n       \"      <th>2</th>\\n\",\n       \"      <th>3</th>\\n\",\n       \"      <th>4</th>\\n\",\n       \"      <th>5</th>\\n\",\n       \"      <th>6</th>\\n\",\n       \"      <th>7</th>\\n\",\n       \"      <th>8</th>\\n\",\n       \"      <th>9</th>\\n\",\n       \"      <th>...</th>\\n\",\n       \"      <th>290</th>\\n\",\n       \"      <th>291</th>\\n\",\n       \"      <th>292</th>\\n\",\n       \"      <th>293</th>\\n\",\n       \"      <th>294</th>\\n\",\n       \"      <th>295</th>\\n\",\n       \"      <th>296</th>\\n\",\n       \"      <th>297</th>\\n\",\n       \"      <th>298</th>\\n\",\n       \"      <th>299</th>\\n\",\n       \"    </tr>\\n\",\n       \"    <tr>\\n\",\n       \"      <th>customerId</th>\\n\",\n       \"      <th></th>\\n\",\n       \"      <th></th>\\n\",\n       \"      <th></th>\\n\",\n       \"      <th></th>\\n\",\n       \"      <th></th>\\n\",\n       \"      <th></th>\\n\",\n       \"      <th></th>\\n\",\n       \"      <th></th>\\n\",\n       \"      <th></th>\\n\",\n       \"      <th></th>\\n\",\n       \"      <th></th>\\n\",\n       \"      <th></th>\\n\",\n       \"      <th></th>\\n\",\n       \"      <th></th>\\n\",\n       \"      <th></th>\\n\",\n       \"      <th></th>\\n\",\n       \"      <th></th>\\n\",\n       \"      <th></th>\\n\",\n       \"      <th></th>\\n\",\n       \"      <th></th>\\n\",\n       \"      <th></th>\\n\",\n       \"    </tr>\\n\",\n       \"  </thead>\\n\",\n       \"  <tbody>\\n\",\n       \"    <tr>\\n\",\n       \"      <th>0</th>\\n\",\n       \"      <td>NaN</td>\\n\",\n       \"      <td>2.0</td>\\n\",\n       \"      <td>NaN</td>\\n\",\n       \"      <td>NaN</td>\\n\",\n       \"      <td>NaN</td>\\n\",\n       \"      <td>NaN</td>\\n\",\n       \"      <td>NaN</td>\\n\",\n       \"      <td>NaN</td>\\n\",\n       \"      <td>NaN</td>\\n\",\n       \"      <td>NaN</td>\\n\",\n       \"      <td>...</td>\\n\",\n       \"      <td>NaN</td>\\n\",\n       \"      <td>NaN</td>\\n\",\n       \"      <td>NaN</td>\\n\",\n       \"      <td>NaN</td>\\n\",\n       \"      <td>NaN</td>\\n\",\n       \"      <td>NaN</td>\\n\",\n       \"      <td>NaN</td>\\n\",\n       \"      <td>NaN</td>\\n\",\n       \"      <td>NaN</td>\\n\",\n       \"      <td>NaN</td>\\n\",\n       \"    </tr>\\n\",\n       \"    <tr>\\n\",\n       \"      <th>1</th>\\n\",\n       \"      <td>NaN</td>\\n\",\n       \"      <td>NaN</td>\\n\",\n       \"      <td>6.0</td>\\n\",\n       \"      <td>NaN</td>\\n\",\n       \"      <td>NaN</td>\\n\",\n       \"      <td>NaN</td>\\n\",\n       \"      <td>NaN</td>\\n\",\n       \"      <td>NaN</td>\\n\",\n       \"      <td>NaN</td>\\n\",\n       \"      <td>NaN</td>\\n\",\n       \"      <td>...</td>\\n\",\n       \"      <td>NaN</td>\\n\",\n       \"      <td>NaN</td>\\n\",\n       \"      <td>NaN</td>\\n\",\n       \"      <td>1.0</td>\\n\",\n       \"      <td>NaN</td>\\n\",\n       \"      <td>NaN</td>\\n\",\n       \"      <td>1.0</td>\\n\",\n       \"      <td>NaN</td>\\n\",\n       \"      <td>NaN</td>\\n\",\n       \"      <td>NaN</td>\\n\",\n       \"    </tr>\\n\",\n       \"    <tr>\\n\",\n       \"      <th>2</th>\\n\",\n       \"      <td>NaN</td>\\n\",\n       \"      <td>NaN</td>\\n\",\n       \"      <td>NaN</td>\\n\",\n       \"      <td>NaN</td>\\n\",\n       \"      <td>NaN</td>\\n\",\n       \"      <td>NaN</td>\\n\",\n       \"      <td>NaN</td>\\n\",\n       \"      <td>NaN</td>\\n\",\n       \"      <td>NaN</td>\\n\",\n       \"      <td>NaN</td>\\n\",\n       \"      <td>...</td>\\n\",\n       \"      <td>NaN</td>\\n\",\n       \"      <td>NaN</td>\\n\",\n       \"      <td>NaN</td>\\n\",\n       \"      <td>NaN</td>\\n\",\n       \"      <td>NaN</td>\\n\",\n       \"      <td>NaN</td>\\n\",\n       \"      <td>NaN</td>\\n\",\n       \"      <td>NaN</td>\\n\",\n       \"      <td>NaN</td>\\n\",\n       \"      <td>NaN</td>\\n\",\n       \"    </tr>\\n\",\n       \"    <tr>\\n\",\n       \"      <th>3</th>\\n\",\n       \"      <td>NaN</td>\\n\",\n       \"      <td>NaN</td>\\n\",\n       \"      <td>NaN</td>\\n\",\n       \"      <td>NaN</td>\\n\",\n       \"      <td>NaN</td>\\n\",\n       \"      <td>NaN</td>\\n\",\n       \"      <td>NaN</td>\\n\",\n       \"      <td>NaN</td>\\n\",\n       \"      <td>NaN</td>\\n\",\n       \"      <td>NaN</td>\\n\",\n       \"      <td>...</td>\\n\",\n       \"      <td>NaN</td>\\n\",\n       \"      <td>NaN</td>\\n\",\n       \"      <td>NaN</td>\\n\",\n       \"      <td>NaN</td>\\n\",\n       \"      <td>NaN</td>\\n\",\n       \"      <td>NaN</td>\\n\",\n       \"      <td>NaN</td>\\n\",\n       \"      <td>NaN</td>\\n\",\n       \"      <td>NaN</td>\\n\",\n       \"      <td>NaN</td>\\n\",\n       \"    </tr>\\n\",\n       \"    <tr>\\n\",\n       \"      <th>4</th>\\n\",\n       \"      <td>NaN</td>\\n\",\n       \"      <td>NaN</td>\\n\",\n       \"      <td>NaN</td>\\n\",\n       \"      <td>NaN</td>\\n\",\n       \"      <td>NaN</td>\\n\",\n       \"      <td>NaN</td>\\n\",\n       \"      <td>NaN</td>\\n\",\n       \"      <td>NaN</td>\\n\",\n       \"      <td>NaN</td>\\n\",\n       \"      <td>NaN</td>\\n\",\n       \"      <td>...</td>\\n\",\n       \"      <td>NaN</td>\\n\",\n       \"      <td>NaN</td>\\n\",\n       \"      <td>NaN</td>\\n\",\n       \"      <td>NaN</td>\\n\",\n       \"      <td>NaN</td>\\n\",\n       \"      <td>NaN</td>\\n\",\n       \"      <td>NaN</td>\\n\",\n       \"      <td>NaN</td>\\n\",\n       \"      <td>NaN</td>\\n\",\n       \"      <td>NaN</td>\\n\",\n       \"    </tr>\\n\",\n       \"  </tbody>\\n\",\n       \"</table>\\n\",\n       \"<p>5 rows × 300 columns</p>\\n\",\n       \"</div>\"\n      ],\n      \"text/plain\": [\n       \"productId   0    1    2    3    4    5    6    7    8    9    ...  290  291  \\\\\\n\",\n       \"customerId                                                    ...             \\n\",\n       \"0           NaN  2.0  NaN  NaN  NaN  NaN  NaN  NaN  NaN  NaN  ...  NaN  NaN   \\n\",\n       \"1           NaN  NaN  6.0  NaN  NaN  NaN  NaN  NaN  NaN  NaN  ...  NaN  NaN   \\n\",\n       \"2           NaN  NaN  NaN  NaN  NaN  NaN  NaN  NaN  NaN  NaN  ...  NaN  NaN   \\n\",\n       \"3           NaN  NaN  NaN  NaN  NaN  NaN  NaN  NaN  NaN  NaN  ...  NaN  NaN   \\n\",\n       \"4           NaN  NaN  NaN  NaN  NaN  NaN  NaN  NaN  NaN  NaN  ...  NaN  NaN   \\n\",\n       \"\\n\",\n       \"productId   292  293  294  295  296  297  298  299  \\n\",\n       \"customerId                                          \\n\",\n       \"0           NaN  NaN  NaN  NaN  NaN  NaN  NaN  NaN  \\n\",\n       \"1           NaN  1.0  NaN  NaN  1.0  NaN  NaN  NaN  \\n\",\n       \"2           NaN  NaN  NaN  NaN  NaN  NaN  NaN  NaN  \\n\",\n       \"3           NaN  NaN  NaN  NaN  NaN  NaN  NaN  NaN  \\n\",\n       \"4           NaN  NaN  NaN  NaN  NaN  NaN  NaN  NaN  \\n\",\n       \"\\n\",\n       \"[5 rows x 300 columns]\"\n      ]\n     },\n     \"execution_count\": 11,\n     \"metadata\": {},\n     \"output_type\": \"execute_result\"\n    }\n   ],\n   \"source\": [\n    \"df_matrix = pd.pivot_table(data, values='purchase_count', index='customerId', columns='productId')\\n\",\n    \"df_matrix.head()\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": 12,\n   \"metadata\": {},\n   \"outputs\": [\n    {\n     \"name\": \"stdout\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"(24429, 300)\\n\"\n     ]\n    },\n    {\n     \"data\": {\n      \"text/html\": [\n       \"<div>\\n\",\n       \"<style scoped>\\n\",\n       \"    .dataframe tbody tr th:only-of-type {\\n\",\n       \"        vertical-align: middle;\\n\",\n       \"    }\\n\",\n       \"\\n\",\n       \"    .dataframe tbody tr th {\\n\",\n       \"        vertical-align: top;\\n\",\n       \"    }\\n\",\n       \"\\n\",\n       \"    .dataframe thead th {\\n\",\n       \"        text-align: right;\\n\",\n       \"    }\\n\",\n       \"</style>\\n\",\n       \"<table border=\\\"1\\\" class=\\\"dataframe\\\">\\n\",\n       \"  <thead>\\n\",\n       \"    <tr style=\\\"text-align: right;\\\">\\n\",\n       \"      <th>productId</th>\\n\",\n       \"      <th>0</th>\\n\",\n       \"      <th>1</th>\\n\",\n       \"      <th>2</th>\\n\",\n       \"      <th>3</th>\\n\",\n       \"      <th>4</th>\\n\",\n       \"      <th>5</th>\\n\",\n       \"      <th>6</th>\\n\",\n       \"      <th>7</th>\\n\",\n       \"      <th>8</th>\\n\",\n       \"      <th>9</th>\\n\",\n       \"      <th>...</th>\\n\",\n       \"      <th>290</th>\\n\",\n       \"      <th>291</th>\\n\",\n       \"      <th>292</th>\\n\",\n       \"      <th>293</th>\\n\",\n       \"      <th>294</th>\\n\",\n       \"      <th>295</th>\\n\",\n       \"      <th>296</th>\\n\",\n       \"      <th>297</th>\\n\",\n       \"      <th>298</th>\\n\",\n       \"      <th>299</th>\\n\",\n       \"    </tr>\\n\",\n       \"    <tr>\\n\",\n       \"      <th>customerId</th>\\n\",\n       \"      <th></th>\\n\",\n       \"      <th></th>\\n\",\n       \"      <th></th>\\n\",\n       \"      <th></th>\\n\",\n       \"      <th></th>\\n\",\n       \"      <th></th>\\n\",\n       \"      <th></th>\\n\",\n       \"      <th></th>\\n\",\n       \"      <th></th>\\n\",\n       \"      <th></th>\\n\",\n       \"      <th></th>\\n\",\n       \"      <th></th>\\n\",\n       \"      <th></th>\\n\",\n       \"      <th></th>\\n\",\n       \"      <th></th>\\n\",\n       \"      <th></th>\\n\",\n       \"      <th></th>\\n\",\n       \"      <th></th>\\n\",\n       \"      <th></th>\\n\",\n       \"      <th></th>\\n\",\n       \"      <th></th>\\n\",\n       \"    </tr>\\n\",\n       \"  </thead>\\n\",\n       \"  <tbody>\\n\",\n       \"    <tr>\\n\",\n       \"      <th>0</th>\\n\",\n       \"      <td>NaN</td>\\n\",\n       \"      <td>0.1</td>\\n\",\n       \"      <td>NaN</td>\\n\",\n       \"      <td>NaN</td>\\n\",\n       \"      <td>NaN</td>\\n\",\n       \"      <td>NaN</td>\\n\",\n       \"      <td>NaN</td>\\n\",\n       \"      <td>NaN</td>\\n\",\n       \"      <td>NaN</td>\\n\",\n       \"      <td>NaN</td>\\n\",\n       \"      <td>...</td>\\n\",\n       \"      <td>NaN</td>\\n\",\n       \"      <td>NaN</td>\\n\",\n       \"      <td>NaN</td>\\n\",\n       \"      <td>NaN</td>\\n\",\n       \"      <td>NaN</td>\\n\",\n       \"      <td>NaN</td>\\n\",\n       \"      <td>NaN</td>\\n\",\n       \"      <td>NaN</td>\\n\",\n       \"      <td>NaN</td>\\n\",\n       \"      <td>NaN</td>\\n\",\n       \"    </tr>\\n\",\n       \"    <tr>\\n\",\n       \"      <th>1</th>\\n\",\n       \"      <td>NaN</td>\\n\",\n       \"      <td>NaN</td>\\n\",\n       \"      <td>0.166667</td>\\n\",\n       \"      <td>NaN</td>\\n\",\n       \"      <td>NaN</td>\\n\",\n       \"      <td>NaN</td>\\n\",\n       \"      <td>NaN</td>\\n\",\n       \"      <td>NaN</td>\\n\",\n       \"      <td>NaN</td>\\n\",\n       \"      <td>NaN</td>\\n\",\n       \"      <td>...</td>\\n\",\n       \"      <td>NaN</td>\\n\",\n       \"      <td>NaN</td>\\n\",\n       \"      <td>NaN</td>\\n\",\n       \"      <td>0.0</td>\\n\",\n       \"      <td>NaN</td>\\n\",\n       \"      <td>NaN</td>\\n\",\n       \"      <td>0.0</td>\\n\",\n       \"      <td>NaN</td>\\n\",\n       \"      <td>NaN</td>\\n\",\n       \"      <td>NaN</td>\\n\",\n       \"    </tr>\\n\",\n       \"    <tr>\\n\",\n       \"      <th>2</th>\\n\",\n       \"      <td>NaN</td>\\n\",\n       \"      <td>NaN</td>\\n\",\n       \"      <td>NaN</td>\\n\",\n       \"      <td>NaN</td>\\n\",\n       \"      <td>NaN</td>\\n\",\n       \"      <td>NaN</td>\\n\",\n       \"      <td>NaN</td>\\n\",\n       \"      <td>NaN</td>\\n\",\n       \"      <td>NaN</td>\\n\",\n       \"      <td>NaN</td>\\n\",\n       \"      <td>...</td>\\n\",\n       \"      <td>NaN</td>\\n\",\n       \"      <td>NaN</td>\\n\",\n       \"      <td>NaN</td>\\n\",\n       \"      <td>NaN</td>\\n\",\n       \"      <td>NaN</td>\\n\",\n       \"      <td>NaN</td>\\n\",\n       \"      <td>NaN</td>\\n\",\n       \"      <td>NaN</td>\\n\",\n       \"      <td>NaN</td>\\n\",\n       \"      <td>NaN</td>\\n\",\n       \"    </tr>\\n\",\n       \"    <tr>\\n\",\n       \"      <th>3</th>\\n\",\n       \"      <td>NaN</td>\\n\",\n       \"      <td>NaN</td>\\n\",\n       \"      <td>NaN</td>\\n\",\n       \"      <td>NaN</td>\\n\",\n       \"      <td>NaN</td>\\n\",\n       \"      <td>NaN</td>\\n\",\n       \"      <td>NaN</td>\\n\",\n       \"      <td>NaN</td>\\n\",\n       \"      <td>NaN</td>\\n\",\n       \"      <td>NaN</td>\\n\",\n       \"      <td>...</td>\\n\",\n       \"      <td>NaN</td>\\n\",\n       \"      <td>NaN</td>\\n\",\n       \"      <td>NaN</td>\\n\",\n       \"      <td>NaN</td>\\n\",\n       \"      <td>NaN</td>\\n\",\n       \"      <td>NaN</td>\\n\",\n       \"      <td>NaN</td>\\n\",\n       \"      <td>NaN</td>\\n\",\n       \"      <td>NaN</td>\\n\",\n       \"      <td>NaN</td>\\n\",\n       \"    </tr>\\n\",\n       \"    <tr>\\n\",\n       \"      <th>4</th>\\n\",\n       \"      <td>NaN</td>\\n\",\n       \"      <td>NaN</td>\\n\",\n       \"      <td>NaN</td>\\n\",\n       \"      <td>NaN</td>\\n\",\n       \"      <td>NaN</td>\\n\",\n       \"      <td>NaN</td>\\n\",\n       \"      <td>NaN</td>\\n\",\n       \"      <td>NaN</td>\\n\",\n       \"      <td>NaN</td>\\n\",\n       \"      <td>NaN</td>\\n\",\n       \"      <td>...</td>\\n\",\n       \"      <td>NaN</td>\\n\",\n       \"      <td>NaN</td>\\n\",\n       \"      <td>NaN</td>\\n\",\n       \"      <td>NaN</td>\\n\",\n       \"      <td>NaN</td>\\n\",\n       \"      <td>NaN</td>\\n\",\n       \"      <td>NaN</td>\\n\",\n       \"      <td>NaN</td>\\n\",\n       \"      <td>NaN</td>\\n\",\n       \"      <td>NaN</td>\\n\",\n       \"    </tr>\\n\",\n       \"  </tbody>\\n\",\n       \"</table>\\n\",\n       \"<p>5 rows × 300 columns</p>\\n\",\n       \"</div>\"\n      ],\n      \"text/plain\": [\n       \"productId   0    1         2    3    4    5    6    7    8    9    ...  290  \\\\\\n\",\n       \"customerId                                                         ...        \\n\",\n       \"0           NaN  0.1       NaN  NaN  NaN  NaN  NaN  NaN  NaN  NaN  ...  NaN   \\n\",\n       \"1           NaN  NaN  0.166667  NaN  NaN  NaN  NaN  NaN  NaN  NaN  ...  NaN   \\n\",\n       \"2           NaN  NaN       NaN  NaN  NaN  NaN  NaN  NaN  NaN  NaN  ...  NaN   \\n\",\n       \"3           NaN  NaN       NaN  NaN  NaN  NaN  NaN  NaN  NaN  NaN  ...  NaN   \\n\",\n       \"4           NaN  NaN       NaN  NaN  NaN  NaN  NaN  NaN  NaN  NaN  ...  NaN   \\n\",\n       \"\\n\",\n       \"productId   291  292  293  294  295  296  297  298  299  \\n\",\n       \"customerId                                               \\n\",\n       \"0           NaN  NaN  NaN  NaN  NaN  NaN  NaN  NaN  NaN  \\n\",\n       \"1           NaN  NaN  0.0  NaN  NaN  0.0  NaN  NaN  NaN  \\n\",\n       \"2           NaN  NaN  NaN  NaN  NaN  NaN  NaN  NaN  NaN  \\n\",\n       \"3           NaN  NaN  NaN  NaN  NaN  NaN  NaN  NaN  NaN  \\n\",\n       \"4           NaN  NaN  NaN  NaN  NaN  NaN  NaN  NaN  NaN  \\n\",\n       \"\\n\",\n       \"[5 rows x 300 columns]\"\n      ]\n     },\n     \"execution_count\": 12,\n     \"metadata\": {},\n     \"output_type\": \"execute_result\"\n    }\n   ],\n   \"source\": [\n    \"df_matrix_norm = (df_matrix-df_matrix.min())/(df_matrix.max()-df_matrix.min())\\n\",\n    \"print(df_matrix_norm.shape)\\n\",\n    \"df_matrix_norm.head()\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": 13,\n   \"metadata\": {},\n   \"outputs\": [\n    {\n     \"name\": \"stdout\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"(133585, 3)\\n\"\n     ]\n    },\n    {\n     \"data\": {\n      \"text/html\": [\n       \"<div>\\n\",\n       \"<style scoped>\\n\",\n       \"    .dataframe tbody tr th:only-of-type {\\n\",\n       \"        vertical-align: middle;\\n\",\n       \"    }\\n\",\n       \"\\n\",\n       \"    .dataframe tbody tr th {\\n\",\n       \"        vertical-align: top;\\n\",\n       \"    }\\n\",\n       \"\\n\",\n       \"    .dataframe thead th {\\n\",\n       \"        text-align: right;\\n\",\n       \"    }\\n\",\n       \"</style>\\n\",\n       \"<table border=\\\"1\\\" class=\\\"dataframe\\\">\\n\",\n       \"  <thead>\\n\",\n       \"    <tr style=\\\"text-align: right;\\\">\\n\",\n       \"      <th></th>\\n\",\n       \"      <th>customerId</th>\\n\",\n       \"      <th>productId</th>\\n\",\n       \"      <th>scaled_purchase_freq</th>\\n\",\n       \"    </tr>\\n\",\n       \"  </thead>\\n\",\n       \"  <tbody>\\n\",\n       \"    <tr>\\n\",\n       \"      <th>9</th>\\n\",\n       \"      <td>9</td>\\n\",\n       \"      <td>0</td>\\n\",\n       \"      <td>0.133333</td>\\n\",\n       \"    </tr>\\n\",\n       \"    <tr>\\n\",\n       \"      <th>25</th>\\n\",\n       \"      <td>25</td>\\n\",\n       \"      <td>0</td>\\n\",\n       \"      <td>0.133333</td>\\n\",\n       \"    </tr>\\n\",\n       \"    <tr>\\n\",\n       \"      <th>32</th>\\n\",\n       \"      <td>33</td>\\n\",\n       \"      <td>0</td>\\n\",\n       \"      <td>0.133333</td>\\n\",\n       \"    </tr>\\n\",\n       \"    <tr>\\n\",\n       \"      <th>35</th>\\n\",\n       \"      <td>36</td>\\n\",\n       \"      <td>0</td>\\n\",\n       \"      <td>0.133333</td>\\n\",\n       \"    </tr>\\n\",\n       \"    <tr>\\n\",\n       \"      <th>43</th>\\n\",\n       \"      <td>44</td>\\n\",\n       \"      <td>0</td>\\n\",\n       \"      <td>0.133333</td>\\n\",\n       \"    </tr>\\n\",\n       \"  </tbody>\\n\",\n       \"</table>\\n\",\n       \"</div>\"\n      ],\n      \"text/plain\": [\n       \"    customerId productId  scaled_purchase_freq\\n\",\n       \"9            9         0              0.133333\\n\",\n       \"25          25         0              0.133333\\n\",\n       \"32          33         0              0.133333\\n\",\n       \"35          36         0              0.133333\\n\",\n       \"43          44         0              0.133333\"\n      ]\n     },\n     \"execution_count\": 13,\n     \"metadata\": {},\n     \"output_type\": \"execute_result\"\n    }\n   ],\n   \"source\": [\n    \"# create a table for input to the modeling\\n\",\n    \"\\n\",\n    \"d = df_matrix_norm.reset_index()\\n\",\n    \"d.index.names = ['scaled_purchase_freq']\\n\",\n    \"data_norm = pd.melt(d, id_vars=['customerId'], value_name='scaled_purchase_freq').dropna()\\n\",\n    \"print(data_norm.shape)\\n\",\n    \"data_norm.head()\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"# 4 Preparing data for learning\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": 14,\n   \"metadata\": {},\n   \"outputs\": [\n    {\n     \"name\": \"stdout\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"28606\\n\",\n      \"300\\n\",\n      \"[    9    25    33 ... 26873 26998 28066]\\n\",\n      \"[  0   0   0 ... 299 299 299]\\n\",\n      \"[0.13333333 0.13333333 0.13333333 ... 0.         0.         0.        ]\\n\"\n     ]\n    }\n   ],\n   \"source\": [\n    \"customer_idxs = np.array(data_norm.customerId, dtype = np.int)\\n\",\n    \"product_idxs = np.array(data_norm.productId, dtype = np.int)\\n\",\n    \"\\n\",\n    \"ratings = np.array(data_norm.scaled_purchase_freq)\\n\",\n    \"\\n\",\n    \"n_customers = int(data_norm['customerId'].drop_duplicates().max()) + 1\\n\",\n    \"n_products = int(data_norm['productId'].drop_duplicates().max()) + 1\\n\",\n    \"n_factors = 50\\n\",\n    \"\\n\",\n    \"input_shape = (1,)\\n\",\n    \"\\n\",\n    \"print(n_customers)\\n\",\n    \"print(n_products)\\n\",\n    \"print(customer_idxs)\\n\",\n    \"print(product_idxs)\\n\",\n    \"print(ratings)\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"## 4.1 Tensorflow Session\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": 15,\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"# create TF session and set it in Keras\\n\",\n    \"sess = tf.Session()\\n\",\n    \"K.set_session(sess)\\n\",\n    \"K.set_learning_phase(1)\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"## 4.2 Model Class\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": 16,\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"class DeepCollaborativeFiltering(Model):\\n\",\n    \"    def __init__(self, n_customers, n_products, n_factors, p_dropout = 0.2):\\n\",\n    \"        x1 = Input(shape = (1,), name=\\\"user\\\")\\n\",\n    \"\\n\",\n    \"        P = Embedding(n_customers, n_factors, input_length = 1)(x1)\\n\",\n    \"        P = Reshape((n_factors,))(P)\\n\",\n    \"\\n\",\n    \"        x2 = Input(shape = (1,), name=\\\"product\\\")\\n\",\n    \"\\n\",\n    \"        Q = Embedding(n_products, n_factors, input_length = 1)(x2)\\n\",\n    \"        Q = Reshape((n_factors,))(Q)\\n\",\n    \"\\n\",\n    \"        x = concatenate([P, Q], axis=1)\\n\",\n    \"        x = Dropout(p_dropout)(x)\\n\",\n    \"\\n\",\n    \"        x = Dense(n_factors)(x)\\n\",\n    \"        x = Activation('relu')(x)\\n\",\n    \"        x = Dropout(p_dropout)(x)\\n\",\n    \"\\n\",\n    \"        output = Dense(1)(x)       \\n\",\n    \"        \\n\",\n    \"        super(DeepCollaborativeFiltering, self).__init__([x1, x2], output)\\n\",\n    \"    \\n\",\n    \"    def rate(self, customer_idxs, product_idxs):\\n\",\n    \"        if (type(customer_idxs) == int and type(product_idxs) == int):\\n\",\n    \"            return self.predict([np.array(customer_idxs).reshape((1,)), np.array(product_idxs).reshape((1,))])\\n\",\n    \"        \\n\",\n    \"        if (type(customer_idxs) == str and type(product_idxs) == str):\\n\",\n    \"            return self.predict([np.array(customerMapping[customer_idxs]).reshape((1,)), np.array(productMapping[product_idxs]).reshape((1,))])\\n\",\n    \"        \\n\",\n    \"        return self.predict([\\n\",\n    \"            np.array([customerMapping[customer_idx] for customer_idx in customer_idxs]), \\n\",\n    \"            np.array([productMapping[product_idx] for product_idx in product_idxs])\\n\",\n    \"        ])\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"## 4.3 Hyperparameters\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": 17,\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"bs = 64\\n\",\n    \"val_per = 0.25\\n\",\n    \"epochs = 3\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"## 4.4 Model Definition\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": 18,\n   \"metadata\": {},\n   \"outputs\": [\n    {\n     \"name\": \"stdout\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"WARNING:tensorflow:From /usr/local/lib/python3.6/dist-packages/tensorflow_core/python/ops/resource_variable_ops.py:1630: calling BaseResourceVariable.__init__ (from tensorflow.python.ops.resource_variable_ops) with constraint is deprecated and will be removed in a future version.\\n\",\n      \"Instructions for updating:\\n\",\n      \"If using Keras pass *_constraint arguments to layers.\\n\",\n      \"Model: \\\"deepcollaborativefiltering_1\\\"\\n\",\n      \"__________________________________________________________________________________________________\\n\",\n      \"Layer (type)                    Output Shape         Param #     Connected to                     \\n\",\n      \"==================================================================================================\\n\",\n      \"user (InputLayer)               (None, 1)            0                                            \\n\",\n      \"__________________________________________________________________________________________________\\n\",\n      \"product (InputLayer)            (None, 1)            0                                            \\n\",\n      \"__________________________________________________________________________________________________\\n\",\n      \"embedding_1 (Embedding)         (None, 1, 50)        1430300     user[0][0]                       \\n\",\n      \"__________________________________________________________________________________________________\\n\",\n      \"embedding_2 (Embedding)         (None, 1, 50)        15000       product[0][0]                    \\n\",\n      \"__________________________________________________________________________________________________\\n\",\n      \"reshape_1 (Reshape)             (None, 50)           0           embedding_1[0][0]                \\n\",\n      \"__________________________________________________________________________________________________\\n\",\n      \"reshape_2 (Reshape)             (None, 50)           0           embedding_2[0][0]                \\n\",\n      \"__________________________________________________________________________________________________\\n\",\n      \"concatenate_1 (Concatenate)     (None, 100)          0           reshape_1[0][0]                  \\n\",\n      \"                                                                 reshape_2[0][0]                  \\n\",\n      \"__________________________________________________________________________________________________\\n\",\n      \"dropout_1 (Dropout)             (None, 100)          0           concatenate_1[0][0]              \\n\",\n      \"__________________________________________________________________________________________________\\n\",\n      \"dense_1 (Dense)                 (None, 50)           5050        dropout_1[0][0]                  \\n\",\n      \"__________________________________________________________________________________________________\\n\",\n      \"activation_1 (Activation)       (None, 50)           0           dense_1[0][0]                    \\n\",\n      \"__________________________________________________________________________________________________\\n\",\n      \"dropout_2 (Dropout)             (None, 50)           0           activation_1[0][0]               \\n\",\n      \"__________________________________________________________________________________________________\\n\",\n      \"dense_2 (Dense)                 (None, 1)            51          dropout_2[0][0]                  \\n\",\n      \"==================================================================================================\\n\",\n      \"Total params: 1,450,401\\n\",\n      \"Trainable params: 1,450,401\\n\",\n      \"Non-trainable params: 0\\n\",\n      \"__________________________________________________________________________________________________\\n\"\n     ]\n    }\n   ],\n   \"source\": [\n    \"model = DeepCollaborativeFiltering(n_customers, n_products, n_factors)\\n\",\n    \"model.summary()\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"# 5 Training\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": 19,\n   \"metadata\": {},\n   \"outputs\": [\n    {\n     \"name\": \"stdout\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"WARNING:tensorflow:From /usr/local/lib/python3.6/dist-packages/tensorflow_core/python/ops/math_grad.py:1424: where (from tensorflow.python.ops.array_ops) is deprecated and will be removed in a future version.\\n\",\n      \"Instructions for updating:\\n\",\n      \"Use tf.where in 2.0, which has the same broadcast rule as np.where\\n\",\n      \"WARNING:tensorflow:From /home/jovyan/.local/lib/python3.6/site-packages/keras/backend/tensorflow_backend.py:422: The name tf.global_variables is deprecated. Please use tf.compat.v1.global_variables instead.\\n\",\n      \"\\n\",\n      \"Train on 100188 samples, validate on 33397 samples\\n\",\n      \"Epoch 1/3\\n\",\n      \"100188/100188 [==============================] - 14s 142us/step - loss: 0.0105 - val_loss: 0.0184\\n\",\n      \"Epoch 2/3\\n\",\n      \"100188/100188 [==============================] - 14s 137us/step - loss: 0.0091 - val_loss: 0.0187\\n\",\n      \"Epoch 3/3\\n\",\n      \"100188/100188 [==============================] - 14s 139us/step - loss: 0.0078 - val_loss: 0.0193\\n\",\n      \"Done training!\\n\"\n     ]\n    }\n   ],\n   \"source\": [\n    \"model.compile(optimizer = 'adam', loss = mean_squared_logarithmic_error)\\n\",\n    \"model.fit(x = [customer_idxs, product_idxs], y = ratings, batch_size = bs, epochs = epochs, validation_split = val_per)\\n\",\n    \"print('Done training!')\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"## 5.1 Log model and metrics\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": 20,\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"logmodel = exec.log_output(\\n\",\n    \"    metadata.Model(\\n\",\n    \"            name=\\\"DeepCollaborativeFiltering\\\",\\n\",\n    \"            description=\\\"Model for product recommender\\\",\\n\",\n    \"            uri=\\\"\\\",\\n\",\n    \"            model_type=\\\"neural network\\\",\\n\",\n    \"            version=execTime,\\n\",\n    \"            training_framework={\\n\",\n    \"                \\\"name\\\": \\\"tensorflow\\\",\\n\",\n    \"                \\\"version\\\": \\\"v1.14\\\"\\n\",\n    \"            },\\n\",\n    \"            hyperparameters={\\n\",\n    \"                \\\"batch_size\\\" : 64,\\n\",\n    \"                \\\"validation_split\\\" : 0.25,\\n\",\n    \"                \\\"layers\\\": [n_customers, n_products, n_factors],\\n\",\n    \"                \\\"epochs\\\" : 3\\n\",\n    \"            }))\\n\",\n    \"metrics = exec.log_output(\\n\",\n    \"    metadata.Metrics(\\n\",\n    \"            name=\\\"Model for product recommender evaluation\\\",\\n\",\n    \"            description=\\\"Validating of the recommender model\\\",\\n\",\n    \"            uri=\\\"\\\",\\n\",\n    \"            version=execTime,\\n\",\n    \"            data_set_id=data_set.id,\\n\",\n    \"            model_id=logmodel.id))\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"# 6 Get current output directory for model\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": 21,\n   \"metadata\": {},\n   \"outputs\": [\n    {\n     \"name\": \"stdout\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"Exporting trained model to s3://models/recommender/1/\\n\"\n     ]\n    }\n   ],\n   \"source\": [\n    \"directorystream = minioClient.get_object('data', 'recommender/directory.txt')\\n\",\n    \"directory = \\\"\\\"\\n\",\n    \"for d in directorystream.stream(32*1024):\\n\",\n    \"    directory += d.decode('utf-8')\\n\",\n    \"arg_version = \\\"1\\\"    \\n\",\n    \"export_path = 's3://models/' + directory + '/' + arg_version + '/'\\n\",\n    \"print ('Exporting trained model to', export_path)\\n\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"## 6.1 Export models\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": 22,\n   \"metadata\": {},\n   \"outputs\": [\n    {\n     \"name\": \"stdout\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"WARNING:tensorflow:From <ipython-input-22-58b1f5cc64c6>:2: build_tensor_info (from tensorflow.python.saved_model.utils_impl) is deprecated and will be removed in a future version.\\n\",\n      \"Instructions for updating:\\n\",\n      \"This function will only be available through the v1 compatibility library as tf.compat.v1.saved_model.utils.build_tensor_info or tf.compat.v1.saved_model.build_tensor_info.\\n\",\n      \"tensor_info_users user:0\\n\",\n      \"tensor_info_products product:0\\n\",\n      \"tensor_info_pred dense_2/BiasAdd:0\\n\"\n     ]\n    }\n   ],\n   \"source\": [\n    \"# inputs/outputs\\n\",\n    \"tensor_info_users = tf.saved_model.utils.build_tensor_info(model.input[0])\\n\",\n    \"tensor_info_products = tf.saved_model.utils.build_tensor_info(model.input[1])\\n\",\n    \"tensor_info_pred = tf.saved_model.utils.build_tensor_info(model.output)\\n\",\n    \"\\n\",\n    \"print (\\\"tensor_info_users\\\", tensor_info_users.name)\\n\",\n    \"print (\\\"tensor_info_products\\\", tensor_info_products.name)\\n\",\n    \"print (\\\"tensor_info_pred\\\", tensor_info_pred.name)\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": 23,\n   \"metadata\": {},\n   \"outputs\": [\n    {\n     \"name\": \"stdout\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"WARNING:tensorflow:From <ipython-input-23-da3077ef7d92>:14: calling SavedModelBuilder.add_meta_graph_and_variables (from tensorflow.python.saved_model.builder_impl) with legacy_init_op is deprecated and will be removed in a future version.\\n\",\n      \"Instructions for updating:\\n\",\n      \"Pass your op to the equivalent parameter main_op instead.\\n\",\n      \"INFO:tensorflow:No assets to save.\\n\",\n      \"INFO:tensorflow:No assets to write.\\n\",\n      \"INFO:tensorflow:SavedModel written to: s3://models/recommender/1/saved_model.pb\\n\"\n     ]\n    },\n    {\n     \"data\": {\n      \"text/plain\": [\n       \"b's3://models/recommender/1/saved_model.pb'\"\n      ]\n     },\n     \"execution_count\": 23,\n     \"metadata\": {},\n     \"output_type\": \"execute_result\"\n    }\n   ],\n   \"source\": [\n    \"# signature\\n\",\n    \"prediction_signature = (tf.saved_model.signature_def_utils.build_signature_def(\\n\",\n    \"        inputs={\\\"users\\\": tensor_info_users, \\\"products\\\": tensor_info_products},\\n\",\n    \"        outputs={\\\"predictions\\\": tensor_info_pred},\\n\",\n    \"        method_name=tf.saved_model.signature_constants.PREDICT_METHOD_NAME))\\n\",\n    \"# export\\n\",\n    \"legacy_init_op = tf.group(tf.tables_initializer(), name='legacy_init_op')\\n\",\n    \"builder = tf.saved_model.builder.SavedModelBuilder(export_path)\\n\",\n    \"builder.add_meta_graph_and_variables(\\n\",\n    \"      sess, [tf.saved_model.tag_constants.SERVING],\\n\",\n    \"      signature_def_map={\\n\",\n    \"           tf.saved_model.signature_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY: prediction_signature,\\n\",\n    \"      },\\n\",\n    \"      legacy_init_op=legacy_init_op)\\n\",\n    \"builder.save()\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"# 7 Restarting of the model serving server\\n\",\n    \"\\n\",\n    \"In order for a new model to take effect it is also necessary to restart a model server.\\n\",\n    \"The issue here is that we are not changing the model version version and as a result, \\n\",\n    \"the model will not be updated. To ensure model update, we are here restarting a server -\\n\",\n    \"simply killing the running instance, and as a server is installed using deployment, the instance\\n\",\n    \"will be recreated. Additionally for pods operations to work correctly from the notebook,\\n\",\n    \"it is necessary to create permissions allowing for access to pods in another namespace. \\n\",\n    \"Look at the podaccessroles.yaml for details.\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": 24,\n   \"metadata\": {},\n   \"outputs\": [\n    {\n     \"name\": \"stdout\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"pod prefix  recommendermodelserver-\\n\",\n      \"pod namespace  kubeflow\\n\"\n     ]\n    }\n   ],\n   \"source\": [\n    \"recommender = \\\"recommendermodelserver-\\\"\\n\",\n    \"if directory == \\\"recommender1\\\":\\n\",\n    \"    recommender = \\\"recommendermodelserver1-\\\"\\n\",\n    \"print(\\\"pod prefix \\\", recommender) \\n\",\n    \"\\n\",\n    \"namespace = \\\"kubeflow\\\"\\n\",\n    \"print(\\\"pod namespace \\\", namespace) \"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": 26,\n   \"metadata\": {},\n   \"outputs\": [\n    {\n     \"name\": \"stdout\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"Current pod name  recommendermodelserver-6d5d5c654-snl99\\n\"\n     ]\n    }\n   ],\n   \"source\": [\n    \"# Get full pod name for the current model\\n\",\n    \"\\n\",\n    \"k8s_config.load_incluster_config()\\n\",\n    \"\\n\",\n    \"v1 = k8s_client.CoreV1Api()\\n\",\n    \"\\n\",\n    \"pod_list = v1.list_namespaced_pod(namespace)\\n\",\n    \"pod = [item.metadata.name for item in pod_list.items if recommender in item.metadata.name][0]\\n\",\n    \"print(\\\"Current pod name \\\", pod)\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": 27,\n   \"metadata\": {},\n   \"outputs\": [\n    {\n     \"name\": \"stdout\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"Done deleting\\n\"\n     ]\n    }\n   ],\n   \"source\": [\n    \"# Delete pod, so that it gets recreated\\n\",\n    \"v1.delete_namespaced_pod(pod, namespace, grace_period_seconds=0)\\n\",\n    \"\\n\",\n    \"print(\\\"Done deleting\\\")\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": 28,\n   \"metadata\": {},\n   \"outputs\": [\n    {\n     \"name\": \"stdout\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"New pod name  recommendermodelserver-6d5d5c654-xvxf7\\n\"\n     ]\n    }\n   ],\n   \"source\": [\n    \"# Verify that the new instance was created\\n\",\n    \"time.sleep(20)\\n\",\n    \"pod_list = v1.list_namespaced_pod(namespace)\\n\",\n    \"pod = [item.metadata.name for item in pod_list.items if recommender in item.metadata.name][0]\\n\",\n    \"print(\\\"New pod name \\\", pod)\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": null,\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": []\n  }\n ],\n \"metadata\": {\n  \"kernelspec\": {\n   \"display_name\": \"Python 3\",\n   \"language\": \"python\",\n   \"name\": \"python3\"\n  },\n  \"language_info\": {\n   \"codemirror_mode\": {\n    \"name\": \"ipython\",\n    \"version\": 3\n   },\n   \"file_extension\": \".py\",\n   \"mimetype\": \"text/x-python\",\n   \"name\": \"python\",\n   \"nbconvert_exporter\": \"python\",\n   \"pygments_lexer\": \"ipython3\",\n   \"version\": \"3.6.9\"\n  }\n },\n \"nbformat\": 4,\n \"nbformat_minor\": 2\n}\n"
  },
  {
    "path": "recommender/Recommender_Kubeflow.py",
    "content": "#!/usr/bin/env python\n# coding: utf-8\n\n# # This is implementation of the Recommender training\n#\n# This implementation takes a list of users and their purchasing history to calculate prediction\n# on the probability that they would by a certain product.\n# The implementation is structured in 2 parts:\n# 1. Build rating matrix based on the purchasing history. The implementation is based on this blog post\n# https://medium.com/datadriveninvestor/how-to-build-a-recommendation-system-for-purchase-data-step-by-step-d6d7a78800b6\n# 2. Build collabarative filtering model based on the rating matrix. The implementation is based on this project https://github.com/Piyushdharkar/Collaborative-Filtering-Using-Keras\n#\n# Implementation is leveraging Minio for storing both source data and result models\n#\n# It also uses Python kubernetes client for re starting model server pod\n#\n\n# # 1. Install libraries\n\n# In[1]:\n\nget_ipython().system('pip install pandas --upgrade --user')\nget_ipython().system('pip install keras --upgrade --user')\nget_ipython().system('pip install minio --upgrade --user')\nget_ipython().system('pip install kubernetes --upgrade --user')\nget_ipython().system('pip install kfmd --upgrade --user')\n\n# ## imports\n\n# In[2]:\n\nimport pandas as pd\nimport numpy as np\nimport time\nfrom minio import Minio\nfrom keras.models import Model\nfrom keras.layers import *\nfrom keras.losses import *\nimport tensorflow as tf\nimport os\nfrom kfmd import metadata\nfrom datetime import datetime\nfrom keras import backend as K\nfrom kubernetes import client as k8s_client, config as k8s_config\n\n# Create a workspace, run and execution\n\n# In[3]:\n\nexecTime = datetime.utcnow().isoformat(\"T\")\nws = metadata.Workspace(\n    # Connect to metadata-service in namesapce kubeflow in k8s cluster.\n    backend_url_prefix=\"metadata-service.kubeflow.svc.cluster.local:8080\",\n    name=\"recommender\",\n    description=\"a workspace for saving recommender experiments\")\nr = metadata.Run(\n    workspace=ws,\n    name=\"run-\" + execTime,\n    description=\"recommender run\",\n)\nexec = metadata.Execution(\n    name=\"execution\" + execTime,\n    workspace=ws,\n    run=r,\n    description=\"recommender ML execution\",\n)\n\n# # 2. Read data\n#\n# For reading data we are using two diffierent approaches:\n# 1. We use Tensorflow build in support to write resulting model to Minio\n# 2. We use Minio APIs to read source data using Pandas. We could of use Boto APIs here instead.\n\n# In[4]:\n\nminio_endpoint = os.environ.get(\n    'MINIO_URL', 'minio-service.kubeflow.svc.cluster.local:9000')\nminio_key = os.environ.get('MINIO_KEY', 'minio')\nminio_secret = os.environ.get('MINIO_SECRET', 'minio123')\n\nprint('Minio parameters : URL ', minio_endpoint, ' key ', minio_key,\n      ' secret ', minio_secret)\n\nos.environ['AWS_ACCESS_KEY_ID'] = minio_key\nos.environ['AWS_SECRET_ACCESS_KEY'] = minio_secret\nos.environ['AWS_REGION'] = 'us-west-1'\nos.environ['S3_REGION'] = 'us-west-1'\nos.environ['S3_ENDPOINT'] = minio_endpoint\nos.environ['S3_USE_HTTPS'] = '0'\nos.environ['S3_VERIFY_SSL'] = '0'\n\n# In[5]:\n\nminioClient = Minio(minio_endpoint,\n                    access_key=minio_key,\n                    secret_key=minio_secret,\n                    secure=False)\n\nminioClient.fget_object('data', 'recommender/users.csv', '/tmp/users.csv')\ncustomers = pd.read_csv('/tmp/users.csv')\nminioClient.fget_object('data', 'recommender/transactions.csv',\n                        '/tmp/transactions.csv')\ntransactions = pd.read_csv('/tmp/transactions.csv')\n\n#Log experiment data set\ndata_set = exec.log_input(\n    metadata.DataSet(\n        description=\"recommender current transactions and customers\",\n        name=\"Current transactions and customers\",\n        version=execTime,\n        uri=\"minio:/tmp/transactions.csv; minio:/tmp/users.csv\"))\n\n# In[6]:\n\nprint(customers.shape)\ncustomers.head()\n\n# In[7]:\n\nprint(transactions.shape)\ntransactions.head()\n\n# # 3 Data preparation\n#\n# Our goal here is to break down each list of items in the products column into rows\n# and count the number of products bought by a user\n\n# In[8]:\n\n# 1: split product items\ntransactions['products'] = transactions['products'].apply(\n    lambda x: [int(i) for i in x.split('|')])\ntransactions.head(2).set_index('customerId')['products'].apply(\n    pd.Series).reset_index()\n\n# In[9]:\n\n# 2: organize a given table into a dataframe with customerId, single productId, and purchase count\npd.melt(transactions.head(2).set_index('customerId')['products'].apply(pd.Series).reset_index(),\n             id_vars=['customerId'],\n             value_name='products') \\\n    .dropna().drop(['variable'], axis=1) \\\n    .groupby(['customerId', 'products']) \\\n    .agg({'products': 'count'}) \\\n    .rename(columns={'products': 'purchase_count'}) \\\n    .reset_index() \\\n    .rename(columns={'products': 'productId'})\n\n# ## 3.1 Create data with user, item, and target field\n\n# In[10]:\n\n\ndata = pd.melt(transactions.set_index('customerId')['products'].apply(pd.Series).reset_index(),\n             id_vars=['customerId'],\n             value_name='products') \\\n    .dropna().drop(['variable'], axis=1) \\\n    .groupby(['customerId', 'products']) \\\n    .agg({'products': 'count'}) \\\n    .rename(columns={'products': 'purchase_count'}) \\\n    .reset_index() \\\n    .rename(columns={'products': 'productId'})\ndata['productId'] = data['productId'].astype(np.int64)\n\nprint(data.shape)\ndata.head()\n\n# ## 3.2 Normalize item values across users\n\n# In[11]:\n\ndf_matrix = pd.pivot_table(data,\n                           values='purchase_count',\n                           index='customerId',\n                           columns='productId')\ndf_matrix.head()\n\n# In[12]:\n\n\ndf_matrix_norm = (df_matrix - df_matrix.min()) / \\\n                  (df_matrix.max() - df_matrix.min())\nprint(df_matrix_norm.shape)\ndf_matrix_norm.head()\n\n# In[13]:\n\n# create a table for input to the modeling\n\nd = df_matrix_norm.reset_index()\nd.index.names = ['scaled_purchase_freq']\ndata_norm = pd.melt(d,\n                    id_vars=['customerId'],\n                    value_name='scaled_purchase_freq').dropna()\nprint(data_norm.shape)\ndata_norm.head()\n\n# # 4 Preparing data for learning\n\n# In[14]:\n\ncustomer_idxs = np.array(data_norm.customerId, dtype=np.int)\nproduct_idxs = np.array(data_norm.productId, dtype=np.int)\n\nratings = np.array(data_norm.scaled_purchase_freq)\n\nn_customers = int(data_norm['customerId'].drop_duplicates().max()) + 1\nn_products = int(data_norm['productId'].drop_duplicates().max()) + 1\nn_factors = 50\n\ninput_shape = (1, )\n\nprint(n_customers)\nprint(n_products)\nprint(customer_idxs)\nprint(product_idxs)\nprint(ratings)\n\n# ## 4.1 Tensorflow Session\n\n# In[15]:\n\n# create TF session and set it in Keras\nsess = tf.Session()\nK.set_session(sess)\nK.set_learning_phase(1)\n\n# ## 4.2 Model Class\n\n# In[16]:\n\n\nclass DeepCollaborativeFiltering(Model):\n    def __init__(self, n_customers, n_products, n_factors, p_dropout=0.2):\n        x1 = Input(shape=(1, ), name=\"user\")\n\n        P = Embedding(n_customers, n_factors, input_length=1)(x1)\n        P = Reshape((n_factors, ))(P)\n\n        x2 = Input(shape=(1, ), name=\"product\")\n\n        Q = Embedding(n_products, n_factors, input_length=1)(x2)\n        Q = Reshape((n_factors, ))(Q)\n\n        x = concatenate([P, Q], axis=1)\n        x = Dropout(p_dropout)(x)\n\n        x = Dense(n_factors)(x)\n        x = Activation('relu')(x)\n        x = Dropout(p_dropout)(x)\n\n        output = Dense(1)(x)\n\n        super(DeepCollaborativeFiltering, self).__init__([x1, x2], output)\n\n    def rate(self, customer_idxs, product_idxs):\n        if (type(customer_idxs) == int and type(product_idxs) == int):\n            return self.predict([\n                np.array(customer_idxs).reshape((1, )),\n                np.array(product_idxs).reshape((1, ))\n            ])\n\n        if (type(customer_idxs) == str and type(product_idxs) == str):\n            return self.predict([\n                np.array(customerMapping[customer_idxs]).reshape((1, )),\n                np.array(productMapping[product_idxs]).reshape((1, ))\n            ])\n\n        return self.predict([\n            np.array([\n                customerMapping[customer_idx] for customer_idx in customer_idxs\n            ]),\n            np.array(\n                [productMapping[product_idx] for product_idx in product_idxs])\n        ])\n\n\n# ## 4.3 Hyperparameters\n\n# In[17]:\n\nbs = 64\nval_per = 0.25\nepochs = 3\n\n# ## 4.4 Model Definition\n\n# In[18]:\n\nmodel = DeepCollaborativeFiltering(n_customers, n_products, n_factors)\nmodel.summary()\n\n# # 5 Training\n\n# In[19]:\n\nmodel.compile(optimizer='adam', loss=mean_squared_logarithmic_error)\nmodel.fit(x=[customer_idxs, product_idxs],\n          y=ratings,\n          batch_size=bs,\n          epochs=epochs,\n          validation_split=val_per)\nprint('Done training!')\n\n# ## 5.1 Log model and metrics\n\n# In[20]:\n\nlogmodel = exec.log_output(\n    metadata.Model(name=\"DeepCollaborativeFiltering\",\n                   description=\"Model for product recommender\",\n                   uri=\"\",\n                   model_type=\"neural network\",\n                   version=execTime,\n                   training_framework={\n                       \"name\": \"tensorflow\",\n                       \"version\": \"v1.14\"\n                   },\n                   hyperparameters={\n                       \"batch_size\": 64,\n                       \"validation_split\": 0.25,\n                       \"layers\": [n_customers, n_products, n_factors],\n                       \"epochs\": 3\n                   }))\nmetrics = exec.log_output(\n    metadata.Metrics(name=\"Model for product recommender evaluation\",\n                     description=\"Validating of the recommender model\",\n                     uri=\"\",\n                     version=execTime,\n                     data_set_id=data_set.id,\n                     model_id=logmodel.id))\n\n# # 6 Get current output directory for model\n\n# In[21]:\n\ndirectorystream = minioClient.get_object('data', 'recommender/directory.txt')\ndirectory = \"\"\nfor d in directorystream.stream(32 * 1024):\n    directory += d.decode('utf-8')\narg_version = \"1\"\nexport_path = 's3://models/' + directory + '/' + arg_version + '/'\nprint('Exporting trained model to', export_path)\n\n# ## 6.1 Export models\n\n# In[22]:\n\n# inputs/outputs\ntensor_info_users = tf.saved_model.utils.build_tensor_info(model.input[0])\ntensor_info_products = tf.saved_model.utils.build_tensor_info(model.input[1])\ntensor_info_pred = tf.saved_model.utils.build_tensor_info(model.output)\n\nprint(\"tensor_info_users\", tensor_info_users.name)\nprint(\"tensor_info_products\", tensor_info_products.name)\nprint(\"tensor_info_pred\", tensor_info_pred.name)\n\n# In[23]:\n\n# signature\nprediction_signature = (tf.saved_model.signature_def_utils.build_signature_def(\n    inputs={\n        \"users\": tensor_info_users,\n        \"products\": tensor_info_products\n    },\n    outputs={\"predictions\": tensor_info_pred},\n    method_name=tf.saved_model.signature_constants.PREDICT_METHOD_NAME))\n# export\nlegacy_init_op = tf.group(tf.tables_initializer(), name='legacy_init_op')\nbuilder = tf.saved_model.builder.SavedModelBuilder(export_path)\nbuilder.add_meta_graph_and_variables(\n    sess, [tf.saved_model.tag_constants.SERVING],\n    signature_def_map={\n        tf.saved_model.signature_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY:\n        prediction_signature,\n    },\n    legacy_init_op=legacy_init_op)\nbuilder.save()\n\n# # 7 Restarting of the model serving server\n#\n# In order for a new model to take effect it is also necessary to restart a model server.\n# The issue here is that we are not changing the model version version and as a result,\n# the model will not be updated. To ensure model update, we are here restarting a server -\n# simply killing the running instance, and as a server is installed using deployment, the instance\n# will be recreated. Additionally for pods operations to work correctly from the notebook,\n# it is necessary to create permissions allowing for access to pods in another namespace.\n# Look at the podaccessroles.yaml for details.\n\n# In[24]:\n\nrecommender = \"recommendermodelserver-\"\nif directory == \"recommender1\":\n    recommender = \"recommendermodelserver1-\"\nprint(\"pod prefix \", recommender)\n\nnamespace = \"kubeflow\"\nprint(\"pod namespace \", namespace)\n\n# In[26]:\n\n# Get full pod name for the current model\n\nk8s_config.load_incluster_config()\n\nv1 = k8s_client.CoreV1Api()\n\npod_list = v1.list_namespaced_pod(namespace)\npod = [\n    item.metadata.name for item in pod_list.items\n    if recommender in item.metadata.name\n][0]\nprint(\"Current pod name \", pod)\n\n# In[27]:\n\n# Delete pod, so that it gets recreated\nv1.delete_namespaced_pod(pod, namespace, grace_period_seconds=0)\n\nprint(\"Done deleting\")\n\n# In[28]:\n\n# Verify that the new instance was created\ntime.sleep(20)\npod_list = v1.list_namespaced_pod(namespace)\npod = [\n    item.metadata.name for item in pod_list.items\n    if recommender in item.metadata.name\n][0]\nprint(\"New pod name \", pod)\n\n# In[ ]:\n"
  },
  {
    "path": "recommender/docker/Dockerfile",
    "content": "FROM  tensorflow/tensorflow:1.15.0-py3\nRUN pip3 install --upgrade pip\nRUN pip3 install pandas --upgrade\nRUN pip3 install keras --upgrade\nRUN pip3 install minio --upgrade\nRUN pip3 install kubernetes --upgrade\nRUN pip3 install kfmd --upgrade\n\nRUN mkdir -p /opt/kubeflow\nCOPY Recommender_Kubeflow.py /opt/kubeflow/\nENTRYPOINT [\"python3\", \"/opt/kubeflow/Recommender_Kubeflow.py\"]"
  },
  {
    "path": "recommender/docker/build.sh",
    "content": "#!/bin/bash\n\nimg='lightbend/ml-tf-recommender'\ntag='0.1'\ndocker build -t $img:$tag .\n\n"
  },
  {
    "path": "recommender/tfservingchart/.helmignore",
    "content": "# Patterns to ignore when building packages.\n# This supports shell glob matching, relative path matching, and\n# negation (prefixed with !). Only one pattern per line.\n.DS_Store\n# Common VCS dirs\n.git/\n.gitignore\n.bzr/\n.bzrignore\n.hg/\n.hgignore\n.svn/\n# Common backup files\n*.swp\n*.bak\n*.tmp\n*~\n# Various IDEs\n.project\n.idea/\n*.tmproj\n"
  },
  {
    "path": "recommender/tfservingchart/Chart.yaml",
    "content": "apiVersion: v1\nappVersion: 1.14.0\ndescription: TF Serving\nmaintainers:\n- name: Boris Lublinsky\nname: TF Serving Recommender model server\nversion: 1.0.0"
  },
  {
    "path": "recommender/tfservingchart/templates/NOTES.txt",
    "content": "Kubeflow Model serving components : tfserving is installed\n"
  },
  {
    "path": "recommender/tfservingchart/templates/_helpers.tpl",
    "content": "{{/* vim: set filetype=mustache: */}}\n{{/*\nExpand the name of the chart.\n*/}}\n{{- define \"modelserverchart.name\" -}}\n{{- default .Chart.Name .Values.nameOverride | trunc 63 | trimSuffix \"-\" -}}\n{{- end -}}\n\n{{/*\nCreate a default fully qualified app name.\nWe truncate at 63 chars because some Kubernetes name fields are limited to this (by the DNS naming spec).\n*/}}\n{{- define \"modelserverchart.fullname\" -}}\n{{- $name := default .Chart.Name .Values.nameOverride -}}\n{{- printf \"%s-%s\" .Release.Name $name | trunc 63 | trimSuffix \"-\" -}}\n{{- end -}}\n"
  },
  {
    "path": "recommender/tfservingchart/templates/minioaccess.yaml",
    "content": "apiVersion: v1\nkind: Secret\nmetadata:\n  name: minioaccess\n  namespace: kubeflow\ndata:\n  AWS_ACCESS_KEY_ID: bWluaW8=\n  AWS_SECRET_ACCESS_KEY: bWluaW8xMjM="
  },
  {
    "path": "recommender/tfservingchart/templates/tfserving.yaml",
    "content": "apiVersion: apps/v1\nkind: Deployment\nmetadata:\n  namespace: kubeflow\n  name: recommendermodelserver\n  labels:\n    app: recommendermodelserver\nspec:\n  replicas: 1\n  selector:\n    matchLabels:\n      app: recommendermodelserver\n  strategy:\n    type: RollingUpdate\n  template:\n    metadata:\n      labels:\n        app: recommendermodelserver\n    spec:\n      containers:\n        - name: serving\n          image: \"{{ .Values.image.server }}:{{ .Values.image.version }}\"\n          imagePullPolicy: \"{{ .Values.image.pullPolicy }}\"\n          ports:\n            - containerPort: 8500\n              name: grpc\n              protocol: TCP\n            - containerPort: 8501\n              name: http\n              protocol: TCP\n          readinessProbe:\n            tcpSocket:\n              port: http\n            initialDelaySeconds: 15\n            timeoutSeconds: 1\n          livenessProbe:\n            initialDelaySeconds: 30\n            periodSeconds: 30\n            tcpSocket:\n              port: htttp\n          resources:\n            limits:\n              cpu: \"2\"\n              memory: 2Gi\n            requests:\n              cpu: \"1\"\n              memory: 1Gi\n          env:\n            - name: \"AWS_REGION\"\n              value: \"us-west-1\"\n            - name: \"S3_REGION\"\n              value: \"us-west-1\"\n            - name: \"S3_ENDPOINT\"\n              value: \"minio-service.kubeflow.svc.cluster.local:9000\"\n            - name: \"S3_USE_HTTPS\"\n              value: \"0\"\n            - name: \"S3_VERIFY_SSL\"\n              value: \"0\"\n            - name: \"AWS_ACCESS_KEY_ID\"\n              valueFrom: { secretKeyRef: { name: \"minioaccess\", key: \"AWS_ACCESS_KEY_ID\" } }\n            - name: \"AWS_SECRET_ACCESS_KEY\"\n              valueFrom: { secretKeyRef: { name: \"minioaccess\", key: \"AWS_SECRET_ACCESS_KEY\" } }\n            - name: \"MODEL_BASE_PATH\"\n              value: \"s3://models\"\n            - name: \"MODEL_NAME\"\n              value: \"recommender\"\n          volumes:\n            - name: secret-volume\n              secret:\n                secretName: minioaccess\n---\napiVersion: v1\nkind: Service\nmetadata:\n  namespace: kubeflow\n  name: recommendermodelserver\nspec:\n  selector:\n    app: recommendermodelserver\n  ports:\n    - name: grpc\n      protocol: TCP\n      port: 8500\n      targetPort: 8500\n    - name: http\n      protocol: TCP\n      port: 8501\n      targetPort: 8501\n"
  },
  {
    "path": "recommender/tfservingchart/templates/tfserving1.yaml",
    "content": "apiVersion: apps/v1\nkind: Deployment\nmetadata:\n  namespace: kubeflow\n  name: recommendermodelserver1\n  labels:\n    app: recommendermodelserver1\nspec:\n  replicas: 1\n  selector:\n    matchLabels:\n      app: recommendermodelserver1\n  strategy:\n    type: RollingUpdate\n  template:\n    metadata:\n      labels:\n        app: recommendermodelserver1\n    spec:\n      containers:\n        - name: serving\n          image: \"{{ .Values.image.server }}:{{ .Values.image.version }}\"\n          imagePullPolicy: \"{{ .Values.image.pullPolicy }}\"\n          ports:\n            - containerPort: 8500\n              name: grpc\n              protocol: TCP\n            - containerPort: 8501\n              name: http\n              protocol: TCP\n          readinessProbe:\n            tcpSocket:\n              port: http\n            initialDelaySeconds: 15\n            timeoutSeconds: 1\n          livenessProbe:\n            initialDelaySeconds: 30\n            periodSeconds: 30\n            tcpSocket:\n              port: htttp\n          resources:\n            limits:\n              cpu: \"2\"\n              memory: 2Gi\n            requests:\n              cpu: \"1\"\n              memory: 1Gi\n          env:\n            - name: \"AWS_REGION\"\n              value: \"us-west-1\"\n            - name: \"S3_REGION\"\n              value: \"us-west-1\"\n            - name: \"S3_ENDPOINT\"\n              value: \"minio-service.kubeflow.svc.cluster.local:9000\"\n            - name: \"S3_USE_HTTPS\"\n              value: \"0\"\n            - name: \"S3_VERIFY_SSL\"\n              value: \"0\"\n            - name: \"AWS_ACCESS_KEY_ID\"\n              valueFrom: { secretKeyRef: { name: \"minioaccess\", key: \"AWS_ACCESS_KEY_ID\" } }\n            - name: \"AWS_SECRET_ACCESS_KEY\"\n              valueFrom: { secretKeyRef: { name: \"minioaccess\", key: \"AWS_SECRET_ACCESS_KEY\" } }\n            - name: \"MODEL_BASE_PATH\"\n              value: \"s3://models\"\n            - name: \"MODEL_NAME\"\n              value: \"recommender1\"\n          volumes:\n            - name: secret-volume\n              secret:\n                secretName: minioaccess\n---\napiVersion: v1\nkind: Service\nmetadata:\n  namespace: kubeflow\n  name: recommendermodelserver1\nspec:\n  selector:\n    app: recommendermodelserver1\n  ports:\n    - name: grpc\n      protocol: TCP\n      port: 8500\n      targetPort: 8500\n    - name: http\n      protocol: TCP\n      port: 8501\n      targetPort: 8501\n\n"
  },
  {
    "path": "recommender/tfservingchart/values.yaml",
    "content": "# application name is a namespace\n# docker images\nimage:\n  server: tensorflow/serving\n  pullPolicy: Always\n  version: 1.15.0\n"
  },
  {
    "path": "runthrough.sh",
    "content": "#!/bin/bash\nset -ex\nexample_repo_home=\"$( cd \"$( dirname \"${BASH_SOURCE[0]}\" )\" >/dev/null 2>&1 && pwd )\"\nKF_PLATFORM=${KF_PLATFORM:-minikube}\nexport KF_PLATFORM\n\nif [ \"$PLATFORM\" == \"gcp\" ]; then\n  # In GCP we also need a default zone\n  gcloud config set compute/zone us-west1-b\nfi\n\npushd dev-setup\ncommand -v kfctl >/dev/null 2>&1 || source install-kf.sh\ncommand -v kustomize >/dev/null 2>&1 || source install-kustomize.sh\ncommand -v argo >/dev/null 2>&1 || source install-argo.sh\nsource install-kf-pipeline-sdk.sh\npopd\nmkdir -p /tmp/abc\npushd /tmp/abc\nsource \"${example_repo_home}/ch2_seldon_examples/setup_example.sh\"\npopd\n# rm -rf /tmp/abc\n"
  },
  {
    "path": "scikitLearn/python/IncomePrediction.ipynb",
    "content": "{\n \"cells\": [\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"# Income prediction\\n\",\n    \"based on Seldon's implementation\\n\",\n    \"https://github.com/SeldonIO/alibi/blob/master/examples/anchor_tabular_adult.ipynb and\\n\",\n    \"https://github.com/SeldonIO/alibi/blob/5aec3ab4ce651ca2249bf849ecb434371c9278e4/alibi/datasets.py#L183\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": 1,\n   \"metadata\": {},\n   \"outputs\": [\n    {\n     \"name\": \"stdout\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"Requirement already up-to-date: pandas in ./.local/lib/python3.6/site-packages (1.0.3)\\n\",\n      \"Requirement already satisfied, skipping upgrade: numpy>=1.13.3 in /usr/local/lib/python3.6/dist-packages (from pandas) (1.18.1)\\n\",\n      \"Requirement already satisfied, skipping upgrade: pytz>=2017.2 in /usr/local/lib/python3.6/dist-packages (from pandas) (2019.3)\\n\",\n      \"Requirement already satisfied, skipping upgrade: python-dateutil>=2.6.1 in /usr/local/lib/python3.6/dist-packages (from pandas) (2.8.1)\\n\",\n      \"Requirement already satisfied, skipping upgrade: six>=1.5 in /usr/lib/python3/dist-packages (from python-dateutil>=2.6.1->pandas) (1.11.0)\\n\",\n      \"Requirement already up-to-date: scikit-learn in ./.local/lib/python3.6/site-packages (0.22.2.post1)\\n\",\n      \"Requirement already satisfied, skipping upgrade: joblib>=0.11 in ./.local/lib/python3.6/site-packages (from scikit-learn) (0.14.1)\\n\",\n      \"Requirement already satisfied, skipping upgrade: numpy>=1.11.0 in /usr/local/lib/python3.6/dist-packages (from scikit-learn) (1.18.1)\\n\",\n      \"Requirement already satisfied, skipping upgrade: scipy>=0.17.0 in /usr/local/lib/python3.6/dist-packages (from scikit-learn) (1.4.1)\\n\",\n      \"Requirement already up-to-date: alibi in ./.local/lib/python3.6/site-packages (0.4.0)\\n\",\n      \"Requirement already satisfied, skipping upgrade: scikit-learn in ./.local/lib/python3.6/site-packages (from alibi) (0.22.2.post1)\\n\",\n      \"Requirement already satisfied, skipping upgrade: attrs in /usr/local/lib/python3.6/dist-packages (from alibi) (19.3.0)\\n\",\n      \"Requirement already satisfied, skipping upgrade: beautifulsoup4 in ./.local/lib/python3.6/site-packages (from alibi) (4.8.2)\\n\",\n      \"Requirement already satisfied, skipping upgrade: spacy in ./.local/lib/python3.6/site-packages (from alibi) (2.2.4)\\n\",\n      \"Requirement already satisfied, skipping upgrade: shap in ./.local/lib/python3.6/site-packages (from alibi) (0.35.0)\\n\",\n      \"Requirement already satisfied, skipping upgrade: scipy in /usr/local/lib/python3.6/dist-packages (from alibi) (1.4.1)\\n\",\n      \"Requirement already satisfied, skipping upgrade: requests in /usr/local/lib/python3.6/dist-packages (from alibi) (2.22.0)\\n\",\n      \"Requirement already satisfied, skipping upgrade: numpy in /usr/local/lib/python3.6/dist-packages (from alibi) (1.18.1)\\n\",\n      \"Requirement already satisfied, skipping upgrade: Pillow in ./.local/lib/python3.6/site-packages (from alibi) (7.0.0)\\n\",\n      \"Requirement already satisfied, skipping upgrade: tensorflow<2.0 in /usr/local/lib/python3.6/dist-packages (from alibi) (1.15.2)\\n\",\n      \"Requirement already satisfied, skipping upgrade: pandas in ./.local/lib/python3.6/site-packages (from alibi) (1.0.3)\\n\",\n      \"Requirement already satisfied, skipping upgrade: prettyprinter in ./.local/lib/python3.6/site-packages (from alibi) (0.18.0)\\n\",\n      \"Requirement already satisfied, skipping upgrade: scikit-image in ./.local/lib/python3.6/site-packages (from alibi) (0.16.2)\\n\",\n      \"Requirement already satisfied, skipping upgrade: joblib>=0.11 in ./.local/lib/python3.6/site-packages (from scikit-learn->alibi) (0.14.1)\\n\",\n      \"Requirement already satisfied, skipping upgrade: soupsieve>=1.2 in ./.local/lib/python3.6/site-packages (from beautifulsoup4->alibi) (2.0)\\n\",\n      \"Requirement already satisfied, skipping upgrade: srsly<1.1.0,>=1.0.2 in ./.local/lib/python3.6/site-packages (from spacy->alibi) (1.0.2)\\n\",\n      \"Requirement already satisfied, skipping upgrade: preshed<3.1.0,>=3.0.2 in ./.local/lib/python3.6/site-packages (from spacy->alibi) (3.0.2)\\n\",\n      \"Requirement already satisfied, skipping upgrade: plac<1.2.0,>=0.9.6 in ./.local/lib/python3.6/site-packages (from spacy->alibi) (1.1.3)\\n\",\n      \"Requirement already satisfied, skipping upgrade: blis<0.5.0,>=0.4.0 in ./.local/lib/python3.6/site-packages (from spacy->alibi) (0.4.1)\\n\",\n      \"Requirement already satisfied, skipping upgrade: cymem<2.1.0,>=2.0.2 in ./.local/lib/python3.6/site-packages (from spacy->alibi) (2.0.3)\\n\",\n      \"Requirement already satisfied, skipping upgrade: tqdm<5.0.0,>=4.38.0 in ./.local/lib/python3.6/site-packages (from spacy->alibi) (4.43.0)\\n\",\n      \"Requirement already satisfied, skipping upgrade: catalogue<1.1.0,>=0.0.7 in ./.local/lib/python3.6/site-packages (from spacy->alibi) (1.0.0)\\n\",\n      \"Requirement already satisfied, skipping upgrade: thinc==7.4.0 in ./.local/lib/python3.6/site-packages (from spacy->alibi) (7.4.0)\\n\",\n      \"Requirement already satisfied, skipping upgrade: murmurhash<1.1.0,>=0.28.0 in ./.local/lib/python3.6/site-packages (from spacy->alibi) (1.0.2)\\n\",\n      \"Requirement already satisfied, skipping upgrade: setuptools in /usr/local/lib/python3.6/dist-packages (from spacy->alibi) (45.1.0)\\n\",\n      \"Requirement already satisfied, skipping upgrade: wasabi<1.1.0,>=0.4.0 in ./.local/lib/python3.6/site-packages (from spacy->alibi) (0.6.0)\\n\",\n      \"Requirement already satisfied, skipping upgrade: certifi>=2017.4.17 in /usr/local/lib/python3.6/dist-packages (from requests->alibi) (2019.11.28)\\n\",\n      \"Requirement already satisfied, skipping upgrade: idna<2.9,>=2.5 in /usr/lib/python3/dist-packages (from requests->alibi) (2.6)\\n\",\n      \"Requirement already satisfied, skipping upgrade: urllib3!=1.25.0,!=1.25.1,<1.26,>=1.21.1 in ./.local/lib/python3.6/site-packages (from requests->alibi) (1.24.3)\\n\",\n      \"Requirement already satisfied, skipping upgrade: chardet<3.1.0,>=3.0.2 in /usr/local/lib/python3.6/dist-packages (from requests->alibi) (3.0.4)\\n\",\n      \"Requirement already satisfied, skipping upgrade: keras-preprocessing>=1.0.5 in /usr/local/lib/python3.6/dist-packages (from tensorflow<2.0->alibi) (1.1.0)\\n\",\n      \"Requirement already satisfied, skipping upgrade: gast==0.2.2 in /usr/local/lib/python3.6/dist-packages (from tensorflow<2.0->alibi) (0.2.2)\\n\",\n      \"Requirement already satisfied, skipping upgrade: wheel>=0.26; python_version >= \\\"3\\\" in /usr/lib/python3/dist-packages (from tensorflow<2.0->alibi) (0.30.0)\\n\",\n      \"Requirement already satisfied, skipping upgrade: six>=1.10.0 in /usr/lib/python3/dist-packages (from tensorflow<2.0->alibi) (1.11.0)\\n\",\n      \"Requirement already satisfied, skipping upgrade: termcolor>=1.1.0 in /usr/local/lib/python3.6/dist-packages (from tensorflow<2.0->alibi) (1.1.0)\\n\",\n      \"Requirement already satisfied, skipping upgrade: astor>=0.6.0 in /usr/local/lib/python3.6/dist-packages (from tensorflow<2.0->alibi) (0.8.1)\\n\",\n      \"Requirement already satisfied, skipping upgrade: keras-applications>=1.0.8 in /usr/local/lib/python3.6/dist-packages (from tensorflow<2.0->alibi) (1.0.8)\\n\",\n      \"Requirement already satisfied, skipping upgrade: opt-einsum>=2.3.2 in /usr/local/lib/python3.6/dist-packages (from tensorflow<2.0->alibi) (3.1.0)\\n\",\n      \"Requirement already satisfied, skipping upgrade: protobuf>=3.6.1 in /usr/local/lib/python3.6/dist-packages (from tensorflow<2.0->alibi) (3.11.2)\\n\",\n      \"Requirement already satisfied, skipping upgrade: tensorflow-estimator==1.15.1 in /usr/local/lib/python3.6/dist-packages (from tensorflow<2.0->alibi) (1.15.1)\\n\",\n      \"Requirement already satisfied, skipping upgrade: google-pasta>=0.1.6 in /usr/local/lib/python3.6/dist-packages (from tensorflow<2.0->alibi) (0.1.8)\\n\",\n      \"Requirement already satisfied, skipping upgrade: wrapt>=1.11.1 in /usr/local/lib/python3.6/dist-packages (from tensorflow<2.0->alibi) (1.11.2)\\n\",\n      \"Requirement already satisfied, skipping upgrade: grpcio>=1.8.6 in /usr/local/lib/python3.6/dist-packages (from tensorflow<2.0->alibi) (1.26.0)\\n\",\n      \"Requirement already satisfied, skipping upgrade: tensorboard<1.16.0,>=1.15.0 in /usr/local/lib/python3.6/dist-packages (from tensorflow<2.0->alibi) (1.15.0)\\n\",\n      \"Requirement already satisfied, skipping upgrade: absl-py>=0.7.0 in /usr/local/lib/python3.6/dist-packages (from tensorflow<2.0->alibi) (0.9.0)\\n\",\n      \"Requirement already satisfied, skipping upgrade: pytz>=2017.2 in /usr/local/lib/python3.6/dist-packages (from pandas->alibi) (2019.3)\\n\",\n      \"Requirement already satisfied, skipping upgrade: python-dateutil>=2.6.1 in /usr/local/lib/python3.6/dist-packages (from pandas->alibi) (2.8.1)\\n\",\n      \"Requirement already satisfied, skipping upgrade: Pygments>=2.2.0 in /usr/local/lib/python3.6/dist-packages (from prettyprinter->alibi) (2.5.2)\\n\",\n      \"Requirement already satisfied, skipping upgrade: colorful>=0.4.0 in ./.local/lib/python3.6/site-packages (from prettyprinter->alibi) (0.5.4)\\n\",\n      \"Requirement already satisfied, skipping upgrade: networkx>=2.0 in ./.local/lib/python3.6/site-packages (from scikit-image->alibi) (2.4)\\n\",\n      \"Requirement already satisfied, skipping upgrade: imageio>=2.3.0 in ./.local/lib/python3.6/site-packages (from scikit-image->alibi) (2.8.0)\\n\",\n      \"Requirement already satisfied, skipping upgrade: matplotlib!=3.0.0,>=2.0.0 in /usr/local/lib/python3.6/dist-packages (from scikit-image->alibi) (3.1.2)\\n\",\n      \"Requirement already satisfied, skipping upgrade: PyWavelets>=0.4.0 in ./.local/lib/python3.6/site-packages (from scikit-image->alibi) (1.1.1)\\n\",\n      \"Requirement already satisfied, skipping upgrade: importlib-metadata>=0.20; python_version < \\\"3.8\\\" in /usr/local/lib/python3.6/dist-packages (from catalogue<1.1.0,>=0.0.7->spacy->alibi) (1.4.0)\\n\",\n      \"Requirement already satisfied, skipping upgrade: h5py in /usr/local/lib/python3.6/dist-packages (from keras-applications>=1.0.8->tensorflow<2.0->alibi) (2.10.0)\\n\",\n      \"Requirement already satisfied, skipping upgrade: werkzeug>=0.11.15 in /usr/local/lib/python3.6/dist-packages (from tensorboard<1.16.0,>=1.15.0->tensorflow<2.0->alibi) (0.16.1)\\n\",\n      \"Requirement already satisfied, skipping upgrade: markdown>=2.6.8 in /usr/local/lib/python3.6/dist-packages (from tensorboard<1.16.0,>=1.15.0->tensorflow<2.0->alibi) (3.1.1)\\n\",\n      \"Requirement already satisfied, skipping upgrade: decorator>=4.3.0 in /usr/local/lib/python3.6/dist-packages (from networkx>=2.0->scikit-image->alibi) (4.4.1)\\n\",\n      \"Requirement already satisfied, skipping upgrade: kiwisolver>=1.0.1 in /usr/local/lib/python3.6/dist-packages (from matplotlib!=3.0.0,>=2.0.0->scikit-image->alibi) (1.1.0)\\n\",\n      \"Requirement already satisfied, skipping upgrade: cycler>=0.10 in /usr/local/lib/python3.6/dist-packages (from matplotlib!=3.0.0,>=2.0.0->scikit-image->alibi) (0.10.0)\\n\",\n      \"Requirement already satisfied, skipping upgrade: pyparsing!=2.0.4,!=2.1.2,!=2.1.6,>=2.0.1 in /usr/local/lib/python3.6/dist-packages (from matplotlib!=3.0.0,>=2.0.0->scikit-image->alibi) (2.4.6)\\n\",\n      \"Requirement already satisfied, skipping upgrade: zipp>=0.5 in /usr/local/lib/python3.6/dist-packages (from importlib-metadata>=0.20; python_version < \\\"3.8\\\"->catalogue<1.1.0,>=0.0.7->spacy->alibi) (2.1.0)\\r\\n\"\n     ]\n    }\n   ],\n   \"source\": [\n    \"!pip install pandas --upgrade --user\\n\",\n    \"!pip install scikit-learn --upgrade --user\\n\",\n    \"!pip install alibi --upgrade --user\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": 2,\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"import numpy as np\\n\",\n    \"import pandas as pd\\n\",\n    \"from sklearn.ensemble import RandomForestClassifier\\n\",\n    \"from sklearn.compose import ColumnTransformer\\n\",\n    \"from sklearn.pipeline import Pipeline\\n\",\n    \"from sklearn.impute import SimpleImputer\\n\",\n    \"from sklearn.metrics import accuracy_score\\n\",\n    \"from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder\\n\",\n    \"from alibi.explainers import AnchorTabular\\n\",\n    \"from alibi.datasets import fetch_adult\\n\",\n    \"from alibi.utils.data import Bunch, gen_category_map\\n\",\n    \"from typing import Tuple, Union\\n\",\n    \"import requests\\n\",\n    \"from requests import RequestException\\n\",\n    \"from io import BytesIO, StringIO\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"# Fetching and preprocessing data\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": 3,\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"def fetch_adult(features_drop: list = None, return_X_y: bool = False, url_id: int = 0) -> Union[Bunch, Tuple[np.ndarray, np.ndarray]]:\\n\",\n    \"    \\\"\\\"\\\"\\n\",\n    \"    Downloads and pre-processes 'adult' dataset.\\n\",\n    \"    More info: http://mlr.cs.umass.edu/ml/machine-learning-databases/adult/\\n\",\n    \"    Parameters\\n\",\n    \"    ----------\\n\",\n    \"    features_drop\\n\",\n    \"        List of features to be dropped from dataset, by default drops [\\\"fnlwgt\\\", \\\"Education-Num\\\"]\\n\",\n    \"    return_X_y\\n\",\n    \"        If true, return features X and labels y as numpy arrays, if False return a Bunch object\\n\",\n    \"    url_id\\n\",\n    \"        Index specifying which URL to use for downloading\\n\",\n    \"    Returns\\n\",\n    \"    -------\\n\",\n    \"    Bunch\\n\",\n    \"        Dataset, labels, a list of features and a dictionary containing a list with the potential categories\\n\",\n    \"        for each categorical feature where the key refers to the feature column.\\n\",\n    \"    (data, target)\\n\",\n    \"        Tuple if ``return_X_y`` is true\\n\",\n    \"    \\\"\\\"\\\"\\n\",\n    \"    ADULT_URLS = ['https://storage.googleapis.com/seldon-datasets/adult/adult.data',\\n\",\n    \"              'https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data',\\n\",\n    \"              'http://mlr.cs.umass.edu/ml/machine-learning-databases/adult/adult.data']\\n\",\n    \"    if features_drop is None:\\n\",\n    \"        features_drop = [\\\"fnlwgt\\\", \\\"Education-Num\\\"]\\n\",\n    \"\\n\",\n    \"    # download data\\n\",\n    \"    dataset_url = ADULT_URLS[url_id]\\n\",\n    \"    raw_features = ['Age', 'Workclass', 'fnlwgt', 'Education', 'Education-Num', 'Marital Status',\\n\",\n    \"                    'Occupation', 'Relationship', 'Race', 'Sex', 'Capital Gain', 'Capital Loss',\\n\",\n    \"                    'Hours per week', 'Country', 'Target']\\n\",\n    \"    try:\\n\",\n    \"        resp = requests.get(dataset_url)\\n\",\n    \"        resp.raise_for_status()\\n\",\n    \"    except RequestException:\\n\",\n    \"        logger.exception(\\\"Could not connect, URL may be out of service\\\")\\n\",\n    \"        raise\\n\",\n    \"\\n\",\n    \"    raw_data = pd.read_csv(StringIO(resp.text), names=raw_features, delimiter=', ', engine='python').fillna('?')\\n\",\n    \"\\n\",\n    \"    # get labels, features and drop unnecessary features\\n\",\n    \"    labels = (raw_data['Target'] == '>50K').astype(int).values\\n\",\n    \"    features_drop += ['Target']\\n\",\n    \"    data = raw_data.drop(features_drop, axis=1)\\n\",\n    \"    features = list(data.columns)\\n\",\n    \"\\n\",\n    \"    # map categorical features\\n\",\n    \"    education_map = {\\n\",\n    \"        '10th': 'Dropout', '11th': 'Dropout', '12th': 'Dropout', '1st-4th':\\n\",\n    \"            'Dropout', '5th-6th': 'Dropout', '7th-8th': 'Dropout', '9th':\\n\",\n    \"            'Dropout', 'Preschool': 'Dropout', 'HS-grad': 'High School grad',\\n\",\n    \"        'Some-college': 'High School grad', 'Masters': 'Masters',\\n\",\n    \"        'Prof-school': 'Prof-School', 'Assoc-acdm': 'Associates',\\n\",\n    \"        'Assoc-voc': 'Associates'\\n\",\n    \"    }\\n\",\n    \"    occupation_map = {\\n\",\n    \"        \\\"Adm-clerical\\\": \\\"Admin\\\", \\\"Armed-Forces\\\": \\\"Military\\\",\\n\",\n    \"        \\\"Craft-repair\\\": \\\"Blue-Collar\\\", \\\"Exec-managerial\\\": \\\"White-Collar\\\",\\n\",\n    \"        \\\"Farming-fishing\\\": \\\"Blue-Collar\\\", \\\"Handlers-cleaners\\\":\\n\",\n    \"            \\\"Blue-Collar\\\", \\\"Machine-op-inspct\\\": \\\"Blue-Collar\\\", \\\"Other-service\\\":\\n\",\n    \"            \\\"Service\\\", \\\"Priv-house-serv\\\": \\\"Service\\\", \\\"Prof-specialty\\\":\\n\",\n    \"            \\\"Professional\\\", \\\"Protective-serv\\\": \\\"Other\\\", \\\"Sales\\\":\\n\",\n    \"            \\\"Sales\\\", \\\"Tech-support\\\": \\\"Other\\\", \\\"Transport-moving\\\":\\n\",\n    \"            \\\"Blue-Collar\\\"\\n\",\n    \"    }\\n\",\n    \"    country_map = {\\n\",\n    \"        'Cambodia': 'SE-Asia', 'Canada': 'British-Commonwealth', 'China':\\n\",\n    \"            'China', 'Columbia': 'South-America', 'Cuba': 'Other',\\n\",\n    \"        'Dominican-Republic': 'Latin-America', 'Ecuador': 'South-America',\\n\",\n    \"        'El-Salvador': 'South-America', 'England': 'British-Commonwealth',\\n\",\n    \"        'France': 'Euro_1', 'Germany': 'Euro_1', 'Greece': 'Euro_2',\\n\",\n    \"        'Guatemala': 'Latin-America', 'Haiti': 'Latin-America',\\n\",\n    \"        'Holand-Netherlands': 'Euro_1', 'Honduras': 'Latin-America',\\n\",\n    \"        'Hong': 'China', 'Hungary': 'Euro_2', 'India':\\n\",\n    \"            'British-Commonwealth', 'Iran': 'Other', 'Ireland':\\n\",\n    \"            'British-Commonwealth', 'Italy': 'Euro_1', 'Jamaica':\\n\",\n    \"            'Latin-America', 'Japan': 'Other', 'Laos': 'SE-Asia', 'Mexico':\\n\",\n    \"            'Latin-America', 'Nicaragua': 'Latin-America',\\n\",\n    \"        'Outlying-US(Guam-USVI-etc)': 'Latin-America', 'Peru':\\n\",\n    \"            'South-America', 'Philippines': 'SE-Asia', 'Poland': 'Euro_2',\\n\",\n    \"        'Portugal': 'Euro_2', 'Puerto-Rico': 'Latin-America', 'Scotland':\\n\",\n    \"            'British-Commonwealth', 'South': 'Euro_2', 'Taiwan': 'China',\\n\",\n    \"        'Thailand': 'SE-Asia', 'Trinadad&Tobago': 'Latin-America',\\n\",\n    \"        'United-States': 'United-States', 'Vietnam': 'SE-Asia'\\n\",\n    \"    }\\n\",\n    \"    married_map = {\\n\",\n    \"        'Never-married': 'Never-Married', 'Married-AF-spouse': 'Married',\\n\",\n    \"        'Married-civ-spouse': 'Married', 'Married-spouse-absent':\\n\",\n    \"            'Separated', 'Separated': 'Separated', 'Divorced':\\n\",\n    \"            'Separated', 'Widowed': 'Widowed'\\n\",\n    \"    }\\n\",\n    \"    mapping = {'Education': education_map, 'Occupation': occupation_map, 'Country': country_map,\\n\",\n    \"               'Marital Status': married_map}\\n\",\n    \"\\n\",\n    \"    data_copy = data.copy()\\n\",\n    \"    for f, f_map in mapping.items():\\n\",\n    \"        data_tmp = data_copy[f].values\\n\",\n    \"        for key, value in f_map.items():\\n\",\n    \"            data_tmp[data_tmp == key] = value\\n\",\n    \"        data[f] = data_tmp\\n\",\n    \"\\n\",\n    \"    # get categorical features and apply labelencoding\\n\",\n    \"    categorical_features = [f for f in features if data[f].dtype == 'O']\\n\",\n    \"    category_map = {}\\n\",\n    \"    for f in categorical_features:\\n\",\n    \"        le = LabelEncoder()\\n\",\n    \"        data_tmp = le.fit_transform(data[f].values)\\n\",\n    \"        data[f] = data_tmp\\n\",\n    \"        category_map[features.index(f)] = list(le.classes_)\\n\",\n    \"\\n\",\n    \"    # only return data values\\n\",\n    \"    data = data.values\\n\",\n    \"    target_names = ['<=50K', '>50K']\\n\",\n    \"\\n\",\n    \"    if return_X_y:\\n\",\n    \"        return data, labels\\n\",\n    \"\\n\",\n    \"    return Bunch(data=data, target=labels, feature_names=features, target_names=target_names, category_map=category_map)\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"# Load adult dataset\\n\",\n    \"The fetch_adult function returns a Bunch object containing the features, the targets, the feature names and a mapping of categorical variables to numbers which are required for formatting the output of the Anchor explainer.\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": 4,\n   \"metadata\": {},\n   \"outputs\": [\n    {\n     \"data\": {\n      \"text/plain\": [\n       \"dict_keys(['data', 'target', 'feature_names', 'target_names', 'category_map'])\"\n      ]\n     },\n     \"execution_count\": 4,\n     \"metadata\": {},\n     \"output_type\": \"execute_result\"\n    }\n   ],\n   \"source\": [\n    \"adult = fetch_adult()\\n\",\n    \"adult.keys()\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": 5,\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"data = adult.data\\n\",\n    \"target = adult.target\\n\",\n    \"feature_names = adult.feature_names\\n\",\n    \"category_map = adult.category_map\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"Define shuffled training and test set\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": 6,\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"np.random.seed(0)\\n\",\n    \"data_perm = np.random.permutation(np.c_[data, target])\\n\",\n    \"data = data_perm[:,:-1]\\n\",\n    \"target = data_perm[:,-1]\\n\",\n    \"idx = 30000\\n\",\n    \"X_train,Y_train = data[:idx,:], target[:idx]\\n\",\n    \"X_test, Y_test = data[idx+1:,:], target[idx+1:]\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"# Create feature transformation pipeline\\n\",\n    \"Create feature pre-processor. Needs to have 'fit' and 'transform' methods. Different types of pre-processing can be applied to all or part of the features. In the example below we will standardize ordinal features and apply one-hot-encoding to categorical features.\\n\",\n    \"\\n\",\n    \"Ordinal features:\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": 7,\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"ordinal_features = [x for x in range(len(feature_names)) if x not in list(category_map.keys())]\\n\",\n    \"ordinal_transformer = Pipeline(steps=[('imputer', SimpleImputer(strategy='median')),\\n\",\n    \"                                      ('scaler', StandardScaler())])\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"Categorical features:\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": 8,\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"categorical_features = list(category_map.keys())\\n\",\n    \"categorical_transformer = Pipeline(steps=[('imputer', SimpleImputer(strategy='median')),\\n\",\n    \"                                          ('onehot', OneHotEncoder(handle_unknown='ignore'))])\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"Combine and fit:\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": 9,\n   \"metadata\": {},\n   \"outputs\": [\n    {\n     \"data\": {\n      \"text/plain\": [\n       \"ColumnTransformer(n_jobs=None, remainder='drop', sparse_threshold=0.3,\\n\",\n       \"                  transformer_weights=None,\\n\",\n       \"                  transformers=[('num',\\n\",\n       \"                                 Pipeline(memory=None,\\n\",\n       \"                                          steps=[('imputer',\\n\",\n       \"                                                  SimpleImputer(add_indicator=False,\\n\",\n       \"                                                                copy=True,\\n\",\n       \"                                                                fill_value=None,\\n\",\n       \"                                                                missing_values=nan,\\n\",\n       \"                                                                strategy='median',\\n\",\n       \"                                                                verbose=0)),\\n\",\n       \"                                                 ('scaler',\\n\",\n       \"                                                  StandardScaler(copy=True,\\n\",\n       \"                                                                 with_mean=True,\\n\",\n       \"                                                                 with_std=True))],\\n\",\n       \"                                          verbose=False),\\n\",\n       \"                                 [0, 8, 9, 10]),\\n\",\n       \"                                ('cat',\\n\",\n       \"                                 Pipeline(memory=None,\\n\",\n       \"                                          steps=[('imputer',\\n\",\n       \"                                                  SimpleImputer(add_indicator=False,\\n\",\n       \"                                                                copy=True,\\n\",\n       \"                                                                fill_value=None,\\n\",\n       \"                                                                missing_values=nan,\\n\",\n       \"                                                                strategy='median',\\n\",\n       \"                                                                verbose=0)),\\n\",\n       \"                                                 ('onehot',\\n\",\n       \"                                                  OneHotEncoder(categories='auto',\\n\",\n       \"                                                                drop=None,\\n\",\n       \"                                                                dtype=<class 'numpy.float64'>,\\n\",\n       \"                                                                handle_unknown='ignore',\\n\",\n       \"                                                                sparse=True))],\\n\",\n       \"                                          verbose=False),\\n\",\n       \"                                 [1, 2, 3, 4, 5, 6, 7, 11])],\\n\",\n       \"                  verbose=False)\"\n      ]\n     },\n     \"execution_count\": 9,\n     \"metadata\": {},\n     \"output_type\": \"execute_result\"\n    }\n   ],\n   \"source\": [\n    \"preprocessor = ColumnTransformer(transformers=[('num', ordinal_transformer, ordinal_features),\\n\",\n    \"                                               ('cat', categorical_transformer, categorical_features)])\\n\",\n    \"preprocessor.fit(X_train)\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"# Train Random Forest model\\n\",\n    \"Fit on pre-processed (imputing, OHE, standardizing) data.\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": 10,\n   \"metadata\": {},\n   \"outputs\": [\n    {\n     \"data\": {\n      \"text/plain\": [\n       \"RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,\\n\",\n       \"                       criterion='gini', max_depth=None, max_features='auto',\\n\",\n       \"                       max_leaf_nodes=None, max_samples=None,\\n\",\n       \"                       min_impurity_decrease=0.0, min_impurity_split=None,\\n\",\n       \"                       min_samples_leaf=1, min_samples_split=2,\\n\",\n       \"                       min_weight_fraction_leaf=0.0, n_estimators=50,\\n\",\n       \"                       n_jobs=None, oob_score=False, random_state=None,\\n\",\n       \"                       verbose=0, warm_start=False)\"\n      ]\n     },\n     \"execution_count\": 10,\n     \"metadata\": {},\n     \"output_type\": \"execute_result\"\n    }\n   ],\n   \"source\": [\n    \"np.random.seed(0)\\n\",\n    \"clf = RandomForestClassifier(n_estimators=50)\\n\",\n    \"clf.fit(preprocessor.transform(X_train), Y_train)\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"Define predict function\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": 11,\n   \"metadata\": {},\n   \"outputs\": [\n    {\n     \"name\": \"stdout\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"Train accuracy:  0.9655333333333334\\n\",\n      \"Test accuracy:  0.855859375\\n\"\n     ]\n    }\n   ],\n   \"source\": [\n    \"predict_fn = lambda x: clf.predict(preprocessor.transform(x))\\n\",\n    \"print('Train accuracy: ', accuracy_score(Y_train, predict_fn(X_train)))\\n\",\n    \"print('Test accuracy: ', accuracy_score(Y_test, predict_fn(X_test)))\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"# Initialize and fit anchor explainer for tabular data\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": 12,\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": [\n    \"explainer = AnchorTabular(predict_fn, feature_names, categorical_names=category_map, seed=1)\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"Discretize the ordinal features into quartiles\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": 13,\n   \"metadata\": {},\n   \"outputs\": [\n    {\n     \"data\": {\n      \"text/plain\": [\n       \"AnchorTabular(meta={\\n\",\n       \"    'name': 'AnchorTabular',\\n\",\n       \"    'type': ['blackbox'],\\n\",\n       \"    'explanations': ['local'],\\n\",\n       \"    'params': {'seed': 1, 'disc_perc': [25, 50, 75]}\\n\",\n       \"})\"\n      ]\n     },\n     \"execution_count\": 13,\n     \"metadata\": {},\n     \"output_type\": \"execute_result\"\n    }\n   ],\n   \"source\": [\n    \"explainer.fit(X_train, disc_perc=[25, 50, 75])\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"# Getting an anchor\\n\",\n    \"Below, we get an anchor for the prediction of the first observation in the test set. An anchor is a sufficient condition - that is, when the anchor holds, the prediction should be the same as the prediction for this instance.\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": 14,\n   \"metadata\": {},\n   \"outputs\": [\n    {\n     \"name\": \"stdout\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"Prediction:  <=50K\\n\"\n     ]\n    }\n   ],\n   \"source\": [\n    \"idx = 0\\n\",\n    \"class_names = adult.target_names\\n\",\n    \"print('Prediction: ', class_names[explainer.predictor(X_test[idx].reshape(1, -1))[0]])\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"We set the precision threshold to 0.95. This means that predictions on observations where the anchor holds will be the same as the prediction on the explained instance at least 95% of the time.\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": 15,\n   \"metadata\": {},\n   \"outputs\": [\n    {\n     \"name\": \"stdout\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"Anchor: Marital Status = Separated AND Sex = Female\\n\",\n      \"Precision: 0.95\\n\",\n      \"Coverage: 0.18\\n\"\n     ]\n    }\n   ],\n   \"source\": [\n    \"explanation = explainer.explain(X_test[idx], threshold=0.95)\\n\",\n    \"print('Anchor: %s' % (' AND '.join(explanation.anchor)))\\n\",\n    \"print('Precision: %.2f' % explanation.precision)\\n\",\n    \"print('Coverage: %.2f' % explanation.coverage)\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"# ...or not?\\n\",\n    \"Let's try getting an anchor for a different observation in the test set - one for the which the prediction is >50K.\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": 16,\n   \"metadata\": {},\n   \"outputs\": [\n    {\n     \"name\": \"stdout\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"Prediction:  >50K\\n\"\n     ]\n    },\n    {\n     \"name\": \"stderr\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"Could not find an result satisfying the 0.95 precision constraint. Now returning the best non-eligible result.\\n\"\n     ]\n    },\n    {\n     \"name\": \"stdout\",\n     \"output_type\": \"stream\",\n     \"text\": [\n      \"Anchor: Capital Loss > 0.00 AND Relationship = Husband AND Marital Status = Married AND Age > 37.00 AND Race = White AND Country = United-States AND Sex = Male\\n\",\n      \"Precision: 0.71\\n\",\n      \"Coverage: 0.05\\n\"\n     ]\n    }\n   ],\n   \"source\": [\n    \"idx = 6\\n\",\n    \"class_names = adult.target_names\\n\",\n    \"print('Prediction: ', class_names[explainer.predictor(X_test[idx].reshape(1, -1))[0]])\\n\",\n    \"\\n\",\n    \"explanation = explainer.explain(X_test[idx], threshold=0.95)\\n\",\n    \"print('Anchor: %s' % (' AND '.join(explanation.anchor)))\\n\",\n    \"print('Precision: %.2f' % explanation.precision)\\n\",\n    \"print('Coverage: %.2f' % explanation.coverage)\"\n   ]\n  },\n  {\n   \"cell_type\": \"markdown\",\n   \"metadata\": {},\n   \"source\": [\n    \"Notice how no anchor is found!\\n\",\n    \"\\n\",\n    \"This is due to the imbalanced dataset (roughly 25:75 high:low earner proportion), so during the sampling stage feature ranges corresponding to low-earners will be oversampled. This is a feature because it can point out an imbalanced dataset, but it can also be fixed by producing balanced datasets to enable anchors to be found for either class.\"\n   ]\n  },\n  {\n   \"cell_type\": \"code\",\n   \"execution_count\": null,\n   \"metadata\": {},\n   \"outputs\": [],\n   \"source\": []\n  }\n ],\n \"metadata\": {\n  \"kernelspec\": {\n   \"display_name\": \"Python 3\",\n   \"language\": \"python\",\n   \"name\": \"python3\"\n  },\n  \"language_info\": {\n   \"codemirror_mode\": {\n    \"name\": \"ipython\",\n    \"version\": 3\n   },\n   \"file_extension\": \".py\",\n   \"mimetype\": \"text/x-python\",\n   \"name\": \"python\",\n   \"nbconvert_exporter\": \"python\",\n   \"pygments_lexer\": \"ipython3\",\n   \"version\": \"3.6.9\"\n  }\n },\n \"nbformat\": 4,\n \"nbformat_minor\": 4\n}\n"
  }
]