Repository: mlops-for-all/mlops-for-all.github.io Branch: main Commit: fc01e7722ed3 Files: 291 Total size: 1.3 MB Directory structure: gitextract_cdatvxxx/ ├── .github/ │ ├── CODEOWNERS │ ├── PULL_REQUEST_TEMPLATE.md │ └── workflows/ │ ├── deploy.yml │ └── pull-request.yml ├── .gitignore ├── README.md ├── babel.config.js ├── community/ │ ├── community.md │ ├── contributors.md │ └── how-to-contribute.md ├── docs/ │ ├── api-deployment/ │ │ ├── _category_.json │ │ ├── seldon-children.md │ │ ├── seldon-fields.md │ │ ├── seldon-iris.md │ │ ├── seldon-mlflow.md │ │ ├── seldon-pg.md │ │ └── what-is-api-deployment.md │ ├── appendix/ │ │ ├── _category_.json │ │ ├── metallb.md │ │ └── pyenv.md │ ├── further-readings/ │ │ ├── _category_.json │ │ └── info.md │ ├── introduction/ │ │ ├── _category_.json │ │ ├── component.md │ │ ├── intro.md │ │ ├── levels.md │ │ └── why_kubernetes.md │ ├── kubeflow/ │ │ ├── _category_.json │ │ ├── advanced-component.md │ │ ├── advanced-environment.md │ │ ├── advanced-mlflow.md │ │ ├── advanced-pipeline.md │ │ ├── advanced-run.md │ │ ├── basic-component.md │ │ ├── basic-pipeline-upload.md │ │ ├── basic-pipeline.md │ │ ├── basic-requirements.md │ │ ├── basic-run.md │ │ ├── how-to-debug.md │ │ ├── kubeflow-concepts.md │ │ └── kubeflow-intro.md │ ├── kubeflow-dashboard-guide/ │ │ ├── _category_.json │ │ ├── experiments-and-others.md │ │ ├── experiments.md │ │ ├── intro.md │ │ ├── notebooks.md │ │ ├── tensorboards.md │ │ └── volumes.md │ ├── prerequisites/ │ │ ├── _category_.json │ │ └── docker/ │ │ ├── _category_.json │ │ ├── advanced.md │ │ ├── command.md │ │ ├── docker.md │ │ ├── images.md │ │ ├── install.md │ │ └── introduction.md │ ├── setup-components/ │ │ ├── _category_.json │ │ ├── install-components-kf.md │ │ ├── install-components-mlflow.md │ │ ├── install-components-pg.md │ │ └── install-components-seldon.md │ └── setup-kubernetes/ │ ├── _category_.json │ ├── install-kubernetes/ │ │ ├── _category_.json │ │ ├── kubernetes-with-k3s.md │ │ ├── kubernetes-with-kubeadm.md │ │ └── kubernetes-with-minikube.md │ ├── install-kubernetes-module.md │ ├── install-prerequisite.md │ ├── intro.md │ ├── kubernetes.md │ └── setup-nvidia-gpu.md ├── docusaurus.config.js ├── i18n/ │ ├── en/ │ │ ├── code.json │ │ ├── docusaurus-plugin-content-blog/ │ │ │ └── options.json │ │ ├── docusaurus-plugin-content-docs/ │ │ │ ├── current/ │ │ │ │ ├── api-deployment/ │ │ │ │ │ ├── _category_.json │ │ │ │ │ ├── seldon-children.md │ │ │ │ │ ├── seldon-fields.md │ │ │ │ │ ├── seldon-iris.md │ │ │ │ │ ├── seldon-mlflow.md │ │ │ │ │ ├── seldon-pg.md │ │ │ │ │ └── what-is-api-deployment.md │ │ │ │ ├── appendix/ │ │ │ │ │ ├── _category_.json │ │ │ │ │ ├── metallb.md │ │ │ │ │ └── pyenv.md │ │ │ │ ├── further-readings/ │ │ │ │ │ ├── _category_.json │ │ │ │ │ └── info.md │ │ │ │ ├── introduction/ │ │ │ │ │ ├── _category_.json │ │ │ │ │ ├── component.md │ │ │ │ │ ├── intro.md │ │ │ │ │ ├── levels.md │ │ │ │ │ └── why_kubernetes.md │ │ │ │ ├── kubeflow/ │ │ │ │ │ ├── _category_.json │ │ │ │ │ ├── advanced-component.md │ │ │ │ │ ├── advanced-environment.md │ │ │ │ │ ├── advanced-mlflow.md │ │ │ │ │ ├── advanced-pipeline.md │ │ │ │ │ ├── advanced-run.md │ │ │ │ │ ├── basic-component.md │ │ │ │ │ ├── basic-pipeline-upload.md │ │ │ │ │ ├── basic-pipeline.md │ │ │ │ │ ├── basic-requirements.md │ │ │ │ │ ├── basic-run.md │ │ │ │ │ ├── how-to-debug.md │ │ │ │ │ ├── kubeflow-concepts.md │ │ │ │ │ └── kubeflow-intro.md │ │ │ │ ├── kubeflow-dashboard-guide/ │ │ │ │ │ ├── _category_.json │ │ │ │ │ ├── experiments-and-others.md │ │ │ │ │ ├── experiments.md │ │ │ │ │ ├── intro.md │ │ │ │ │ ├── notebooks.md │ │ │ │ │ ├── tensorboards.md │ │ │ │ │ └── volumes.md │ │ │ │ ├── prerequisites/ │ │ │ │ │ ├── _category_.json │ │ │ │ │ └── docker/ │ │ │ │ │ ├── _category_.json │ │ │ │ │ ├── advanced.md │ │ │ │ │ ├── command.md │ │ │ │ │ ├── docker.md │ │ │ │ │ ├── images.md │ │ │ │ │ ├── install.md │ │ │ │ │ └── introduction.md │ │ │ │ ├── setup-components/ │ │ │ │ │ ├── _category_.json │ │ │ │ │ ├── install-components-kf.md │ │ │ │ │ ├── install-components-mlflow.md │ │ │ │ │ ├── install-components-pg.md │ │ │ │ │ └── install-components-seldon.md │ │ │ │ └── setup-kubernetes/ │ │ │ │ ├── _category_.json │ │ │ │ ├── install-kubernetes/ │ │ │ │ │ ├── _category_.json │ │ │ │ │ ├── kubernetes-with-k3s.md │ │ │ │ │ ├── kubernetes-with-kubeadm.md │ │ │ │ │ └── kubernetes-with-minikube.md │ │ │ │ ├── install-kubernetes-module.md │ │ │ │ ├── install-prerequisite.md │ │ │ │ ├── intro.md │ │ │ │ ├── kubernetes.md │ │ │ │ └── setup-nvidia-gpu.md │ │ │ ├── current.json │ │ │ ├── version-1.0/ │ │ │ │ ├── api-deployment/ │ │ │ │ │ ├── _category_.json │ │ │ │ │ ├── seldon-children.md │ │ │ │ │ ├── seldon-fields.md │ │ │ │ │ ├── seldon-iris.md │ │ │ │ │ ├── seldon-mlflow.md │ │ │ │ │ ├── seldon-pg.md │ │ │ │ │ └── what-is-api-deployment.md │ │ │ │ ├── appendix/ │ │ │ │ │ ├── _category_.json │ │ │ │ │ ├── metallb.md │ │ │ │ │ └── pyenv.md │ │ │ │ ├── further-readings/ │ │ │ │ │ ├── _category_.json │ │ │ │ │ └── info.md │ │ │ │ ├── introduction/ │ │ │ │ │ ├── _category_.json │ │ │ │ │ ├── component.md │ │ │ │ │ ├── intro.md │ │ │ │ │ ├── levels.md │ │ │ │ │ └── why_kubernetes.md │ │ │ │ ├── kubeflow/ │ │ │ │ │ ├── _category_.json │ │ │ │ │ ├── advanced-component.md │ │ │ │ │ ├── advanced-environment.md │ │ │ │ │ ├── advanced-mlflow.md │ │ │ │ │ ├── advanced-pipeline.md │ │ │ │ │ ├── advanced-run.md │ │ │ │ │ ├── basic-component.md │ │ │ │ │ ├── basic-pipeline-upload.md │ │ │ │ │ ├── basic-pipeline.md │ │ │ │ │ ├── basic-requirements.md │ │ │ │ │ ├── basic-run.md │ │ │ │ │ ├── how-to-debug.md │ │ │ │ │ ├── kubeflow-concepts.md │ │ │ │ │ └── kubeflow-intro.md │ │ │ │ ├── kubeflow-dashboard-guide/ │ │ │ │ │ ├── _category_.json │ │ │ │ │ ├── experiments-and-others.md │ │ │ │ │ ├── experiments.md │ │ │ │ │ ├── intro.md │ │ │ │ │ ├── notebooks.md │ │ │ │ │ ├── tensorboards.md │ │ │ │ │ └── volumes.md │ │ │ │ ├── prerequisites/ │ │ │ │ │ ├── _category_.json │ │ │ │ │ └── docker/ │ │ │ │ │ ├── _category_.json │ │ │ │ │ ├── advanced.md │ │ │ │ │ ├── command.md │ │ │ │ │ ├── docker.md │ │ │ │ │ ├── images.md │ │ │ │ │ ├── install.md │ │ │ │ │ └── introduction.md │ │ │ │ ├── setup-components/ │ │ │ │ │ ├── _category_.json │ │ │ │ │ ├── install-components-kf.md │ │ │ │ │ ├── install-components-mlflow.md │ │ │ │ │ ├── install-components-pg.md │ │ │ │ │ └── install-components-seldon.md │ │ │ │ └── setup-kubernetes/ │ │ │ │ ├── _category_.json │ │ │ │ ├── install-kubernetes/ │ │ │ │ │ ├── _category_.json │ │ │ │ │ ├── kubernetes-with-k3s.md │ │ │ │ │ ├── kubernetes-with-kubeadm.md │ │ │ │ │ └── kubernetes-with-minikube.md │ │ │ │ ├── install-kubernetes-module.md │ │ │ │ ├── install-prerequisite.md │ │ │ │ ├── intro.md │ │ │ │ ├── kubernetes.md │ │ │ │ └── setup-nvidia-gpu.md │ │ │ └── version-1.0.json │ │ ├── docusaurus-plugin-content-docs-community/ │ │ │ ├── current/ │ │ │ │ └── community/ │ │ │ │ ├── community.md │ │ │ │ ├── contributors.md │ │ │ │ └── how-to-contribute.md │ │ │ └── current.json │ │ └── docusaurus-theme-classic/ │ │ ├── footer.json │ │ └── navbar.json │ └── ko/ │ ├── code.json │ ├── docusaurus-plugin-content-blog/ │ │ └── options.json │ ├── docusaurus-plugin-content-docs/ │ │ ├── current.json │ │ └── version-1.0.json │ ├── docusaurus-plugin-content-docs-community/ │ │ └── current.json │ └── docusaurus-theme-classic/ │ ├── footer.json │ └── navbar.json ├── package.json ├── python/ │ ├── env/ │ │ └── .gitkeep │ ├── pyproject.toml │ └── translation/ │ └── main.py ├── sidebars.js ├── sidebarsCommunity.js ├── src/ │ ├── components/ │ │ ├── HomepageFeatures/ │ │ │ ├── index.tsx │ │ │ └── styles.module.css │ │ └── TeamProfileCards/ │ │ └── index.tsx │ ├── css/ │ │ └── custom.css │ └── pages/ │ ├── index.module.css │ ├── index.tsx │ └── markdown-page.md ├── static/ │ ├── .nojekyll │ ├── googlee5904fe980148e9b.html │ └── img/ │ └── site.webmanifest ├── tsconfig.json ├── versioned_docs/ │ └── version-1.0/ │ ├── api-deployment/ │ │ ├── _category_.json │ │ ├── seldon-children.md │ │ ├── seldon-fields.md │ │ ├── seldon-iris.md │ │ ├── seldon-mlflow.md │ │ ├── seldon-pg.md │ │ └── what-is-api-deployment.md │ ├── appendix/ │ │ ├── _category_.json │ │ ├── metallb.md │ │ └── pyenv.md │ ├── further-readings/ │ │ ├── _category_.json │ │ └── info.md │ ├── introduction/ │ │ ├── _category_.json │ │ ├── component.md │ │ ├── intro.md │ │ ├── levels.md │ │ └── why_kubernetes.md │ ├── kubeflow/ │ │ ├── _category_.json │ │ ├── advanced-component.md │ │ ├── advanced-environment.md │ │ ├── advanced-mlflow.md │ │ ├── advanced-pipeline.md │ │ ├── advanced-run.md │ │ ├── basic-component.md │ │ ├── basic-pipeline-upload.md │ │ ├── basic-pipeline.md │ │ ├── basic-requirements.md │ │ ├── basic-run.md │ │ ├── how-to-debug.md │ │ ├── kubeflow-concepts.md │ │ └── kubeflow-intro.md │ ├── kubeflow-dashboard-guide/ │ │ ├── _category_.json │ │ ├── experiments-and-others.md │ │ ├── experiments.md │ │ ├── intro.md │ │ ├── notebooks.md │ │ ├── tensorboards.md │ │ └── volumes.md │ ├── prerequisites/ │ │ ├── _category_.json │ │ └── docker/ │ │ ├── _category_.json │ │ ├── advanced.md │ │ ├── command.md │ │ ├── docker.md │ │ ├── images.md │ │ ├── install.md │ │ └── introduction.md │ ├── setup-components/ │ │ ├── _category_.json │ │ ├── install-components-kf.md │ │ ├── install-components-mlflow.md │ │ ├── install-components-pg.md │ │ └── install-components-seldon.md │ └── setup-kubernetes/ │ ├── _category_.json │ ├── install-kubernetes/ │ │ ├── _category_.json │ │ ├── kubernetes-with-k3s.md │ │ ├── kubernetes-with-kubeadm.md │ │ └── kubernetes-with-minikube.md │ ├── install-kubernetes-module.md │ ├── install-prerequisite.md │ ├── intro.md │ ├── kubernetes.md │ └── setup-nvidia-gpu.md ├── versioned_sidebars/ │ └── version-1.0-sidebars.json └── versions.json ================================================ FILE CONTENTS ================================================ ================================================ FILE: .github/CODEOWNERS ================================================ * @mlops-for-all/maintainers ================================================ FILE: .github/PULL_REQUEST_TEMPLATE.md ================================================ ## Changes? ## Why we need? ## Test? - [ ] `npm run start` 를 수행하여 로컬에서 렌더링된 페이지를 확인하셨나요? - [ ] `npm test` 를 통과하였나요? ## Anything Else? (Optional) ================================================ FILE: .github/workflows/deploy.yml ================================================ name: Deploy to GitHub Pages on: push: branches: - main # Review gh actions docs if you want to further define triggers, paths, etc # https://docs.github.com/en/actions/using-workflows/workflow-syntax-for-github-actions#on jobs: deploy: name: Deploy to GitHub Pages runs-on: ubuntu-latest steps: - uses: actions/checkout@v2 - uses: actions/setup-node@v3 with: node-version: 18 cache: npm - name: Install dependencies run: npm ci - name: Build website run: npm run build # Popular action to deploy to GitHub Pages: # Docs: https://github.com/peaceiris/actions-gh-pages#%EF%B8%8F-docusaurus - name: Deploy to GitHub Pages uses: peaceiris/actions-gh-pages@v3 with: github_token: ${{ secrets.ORG_PAT }} # Build output to publish to the `gh-pages` branch: publish_dir: ./build # The following lines assign commit authorship to the official # GH-Actions bot for deploys to `gh-pages` branch: # https://github.com/actions/checkout/issues/13#issuecomment-724415212 # The GH actions bot is used by default if you didn't specify the two fields. # You can swap them out with your own user credentials. user_name: github-actions[bot] user_email: 41898282+github-actions[bot]@users.noreply.github.com ================================================ FILE: .github/workflows/pull-request.yml ================================================ name: "Pull Request" on: pull_request: types: [opened, synchronize, edited, reopened, closed] jobs: label: runs-on: ubuntu-latest steps: - uses: anencore94/labeler@v1.1.0 ================================================ FILE: .gitignore ================================================ # Dependencies /node_modules # Production /build # Generated files .docusaurus .cache-loader # Misc .DS_Store .env.local .env.development.local .env.test.local .env.production.local npm-debug.log* yarn-debug.log* yarn-error.log* v1/ .vscode openai.env .envrc __pycache__ ================================================ FILE: README.md ================================================ ## 모두의 MLOps 모두의 MLOps 프로젝트입니다. 프로젝트에 누구던 자유롭게 기여할 수 있습니다. 자세한 내용은 [How to Contribute](https://mlops-for-all.github.io/community/how-to-contribute/)를 참조하세요. ================================================ FILE: babel.config.js ================================================ module.exports = { presets: [require.resolve('@docusaurus/core/lib/babel/preset')], }; ================================================ FILE: community/community.md ================================================ --- title: "Community" sidebar_position: 1 --- ### *모두의 MLOps* 릴리즈 소식 새로운 포스트나 수정사항은 [Announcements](https://github.com/mlops-for-all/mlops-for-all.github.io/discussions/categories/announcements)에서 확인할 수 있습니다. ### Question 프로젝트 내용과 관련된 궁금점은 [Q&A](https://github.com/mlops-for-all/mlops-for-all.github.io/discussions/categories/q-a)를 통해 질문할 수 있습니다. ### Suggestion 제안점은 [Ideas](https://github.com/mlops-for-all/mlops-for-all.github.io/discussions/categories/ideas)를 통해 제안해 주시면 됩니다. ### Copyright 1. 본 문서를 “비상업적 목적” 사용 시 하기와 같이 출처를 반드시 표시해주세요. - MLOps for ALL, https://mlops-for-all.github.io/ 2. 상업적 용도로 인용/사용/차용하고자 하는 경우 마키나락스(contact@makinarocks.ai)로 사전에 문의주시기 바랍니다. ================================================ FILE: community/contributors.md ================================================ --- sidebar_position: 3 --- # Contributors ## Main Authors import { MainAuthorRow, } from '@site/src/components/TeamProfileCards'; ## Contributors Thank you for contributing our tutorials! import { ContributorsRow, } from '@site/src/components/TeamProfileCards'; ================================================ FILE: community/how-to-contribute.md ================================================ --- title: "How to Contribute" sidebar_position: 2 --- ## How to Start ### Git Repo 준비 1. [*모두의 MLOps* GitHub Repository](https://github.com/mlops-for-all/mlops-for-all.github.io)에 접속합니다. 2. 여러분의 개인 Repository로 `Fork`합니다. 3. Forked Repository를 여러분의 작업 환경으로 `git clone`합니다. ### 환경 설정 1. 모두의 MLOps는 Hugo 와 Node를 이용하고 있습니다. 다음 명령어를 통해 필요한 패키지가 설치되어 있는지 확인합니다. - node & npm ```bash npm --version ``` - hugo ```bash hugo version ``` 1. 필요한 node module을 설치합니다. ```bash npm install ``` 2. 프로젝트에서는 각 글의 일관성을 위해서 여러 markdown lint를 적용하고 있습니다. 다음 명령어를 실행해 test를 진행한 후 커밋합니다.내용 수정 및 추가 후 lint가 맞는지 확인합니다. ```bash npm test ``` 4. lint 확인 완료 후 ci 를 실행합니다. ```bash npm ci ``` 4. 로컬에서 실행 후 수정한 글이 정상적으로 나오는지 확인합니다. ```bash npm run start ``` ## How to Contribute ### 1. 새로운 포스트를 작성할 때 새로운 포스트는 각 챕터와 포스트의 위치에 맞는 weight를 설정합니다. - Introduction: 1xx - Setup: 2xx - Kubeflow: 3xx - API Deployment: 4xx - Help: 10xx ### 2. 기존의 포스트를 수정할 때 기존의 포스트를 수정할 때 Contributor에 본인의 이름을 입력합니다. ```markdown contributors: ["John Doe", "Adam Smith"] ``` ### 3. 프로젝트에 처음 기여할 때 만약 프로젝트에 처음 기여 할 때 `content/kor/contributors`에 본인의 이름으로 폴더를 생성한 후, `_index.md`라는 파일을 작성합니다. 예를 들어, `minsoo kim`이 본인의 영어 이름이라면, 폴더명은 `minsoo-kim`으로 하여 해당 폴더 내부의 `_index.md`파일에 다음의 내용을 작성합니다. 폴더명은 하이픈(-)으로 연결한 소문자로, title은 띄어쓰기를 포함한 CamelCase로 작성합니다. ```markdown --- title: "John Doe" draft: false --- ``` ## After Pull Request Pull Request를 생성하면 프로젝트에서는 자동으로 *모두의 MLOps* 운영진에게 리뷰 요청이 전해집니다. 최대 일주일 이내로 확인 후 Comment를 드릴 예정입니다. ================================================ FILE: docs/api-deployment/_category_.json ================================================ { "label": "API Deployment", "position": 7, "link": { "type": "generated-index" } } ================================================ FILE: docs/api-deployment/seldon-children.md ================================================ --- title : "6. Multi Models" description: "" sidebar_position: 6 contributors: ["Jongseob Jeon"] --- ## Multi Models 앞서 설명했던 방법들은 모두 단일 모델을 대상으로 했습니다. 이번 페이지에서는 여러 개의 모델을 연결하는 방법에 대해서 알아봅니다. ## Pipeline 우선 모델을 2개를 생성하는 파이프라인을 작성하겠습니다. 모델은 앞서 사용한 SVC 모델에 StandardScaler를 추가하고 저장하도록 하겠습니다. ```python from functools import partial import kfp from kfp.components import InputPath, OutputPath, create_component_from_func @partial( create_component_from_func, packages_to_install=["pandas", "scikit-learn"], ) def load_iris_data( data_path: OutputPath("csv"), target_path: OutputPath("csv"), ): import pandas as pd from sklearn.datasets import load_iris iris = load_iris() data = pd.DataFrame(iris["data"], columns=iris["feature_names"]) target = pd.DataFrame(iris["target"], columns=["target"]) data.to_csv(data_path, index=False) target.to_csv(target_path, index=False) @partial( create_component_from_func, packages_to_install=["dill", "pandas", "scikit-learn", "mlflow"], ) def train_scaler_from_csv( data_path: InputPath("csv"), scaled_data_path: OutputPath("csv"), model_path: OutputPath("dill"), input_example_path: OutputPath("dill"), signature_path: OutputPath("dill"), conda_env_path: OutputPath("dill"), ): import dill import pandas as pd from sklearn.preprocessing import StandardScaler from mlflow.models.signature import infer_signature from mlflow.utils.environment import _mlflow_conda_env data = pd.read_csv(data_path) scaler = StandardScaler() scaled_data = scaler.fit_transform(data) scaled_data = pd.DataFrame(scaled_data, columns=data.columns, index=data.index) scaled_data.to_csv(scaled_data_path, index=False) with open(model_path, mode="wb") as file_writer: dill.dump(scaler, file_writer) input_example = data.sample(1) with open(input_example_path, "wb") as file_writer: dill.dump(input_example, file_writer) signature = infer_signature(data, scaler.transform(data)) with open(signature_path, "wb") as file_writer: dill.dump(signature, file_writer) conda_env = _mlflow_conda_env( additional_pip_deps=["scikit-learn"], install_mlflow=False ) with open(conda_env_path, "wb") as file_writer: dill.dump(conda_env, file_writer) @partial( create_component_from_func, packages_to_install=["dill", "pandas", "scikit-learn", "mlflow"], ) def train_svc_from_csv( train_data_path: InputPath("csv"), train_target_path: InputPath("csv"), model_path: OutputPath("dill"), input_example_path: OutputPath("dill"), signature_path: OutputPath("dill"), conda_env_path: OutputPath("dill"), kernel: str, ): import dill import pandas as pd from sklearn.svm import SVC from mlflow.models.signature import infer_signature from mlflow.utils.environment import _mlflow_conda_env train_data = pd.read_csv(train_data_path) train_target = pd.read_csv(train_target_path) clf = SVC(kernel=kernel) clf.fit(train_data, train_target) with open(model_path, mode="wb") as file_writer: dill.dump(clf, file_writer) input_example = train_data.sample(1) with open(input_example_path, "wb") as file_writer: dill.dump(input_example, file_writer) signature = infer_signature(train_data, clf.predict(train_data)) with open(signature_path, "wb") as file_writer: dill.dump(signature, file_writer) conda_env = _mlflow_conda_env( additional_pip_deps=["scikit-learn"], install_mlflow=False ) with open(conda_env_path, "wb") as file_writer: dill.dump(conda_env, file_writer) @partial( create_component_from_func, packages_to_install=["dill", "pandas", "scikit-learn", "mlflow", "boto3"], ) def upload_sklearn_model_to_mlflow( model_name: str, model_path: InputPath("dill"), input_example_path: InputPath("dill"), signature_path: InputPath("dill"), conda_env_path: InputPath("dill"), ): import os import dill from mlflow.sklearn import save_model from mlflow.tracking.client import MlflowClient os.environ["MLFLOW_S3_ENDPOINT_URL"] = "http://minio-service.kubeflow.svc:9000" os.environ["AWS_ACCESS_KEY_ID"] = "minio" os.environ["AWS_SECRET_ACCESS_KEY"] = "minio123" client = MlflowClient("http://mlflow-server-service.mlflow-system.svc:5000") with open(model_path, mode="rb") as file_reader: clf = dill.load(file_reader) with open(input_example_path, "rb") as file_reader: input_example = dill.load(file_reader) with open(signature_path, "rb") as file_reader: signature = dill.load(file_reader) with open(conda_env_path, "rb") as file_reader: conda_env = dill.load(file_reader) save_model( sk_model=clf, path=model_name, serialization_format="cloudpickle", conda_env=conda_env, signature=signature, input_example=input_example, ) run = client.create_run(experiment_id="0") client.log_artifact(run.info.run_id, model_name) from kfp.dsl import pipeline @pipeline(name="multi_model_pipeline") def multi_model_pipeline(kernel: str = "rbf"): iris_data = load_iris_data() scaled_data = train_scaler_from_csv(data=iris_data.outputs["data"]) _ = upload_sklearn_model_to_mlflow( model_name="scaler", model=scaled_data.outputs["model"], input_example=scaled_data.outputs["input_example"], signature=scaled_data.outputs["signature"], conda_env=scaled_data.outputs["conda_env"], ) model = train_svc_from_csv( train_data=scaled_data.outputs["scaled_data"], train_target=iris_data.outputs["target"], kernel=kernel, ) _ = upload_sklearn_model_to_mlflow( model_name="svc", model=model.outputs["model"], input_example=model.outputs["input_example"], signature=model.outputs["signature"], conda_env=model.outputs["conda_env"], ) if __name__ == "__main__": kfp.compiler.Compiler().compile(multi_model_pipeline, "multi_model_pipeline.yaml") ``` 파이프라인을 업로드하면 다음과 같이 나옵니다. ![children-kubeflow.png](./img/children-kubeflow.png) MLflow 대시보드를 확인하면 다음과 같이 두 개의 모델이 생성됩니다. ![children-mlflow.png](./img/children-mlflow.png) 각각의 run_id를 확인 후 다음과 같이 SeldonDeployment 스펙을 정의합니다. ```bash apiVersion: machinelearning.seldon.io/v1 kind: SeldonDeployment metadata: name: multi-model-example namespace: kubeflow-user-example-com spec: name: model predictors: - name: model componentSpecs: - spec: volumes: - name: model-provision-location emptyDir: {} initContainers: - name: scaler-initializer image: gcr.io/kfserving/storage-initializer:v0.4.0 args: - "s3://mlflow/mlflow/artifacts/0/7f445015a0e94519b003d316478766ef/artifacts/scaler" - "/mnt/models" volumeMounts: - mountPath: /mnt/models name: model-provision-location envFrom: - secretRef: name: seldon-init-container-secret - name: svc-initializer image: gcr.io/kfserving/storage-initializer:v0.4.0 args: - "s3://mlflow/mlflow/artifacts/0/87eb168e76264b39a24b0e5ca0fe922b/artifacts/svc" - "/mnt/models" volumeMounts: - mountPath: /mnt/models name: model-provision-location envFrom: - secretRef: name: seldon-init-container-secret containers: - name: scaler image: seldonio/mlflowserver:1.8.0-dev volumeMounts: - mountPath: /mnt/models name: model-provision-location readOnly: true securityContext: privileged: true runAsUser: 0 runAsGroup: 0 - name: svc image: seldonio/mlflowserver:1.8.0-dev volumeMounts: - mountPath: /mnt/models name: model-provision-location readOnly: true securityContext: privileged: true runAsUser: 0 runAsGroup: 0 graph: name: scaler type: MODEL parameters: - name: model_uri type: STRING value: "/mnt/models" - name: predict_method type: STRING value: "transform" children: - name: svc type: MODEL parameters: - name: model_uri type: STRING value: "/mnt/models" ``` 모델이 두 개가 되었으므로 각 모델의 initContainer와 container를 정의해주어야 합니다. 이 필드는 입력값을 array로 받으며 순서는 관계없습니다. 모델이 실행하는 순서는 graph에서 정의됩니다. ```bash graph: name: scaler type: MODEL parameters: - name: model_uri type: STRING value: "/mnt/models" - name: predict_method type: STRING value: "transform" children: - name: svc type: MODEL parameters: - name: model_uri type: STRING value: "/mnt/models" ``` graph의 동작 방식은 처음 받은 값을 정해진 predict_method로 변환한 뒤 children으로 정의된 모델에 전달하는 방식입니다. 이 경우 scaler -> svc 로 데이터가 전달됩니다. 이제 위의 스펙을 yaml파일로 생성해 보겠습니다. ```bash cat < multi-model.yaml apiVersion: machinelearning.seldon.io/v1 kind: SeldonDeployment metadata: name: multi-model-example namespace: kubeflow-user-example-com spec: name: model predictors: - name: model componentSpecs: - spec: volumes: - name: model-provision-location emptyDir: {} initContainers: - name: scaler-initializer image: gcr.io/kfserving/storage-initializer:v0.4.0 args: - "s3://mlflow/mlflow/artifacts/0/7f445015a0e94519b003d316478766ef/artifacts/scaler" - "/mnt/models" volumeMounts: - mountPath: /mnt/models name: model-provision-location envFrom: - secretRef: name: seldon-init-container-secret - name: svc-initializer image: gcr.io/kfserving/storage-initializer:v0.4.0 args: - "s3://mlflow/mlflow/artifacts/0/87eb168e76264b39a24b0e5ca0fe922b/artifacts/svc" - "/mnt/models" volumeMounts: - mountPath: /mnt/models name: model-provision-location envFrom: - secretRef: name: seldon-init-container-secret containers: - name: scaler image: ghcr.io/mlops-for-all/mlflowserver volumeMounts: - mountPath: /mnt/models name: model-provision-location readOnly: true securityContext: privileged: true runAsUser: 0 runAsGroup: 0 - name: svc image: ghcr.io/mlops-for-all/mlflowserver volumeMounts: - mountPath: /mnt/models name: model-provision-location readOnly: true securityContext: privileged: true runAsUser: 0 runAsGroup: 0 graph: name: scaler type: MODEL parameters: - name: model_uri type: STRING value: "/mnt/models" - name: predict_method type: STRING value: "transform" children: - name: svc type: MODEL parameters: - name: model_uri type: STRING value: "/mnt/models" EOF ``` 다음 명령어를 통해 API를 생성합니다. ```bash kubectl apply -f multi-model.yaml ``` 정상적으로 수행되면 다음과 같이 출력됩니다. ```bash seldondeployment.machinelearning.seldon.io/multi-model-example created ``` 정상적으로 생성됐는지 확인합니다. ```bash kubectl get po -n kubeflow-user-example-com | grep multi-model-example ``` 정상적으로 생성되면 다음과 비슷한 pod이 생성됩니다. ```bash multi-model-example-model-0-scaler-svc-9955fb795-n9ffw 4/4 Running 0 2m30s ``` ================================================ FILE: docs/api-deployment/seldon-fields.md ================================================ --- title : "4. Seldon Fields" description: "" sidebar_position: 4 contributors: ["Jongseob Jeon"] --- ## How Seldon Core works? Seldon Core가 API 서버를 생성하는 과정을 요약하면 다음과 같습니다. ![seldon-fields-0.png](./img/seldon-fields-0.png) 1. initContainer는 모델 저장소에서 필요한 모델을 다운로드 받습니다. 2. 다운로드받은 모델을 container로 전달합니다. 3. container는 전달받은 모델을 감싼 API 서버를 실행합니다. 4. 생성된 API 서버 주소로 API를 요청하여 모델의 추론 값을 받을 수 있습니다. ## SeldonDeployment Spec Seldon Core를 사용할 때, 주로 사용하게 되는 커스텀 리소스인 SeldonDeployment를 정의하는 yaml 파일은 다음과 같습니다. ```bash apiVersion: machinelearning.seldon.io/v1 kind: SeldonDeployment metadata: name: seldon-example namespace: kubeflow-user-example-com spec: name: model predictors: - name: model componentSpecs: - spec: volumes: - name: model-provision-location emptyDir: {} initContainers: - name: model-initializer image: gcr.io/kfserving/storage-initializer:v0.4.0 args: - "gs://seldon-models/v1.12.0-dev/sklearn/iris" - "/mnt/models" volumeMounts: - mountPath: /mnt/models name: model-provision-location containers: - name: model image: seldonio/sklearnserver:1.8.0-dev volumeMounts: - mountPath: /mnt/models name: model-provision-location readOnly: true securityContext: privileged: true runAsUser: 0 runAsGroup: 0 graph: name: model type: MODEL parameters: - name: model_uri type: STRING value: "/mnt/models" children: [] ``` SeldonDeployment spe 중 `name` 과 `predictors` 필드는 required 필드입니다. `name`은 쿠버네티스 상에서 pod의 구분을 위한 이름으로 크게 영향을 미치지 않습니다. `predictors`는 한 개로 구성된 array로 `name`, `componentSpecs` 와 `graph` 가 정의되어야 합니다. 여기서도 `name`은 pod의 구분을 위한 이름으로 크게 영향을 미치지 않습니다. 이제 `componentSpecs` 와 `graph`에서 정의해야 할 필드들에 대해서 알아보겠습니다. ## componentSpecs `componentSpecs` 는 하나로 구성된 array로 `spec` 키값이 정의되어야 합니다. `spec` 에는 `volumes`, `initContainers`, `containers` 의 필드가 정의되어야 합니다. ### volumes ```bash volumes: - name: model-provision-location emptyDir: {} ``` `volumes`은 initContainer에서 다운로드받는 모델을 저장하기 위한 공간을 의미합니다. array로 입력을 받으며 array의 구성 요소는 `name`과 `emptyDir` 입니다. 이 값들은 모델을 다운로드받고 옮길 때 한번 사용되므로 크게 수정하지 않아도 됩니다. ### initContainer ```bash - name: model-initializer image: gcr.io/kfserving/storage-initializer:v0.4.0 args: - "gs://seldon-models/v1.12.0-dev/sklearn/iris" - "/mnt/models" volumeMounts: - mountPath: /mnt/models name: model-provision-location ``` initContainer는 API에서 사용할 모델을 다운로드받는 역할을 합니다. 그래서 사용되는 필드들은 모델 저장소(Model Registry)로부터 데이터를 다운로드받을 때 필요한 정보들을 정해줍니다. initContainer의 값은 n개의 array로 구성되어 있으며 사용하는 모델마다 각각 지정해주어야 합니다. #### name `name`은 쿠버네티스 상의 pod의 이름입니다. 디버깅을 위해 `{model_name}-initializer` 로 사용하길 권장합니다. #### image `image` 는 모델을 다운로드 받기 위해 사용할 이미지 이름입니다. seldon core에서 권장하는 이미지는 크게 두 가지입니다. - gcr.io/kfserving/storage-initializer:v0.4.0 - seldonio/rclone-storage-initializer:1.13.0-dev 각각의 자세한 내용은 다음을 참고 바랍니다. - [kfserving](https://docs.seldon.io/projects/seldon-core/en/latest/servers/kfserving-storage-initializer.html) - [rclone](https://github.com/SeldonIO/seldon-core/tree/master/components/rclone-storage-initializer) *모두의 MLOps* 에서는 kfserving을 사용합니다. #### args ```bash args: - "gs://seldon-models/v1.12.0-dev/sklearn/iris" - "/mnt/models" ``` gcr.io/kfserving/storage-initializer:v0.4.0 도커 이미지가 실행(`run`)될 때 입력받는 argument를 입력합니다. array로 구성되며 첫 번째 array의 값은 다운로드받을 모델의 주소를 적습니다. 두 번째 array의 값은 다운로드받은 모델을 저장할 주소를 적습니다. (seldon core에서는 주로 `/mnt/models`에 저장합니다.) ### volumeMounts ```bash volumeMounts: - mountPath: /mnt/models name: model-provision-location ``` `volumneMounts`는 volumes에서 설명한 것과 같이 `/mnt/models`를 쿠버네티스 상에서 공유할 수 있도록 볼륨을 붙여주는 필드입니다. 자세한 내용은 [쿠버네티스 Volume](https://kubernetes.io/docs/concepts/storage/volumes/)을 참조 바랍니다. ### container ```bash containers: - name: model image: seldonio/sklearnserver:1.8.0-dev volumeMounts: - mountPath: /mnt/models name: model-provision-location readOnly: true securityContext: privileged: true runAsUser: 0 runAsGroup: 0 ``` container는 실제로 모델이 API 형식으로 실행될 때의 설정을 정의하는 필드입니다. #### name `name`은 쿠버네티스 상의 pod의 이름입니다. 사용하는 모델의 이름을 적습니다. #### image `image` 는 모델을 API로 만드는 데 사용할 이미지입니다. 이미지에는 모델이 로드될 때 필요한 패키지들이 모두 설치되어 있어야 합니다. Seldon Core에서 지원하는 공식 이미지는 다음과 같습니다. - seldonio/sklearnserver - seldonio/mlflowserver - seldonio/xgboostserver - seldonio/tfserving #### volumeMounts ```bash volumeMounts: - mountPath: /mnt/models name: model-provision-location readOnly: true ``` initContainer에서 다운로드받은 데이터가 있는 경로를 알려주는 필드입니다. 이때 모델이 수정되는 것을 방지하기 위해 `readOnly: true`도 같이 주겠습니다. #### securityContext ```bash securityContext: privileged: true runAsUser: 0 runAsGroup: 0 ``` 필요한 패키지를 설치할 때 pod이 권한이 없어서 패키지 설치를 수행하지 못할 수 있습니다. 이를 위해서 root 권한을 부여합니다. (다만 이 작업은 실제 서빙 시 보안 문제가 생길 수 있습니다.) ## graph ```bash graph: name: model type: MODEL parameters: - name: model_uri type: STRING value: "/mnt/models" children: [] ``` 모델이 동작하는 순서를 정의한 필드입니다. ### name 모델 그래프의 이름입니다. container에서 정의된 이름을 사용합니다. ### type type은 크게 4가지가 있습니다. 1. TRANSFORMER 2. MODEL 3. OUTPUT_TRANSFORMER 4. ROUTER 각 type에 대한 자세한 설명은 [Seldon Core Complex Graphs Metadata Example](https://docs.seldon.io/projects/seldon-core/en/latest/examples/graph-metadata.html)을 참조 바랍니다. ### parameters class init 에서 사용되는 값들입니다. sklearnserver에서 필요한 값은 [다음 파일](https://github.com/SeldonIO/seldon-core/blob/master/servers/sklearnserver/sklearnserver/SKLearnServer.py)에서 확인할 수 있습니다. ```python class SKLearnServer(SeldonComponent): def __init__(self, model_uri: str = None, method: str = "predict_proba"): ``` 코드를 보면 `model_uri`와 `method`를 정의할 수 있습니다. ### children 순서도를 작성할 때 사용됩니다. 자세한 내용은 다음 페이지에서 설명합니다. ================================================ FILE: docs/api-deployment/seldon-iris.md ================================================ --- title : "2. Deploy SeldonDeployment" description: "" sidebar_position: 2 date: 2021-12-22 lastmod: 2021-12-22 contributors: ["Youngcheol Jang", "SeungTae Kim"] --- ## SeldonDeployment를 통해 배포하기 이번에는 학습된 모델이 있을 때 SeldonDeployment를 통해 API Deployment를 해보겠습니다. SeldonDeployment는 쿠버네티스(Kubernetes)에 모델을 REST/gRPC 서버의 형태로 배포하기 위해 정의된 CRD(CustomResourceDefinition)입니다. ### 1. Prerequisites SeldonDeployment 관련된 실습은 seldon-deploy라는 새로운 네임스페이스(namespace)에서 진행하도록 하겠습니다. 네임스페이스를 생성한 뒤, seldon-deploy를 현재 네임스페이스로 설정합니다. ```bash kubectl create namespace seldon-deploy kubectl config set-context --current --namespace=seldon-deploy ``` ### 2. 스펙 정의 SeldonDeployment를 배포하기 위한 yaml 파일을 생성합니다. 이번 페이지에서는 공개된 iris model을 사용하도록 하겠습니다. 이 iris model은 sklearn 프레임워크를 통해 학습되었기 때문에 SKLEARN_SERVER를 사용합니다. ```bash cat < iris-sdep.yaml apiVersion: machinelearning.seldon.io/v1alpha2 kind: SeldonDeployment metadata: name: sklearn namespace: seldon-deploy spec: name: iris predictors: - graph: children: [] implementation: SKLEARN_SERVER modelUri: gs://seldon-models/v1.12.0-dev/sklearn/iris name: classifier name: default replicas: 1 EOF ``` yaml 파일을 배포합니다. ```bash kubectl apply -f iris-sdep.yaml ``` 다음 명령어를 통해 정상적으로 배포가 되었는지 확인합니다. ```bash kubectl get pods --selector seldon-app=sklearn-default -n seldon-deploy ``` 모두 Running 이 되면 다음과 비슷한 결과가 출력됩니다. ```bash NAME READY STATUS RESTARTS AGE sklearn-default-0-classifier-5fdfd7bb77-ls9tr 2/2 Running 0 5m ``` ## Ingress URL 이제 배포된 모델에 추론 요청(predict request)를 보내서 추론 결괏값을 받아옵니다. 배포된 API는 다음과 같은 규칙으로 생성됩니다. `http://{NODE_IP}:{NODE_PORT}/seldon/{namespace}/{seldon-deployment-name}/api/v1.0/{method-name}/` ### NODE_IP / NODE_PORT [Seldon Core 설치 시, Ambassador를 Ingress Controller로 설정하였으므로](../setup-components/install-components-seldon.md), SeldonDeployment로 생성된 API 서버는 모두 Ambassador의 Ingress gateway를 통해 요청할 수 있습니다. 따라서 우선 Ambassador Ingress Gateway의 url을 환경 변수로 설정합니다. ```bash export NODE_IP=$(kubectl get nodes -o jsonpath='{ $.items[*].status.addresses[?(@.type=="InternalIP")].address }') export NODE_PORT=$(kubectl get service ambassador -n seldon-system -o jsonpath="{.spec.ports[0].nodePort}") ``` 설정된 url을 확인합니다. ```bash echo "NODE_IP"=$NODE_IP echo "NODE_PORT"=$NODE_PORT ``` 다음과 비슷하게 출력되어야 하며, 클라우드 등을 통해 설정할 경우, internal ip 주소가 설정되는 것을 확인할 수 있습니다. ```bash NODE_IP=192.168.0.19 NODE_PORT=30486 ``` ### namespace / seldon-deployment-name SeldonDeployment가 배포된 `namespace`와 `seldon-deployment-name`를 의미합니다. 이는 스펙을 정의할 때 metadata에 정의된 값을 사용합니다. ```bash metadata: name: sklearn namespace: seldon-deploy ``` 위의 예시에서는 `namespace`는 seldon-deploy, `seldon-deployment-name`은 sklearn 입니다. ### method-name SeldonDeployment에서 주로 사용하는 `method-name`은 두 가지가 있습니다. 1. doc 2. predictions 각각의 method의 자세한 사용 방법은 아래에서 설명합니다. ## Using Swagger 우선 doc method를 사용하는 방법입니다. doc method를 이용하면 seldon에서 생성한 swagger에 접속할 수 있습니다. ### 1. Swagger 접속 위에서 설명한 ingress url 규칙에 따라 아래 주소를 통해 swagger에 접근할 수 있습니다. `http://192.168.0.19:30486/seldon/seldon-deploy/sklearn/api/v1.0/doc/` ![iris-swagger1.png](./img/iris-swagger1.png) ### 2. Swagger Predictions 메뉴 선택 UI에서 `/seldon/seldon-deploy/sklearn/api/v1.0/predictions` 메뉴를 선택합니다. ![iris-swagger2.png](./img/iris-swagger2.png) ### 3. *Try it out* 선택 ![iris-swagger3.png](./img/iris-swagger3.png) ### 4. Request body에 data 입력 ![iris-swagger4.png](./img/iris-swagger4.png) 다음 데이터를 입력합니다. ```bash { "data": { "ndarray":[[1.0, 2.0, 5.0, 6.0]] } } ``` ### 5. 추론 결과 확인 `Execute` 버튼을 눌러서 추론 결과를 확인할 수 있습니다. ![iris-swagger5.png](./img/iris-swagger5.png) 정상적으로 수행되면 다음과 같은 추론 결과를 얻습니다. ```bash { "data": { "names": [ "t:0", "t:1", "t:2" ], "ndarray": [ [ 9.912315378486697e-7, 0.0007015931307746079, 0.9992974156376876 ] ] }, "meta": { "requestPath": { "classifier": "seldonio/sklearnserver:1.11.2" } } } ``` ## Using CLI 또한, curl과 같은 http client CLI 도구를 활용해서도 API 요청을 수행할 수 있습니다. 예를 들어, 다음과 같이 `/predictions`를 요청하면 ```bash curl -X POST http://$NODE_IP:$NODE_PORT/seldon/seldon-deploy/sklearn/api/v1.0/predictions \ -H 'Content-Type: application/json' \ -d '{ "data": { "ndarray": [[1,2,3,4]] } }' ``` 아래와 같은 응답이 정상적으로 출력되는 것을 확인할 수 있습니다. ```bash {"data":{"names":["t:0","t:1","t:2"],"ndarray":[[0.0006985194531162835,0.00366803903943666,0.995633441507447]]},"meta":{"requestPath":{"classifier":"seldonio/sklearnserver:1.11.2"}}} ``` ================================================ FILE: docs/api-deployment/seldon-mlflow.md ================================================ --- title : "5. Model from MLflow" description: "" sidebar_position: 5 contributors: ["Jongseob Jeon"] --- ## Model from MLflow 이번 페이지에서는 [MLflow Component](../kubeflow/advanced-mlflow.md)에서 저장된 모델을 이용해 API를 생성하는 방법에 대해서 알아보겠습니다. ## Secret initContainer가 minio에 접근해서 모델을 다운로드받으려면 credentials가 필요합니다. minio에 접근하기 위한 credentials는 다음과 같습니다. ```bash apiVersion: v1 type: Opaque kind: Secret metadata: name: seldon-init-container-secret namespace: kubeflow-user-example-com data: AWS_ACCESS_KEY_ID: bWluaW8K= AWS_SECRET_ACCESS_KEY: bWluaW8xMjM= AWS_ENDPOINT_URL: aHR0cDovL21pbmlvLm1ha2luYXJvY2tzLmFp USE_SSL: ZmFsc2U= ``` `AWS_ACCESS_KEY_ID` 의 입력값은 `minio`입니다. 다만 secret의 입력값은 인코딩된 값이여야 되기 때문에 실제로 입력되는 값은 다음을 수행후 나오는 값이어야 합니다. data에 입력되어야 하는 값들은 다음과 같습니다. - AWS_ACCESS_KEY_ID: minio - AWS_SECRET_ACCESS_KEY: minio123 - AWS_ENDPOINT_URL: http://minio-service.kubeflow.svc:9000 - USE_SSL: false 인코딩은 다음 명령어를 통해서 할 수 있습니다. ```bash echo -n minio | base64 ``` 그러면 다음과 같은 값이 출력됩니다. ```bash bWluaW8= ``` 인코딩을 전체 값에 대해서 진행하면 다음과 같이 됩니다. - AWS_ACCESS_KEY_ID: bWluaW8= - AWS_SECRET_ACCESS_KEY: bWluaW8xMjM= - AWS_ENDPOINT_URL: aHR0cDovL21pbmlvLXNlcnZpY2Uua3ViZWZsb3cuc3ZjOjkwMDA= - USE_SSL: ZmFsc2U= 다음 명령어를 통해 secret을 생성할 수 있는 yaml파일을 생성합니다. ```bash cat < seldon-init-container-secret.yaml apiVersion: v1 kind: Secret metadata: name: seldon-init-container-secret namespace: kubeflow-user-example-com type: Opaque data: AWS_ACCESS_KEY_ID: bWluaW8= AWS_SECRET_ACCESS_KEY: bWluaW8xMjM= AWS_ENDPOINT_URL: aHR0cDovL21pbmlvLXNlcnZpY2Uua3ViZWZsb3cuc3ZjOjkwMDA= USE_SSL: ZmFsc2U= EOF ``` 다음 명령어를 통해 secret을 생성합니다. ```bash kubectl apply -f seldon-init-container-secret.yaml ``` 정상적으로 수행되면 다음과 같이 출력됩니다. ```bash secret/seldon-init-container-secret created ``` ## Seldon Core yaml 이제 Seldon Core를 생성하는 yaml파일을 작성합니다. ```bash apiVersion: machinelearning.seldon.io/v1 kind: SeldonDeployment metadata: name: seldon-example namespace: kubeflow-user-example-com spec: name: model predictors: - name: model componentSpecs: - spec: volumes: - name: model-provision-location emptyDir: {} initContainers: - name: model-initializer image: gcr.io/kfserving/storage-initializer:v0.4.0 args: - "s3://mlflow/mlflow/artifacts/0/74ba8e33994144f599e50b3be176cdb0/artifacts/svc" - "/mnt/models" volumeMounts: - mountPath: /mnt/models name: model-provision-location envFrom: - secretRef: name: seldon-init-container-secret containers: - name: model image: ghcr.io/mlops-for-all/mlflowserver volumeMounts: - mountPath: /mnt/models name: model-provision-location readOnly: true securityContext: privileged: true runAsUser: 0 runAsGroup: 0 graph: name: model type: MODEL parameters: - name: model_uri type: STRING value: "/mnt/models" children: [] ``` 이 전에 작성한 [Seldon Fields](../api-deployment/seldon-fields.md)와 달라진 점은 크게 두 부분입니다. initContainer에 `envFrom` 필드가 추가되었으며 args의 주소가 `s3://mlflow/mlflow/artifacts/0/74ba8e33994144f599e50b3be176cdb0/artifacts/svc` 로 바뀌었습니다. ### args 앞서 args의 첫번째 array는 우리가 다운로드받을 모델의 경로라고 했습니다. 그럼 mlflow에 저장된 모델의 경로는 어떻게 알 수 있을까요? 다시 mlflow에 들어가서 run을 클릭하고 모델을 누르면 다음과 같이 확인할 수 있습니다. ![seldon-mlflow-0.png](./img/seldon-mlflow-0.png) 이렇게 확인된 경로를 입력하면 됩니다. ### envFrom minio에 접근해서 모델을 다운로드 받는 데 필요한 환경변수를 입력해주는 과정입니다. 앞서 만든 `seldon-init-container-secret`를 이용합니다. ## API 생성 우선 위에서 정의한 스펙을 yaml 파일로 생성하겠습니다. ```bash apiVersion: machinelearning.seldon.io/v1 kind: SeldonDeployment metadata: name: seldon-example namespace: kubeflow-user-example-com spec: name: model predictors: - name: model componentSpecs: - spec: volumes: - name: model-provision-location emptyDir: {} initContainers: - name: model-initializer image: gcr.io/kfserving/storage-initializer:v0.4.0 args: - "s3://mlflow/mlflow/artifacts/0/74ba8e33994144f599e50b3be176cdb0/artifacts/svc" - "/mnt/models" volumeMounts: - mountPath: /mnt/models name: model-provision-location envFrom: - secretRef: name: seldon-init-container-secret containers: - name: model image: ghcr.io/mlops-for-all/mlflowserver volumeMounts: - mountPath: /mnt/models name: model-provision-location readOnly: true securityContext: privileged: true runAsUser: 0 runAsGroup: 0 graph: name: model type: MODEL parameters: - name: model_uri type: STRING value: "/mnt/models" - name: xtype type: STRING value: "dataframe" children: [] EOF ``` seldon pod을 생성합니다. ```bash kubectl apply -f seldon-mlflow.yaml ``` 정상적으로 수행되면 다음과 같이 출력됩니다. ```bash seldondeployment.machinelearning.seldon.io/seldon-example created ``` 이제 pod이 정상적으로 뜰 때까지 기다립니다. ```bash kubectl get po -n kubeflow-user-example-com | grep seldon ``` 다음과 비슷하게 출력되면 정상적으로 API를 생성했습니다. ```bash seldon-example-model-0-model-5c949bd894-c5f28 3/3 Running 0 69s ``` CLI를 이용해 생성된 API에는 다음 request를 통해 실행을 확인할 수 있습니다. ```bash curl -X POST http://$NODE_IP:$NODE_PORT/seldon/seldon-deploy/sklearn/api/v1.0/predictions \ -H 'Content-Type: application/json' \ -d '{ "data": { "ndarray": [ [ 143.0, 0.0, 30.0, 30.0 ] ], "names": [ "sepal length (cm)", "sepal width (cm)", "petal length (cm)", "petal width (cm)" ] } }' ``` 정상적으로 실행될 경우 다음과 같은 결과를 받을 수 있습니다. ```bash {"data":{"names":[],"ndarray":["Virginica"]},"meta":{"requestPath":{"model":"ghcr.io/mlops-for-all/mlflowserver:e141f57"}}} ``` ================================================ FILE: docs/api-deployment/seldon-pg.md ================================================ --- title : "3. Seldon Monitoring" description: "Prometheus & Grafana 확인하기" sidebar_position: 3 date: 2021-12-24 lastmod: 2021-12-24 contributors: ["Jongseob Jeon"] --- ## Grafana & Prometheus 이제, [지난 페이지](../api-deployment/seldon-iris.md)에서 생성했던 SeldonDeployment 로 API Request 를 반복적으로 수행해보고, 대시보드에 변화가 일어나는지 확인해봅니다. ### 대시보드 [앞서 생성한 대시보드](../setup-components/install-components-pg.md)를 포트 포워딩합니다. ```bash kubectl port-forward svc/seldon-core-analytics-grafana -n seldon-system 8090:80 ``` ### API 요청 [앞서 생성한 Seldon Deployment](../api-deployment/seldon-iris.md#using-cli)에 요청을 **반복해서** 보냅니다. ```bash curl -X POST http://$NODE_IP:$NODE_PORT/seldon/seldon-deploy/sklearn/api/v1.0/predictions \ -H 'Content-Type: application/json' \ -d '{ "data": { "ndarray": [[1,2,3,4]] } }' ``` 그리고 그라파나 대시보드를 확인하면 다음과 같이 Global Request Rate 이 `0 ops` 에서 순간적으로 상승하는 것을 확인할 수 있습니다. ![repeat-raise.png](./img/repeat-raise.png) 이렇게 프로메테우스와 그라파나가 정상적으로 설치된 것을 확인할 수 있습니다. ================================================ FILE: docs/api-deployment/what-is-api-deployment.md ================================================ --- title : "1. What is API Deployment?" description: "" sidebar_position: 1 date: 2021-12-22 lastmod: 2021-12-22 contributors: ["Youngcheol Jang"] --- ## API Deployment란? 머신러닝 모델을 학습한 뒤에는 어떻게 사용해야 할까요? 머신러닝을 학습할 때는 더 높은 성능의 모델이 나오기를 기대하지만, 학습된 모델을 사용하여 추론을 할 때는 빠르고 쉽게 추론 결과를 받아보고 싶을 것입니다. 모델의 추론 결과를 확인하고자 할 때 주피터 노트북이나 파이썬 스크립트를 통해 학습된 모델을 로드한 뒤 추론할 수 있습니다. 그렇지만 이런 방법은 모델이 클수록 모델을 불러오는 데 많은 시간을 소요하게 되어서 비효율적입니다. 또한 이렇게 이용하면 많은 사람이 모델을 이용할 수 없고 학습된 모델이 있는 환경에서밖에 사용할 수 없습니다. 그래서 실제 서비스에서 머신러닝이 사용될 때는 API를 이용해서 학습된 모델을 사용합니다. 모델은 API 서버가 구동되는 환경에서 한 번만 로드가 되며, DNS를 활용하여 외부에서도 쉽게 추론 결과를 받을 수 있고 다른 서비스와 연동할 수 있습니다. 하지만 모델을 API로 만드는 작업에는 생각보다 많은 부수적인 작업이 필요합니다. 그래서 API로 만드는 작업을 더 쉽게 하기 위해서 Tensorflow와 같은 머신러닝 프레임워크 진영에서는 추론 엔진(Inference engine)을 개발하였습니다. 추론 엔진들을 이용하면 해당 머신러닝 프레임워크로 개발되고 학습된 모델을 불러와 추론이 가능한 API(REST 또는 gRPC)를 생성합니다. 이러한 추론 엔진을 활용하여 구축한 API 서버로 추론하고자 하는 데이터를 담아 요청을 보내면, 추론 엔진이 추론 결과를 응답에 담아 전송하는 것입니다. 대표적으로 다음과 같은 오픈소스 추론 엔진들이 개발되었습니다. - [Tensorflow : Tensorflow Serving](https://github.com/tensorflow/serving) - [PyTorch : Torchserve](https://github.com/pytorch/serve) - [Onnx : Onnx Runtime](https://github.com/microsoft/onnxruntime) 오프소스에서 공식적으로 지원하지는 않지만, 많이 쓰이는 sklearn, xgboost 프레임워크를 위한 추론 엔진도 개발되어 있습니다. 이처럼 모델의 추론 결과를 API의 형태로 받아볼 수 있도록 배포하는 것을 **API Deployment**라고 합니다. ## Serving Framework 위에서 다양한 추론 엔진들이 개발되었다는 사실을 소개해 드렸습니다. 쿠버네티스 환경에서 이러한 추론 엔진들을 사용하여 API Deployment를 한다면 어떤 작업이 필요할까요? 추론 엔진을 배포하기 위한 Deployment, 추론 요청을 보낼 Endpoint를 생성하기 위한 Service, 외부에서의 추론 요청을 추론 엔진으로 보내기 위한 Ingress 등 많은 쿠버네티스 리소스를 배포해 주어야 합니다. 이것 이외에도, 많은 추론 요청이 들어왔을 경우의 스케일 아웃(scale-out), 추론 엔진 상태에 대한 모니터링, 개선된 모델이 나왔을 경우 버전 업데이트 등 추론 엔진을 운영할 때의 요구사항은 한두 가지가 아닙니다. 이러한 많은 요구사항을 처리하기 위해 추론 엔진들을 쿠버네티스 환경 위에서 한 번 더 추상화한 **Serving Framework**들이 개발되었습니다. 개발된 Serving Framework들은 다음과 같은 오픈소스들이 있습니다. - [Seldon Core](https://github.com/SeldonIO/seldon-core) - [Kserve](https://github.com/kserve) - [BentoML](https://github.com/bentoml/BentoML) *모두의 MLOps*에서는 Seldon Core를 사용하여 API Deployment를 하는 과정을 다루어 보도록 하겠습니다. ================================================ FILE: docs/appendix/_category_.json ================================================ { "label": "Appendix", "position": 9, "link": { "type": "generated-index" } } ================================================ FILE: docs/appendix/metallb.md ================================================ --- title: "2. Bare Metal 클러스터용 load balancer metallb 설치" sidebar_position: 2 --- ## MetalLB란? Kubernetes 사용 시 AWS, GCP, Azure 와 같은 클라우드 플랫폼에서는 자체적으로 로드 벨런서(Load Balancer)를 제공해 주지만, 온프레미스 클러스터에서는 로드 벨런싱 기능을 제공하는 모듈을 추가적으로 설치해야 합니다. [MetalLB](https://metallb.universe.tf/)는 베어메탈 환경에서 사용할 수 있는 로드 벨런서를 제공하는 오픈소스 프로젝트 입니다. ## 요구사항 | 요구 사항 | 버전 및 내용 | | ------------------------------------------------------------ | ------------------------------------------------------------ | | Kubernetes | 로드 벨런싱 기능이 없는 >= v1.13.0 | | [호환가능한 네트워크 CNI](https://metallb.universe.tf/installation/network-addons/) | Calico, Canal, Cilium, Flannel, Kube-ovn, Kube-router, Weave Net | | IPv4 주소 | MetalLB 배포에 사용 | | BGP 모드를 사용할 경우 | BGP 기능을 지원하는 하나 이상의 라우터 | | 노드 간 포트 TCP/UDP 7946 오픈 | memberlist 요구 사항 ## MetalLB 설치 ### Preparation IPVS 모드에서 kube-proxy를 사용하는 경우 Kubernetes v1.14.2 이후부터는 엄격한 ARP(strictARP) 모드를 사용하도록 설정해야 합니다. Kube-router는 기본적으로 엄격한 ARP를 활성화하므로 서비스 프록시로 사용할 경우에는 이 기능이 필요하지 않습니다. 엄격한 ARP 모드를 적용하기에 앞서, 현재 모드를 확인합니다. ```bash # see what changes would be made, returns nonzero returncode if different kubectl get configmap kube-proxy -n kube-system -o yaml | \ grep strictARP ``` ```bash strictARP: false ``` strictARP: false 가 출력되는 경우 다음을 실행하여 strictARP: true로 변경합니다. (strictARP: true가 이미 출력된다면 다음 커맨드를 수행하지 않으셔도 됩니다.) ```bash # actually apply the changes, returns nonzero returncode on errors only kubectl get configmap kube-proxy -n kube-system -o yaml | \ sed -e "s/strictARP: false/strictARP: true/" | \ kubectl apply -f - -n kube-system ``` 정상적으로 수행되면 다음과 같이 출력됩니다. ```bash Warning: resource configmaps/kube-proxy is missing the kubectl.kubernetes.io/last-applied-configuration annotation which is required by kubectl apply. kubectl apply should only be used on resources created declaratively by either kubectl create --save-config or kubectl apply. The missing annotation will be patched automatically. configmap/kube-proxy configured ``` ### 설치 - Manifest #### 1. MetalLB 를 설치합니다. ```bash kubectl apply -f https://raw.githubusercontent.com/metallb/metallb/v0.11.0/manifests/namespace.yaml kubectl apply -f https://raw.githubusercontent.com/metallb/metallb/v0.11.0/manifests/metallb.yaml ``` #### 2. 정상 설치 확인 metallb-system namespace 의 2 개의 pod 이 모두 Running 이 될 때까지 기다립니다. ```bash kubectl get pod -n metallb-system ``` 모두 Running 이 되면 다음과 비슷한 결과가 출력됩니다. ```bash NAME READY STATUS RESTARTS AGE controller-7dcc8764f4-8n92q 1/1 Running 1 1m speaker-fnf8l 1/1 Running 1 1m ``` 매니페스트의 구성 요소는 다음과 같습니다. - metallb-system/controller - deployment 로 배포되며, 로드 벨런싱을 수행할 external IP 주소의 할당을 처리하는 역할을 담당합니다. - metallb-system/speaker - daemonset 형태로 배포되며, 외부 트래픽과 서비스를 연결해 네트워크 통신이 가능하도록 구성하는 역할을 담당합니다. 서비스에는 컨트롤러 및 스피커와 구성 요소가 작동하는 데 필요한 RBAC 사용 권한이 포함됩니다. ## Configuration MetalLB 의 로드 벨런싱 정책 설정은 관련 설정 정보를 담은 configmap 을 배포하여 설정할 수 있습니다. MetalLB 에서 구성할 수 있는 모드로는 다음과 같이 2가지가 있습니다. 1. [Layer 2 모드](https://metallb.universe.tf/concepts/layer2/) 2. [BGP 모드](https://metallb.universe.tf/concepts/bgp/) 여기에서는 Layer 2 모드로 진행하겠습니다. ### Layer 2 Configuration Layer 2 모드는 간단하게 사용할 IP 주소의 대역만 설정하면 됩니다. Layer 2 모드를 사용할 경우 워커 노드의 네트워크 인터페이스에 IP를 바인딩 하지 않아도 되는데 로컬 네트워크의 ARP 요청에 직접 응답하여 컴퓨터의 MAC주소를 클라이언트에 제공하는 방식으로 작동하기 때문입니다. 다음 `metallb_config.yaml` 파일은 MetalLB 가 192.168.35.100 ~ 192.168.35.110의 IP에 대한 제어 권한을 제공하고 Layer 2 모드를 구성하는 설정입니다. 클러스터 노드와 클라이언트 노드가 분리된 경우, 192.168.35.100 ~ 192.168.35.110 대역이 클라이언트 노드와 클러스터 노드 모두 접근 가능한 대역이어야 합니다. #### metallb_config.yaml ```bash apiVersion: v1 kind: ConfigMap metadata: namespace: metallb-system name: config data: config: | address-pools: - name: default protocol: layer2 addresses: - 192.168.35.100-192.168.35.110 # IP 대역폭 ``` 위의 설정을 적용합니다. ```test kubectl apply -f metallb_config.yaml ``` 정상적으로 배포하면 다음과 같이 출력됩니다. ```test configmap/config created ``` ## MetalLB 사용 ### Kubeflow Dashboard 먼저 kubeflow의 Dashboard 를 제공하는 istio-system 네임스페이스의 istio-ingressgateway 서비스의 타입을 `LoadBalancer`로 변경하여 MetalLB로부터 로드 벨런싱 기능을 제공받기 전에, 현재 상태를 확인합니다. ```bash kubectl get svc/istio-ingressgateway -n istio-system ``` 해당 서비스의 타입은 ClusterIP이며, External-IP 값은 `none` 인 것을 확인할 수 있습니다. ```bash NAME TYPE CLUSTER-IP EXTERNAL-IP PORT(S) AGE istio-ingressgateway ClusterIP 10.103.72.5 15021/TCP,80/TCP,443/TCP,31400/TCP,15443/TCP 4h21m ``` type 을 LoadBalancer 로 변경하고 원하는 IP 주소를 입력하고 싶은 경우 loadBalancerIP 항목을 추가합니다. 추가 하지 않을 경우에는 위에서 설정한 IP 주소풀에서 순차적으로 IP 주소가 배정됩니다. ```bash kubectl edit svc/istio-ingressgateway -n istio-system ``` ```bash spec: clusterIP: 10.103.72.5 clusterIPs: - 10.103.72.5 ipFamilies: - IPv4 ipFamilyPolicy: SingleStack ports: - name: status-port port: 15021 protocol: TCP targetPort: 15021 - name: http2 port: 80 protocol: TCP targetPort: 8080 - name: https port: 443 protocol: TCP targetPort: 8443 - name: tcp port: 31400 protocol: TCP targetPort: 31400 - name: tls port: 15443 protocol: TCP targetPort: 15443 selector: app: istio-ingressgateway istio: ingressgateway sessionAffinity: None type: LoadBalancer # Change ClusterIP to LoadBalancer loadBalancerIP: 192.168.35.100 # Add IP status: loadBalancer: {} ``` 다시 확인을 해보면 External-IP 값이 `192.168.35.100` 인 것을 확인합니다. ```bash kubectl get svc/istio-ingressgateway -n istio-system ``` ```bash NAME TYPE CLUSTER-IP EXTERNAL-IP PORT(S) AGE istio-ingressgateway LoadBalancer 10.103.72.5 192.168.35.100 15021:31054/TCP,80:30853/TCP,443:30443/TCP,31400:30012/TCP,15443:31650/TCP 5h1m ``` Web Browser 를 열어 [http://192.168.35.100](http://192.168.35.100) 으로 접속하여, 다음과 같은 화면이 출력되는 것을 확인합니다. ![login-after-istio-ingressgateway-setting.png](./img/login-after-istio-ingressgateway-setting.png) ### minio Dashboard 먼저 minio 의 Dashboard 를 제공하는 kubeflow 네임스페이스의 minio-service 서비스의 타입을 LoadBalancer로 변경하여 MetalLB로부터 로드 벨런싱 기능을 제공받기 전에, 현재 상태를 확인합니다. ```bash kubectl get svc/minio-service -n kubeflow ``` 해당 서비스의 타입은 ClusterIP이며, External-IP 값은 `none` 인 것을 확인할 수 있습니다. ```bash NAME TYPE CLUSTER-IP EXTERNAL-IP PORT(S) AGE minio-service ClusterIP 10.109.209.87 9000/TCP 5h14m ``` type 을 LoadBalancer 로 변경하고 원하는 IP 주소를 입력하고 싶은 경우 loadBalancerIP 항목을 추가합니다. 추가 하지 않을 경우에는 위에서 설정한 IP 주소풀에서 순차적으로 IP 주소가 배정됩니다. ```bash kubectl edit svc/minio-service -n kubeflow ``` ```bash apiVersion: v1 kind: Service metadata: annotations: kubectl.kubernetes.io/last-applied-configuration: | {"apiVersion":"v1","kind":"Service","metadata":{"annotations":{},"labels":{"application-crd-id":"kubeflow-pipelines"},"name":"minio-ser> creationTimestamp: "2022-01-05T08:44:23Z" labels: application-crd-id: kubeflow-pipelines name: minio-service namespace: kubeflow resourceVersion: "21120" uid: 0053ee28-4f87-47bb-ad6b-7ad68aa29a48 spec: clusterIP: 10.109.209.87 clusterIPs: - 10.109.209.87 ipFamilies: - IPv4 ipFamilyPolicy: SingleStack ports: - name: http port: 9000 protocol: TCP targetPort: 9000 selector: app: minio application-crd-id: kubeflow-pipelines sessionAffinity: None type: LoadBalancer # Change ClusterIP to LoadBalancer loadBalancerIP: 192.168.35.101 # Add IP status: loadBalancer: {} ``` 다시 확인을 해보면 External-IP 값이 `192.168.35.101` 인 것을 확인할 수 있습니다. ```bash kubectl get svc/minio-service -n kubeflow ``` ```bash NAME TYPE CLUSTER-IP EXTERNAL-IP PORT(S) AGE minio-service LoadBalancer 10.109.209.87 192.168.35.101 9000:31371/TCP 5h21m ``` Web Browser 를 열어 [http://192.168.35.101:9000](http://192.168.35.101:9000) 으로 접속하여, 다음과 같은 화면이 출력되는 것을 확인합니다. ![login-after-minio-setting.png](./img/login-after-minio-setting.png) ### mlflow Dashboard 먼저 mlflow 의 Dashboard 를 제공하는 mlflow-system 네임스페이스의 mlflow-server-service 서비스의 타입을 LoadBalancer로 변경하여 MetalLB로부터 로드 벨런싱 기능을 제공받기 전에, 현재 상태를 확인합니다. ```bash kubectl get svc/mlflow-server-service -n mlflow-system ``` 해당 서비스의 타입은 ClusterIP이며, External-IP 값은 `none` 인 것을 확인할 수 있습니다. ```bash NAME TYPE CLUSTER-IP EXTERNAL-IP PORT(S) AGE mlflow-server-service ClusterIP 10.111.173.209 5000/TCP 4m50s ``` type 을 LoadBalancer 로 변경하고 원하는 IP 주소를 입력하고 싶은 경우 loadBalancerIP 항목을 추가합니다. 추가 하지 않을 경우에는 위에서 설정한 IP 주소풀에서 순차적으로 IP 주소가 배정됩니다. ```bash kubectl edit svc/mlflow-server-service -n mlflow-system ``` ```bash apiVersion: v1 kind: Service metadata: annotations: meta.helm.sh/release-name: mlflow-server meta.helm.sh/release-namespace: mlflow-system creationTimestamp: "2022-01-07T04:00:19Z" labels: app.kubernetes.io/managed-by: Helm name: mlflow-server-service namespace: mlflow-system resourceVersion: "276246" uid: e5d39fb7-ad98-47e7-b512-f9c673055356 spec: clusterIP: 10.111.173.209 clusterIPs: - 10.111.173.209 ipFamilies: - IPv4 ipFamilyPolicy: SingleStack ports: - port: 5000 protocol: TCP targetPort: 5000 selector: app.kubernetes.io/name: mlflow-server sessionAffinity: None type: LoadBalancer # Change ClusterIP to LoadBalancer loadBalancerIP: 192.168.35.102 # Add IP status: loadBalancer: {} ``` 다시 확인을 해보면 External-IP 값이 `192.168.35.102` 인 것을 확인할 수 있습니다. ```bash kubectl get svc/mlflow-server-service -n mlflow-system ``` ```bash NAME TYPE CLUSTER-IP EXTERNAL-IP PORT(S) AGE mlflow-server-service LoadBalancer 10.111.173.209 192.168.35.102 5000:32287/TCP 6m11s ``` Web Browser 를 열어 [http://192.168.35.102:5000](http://192.168.35.102:5000) 으로 접속하여, 다음과 같은 화면이 출력되는 것을 확인합니다. ![login-after-mlflow-setting.png](./img/login-after-mlflow-setting.png) ### Grafana Dashboard 먼저 Grafana 의 Dashboard 를 제공하는 seldon-system 네임스페이스의 seldon-core-analytics-grafana 서비스의 타입을 LoadBalancer로 변경하여 MetalLB로부터 로드 벨런싱 기능을 제공받기 전에, 현재 상태를 확인합니다. ```bash kubectl get svc/seldon-core-analytics-grafana -n seldon-system ``` 해당 서비스의 타입은 ClusterIP이며, External-IP 값은 `none` 인 것을 확인할 수 있습니다. ```bash NAME TYPE CLUSTER-IP EXTERNAL-IP PORT(S) AGE seldon-core-analytics-grafana ClusterIP 10.109.20.161 80/TCP 94s ``` type 을 LoadBalancer 로 변경하고 원하는 IP 주소를 입력하고 싶은 경우 loadBalancerIP 항목을 추가합니다. 추가 하지 않을 경우에는 위에서 설정한 IP 주소풀에서 순차적으로 IP 주소가 배정됩니다. ```bash kubectl edit svc/seldon-core-analytics-grafana -n seldon-system ``` ```bash apiVersion: v1 kind: Service metadata: annotations: meta.helm.sh/release-name: seldon-core-analytics meta.helm.sh/release-namespace: seldon-system creationTimestamp: "2022-01-07T04:16:47Z" labels: app.kubernetes.io/instance: seldon-core-analytics app.kubernetes.io/managed-by: Helm app.kubernetes.io/name: grafana app.kubernetes.io/version: 7.0.3 helm.sh/chart: grafana-5.1.4 name: seldon-core-analytics-grafana namespace: seldon-system resourceVersion: "280605" uid: 75073b78-92ec-472c-b0d5-240038ea8fa5 spec: clusterIP: 10.109.20.161 clusterIPs: - 10.109.20.161 ipFamilies: - IPv4 ipFamilyPolicy: SingleStack ports: - name: service port: 80 protocol: TCP targetPort: 3000 selector: app.kubernetes.io/instance: seldon-core-analytics app.kubernetes.io/name: grafana sessionAffinity: None type: LoadBalancer # Change ClusterIP to LoadBalancer loadBalancerIP: 192.168.35.103 # Add IP status: loadBalancer: {} ``` 다시 확인을 해보면 External-IP 값이 `192.168.35.103` 인 것을 확인할 수 있습니다. ```bash kubectl get svc/seldon-core-analytics-grafana -n seldon-system ``` ```bash NAME TYPE CLUSTER-IP EXTERNAL-IP PORT(S) AGE seldon-core-analytics-grafana LoadBalancer 10.109.20.161 192.168.35.103 80:31191/TCP 5m14s ``` Web Browser 를 열어 [http://192.168.35.103:80](http://192.168.35.103:80) 으로 접속하여, 다음과 같은 화면이 출력되는 것을 확인합니다. ![login-after-grafana-setting.png](./img/login-after-grafana-setting.png) ================================================ FILE: docs/appendix/pyenv.md ================================================ --- title: "1. Python 가상환경 설치" sidebar_position: 1 --- ## 파이썬 가상환경 Python 환경을 사용하다 보면 여러 버전의 Python 환경을 사용하고 싶은 경우나, 여러 프로젝트별 패키지 버전을 따로 관리하고 싶은 경우가 발생합니다. 이처럼 Python 환경 혹은 Python Package 환경을 가상화하여 관리하는 것을 쉽게 도와주는 도구로는 pyenv, conda, virtualenv, venv 등이 존재합니다. 이 중 *모두의 MLOps*에서는 [pyenv](https://github.com/pyenv/pyenv)와 [pyenv-virtualenv](https://github.com/pyenv/pyenv-virtualenv)를 설치하는 방법을 다룹니다. pyenv는 Python 버전을 관리하는 것을 도와주며, pyenv-virtualenv는 pyenv의 plugin으로써 파이썬 패키지 환경을 관리하는 것을 도와줍니다. ## pyenv 설치 ### Prerequisites 운영 체제별로 Prerequisites가 다릅니다. [다음 페이지](https://github.com/pyenv/pyenv/wiki#suggested-build-environment)를 참고하여 필수 패키지들을 설치해주시기 바랍니다. ### 설치 - macOS 1. pyenv, pyenv-virtualenv 설치 ```bash brew update brew install pyenv brew install pyenv-virtualenv ``` 2. pyenv 설정 macOS의 경우 카탈리나 버전 이후 기본 shell이 zsh로 변경되었기 때문에 zsh을 사용하는 경우를 가정하였습니다. ```bash echo 'eval "$(pyenv init -)"' >> ~/.zshrc echo 'eval "$(pyenv virtualenv-init -)"' >> ~/.zshrc source ~/.zshrc ``` pyenv 명령이 정상적으로 수행되는지 확인합니다. ```bash pyenv --help ``` ```bash $ pyenv --help Usage: pyenv [] Some useful pyenv commands are: --version Display the version of pyenv activate Activate virtual environment commands List all available pyenv commands deactivate Deactivate virtual environment exec Run an executable with the selected Python version global Set or show the global Python version(s) help Display help for a command hooks List hook scripts for a given pyenv command init Configure the shell environment for pyenv install Install a Python version using python-build local Set or show the local application-specific Python version(s) prefix Display prefix for a Python version rehash Rehash pyenv shims (run this after installing executables) root Display the root directory where versions and shims are kept shell Set or show the shell-specific Python version shims List existing pyenv shims uninstall Uninstall a specific Python version version Show the current Python version(s) and its origin version-file Detect the file that sets the current pyenv version version-name Show the current Python version version-origin Explain how the current Python version is set versions List all Python versions available to pyenv virtualenv Create a Python virtualenv using the pyenv-virtualenv plugin virtualenv-delete Uninstall a specific Python virtualenv virtualenv-init Configure the shell environment for pyenv-virtualenv virtualenv-prefix Display real_prefix for a Python virtualenv version virtualenvs List all Python virtualenvs found in `$PYENV_ROOT/versions/*'. whence List all Python versions that contain the given executable which Display the full path to an executable See `pyenv help ' for information on a specific command. For full documentation, see: https://github.com/pyenv/pyenv#readme ``` ### 설치 - Ubuntu 1. pyenv, pyenv-virtualenv 설치 ```bash curl https://pyenv.run | bash ``` 다음과 같은 내용이 출력되면 정상적으로 설치된 것을 의미합니다. ```bash % Total % Received % Xferd Average Speed Time Time Time Current Dload Upload Total Spent Left Speed 0 0 0 0 0 0 0 0 --:--:-- --:--:-- 0 0 0 0 0 0 0 0 --:--:-- --:--:-- 100 270 100 270 0 0 239 0 0:00:01 0:00:01 --:--:-- 239 Cloning into '/home/mlops/.pyenv'... r ... 중략... ... remote: Enumerating objects: 10, done. remote: Counting objects: 100% (10/10), done. remote: Compressing objects: 100% (6/6), done. remote: Total 10 (delta 1), reused 6 (delta 0), pack-reused 0 Unpacking objects: 100% (10/10), 2.92 KiB | 2.92 MiB/s, done. WARNING: seems you still have not added 'pyenv' to the load path. # See the README for instructions on how to set up # your shell environment for Pyenv. # Load pyenv-virtualenv automatically by adding # the following to ~/.bashrc: eval "$(pyenv virtualenv-init -)" ``` 2. pyenv 설정 기본 shell로 bash shell을 사용하는 경우를 가정하였습니다. bash에서 pyenv와 pyenv-virtualenv 를 사용할 수 있도록 설정합니다. ```bash sudo vi ~/.bashrc ``` 다음 문자열을 입력한 후 저장합니다. ```bash export PATH="$HOME/.pyenv/bin:$PATH" eval "$(pyenv init -)" eval "$(pyenv virtualenv-init -)" ``` shell을 restart 합니다. ```bash exec $SHELL ``` pyenv 명령이 정상적으로 수행되는지 확인합니다. ```bash pyenv --help ``` 다음과 같은 메시지가 출력되면 정상적으로 설정된 것을 의미합니다. ```bash $ pyenv pyenv 2.2.2 Usage: pyenv [] Some useful pyenv commands are: --version Display the version of pyenv activate Activate virtual environment commands List all available pyenv commands deactivate Deactivate virtual environment doctor Verify pyenv installation and development tools to build pythons. exec Run an executable with the selected Python version global Set or show the global Python version(s) help Display help for a command hooks List hook scripts for a given pyenv command init Configure the shell environment for pyenv install Install a Python version using python-build local Set or show the local application-specific Python version(s) prefix Display prefix for a Python version rehash Rehash pyenv shims (run this after installing executables) root Display the root directory where versions and shims are kept shell Set or show the shell-specific Python version shims List existing pyenv shims uninstall Uninstall a specific Python version version Show the current Python version(s) and its origin version-file Detect the file that sets the current pyenv version version-name Show the current Python version version-origin Explain how the current Python version is set versions List all Python versions available to pyenv virtualenv Create a Python virtualenv using the pyenv-virtualenv plugin virtualenv-delete Uninstall a specific Python virtualenv virtualenv-init Configure the shell environment for pyenv-virtualenv virtualenv-prefix Display real_prefix for a Python virtualenv version virtualenvs List all Python virtualenvs found in `$PYENV_ROOT/versions/*'. whence List all Python versions that contain the given executable which Display the full path to an executable See `pyenv help ' for information on a specific command. For full documentation, see: https://github.com/pyenv/pyenv#readme ``` ## pyenv 사용 ### Python 버전 설치 `pyenv install ` 명령을 통해 원하는 파이썬 버전을 설치할 수 있습니다. 이번 페이지에서는 예시로 kubeflow에서 기본으로 사용하는 파이썬 3.7.12 버전을 설치하겠습니다. ```bash pyenv install 3.7.12 ``` 정상적으로 설치되면 다음과 같은 메시지가 출력됩니다. ```bash $ pyenv install 3.7.12 Downloading Python-3.7.12.tar.xz... -> https://www.python.org/ftp/python/3.7.12/Python-3.7.12.tar.xz Installing Python-3.7.12... patching file Doc/library/ctypes.rst patching file Lib/test/test_unicode.py patching file Modules/_ctypes/_ctypes.c patching file Modules/_ctypes/callproc.c patching file Modules/_ctypes/ctypes.h patching file setup.py patching file 'Misc/NEWS.d/next/Core and Builtins/2020-06-30-04-44-29.bpo-41100.PJwA6F.rst' patching file Modules/_decimal/libmpdec/mpdecimal.h Installed Python-3.7.12 to /home/mlops/.pyenv/versions/3.7.12 ``` ### Python 가상환경 생성 `pyenv virtualenv <가상환경-이름>` 명령을 통해 원하는 파이썬 버전의 파이썬 가상환경을 생성할 수 있습니다. 예시로 Python 3.7.12 버전의 `demo`라는 이름의 Python 가상환경을 생성하겠습니다. ```bash pyenv virtualenv 3.7.12 demo ``` ```bash $ pyenv virtualenv 3.7.12 demo Looking in links: /tmp/tmpffqys0gv Requirement already satisfied: setuptools in /home/mlops/.pyenv/versions/3.7.12/envs/demo/lib/python3.7/site-packages (47.1.0) Requirement already satisfied: pip in /home/mlops/.pyenv/versions/3.7.12/envs/demo/lib/python3.7/site-packages (20.1.1) ``` ### Python 가상환경 사용 `pyenv activate <가상환경 이름>` 명령을 통해 위와 같은 방식으로 생성한 가상환경을 사용할 수 있습니다. 예시로는 `demo`라는 이름의 Python 가상환경을 사용하겠습니다. ```bash pyenv activate demo ``` 다음과 같이 현재 가상환경의 정보가 shell의 맨 앞에 출력되는 것을 확인할 수 있습니다. Before ```bash mlops@ubuntu:~$ pyenv activate demo ``` After ```bash pyenv-virtualenv: prompt changing will be removed from future release. configure `export PYENV_VIRTUALENV_DISABLE_PROMPT=1' to simulate the behavior. (demo) mlops@ubuntu:~$ ``` ### Python 가상환경 비활성화 `source deactivate` 명령을 통해 현재 사용 중인 가상환경을 비활성화할 수 있습니다. ```bash source deactivate ``` Before ```bash (demo) mlops@ubuntu:~$ source deactivate ``` After ```bash mlops@ubuntu:~$ ``` ================================================ FILE: docs/further-readings/_category_.json ================================================ { "label": "Further Readings", "position": 8, "link": { "type": "generated-index" } } ================================================ FILE: docs/further-readings/info.md ================================================ --- title: "다루지 못한 것들" date: 2021-12-21 lastmod: 2021-12-21 --- ## MLOps Component [MLOps Concepts](../introduction/component.md)에서 다루었던 컴포넌트를 도식화하면 다음과 같습니다. ![open-stacks-0.png](./img/open-stacks-0.png) 이 중 *모두의 MLOps* 에서 다룬 기술 스택들은 다음과 같습니다. ![open-stacks-1.png](./img/open-stacks-1.png) 보시는 것처럼 아직 우리가 다루지 못한 많은 MLOps 컴포넌트들이 있습니다. 시간 관계상 이번에 모두 다루지는 못했지만, 만약 필요하다면 다음과 같은 오픈소스들을 먼저 참고해보면 좋을 것 같습니다. ![open-stacks-2.png](./img/open-stacks-2.png) 세부 내용은 다음과 같습니다. | Mgmt. | Component | Open Soruce | | -------------------------- | --------------------------- | ------------------------------------- | | Data Mgmt. | Collection | [Kafka](https://kafka.apache.org/) | | | Validation | [Beam](https://beam.apache.org/) | | | Feature Store | [Flink](https://flink.apache.org/) | | ML Model Dev. & Experiment | Modeling | [Jupyter](https://jupyter.org/) | | | Analysis & Experiment Mgmt. | [MLflow](https://mlflow.org/) | | | HPO Tuning & AutoML | [Katib](https://github.com/kubeflow/katib) | | Deploy Mgmt. | Serving Framework | [Seldon Core](https://docs.seldon.io/projects/seldon-core/en/latest/index.html) | | | A/B Test | [Iter8](https://iter8.tools/) | | | Monitoring | [Grafana](https://grafana.com/oss/grafana/), [Prometheus](https://prometheus.io/) | | Process Mgmt. | pipeline | [Kubeflow](https://www.kubeflow.org/) | | | CI/CD | [Github Action](https://docs.github.com/en/actions) | | | Continuous Training | [Argo Events](https://argoproj.github.io/events/) | | Platform Mgmt. | Configuration Mgmt. | [Consul](https://www.consul.io/) | | | Code Version Mgmt. | [Github](https://github.com/), [Minio](https://min.io/) | | | Logging | (EFK) [Elastic Search](https://www.elastic.co/kr/elasticsearch/), [Fluentd](https://www.fluentd.org/), [Kibana](https://www.elastic.co/kr/kibana/) | | | Resource Mgmt. | [Kubernetes](https://kubernetes.io/) | ================================================ FILE: docs/introduction/_category_.json ================================================ { "label": "Introduction", "position": 1, "link": { "type": "generated-index" } } ================================================ FILE: docs/introduction/component.md ================================================ --- title : "3. Components of MLOps" description: "Describe MLOps Components" sidebar_position: 3 date: 2021-12-03 lastmod: 2021-12-10 contributors: ["Youngcheol Jang"] --- ## Practitioners guide to MLOps 2021년 5월에 발표된 구글의 [white paper : Practitioners guide to MLOps: A framework for continuous delivery and automation of machine learning](https://services.google.com/fh/files/misc/practitioners_guide_to_mlops_whitepaper.pdf)에서는 MLOps의 핵심 기능들로 다음과 같은 것들을 언급하였습니다. ![mlops-component](./img/mlops-component.png) 각 기능이 어떤 역할을 하는지 살펴보겠습니다. ### 1. Experimentation 실험(Experimentation)은 머신러닝 엔지니어들이 데이터를 분석하고, 프로토타입 모델을 만들며 학습 기능을 구현할 수 있도록 하는 다음과 같은 기능을 제공합니다. - 깃(Git)과 같은 버전 컨트롤 도구와 통합된 노트북(Jupyter Notebook) 환경 제공 - 사용한 데이터, 하이퍼 파라미터, 평가 지표를 포함한 실험 추적 기능 제공 - 데이터와 모델에 대한 분석 및 시각화 기능 제공 ### 2. Data Processing 데이터 처리(Data Processing)는 머신러닝 모델 개발 단계, 지속적인 학습(Continuous Training) 단계, 그리고 API 배포(API Deployment) 단계에서 많은 양의 데이터를 사용할 수 있게 해 주는 다음과 같은 기능을 제공합니다. - 다양한 데이터 소스와 서비스에 호환되는 데이터 커넥터(connector) 기능 제공 - 다양한 형태의 데이터와 호환되는 데이터 인코더(encoder) & 디코더(decoder) 기능 제공 - 다양한 형태의 데이터에 대한 데이터 변환과 피처 엔지니어링(feature engineering) 기능 제공 - 학습과 서빙을 위한 확장 가능한 배치, 스트림 데이터 처리 기능 제공 ### 3. Model training 모델 학습(Model training)은 모델 학습을 위한 알고리즘을 효율적으로 실행시켜주는 다음과 같은 기능을 제공합니다. - ML 프레임워크의 실행을 위한 환경 제공 - 다수의 GPU / 분산 학습 사용을 위한 분산 학습 환경 제공 - 하이퍼 파라미터 튜닝과 최적화 기능 제공 ### 4. Model evaluation 모델 평가(Model evaluation)는 실험 환경과 상용 환경에서 동작하는 모델의 성능을 관찰할 수 있는 다음과 같은 기능을 제공합니다. - 평가 데이터에 대한 모델 성능 평가 기능 - 서로 다른 지속 학습 실행 결과에 대한 예측 성능 추적 - 서로 다른 모델의 성능 비교와 시각화 - 해석할 수 있는 AI 기술을 이용한 모델 출력 해석 기능 제공 ### 5. Model serving 모델 서빙(Model serving)은 상용 환경에 모델을 배포하고 서빙하기 위한 다음과 같은 기능들을 제공합니다. - 저 지연 추론과 고가용성 추론 기능 제공 - 다양한 ML 모델 서빙 프레임워크 지원(Tensorflow Serving, TorchServe, NVIDIA Triton, Scikit-learn, XGBoost. etc) - 복잡한 형태의 추론 루틴 기능 제공, 예를 들어 전처리(preprocess) 또는 후처리(postprocess) 기능과 최종 결과를 위해 다수의 모델이 사용되는 경우를 말합니다. - 순간적으로 치솟는 추론 요청을 처리하기 위한 오토 스케일링(autoscaling) 기능 제공 - 추론 요청과 추론 결과에 대한 로깅 기능 제공 ### 6. Online experimentation 온라인 실험(Online experimentation)은 새로운 모델이 생성되었을 때, 이 모델을 배포하면 어느 정도의 성능을 보일 것인지 검증하는 기능을 제공합니다. 이 기능은 새 모델을 배포하는 것까지 연동하기 위해 모델 저장소(Model Registry)와 연동되어야 합니다. - 카나리(canary) & 섀도(shadow) 배포 기능 제공 - A/B 테스트 기능 제공 - 멀티 암드 밴딧(Multi-armed bandit) 테스트 기능 제공 ### 7. Model Monitoring 모델 모니터링(Model Monitoring)은 상용 환경에 배포된 모델이 정상적으로 동작하고 있는지를 모니터링하는 기능을 제공합니다. 예를 들어 모델의 성능이 떨어져 업데이트가 필요한지에 대한 정보 등을 제공합니다. ### 8. ML Pipeline 머신러닝 파이프라인(ML Pipeline)은 상용 환경에서 복잡한 ML 학습과 추론 작업을 구성하고 제어하고 자동화하기 위한 다음과 같은 기능을 제공합니다. - 다양한 이벤트를 소스를 통한 파이프라인 실행 기능 - 파이프라인 파라미터와 생성되는 산출물 관리를 위한 머신러닝 메타데이터 추적과 연동 기능 - 일반적인 머신러닝 작업을 위한 내장 컴포넌트 지원과 사용자가 직접 구현한 컴포넌트에 대한 지원 기능 - 서로 다른 실행 환경 제공 기능 ### 9. Model Registry 모델 저장소(Model Registry)는 머신러닝 모델의 생명 주기(Lifecycle)을 중앙 저장소에서 관리할 수 있게 해 주는 기능을 제공합니다. - 학습된 모델 그리고 배포된 모델에 대한 등록, 추적, 버저닝 기능 제공 - 배포를 위해 필요한 데이터와 런타임 패키지들에 대한 정보 저장 기능 ### 10. Dataset and Feature Repository - 데이터에 대한 공유, 검색, 재사용 그리고 버전 관리 기능 - 이벤트 스트리밍 및 온라인 추론 작업에 대한 실시간 처리 및 저 지연 서빙 기능 - 사진, 텍스트, 테이블 형태의 데이터와 같은 다양한 형태의 데이터 지원 기능 ### 11. ML Metadata and Artifact Tracking MLOps의 각 단계에서는 다양한 형태의 산출물들이 생성됩니다. ML 메타데이터는 이런 산출물들에 대한 정보를 의미합니다. ML 메타데이터와 산출물 관리는 산출물의 위치, 타입, 속성, 그리고 관련된 실험(experiment)에 대한 정보를 관리하기 위해 다음과 같은 기능들을 제공합니다. - ML 산출물에 대한 히스토리 관리 기능 - 실험과 파이프라인 파라미터 설정에 대한 추적, 공유 기능 - ML 산출물에 대한 저장, 접근, 시각화, 다운로드 기능 제공 - 다른 MLOps 기능과의 통합 기능 제공 ================================================ FILE: docs/introduction/intro.md ================================================ --- title : "1. What is MLOps?" description: "Introduction to MLOps" sidebar_position: 1 date: 2021-1./img to MLOps" lastmod: 2022-03-05 contributors: ["Jongseob Jeon"] --- ## Machine Learning Project 2012년 Alexnet 이후 CV, NLP를 비롯하여 데이터가 존재하는 도메인이라면 어디서든 머신러닝과 딥러닝을 도입하고자 하였습니다. 딥러닝과 머신러닝은 AI라는 단어로 묶이며 불렸고 많은 매체에서 AI의 필요성을 외쳤습니다. 그리고 무수히 많은 기업에서 머신러닝과 딥러닝을 이용한 수많은 프로젝트를 진행하였습니다. 하지만 그 결과는 어떻게 되었을까요? 엘리먼트 AI의 음병찬 동북아 지역 총괄책임자는 [*"10개 기업에 AI 프로젝트를 시작한다면 그중 9개는 컨셉검증(POC)만 하다 끝난다"*](https://zdnet.co.kr/view/?no=20200611062002)고 말했습니다. 이처럼 많은 프로젝트에서 머신러닝과 딥러닝은 이 문제를 풀 수 있을 것 같다는 가능성만을 보여주고 사라졌습니다. 그리고 이 시기쯤에 [AI에 다시 겨울](https://www.aifutures.org/2021/ai-winter-is-coming/)이 다가오고 있다는 전망도 나오기 시작했습니다. 왜 프로젝트 대부분이 컨셉검증(POC) 단계에서 끝났을까요? 머신러닝과 딥러닝 코드만으로는 실제 서비스를 운영할 수 없기 때문입니다. 실제 서비스 단계에서 머신러닝과 딥러닝의 코드가 차지하는 부분은 생각보다 크지 않기 때문에, 단순히 모델의 성능만이 아닌 다른 많은 부분을 고려해야 합니다. 구글은 이런 문제를 2015년 [Hidden Technical Debt in Machine Learning Systems](https://proceedings.neurips.cc/paper/2015/file/86df7dcfd896fcaf2674f757a2463eba-Paper.pdf)에서 지적한 바 있습니다. 하지만 이 논문이 나올 당시에는 아직 많은 머신러닝 엔지니어들이 딥러닝과 머신러닝의 가능성을 입증하기 바쁜 시기였기 때문에, 논문이 지적하는 바에 많은 주의를 기울이지는 않았습니다. 그리고 몇 년이 지난 후 머신러닝과 딥러닝은 가능성을 입증해내어, 이제 사람들은 실제 서비스에 적용하고자 했습니다. 하지만 곧 많은 사람이 실제 서비스는 쉽지 않다는 것을 깨달았습니다. ## Devops MLOps는 이전에 없던 새로운 개념이 아니라 DevOps라고 불리는 개발 방법론에서 파생된 단어입니다. 그렇기에 DevOps를 이해한다면 MLOps를 이해하는 데 도움이 됩니다. ### DevOps DevOps는 Development(개발)와 Operations(운영)의 합성어로 소프트웨어의 개발(Development)과 운영(Operations)의 합성어로서 소프트웨어 개발자와 정보기술 전문가 간의 소통, 협업 및 통합을 강조하는 개발 환경이나 문화를 말합니다. DevOps의 목적은 소프트웨어 개발 조직과 운영 조직간의 상호 의존적 대응이며 조직이 소프트웨어 제품과 서비스를 빠른 시간에 개발 및 배포하는 것을 목적으로 합니다. ### Silo Effect 그럼 간단한 상황 설명을 통해 DevOps가 왜 필요한지 알아보도록 하겠습니다. 서비스 초기에는 지원하는 기능이 많지 않으며 팀 또는 회사의 규모가 작습니다. 이때에는 개발팀과 운영팀의 구분이 없거나 작은 규모의 팀으로 구분되어 있습니다. 핵심은 규모가 작다는 것에 있습니다. 이때는 서로 소통할 수 있는 접점이 많고, 집중해야 하는 서비스가 적기 때문에 빠르게 서비스를 개선해 나갈 수 있습니다. 하지만 서비스의 규모가 커질수록 개발팀과 운영팀은 분리되고 서로 소통할 수 있는 채널의 물리적인 한계가 오게 됩니다. 예를 들어서 다른 팀과 함께하는 미팅에 팀원 전체가 미팅을 하는 것이 아니라 각 팀의 팀장 혹은 소수의 시니어만 참석하여 미팅을 진행하게 됩니다. 이런 소통 채널의 한계는 필연적으로 소통의 부재로 이어지게 됩니다. 그러다 보면 개발팀은 새로운 기능들을 계속해서 개발하고 운영팀 입장에서는 개발팀에서 개발한 기능이 배포 시 장애를 일으키는 등 여러 문제가 생기게 됩니다. 위와 같은 상황이 반복되면 조직 이기주의라고 불리는 사일로 현상이 생길 수 있습니다. ![silo](./img/silo.png) > 사일로(silo)는 곡식이나 사료를 저장하는 굴뚝 모양의 창고를 의미한다. 사일로는 독립적으로 존재하며 저장되는 물품이 서로 섞이지 않도록 철저히 관리할 수 있도록 도와준다. > 사일로 효과(Organizational Silos Effect)는 조직 부서 간에 서로 협력하지 않고 내부 이익만을 추구하는 현상을 의미한다. 조직 내에서 개별 부서끼리 서로 담을 쌓고 각자의 이익에만 몰두하는 부서 이기주의를 일컫는다. 사일로 현상은 서비스 품질의 저하로 이어지게 됩니다. 이러한 사일로 현상을 해결하기 위해 나온 것이 바로 DevOps입니다. ### CI/CD Continuous Integration(CI) 와 Continuous Delivery (CD)는 개발팀과 운영팀의 장벽을 해제하기 위한 구체적인 방법입니다. ![cicd](./img/cicd.png) 이 방법을 통해서 개발팀에서는 운영팀의 환경을 이해하고 개발팀에서 개발 중인 기능이 정상적으로 배포까지 이어질 수 있는지 확인합니다. 운영팀은 검증된 기능 또는 개선된 제품을 더 자주 배포해 고객의 제품 경험을 상승시킵니다. 앞에서 설명한 내용을 종합하자면 DevOps는 개발팀과 운영팀 간의 문제가 있었고 이를 해결하기 위한 방법론입니다. ## MLOps ### 1) ML+Ops MLOps는 Machine Learning 과 Operations의 합성어로 DevOps에서 Dev가 ML로 바뀌었습니다. 이제 앞에서 살펴본 DevOps를 통해 MLOps가 무엇인지 짐작해 볼 수 있습니다. “MLOps는 머신러닝팀과 운영팀의 문제를 해결하기 위한 방법입니다.” 이 말은 머신러닝팀과 운영팀 사이에 문제가 발생했다는 의미입니다. 그럼 왜 머신러닝팀과 운영팀에는 문제가 발생했을까요? 두 팀 간의 문제를 알아보기 위해서 추천시스템을 예시로 알아보겠습니다. #### Rule Based 처음 추천시스템을 만드는 경우 간단한 규칙을 기반으로 아이템을 추천합니다. 예를 들어서 1주일간 판매량이 가장 많은 순서대로 보여주는 식의 방식을 이용합니다. 이 방식으로 모델을 정한다면 특별한 이유가 없는 이상 모델의 수정이 필요 없습니다. #### Machine Learning 서비스의 규모가 조금 커지고 로그 데이터가 많이 쌓인다면 이를 이용해 아이템 기반 혹은 유저 기반의 머신러닝 모델을 생성합니다. 이때 모델은 정해진 주기에 따라 모델을 재학습 후 재배포합니다. #### Deep Learning 개인화 추천에 대한 요구가 더 커지고 더 좋은 성능을 내는 모델을 필요해질 경우 딥러닝을 이용한 모델을 개발하기 시작합니다. 이때 만드는 모델은 머신러닝과 같이 정해진 주기에 따라 모델을 재학습 후 재배포합니다. ![graph](./img/graph.png) 위에서 설명한 것을 x축을 모델의 복잡도, y축을 모델의 성능으로 두고 그래프로 표현한다면 다음과 같이 복잡도가 올라갈 때 모델의 성능이 올라가는 상승 관계를 갖습니다. 머신러닝에서 딥러닝으로 넘어갈 머신러닝 팀이 새로 생기게 됩니다. 만약 관리해야할 모델이 적다면 서로 협업을 통해서 충분히 해결할 수 있지만 개발해야 할 모델이 많아진다면 DevOps의 경우와 같이 사일로 현상이 나타나게 됩니다. DevOps의 목표와 맞춰서 생각해보면 MLOps의 목표는 개발한 모델이 정상적으로 배포될 수 있는지 테스트하는 것입니다. 개발팀에서 개발한 기능이 정상적으로 배포될 수 있는지 확인하는 것이 DevOps의 목표였다면, MLOps의 목표는 머신러닝 팀에서 개발한 모델이 정상적으로 배포될 수 있는지 확인하는 것입니다. ### 2) ML -> Ops 하지만 최근 나오고 있는 MLOps 관련 제품과 설명을 보면 꼭 앞에서 설명한 목표만을 대상으로 하고 있지 않습니다. 어떤 경우에는 머신러닝 팀에서 만든 모델을 이용해 직접 운영을 할 수 있도록 도와주려고 합니다. 이러한 니즈는 최근 머신러닝 프로젝트가 진행되는 과정에서 알 수 있습니다. 추천시스템의 경우 운영에서 간단한 모델부터 시작해 운영할 수 있었습니다. 하지만 자연어, 이미지와 같은 곳에서는 규칙 기반의 모델보다는 딥러닝을 이용해 주어진 태스크를 해결할 수 있는지 검증(POC)를 선행하는 경우가 많습니다. 검증이 끝난 프로젝트는 이제 서비스를 위한 운영 환경을 개발하기 시작합니다. 하지만 머신러닝 팀 내의 자체 역량으로는 이 문제를 해결하기 쉽지 않습니다. 이를 해결하기 위해서 MLOps가 필요한 경우도 있습니다. ### 3) 결론 요약하자면 MLOps는 두 가지 목표가 있습니다. 앞에서 설명한 MLOps는 ML+Ops 로 두 팀의 생산성 향상을 위한 것이였습니다. 반면, 뒤에서 설명한 것은 ML->Ops 로 머신러닝 팀에서 직접 운영을 할 수 있도록 도와주는 것을 말합니다. ================================================ FILE: docs/introduction/levels.md ================================================ --- title : "2. Levels of MLOps" description: "Levels of MLOps" sidebar_position: 2 date: 2021-12-03 lastmod: 2022-03-05 contributors: ["Jongseob Jeon", "Chanmin Cho"] --- 이번 페이지에서는 구글에서 발표한 MLOps의 단계를 보며 MLOps의 핵심 기능은 무엇인지 알아 보겠습니다. ## Hidden Technical Debt in ML System 구글은 무려 2015년부터 MLOps의 필요성을 말했습니다. Hidden Technical Debt in Machine Learning Systems 은 그런 구글의 생각을 담은 논문입니다. ![paper](./img/paper.png) 이 논문의 핵심은 바로 머신러닝을 이용한 제품을 만드는데 있어서 머신러닝 코드는 전체 시스템을 구성하는데 있어서 아주 일부일 뿐이라는 것입니다. ![paper-2](./img/paper-2.png) 구글은 이 논문을 더 발전시켜서 MLOps라는 용어를 만들어 확장시켰습니다. 더 자세한 내용은 [구글 클라우드 홈페이지](https://cloud.google.com/architecture/mlops-continuous-delivery-and-automation-pipelines-in-machine-learning)에서 더 자세한 내용을 확인할 수 있습니다. 이번 포스트에서는 구글에서 말하는 MLOps란 어떤 것인지에 대해서 설명해보고자 합니다. 구글에서는 MLOps의 발전 단계를 총 3(0~2)단계로 나누었습니다. 각 단계들에 대해 설명하기 앞서 이전 포스트에서 설명했던 개념 중 필요한 부분을 다시 한번 보겠습니다. 머신러닝 모델을 운영하기 위해서는 모델을 개발하는 머신러닝 팀과 배포 및 운영을 담당하는 운영팀이 있습니다. 이 두 팀의 원할한 협업을 위해서 MLOps가 필요하게 되었습니다. 이전에는 간단히 Continuous Integration(CI)/Continuous Deployment(CD)를 통해서 할 수 있다고 하였는데, 어떻게 CI/CD를 하는지에 대해서 알아 보겠습니다. ## 0단계: 수동 프로세스 ![level-0](./img/level-0.png) 0단계에서 두 팀은 “모델”을 통해 소통합니다. 머신 러닝팀은 쌓여있는 데이터로 모델을 학습시키고 학습된 모델을 운영팀에게 전달 합니다. 운영팀은 이렇게 전달받은 모델을 배포합니다. ![toon](./img/toon.png) 초기의 머신 러닝 모델들은 이 “모델” 중심의 소통을 통해 배포합니다. 그런데 이런 배포 방식은 여러 문제가 있습니다. 예를 들어서 어떤 기능에서는 파이썬 3.7을 쓰고 어떤 기능에서는 파이썬 3.8을 쓴다면 다음과 같은 상황을 자주 목격할 수 있습니다. 이러한 상황이 일어나는 이유는 머신러닝 모델의 특성에 있습니다. 학습된 머신러닝 모델이 동작하기 위해서는 3가지가 필요합니다. 1. 파이썬 코드 2. 학습된 가중치 3. 환경 (패키지, 버전 등) 만약 이 3가지 중 한 가지라도 전달이 잘못 된다면 모델이 동작하지 않거나 예상하지 못한 예측을 할수 있습니다. 그런데 많은 경우 환경이 일치하지 않아서 동작하지 않는 경우가 많습니다. 머신러닝은 다양한 오픈소스를 사용하는데 오픈소스는 특성상 어떤 버전을 쓰는지에 따라서 같은 함수라도 결과가 다를 수 있습니다. 이러한 문제는 서비스 초기에는 관리할 모델이 많지 않기 때문에 금방 해결할 수 있습니다. 하지만 관리하는 기능들이 많아지고 서로 소통에 어려움을 겪게 된다면 성능이 더 좋은 모델을 빠르게 배포할 수 없게 됩니다. ## 1단계: ML 파이프라인 자동화 ### Pipeline ![level-1-pipeline](./img/level-1-pipeline.png) 그래서 MLOps에서는 “파이프라인(Pipeline)”을 이용해 이러한 문제를 방지하고자 했습니다. MLOps의 파이프라인은 도커와 같은 컨테이너를 이용해 머신러닝 엔지니어가 모델 개발에 사용한 것과 동일한 환경으로 동작되는 것을 보장합니다. 이를 통해서 환경이 달라서 모델이 동작하지 않는 상황을 방지합니다. 그런데 파이프라인은 범용적인 용어로 여러 다양한 태스크에서 사용됩니다. 머신러닝 엔지니어가 작성하는 파이프라인의 역할은 무엇일까요? 머신러닝 엔지니어가 작성하는 파이프라인은 학습된 모델을 생산합니다. 그래서 파이프라인 대신 학습 파이프라인(Training Pipeline)이 더 정확하다고 볼 수 있습니다. ### Continuous Training ![level-1-ct.png](./img/level-1-ct.png) 그리고 Continuous Training(CT) 개념이 추가됩니다. 그렇다면 CT는 왜 필요할까요? #### Auto Retrain Real World에서 데이터는 Data Shift라는 데이터의 분포가 계속해서 변하는 특징이 있습니다. 그래서 과거에 학습한 모델이 시간이 지남에 따라 모델의 성능이 저하되는 문제가 있습니다. 이 문제를 해결하는 가장 간단하고 효과적인 해결책은 바로 최근 데이터를 이용해 모델을 재학습하는 것입니다. 변화된 데이터 분포에 맞춰서 모델을 재학습하면 다시 준수한 성능을 낼 수 있습니다. #### Auto Deploy 하지만 제조업과 같이 한 공장에서 여러 레시피를 처리하는 경우 무조건 재학습을 하는 것이 좋지 않을 수 도 있습니다. Blind Spot이 대표적인 예입니다. 예를 들어서 자동차 생산 라인에서 모델 A에 대해서 모델을 만들고 이를 이용해 예측을 진행하고 있었습니다. 만약 전혀 다른 모델 B가 들어오면 이전에 보지 못한 데이터 패턴이기 때문에 모델 B에 대해서 새로운 모델을 학습합니다. 이제 모델 B에 대해서 모델을 만들었기 때문에 모델은 예측을 진행할 것 입니다. 그런데 만약 데이터가 다시 모델 A로 바뀐다면 어떻게 할까요? 만약 Retraining 규칙만 있다면 다시 모델 A에 대해서 새로운 모델을 학습하게 됩니다. 그런데 머신러닝 모델이 충분한 성능을 보이기 위해서는 충분한 양의 데이터가 모여야 합니다. Blind Spot이란 이렇게 데이터를 모으기 위해서 모델이 동작하지 않는 구간을 말합니다. 이러한 Blind Spot을 해결하는 방법은 간단할 수 있습니다. 바로 모델 A에 대한 모델이 과거에 있었는지 확인하고 만약 있었다면 새로운 모델을 바로 학습하기 보다는 이 전 모델을 이용해 다시 예측을 하면 이런 Blind Spot을 해결할 수 있습니다. 이렇게 모델와 같은 메타 데이터를 이용해 모델을 자동으로 변환해주는 것을 Auto Deploy라고 합니다. 정리하자면 CT를 위해서는 Auto Retraining과 Auto Deploy 두 가지 기능이 필요합니다. 둘은 서로의 단점을 보완해 계속해서 모델의 성능을 유지할 수 있게 합니다. ### Model Serving ![level-1-modelserving](./img/level-1-modelserving.png) 프로덕션 환경에서의 머신러닝 파이프라인은 새로운 데이터에 기반한 최신 모델을 예측 서비스에 지속적으로 배포합니다. 이 과정에서, 훈련되고 검증된 모델을 온라인 예측 서비스에 자동적으로 배포하는 작업이 포함됩니다. ## 2단계: CI/CD 파이프라인의 자동화 ![level-2](./img/level-2.png) 2단계의 제목은 CI와 CD의 자동화 입니다. DevOps에서의 CI/CD의 대상은 소스 코드입니다. 그렇다면 MLOps는 어떤 것이 CI/CD의 대상일까요? MLOps의 CI/CD 대상 또한 소스 코드인 것은 맞지만 조금 더 엄밀히 정의하자면 학습 파이프라인이라고 볼 수 있습니다. 그래서 모델을 학습하는데 있어서 영향이 있는 변화에 대해서 실제로 모델이 정상적으로 학습이 되는지 (CI), 학습된 모델이 정상적으로 동작하는지 (CD)를 확인해야 합니다. 그래서 학습을 하는 코드에 직접적인 수정이 있는 경우에는 CI/CD를 진행해야 합니다. 코드 외에도 사용하는 패키지의 버전, 파이썬의 버전 변경도 CI/CD의 대상입니다. 많은 경우 머신 러닝은 오픈 소스를 이용합니다. 하지만 오픈 소스는 그 특성상 버전이 바뀌었을 때 함수의 내부 로직이 변하는 경우도 있습니다. 물론 어느 정도 버전이 올라 갈 때 이와 관련된 알림을 주지만 한 번에 버전이 크게 바뀐다면 이러한 변화를 모를 수도 있습니다. 그래서 사용하는 패키지의 버전이 변하는 경우에도 CI/CD를 통해 정상적으로 모델이 학습, 동작하는지 확인을 해야 합니다. ================================================ FILE: docs/introduction/why_kubernetes.md ================================================ --- title : "4. Why Kubernetes?" description: "Reason for using k8s in MLOps" sidebar_position: 4 date: 2021-12-03 lastmod: 2021-12-10 contributors: ["Jaeyeon Kim"] --- ## MLOps & Kubernetes 그렇다면 MLOps를 이야기할 때, 쿠버네티스(Kubernetes)라는 단어가 항상 함께 들리는 이유가 무엇일까요? 성공적인 MLOps 시스템을 구축하기 위해서는 [MLOps의 구성요소](../introduction/component.md) 에서 설명한 것처럼 다양한 구성 요소들이 필요하지만, 각각의 구성 요소들이 유기적으로 운영되기 위해서는 인프라 레벨에서 수많은 이슈를 해결해야 합니다. 간단하게는 수많은 머신러닝 모델의 학습 요청을 차례대로 실행하는 것, 다른 작업 공간에서도 같은 실행 환경을 보장해야 하는 것, 배포된 서비스에 장애가 생겼을 때 빠르게 대응해야 하는 것 등의 이슈 등을 생각해볼 수 있습니다. 여기서 컨테이너(Container)와 컨테이너 오케스트레이션 시스템(Container Orchestration System)의 필요성이 등장합니다. 쿠버네티스와 같은 컨테이너 오케스트레이션 시스템을 도입하면 실행 환경의 격리와 관리를 효율적으로 수행할 수 있습니다. 컨테이너 오케스트레이션 시스템을 도입한다면, 머신러닝 모델을 개발하고 배포하는 과정에서 다수의 개발자가 소수의 클러스터를 공유하면서 *'1번 클러스터 사용 중이신가요?', 'GPU 사용 중이던 제 프로세스 누가 죽였나요?', '누가 클러스터에 x 패키지 업데이트했나요?'* 와 같은 상황을 방지할 수 있습니다. ## Container 그렇다면 컨테이너란 무엇일까요? 마이크로소프트에서는 컨테이너를 [다음](https://azure.microsoft.com/ko-kr/overview/what-is-a-container/)과 같이 정의하고 있습니다. > 컨테이너란 : 애플리케이션의 표준화된 이식 가능한 패키징 그런데 왜 머신러닝에서 컨테이너가 필요할까요? 머신러닝 모델들은 운영체제나 Python 실행 환경, 패키지 버전 등에 따라 다르게 동작할 수 있습니다. 이를 방지하기 위해서 머신러닝에 사용된 소스 코드와 함께 종속적인 실행 환경 전체를 **하나로 묶어서(패키징해서)** 공유하고 실행하는 데 활용할 수 있는 기술이 컨테이너라이제이션(Containerization) 기술입니다. 이렇게 패키징된 형태를 컨테이너 이미지라고 부르며, 컨테이너 이미지를 공유함으로써 사용자들은 어떤 시스템에서든 같은 실행 결과를 보장할 수 있게 됩니다. 즉, 단순히 Jupyter Notebook 파일이나, 모델의 소스 코드와 requirements.txt 파일을 공유하는 것이 아닌, 모든 실행 환경이 담긴 컨테이너 이미지를 공유한다면 *"제 노트북에서는 잘 되는데요?"* 와 같은 상황을 피할 수 있습니다. 컨테이너를 처음 접하시는 분들이 흔히 하시는 오해 중 하나는 "**컨테이너 == 도커**"라고 받아들이는 것입니다. 도커는 컨테이너와 같은 의미를 지니는 개념이 아니라, 컨테이너를 띄우거나, 컨테이너 이미지를 만들고 공유하는 것과 같이 컨테이너를 더욱더 쉽고 유연하게 사용할 수 있는 기능을 제공해주는 도구입니다. 정리하자면 컨테이너는 가상화 기술이고, 도커는 가상화 기술의 구현체라고 말할 수 있습니다. 다만, 도커는 여러 컨테이너 가상화 도구 중에서 쉬운 사용성과 높은 효율성을 바탕으로 가장 빠르게 성장하여 대세가 되었기에 컨테이너하면 도커라는 이미지가 자동으로 떠오르게 되었습니다. 이렇게 컨테이너와 도커 생태계가 대세가 되기까지는 다양한 이유가 있지만, 기술적으로 자세한 이야기는 *모두의 MLOps*의 범위를 넘어서기 때문에 다루지는 않겠습니다. 컨테이너 혹은 도커를 처음 들어보시는 분들에게는 *모두의 MLOps*의 내용이 다소 어렵게 느껴질 수 있으므로, [생활코딩](https://opentutorials.org/course/4781), [subicura 님의 개인 블로그 글](https://subicura.com/2017/01/19/docker-guide-for-beginners-1.html) 등의 자료를 먼저 살펴보는 것을 권장합니다. ## Container Orchestration System 그렇다면 컨테이너 오케스트레이션 시스템은 무엇일까요? **오케스트레이션**이라는 단어에서 추측해 볼 수 있듯이, 수많은 컨테이너가 있을 때 컨테이너들이 서로 조화롭게 구동될 수 있도록 지휘하는 시스템에 비유할 수 있습니다. 컨테이너 기반의 시스템에서 서비스는 컨테이너의 형태로 사용자들에게 제공됩니다. 이때 관리해야 할 컨테이너의 수가 적다면 운영 담당자 한 명이서도 충분히 모든 상황에 대응할 수 있습니다. 하지만, 수백 개 이상의 컨테이너가 수 십 대 이상의 클러스터에서 구동되고 있고 장애를 일으키지 않고 항상 정상 동작해야 한다면, 모든 서비스의 정상 동작 여부를 담당자 한 명이 파악하고 이슈에 대응하는 것은 불가능에 가깝습니다. 예를 들면, 모든 서비스가 정상적으로 동작하고 있는지를 계속해서 모니터링(Monitoring)해야 합니다. 만약, 특정 서비스가 장애를 일으켰다면 여러 컨테이너의 로그를 확인해가며 문제를 파악해야 합니다. 또한, 특정 클러스터나 특정 컨테이너에 작업이 몰리지 않도록 스케줄링(Scheduling)하고 로드 밸런싱(Load Balancing)하며, 스케일링(Scaling)하는 등의 수많은 작업을 담당해야 합니다. 이렇게 수많은 컨테이너의 상태를 지속해서 관리하고 운영하는 과정을 조금이나마 쉽게, 자동으로 할 수 있는 기능을 제공해주는 소프트웨어가 바로 컨테이너 오케스트레이션 시스템입니다. 머신러닝에서는 어떻게 쓰일 수 있을까요? 예를 들어서 GPU가 있어야 하는 딥러닝 학습 코드가 패키징된 컨테이너는 사용 가능한 GPU가 있는 클러스터에서 수행하고, 많은 메모리를 필요로 하는 데이터 전처리 코드가 패키징된 컨테이너는 메모리의 여유가 많은 클러스터에서 수행하고, 학습 중에 클러스터에 문제가 생기면 자동으로 같은 컨테이너를 다른 클러스터로 이동시키고 다시 학습을 진행하는 등의 작업을 사람이 일일이 수행하지 않고, 자동으로 관리하는 시스템을 개발한 뒤 맡기는 것입니다. 집필을 하는 2022년을 기준으로 쿠버네티스는 컨테이너 오케스트레이션 시스템의 사실상의 표준(De facto standard)입니다. CNCF에서 2018년 발표한 [Survey](https://www.cncf.io/blog/2018/08/29/cncf-survey-use-of-cloud-native-technologies-in-production-has-grown-over-200-percent/) 에 따르면 다음 그림과 같이 이미 두각을 나타내고 있었으며, 2019년 발표한 [Survey](https://www.cncf.io/wp-content/uploads/2020/08/CNCF_Survey_Report.pdf)에 따르면 그중 78%가 상용 수준(Production Level)에서 사용하고 있다는 것을 알 수 있습니다. ![k8s-graph](./img/k8s-graph.png) 쿠버네티스 생태계가 이처럼 커지게 된 이유에는 여러 가지 이유가 있습니다. 하지만 도커와 마찬가지로 쿠버네티스 역시 머신러닝 기반의 서비스에서만 사용하는 기술이 아니기에, 자세히 다루기에는 상당히 많은 양의 기술적인 내용을 다루어야 하므로 이번 *모두의 MLOps*에서는 자세한 내용은 생략할 예정입니다. 다만, *모두의 MLOps*에서 앞으로 다룰 내용은 도커와 쿠버네티스에 대한 내용을 어느 정도 알고 계신 분들을 대상으로 작성하였습니다. 따라서 쿠버네티스에 대해 익숙하지 않으신 분들은 다음 [쿠버네티스 공식 문서](https://kubernetes.io/ko/docs/concepts/overview/what-is-kubernetes/), [subicura 님의 개인 블로그 글](https://subicura.com/k8s/) 등의 쉽고 자세한 자료들을 먼저 참고해주시는 것을 권장합니다. ================================================ FILE: docs/kubeflow/_category_.json ================================================ { "label": "Kubeflow", "position": 6, "link": { "type": "generated-index" } } ================================================ FILE: docs/kubeflow/advanced-component.md ================================================ --- title : "8. Component - InputPath/OutputPath" description: "" sidebar_position: 8 contributors: ["Jongseob Jeon", "SeungTae Kim"] --- ## Complex Outputs 이번 페이지에서는 [Kubeflow Concepts](../kubeflow/kubeflow-concepts.md#component-contents) 예시로 나왔던 코드를 컴포넌트로 작성해 보겠습니다. ## Component Contents 아래 코드는 [Kubeflow Concepts](../kubeflow/kubeflow-concepts.md#component-contents)에서 사용했던 컴포넌트 콘텐츠입니다. ```python import dill import pandas as pd from sklearn.svm import SVC train_data = pd.read_csv(train_data_path) train_target = pd.read_csv(train_target_path) clf = SVC(kernel=kernel) clf.fit(train_data, train_target) with open(model_path, mode="wb") as file_writer: dill.dump(clf, file_writer) ``` ## Component Wrapper ### Define a standalone Python function 컴포넌트 래퍼에 필요한 Config들과 함께 작성하면 다음과 같이 됩니다. ```python def train_from_csv( train_data_path: str, train_target_path: str, model_path: str, kernel: str, ): import dill import pandas as pd from sklearn.svm import SVC train_data = pd.read_csv(train_data_path) train_target = pd.read_csv(train_target_path) clf = SVC(kernel=kernel) clf.fit(train_data, train_target) with open(model_path, mode="wb") as file_writer: dill.dump(clf, file_writer) ``` [Basic Usage Component](../kubeflow/basic-component)에서 설명할 때 입력과 출력에 대한 타입 힌트를 적어야 한다고 설명 했었습니다. 그런데 만약 json에서 사용할 수 있는 기본 타입이 아닌 dataframe, model와 같이 복잡한 객체들은 어떻게 할까요? 파이썬에서 함수간에 값을 전달할 때, 객체를 반환해도 그 값이 호스트의 메모리에 저장되어 있으므로 다음 함수에서도 같은 객체를 사용할 수 있습니다. 하지만 kubeflow에서 컴포넌트들은 각각 컨테이너 위에서 서로 독립적으로 실행됩니다. 즉, 같은 메모리를 공유하고 있지 않기 때문에, 보통의 파이썬 함수에서 사용하는 방식과 같이 객체를 전달할 수 없습니다. 컴포넌트 간에 넘겨 줄 수 있는 정보는 `json` 으로만 가능합니다. 따라서 Model이나 DataFrame과 같이 json 형식으로 변환할 수 없는 타입의 객체는 다른 방법을 통해야 합니다. Kubeflow에서는 이를 해결하기 위해 json-serializable 하지 않은 타입의 객체는 메모리 대신 파일에 데이터를 저장한 뒤, 그 파일을 이용해 정보를 전달합니다. 저장된 파일의 경로는 str이기 때문에 컴포넌트 간에 전달할 수 있기 때문입니다. 그런데 kubeflow에서는 minio를 이용해 파일을 저장하는데 유저는 실행을 하기 전에는 각 파일의 경로를 알 수 없습니다. 이를 위해서 kubeflow에서는 입력과 출력의 경로와 관련된 매직을 제공하는데 바로 `InputPath`와 `OutputPath` 입니다. `InputPath`는 단어 그대로 입력 경로를 `OutputPath` 는 단어 그대로 출력 경로를 의미합니다. 예를 들어서 데이터를 생성하고 반환하는 컴포넌트에서는 `data_path: OutputPath()`를 argument로 만듭니다. 그리고 데이터를 받는 컴포넌트에서는 `data_path: InputPath()`을 argument로 생성합니다. 이렇게 만든 후 파이프라인에서 서로 연결을 하면 kubeflow에서 필요한 경로를 자동으로 생성후 입력해 주기 때문에 더 이상 유저는 경로를 신경쓰지 않고 컴포넌트간의 관계만 신경쓰면 됩니다. 이제 이 내용을 바탕으로 다시 컴포넌트 래퍼를 작성하면 다음과 같이 됩니다. ```python from kfp.components import InputPath, OutputPath def train_from_csv( train_data_path: InputPath("csv"), train_target_path: InputPath("csv"), model_path: OutputPath("dill"), kernel: str, ): import dill import pandas as pd from sklearn.svm import SVC train_data = pd.read_csv(train_data_path) train_target = pd.read_csv(train_target_path) clf = SVC(kernel=kernel) clf.fit(train_data, train_target) with open(model_path, mode="wb") as file_writer: dill.dump(clf, file_writer) ``` InputPath나 OutputPath는 string을 입력할 수 있습니다. 이 string은 입력 또는 출력하려고 하는 파일의 포맷입니다. 그렇다고 꼭 이 포맷으로 파일 형태로 저장이 강제되는 것은 아닙니다. 다만 파이프라인을 컴파일할 때 최소한의 타입 체크를 위한 도우미 역할을 합니다. 만약 파일 포맷이 고정되지 않는다면 입력하지 않으면 됩니다 (타입 힌트 에서 `Any` 와 같은 역할을 합니다). ### Convert to Kubeflow Format 작성한 컴포넌트를 kubeflow에서 사용할 수 있는 포맷으로 변환합니다. ```python from kfp.components import InputPath, OutputPath, create_component_from_func @create_component_from_func def train_from_csv( train_data_path: InputPath("csv"), train_target_path: InputPath("csv"), model_path: OutputPath("dill"), kernel: str, ): import dill import pandas as pd from sklearn.svm import SVC train_data = pd.read_csv(train_data_path) train_target = pd.read_csv(train_target_path) clf = SVC(kernel=kernel) clf.fit(train_data, train_target) with open(model_path, mode="wb") as file_writer: dill.dump(clf, file_writer) ``` ## Rule to use InputPath/OutputPath InputPath나 OutputPath argument는 파이프라인으로 작성할 때 지켜야하는 규칙이 있습니다. ### Load Data Component 위에서 작성한 컴포넌트를 실행하기 위해서는 데이터가 필요하므로 데이터를 생성하는 컴포넌트를 작성합니다. ```python from functools import partial from kfp.components import InputPath, OutputPath, create_component_from_func @create_component_from_func def load_iris_data( data_path: OutputPath("csv"), target_path: OutputPath("csv"), ): import pandas as pd from sklearn.datasets import load_iris iris = load_iris() data = pd.DataFrame(iris["data"], columns=iris["feature_names"]) target = pd.DataFrame(iris["target"], columns=["target"]) data.to_csv(data_path, index=False) target.to_csv(target_path, index=False) ``` ### Write Pipeline 이제 파이프라인을 작성해 보도록 하겠습니다. ```python from kfp.dsl import pipeline @pipeline(name="complex_pipeline") def complex_pipeline(kernel: str): iris_data = load_iris_data() model = train_from_csv( train_data=iris_data.outputs["data"], train_target=iris_data.outputs["target"], kernel=kernel, ) ``` 한 가지 이상한 점을 확인하셨나요? 바로 입력과 출력에서 받는 argument중 경로와 관련된 것들에 `_path` 접미사가 모두 사라졌습니다. `iris_data.outputs["data_path"]` 가 아닌 `iris_data.outputs["data"]` 으로 접근하는 것을 확인할 수 있습니다. 이는 kubeflow에서 정한 법칙으로 `InputPath` 와 `OutputPath` 으로 생성된 경로들은 파이프라인에서 접근할 때는 `_path` 접미사를 생략하여 접근합니다. 다만 방금 작성한 파이프라인을 업로드할 경우 실행이 되지 않습니다. 이유는 다음 페이지에서 설명합니다. ================================================ FILE: docs/kubeflow/advanced-environment.md ================================================ --- title : "9. Component - Environment" description: "" sidebar_position: 9 contributors: ["Jongseob Jeon"] --- ## Component Environment 앞서 [8. Component - InputPath/OutputPath](../kubeflow/advanced-component.md)에서 작성한 파이프라인을 실행하면 실패하게 됩니다. 왜 실패하는지 알아보고 정상적으로 실행될 수 있도록 수정합니다. ### Convert to Kubeflow Format [앞에서 작성한 컴포넌트](../kubeflow/advanced-component.md#convert-to-kubeflow-format)를 yaml파일로 변환하도록 하겠습니다. ```python from kfp.components import InputPath, OutputPath, create_component_from_func @create_component_from_func def train_from_csv( train_data_path: InputPath("csv"), train_target_path: InputPath("csv"), model_path: OutputPath("dill"), kernel: str, ): import dill import pandas as pd from sklearn.svm import SVC train_data = pd.read_csv(train_data_path) train_target = pd.read_csv(train_target_path) clf = SVC(kernel=kernel) clf.fit(train_data, train_target) with open(model_path, mode="wb") as file_writer: dill.dump(clf, file_writer) if __name__ == "__main__": train_from_csv.component_spec.save("train_from_csv.yaml") ``` 위의 스크립트를 실행하면 다음과 같은 `train_from_csv.yaml` 파일을 얻을 수 있습니다. ```bash name: Train from csv inputs: - {name: train_data, type: csv} - {name: train_target, type: csv} - {name: model, type: dill} - {name: kernel, type: String} implementation: container: image: python:3.7 command: - sh - -ec - | program_path=$(mktemp) printf "%s" "$0" > "$program_path" python3 -u "$program_path" "$@" - | def train_from_csv( train_data_path, train_target_path, model_path, kernel, ): import dill import pandas as pd from sklearn.svm import SVC train_data = pd.read_csv(train_data_path) train_target = pd.read_csv(train_target_path) clf = SVC(kernel=kernel) clf.fit(train_data, train_target) with open(model_path, mode="wb") as file_writer: dill.dump(clf, file_writer) import argparse _parser = argparse.ArgumentParser(prog='Train from csv', description='') _parser.add_argument("--train-data", dest="train_data_path", type=str, required=True, default=argparse.SUPPRESS) _parser.add_argument("--train-target", dest="train_target_path", type=str, required=True, default=argparse.SUPPRESS) _parser.add_argument("--model", dest="model_path", type=str, required=True, default=argparse.SUPPRESS) _parser.add_argument("--kernel", dest="kernel", type=str, required=True, default=argparse.SUPPRESS) _parsed_args = vars(_parser.parse_args()) _outputs = train_from_csv(**_parsed_args) args: - --train-data - {inputPath: train_data} - --train-target - {inputPath: train_target} - --model - {inputPath: model} - --kernel - {inputValue: kernel} ``` 앞서 [Basic Usage Component](../kubeflow/basic-component.md#convert-to-kubeflow-format)에서 설명한 내용에 따르면 이 컴포넌트는 다음과 같이 실행됩니다. 1. `docker pull python:3.7` 2. run `command` 하지만 위에서 생성된 컴포넌트를 실행하면 오류가 발생하게 됩니다. 그 이유는 컴포넌트 래퍼가 실행되는 방식에 있습니다. Kubeflow는 쿠버네티스를 이용하기 때문에 컴포넌트 래퍼는 각각 독립된 컨테이너 위에서 컴포넌트 콘텐츠를 실행합니다. 자세히 보면 생성된 만든 `train_from_csv.yaml` 에서 정해진 이미지는 `image: python:3.7` 입니다. 이제 어떤 이유 때문에 실행이 안 되는지 눈치채신 분들도 있을 것입니다. `python:3.7` 이미지에는 우리가 사용하고자 하는 `dill`, `pandas`, `sklearn` 이 설치되어 있지 않습니다. 그러므로 실행할 때 해당 패키지가 존재하지 않는다는 에러와 함께 실행이 안 됩니다. 그럼 어떻게 패키지를 추가할 수 있을까요? ## 패키지 추가 방법 Kubeflow를 변환하는 과정에서 두 가지 방법을 통해 패키지를 추가할 수 있습니다. 1. `base_image` 사용 2. `package_to_install` 사용 컴포넌트를 컴파일할 때 사용했던 함수 `create_component_from_func` 가 어떤 argument들을 받을 수 있는지 확인해 보겠습니다. ```bash def create_component_from_func( func: Callable, output_component_file: Optional[str] = None, base_image: Optional[str] = None, packages_to_install: List[str] = None, annotations: Optional[Mapping[str, str]] = None, ): ``` - `func`: 컴포넌트로 만들 컴포넌트 래퍼 함수 - `base_image`: 컴포넌트 래퍼가 실행할 이미지 - `packages_to_install`: 컴포넌트에서 사용해서 추가로 설치해야 하는 패키지 ### 1. base_image 컴포넌트가 실행되는 순서를 좀 더 자세히 들여다보면 다음과 같습니다. 1. `docker pull base_image` 2. `pip install packages_to_install` 3. run `command` 만약 컴포넌트가 사용하는 base_image에 패키지들이 전부 설치되어 있다면 추가적인 패키지 설치 없이 바로 사용할 수 있습니다. 예를 들어, 이번 페이지에서는 다음과 같은 Dockerfile을 작성하겠습니다. ```dockerfile FROM python:3.7 RUN pip install dill pandas scikit-learn ``` 위의 Dockerfile을 이용해 이미지를 빌드해 보겠습니다. 실습에서 사용해볼 도커 허브는 ghcr입니다. 각자 환경에 맞추어서 도커 허브를 선택 후 업로드하면 됩니다. ```bash docker build . -f Dockerfile -t ghcr.io/mlops-for-all/base-image docker push ghcr.io/mlops-for-all/base-image ``` 이제 base_image를 입력해 보겠습니다. ```python from functools import partial from kfp.components import InputPath, OutputPath, create_component_from_func @partial( create_component_from_func, base_image="ghcr.io/mlops-for-all/base-image:latest", ) def train_from_csv( train_data_path: InputPath("csv"), train_target_path: InputPath("csv"), model_path: OutputPath("dill"), kernel: str, ): import dill import pandas as pd from sklearn.svm import SVC train_data = pd.read_csv(train_data_path) train_target = pd.read_csv(train_target_path) clf = SVC(kernel=kernel) clf.fit(train_data, train_target) with open(model_path, mode="wb") as file_writer: dill.dump(clf, file_writer) if __name__ == "__main__": train_from_csv.component_spec.save("train_from_csv.yaml") ``` 이제 생성된 컴포넌트를 컴파일하면 다음과 같이 나옵니다. ```bash name: Train from csv inputs: - {name: train_data, type: csv} - {name: train_target, type: csv} - {name: kernel, type: String} outputs: - {name: model, type: dill} implementation: container: image: ghcr.io/mlops-for-all/base-image:latest command: - sh - -ec - | program_path=$(mktemp) printf "%s" "$0" > "$program_path" python3 -u "$program_path" "$@" - | def _make_parent_dirs_and_return_path(file_path: str): import os os.makedirs(os.path.dirname(file_path), exist_ok=True) return file_path def train_from_csv( train_data_path, train_target_path, model_path, kernel, ): import dill import pandas as pd from sklearn.svm import SVC train_data = pd.read_csv(train_data_path) train_target = pd.read_csv(train_target_path) clf = SVC(kernel=kernel) clf.fit(train_data, train_target) with open(model_path, mode="wb") as file_writer: dill.dump(clf, file_writer) import argparse _parser = argparse.ArgumentParser(prog='Train from csv', description='') _parser.add_argument("--train-data", dest="train_data_path", type=str, required=True, default=argparse.SUPPRESS) _parser.add_argument("--train-target", dest="train_target_path", type=str, required=True, default=argparse.SUPPRESS) _parser.add_argument("--kernel", dest="kernel", type=str, required=True, default=argparse.SUPPRESS) _parser.add_argument("--model", dest="model_path", type=_make_parent_dirs_and_return_path, required=True, default=argparse.SUPPRESS) _parsed_args = vars(_parser.parse_args()) _outputs = train_from_csv(**_parsed_args) args: - --train-data - {inputPath: train_data} - --train-target - {inputPath: train_target} - --kernel - {inputValue: kernel} - --model - {outputPath: model} ``` base_image가 우리가 설정한 값으로 바뀐 것을 확인할 수 있습니다. ### 2. packages_to_install 하지만 패키지가 추가될 때마다 docker 이미지를 계속해서 새로 생성하는 작업은 많은 시간이 소요됩니다. 이 때, `packages_to_install` argument 를 사용하면 패키지를 컨테이너에 쉽게 추가할 수 있습니다. ```python from functools import partial from kfp.components import InputPath, OutputPath, create_component_from_func @partial( create_component_from_func, packages_to_install=["dill==0.3.4", "pandas==1.3.4", "scikit-learn==1.0.1"], ) def train_from_csv( train_data_path: InputPath("csv"), train_target_path: InputPath("csv"), model_path: OutputPath("dill"), kernel: str, ): import dill import pandas as pd from sklearn.svm import SVC train_data = pd.read_csv(train_data_path) train_target = pd.read_csv(train_target_path) clf = SVC(kernel=kernel) clf.fit(train_data, train_target) with open(model_path, mode="wb") as file_writer: dill.dump(clf, file_writer) if __name__ == "__main__": train_from_csv.component_spec.save("train_from_csv.yaml") ``` 스크립트를 실행하면 다음과 같은 `train_from_csv.yaml` 파일이 생성됩니다. ```bash name: Train from csv inputs: - {name: train_data, type: csv} - {name: train_target, type: csv} - {name: kernel, type: String} outputs: - {name: model, type: dill} implementation: container: image: python:3.7 command: - sh - -c - (PIP_DISABLE_PIP_VERSION_CHECK=1 python3 -m pip install --quiet --no-warn-script-location 'dill==0.3.4' 'pandas==1.3.4' 'scikit-learn==1.0.1' || PIP_DISABLE_PIP_VERSION_CHECK=1 python3 -m pip install --quiet --no-warn-script-location 'dill==0.3.4' 'pandas==1.3.4' 'scikit-learn==1.0.1' --user) && "$0" "$@" - sh - -ec - | program_path=$(mktemp) printf "%s" "$0" > "$program_path" python3 -u "$program_path" "$@" - | def _make_parent_dirs_and_return_path(file_path: str): import os os.makedirs(os.path.dirname(file_path), exist_ok=True) return file_path def train_from_csv( train_data_path, train_target_path, model_path, kernel, ): import dill import pandas as pd from sklearn.svm import SVC train_data = pd.read_csv(train_data_path) train_target = pd.read_csv(train_target_path) clf = SVC(kernel=kernel) clf.fit(train_data, train_target) with open(model_path, mode="wb") as file_writer: dill.dump(clf, file_writer) import argparse _parser = argparse.ArgumentParser(prog='Train from csv', description='') _parser.add_argument("--train-data", dest="train_data_path", type=str, required=True, default=argparse.SUPPRESS) _parser.add_argument("--train-target", dest="train_target_path", type=str, required=True, default=argparse.SUPPRESS) _parser.add_argument("--kernel", dest="kernel", type=str, required=True, default=argparse.SUPPRESS) _parser.add_argument("--model", dest="model_path", type=_make_parent_dirs_and_return_path, required=True, default=argparse.SUPPRESS) _parsed_args = vars(_parser.parse_args()) _outputs = train_from_csv(**_parsed_args) args: - --train-data - {inputPath: train_data} - --train-target - {inputPath: train_target} - --kernel - {inputValue: kernel} - --model - {outputPath: model} ``` 위에 작성한 컴포넌트가 실행되는 순서를 좀 더 자세히 들여다보면 다음과 같습니다. 1. `docker pull python:3.7` 2. `pip install dill==0.3.4 pandas==1.3.4 scikit-learn==1.0.1` 3. run `command` 생성된 yaml 파일을 자세히 보면, 다음과 같은 줄이 자동으로 추가되어 필요한 패키지가 설치되기 때문에 오류 없이 정상적으로 실행됩니다. ```bash command: - sh - -c - (PIP_DISABLE_PIP_VERSION_CHECK=1 python3 -m pip install --quiet --no-warn-script-location 'dill==0.3.4' 'pandas==1.3.4' 'scikit-learn==1.0.1' || PIP_DISABLE_PIP_VERSION_CHECK=1 python3 -m pip install --quiet --no-warn-script-location 'dill==0.3.4' 'pandas==1.3.4' 'scikit-learn==1.0.1' --user) && "$0" "$@" ``` ================================================ FILE: docs/kubeflow/advanced-mlflow.md ================================================ --- title : "12. Component - MLFlow" description: "" sidebar_position: 12 date: 2021-12-13 lastmod: 2021-12-20 contributors: ["Jongseob Jeon", "SeungTae Kim"] --- ## MLFlow Component [Advanced Usage Component](../kubeflow/advanced-component.md) 에서 학습한 모델이 API Deployment까지 이어지기 위해서는 MLFlow에 모델을 저장해야 합니다. 이번 페이지에서는 MLFlow에 모델을 저장할 수 있는 컴포넌트를 작성하는 과정을 설명합니다. ## MLFlow in Local MLFlow에서 모델을 저장하고 서빙에서 사용하기 위해서는 다음의 항목들이 필요합니다. - model - signature - input_example - conda_env 파이썬 코드를 통해서 MLFLow에 모델을 저장하는 과정에 대해서 알아보겠습니다. ### 1. 모델 학습 아래 과정은 iris 데이터를 이용해 SVC 모델을 학습하는 과정입니다. ```python import pandas as pd from sklearn.datasets import load_iris from sklearn.svm import SVC iris = load_iris() data = pd.DataFrame(iris["data"], columns=iris["feature_names"]) target = pd.DataFrame(iris["target"], columns=["target"]) clf = SVC(kernel="rbf") clf.fit(data, target) ``` ### 2. MLFLow Infos mlflow에 필요한 정보들을 만드는 과정입니다. ```python from mlflow.models.signature import infer_signature from mlflow.utils.environment import _mlflow_conda_env input_example = data.sample(1) signature = infer_signature(data, clf.predict(data)) conda_env = _mlflow_conda_env(additional_pip_deps=["dill", "pandas", "scikit-learn"]) ``` 각 변수의 내용을 확인하면 다음과 같습니다. - `input_example` | sepal length (cm) | sepal width (cm) | petal length (cm) | petal width (cm) | | --- | --- | --- | --- | | 6.5 | 6.7 | 3.1 | 4.4 | - `signature` ```python inputs: ['sepal length (cm)': double, 'sepal width (cm)': double, 'petal length (cm)': double, 'petal width (cm)': double] outputs: [Tensor('int64', (-1,))] ``` - `conda_env` ```python {'name': 'mlflow-env', 'channels': ['conda-forge'], 'dependencies': ['python=3.8.10', 'pip', {'pip': ['mlflow', 'dill', 'pandas', 'scikit-learn']}]} ``` ### 3. Save MLFLow Infos 다음으로 학습한 정보들과 모델을 저장합니다. 학습한 모델이 sklearn 패키지를 이용하기 때문에 `mlflow.sklearn` 을 이용하면 쉽게 모델을 저장할 수 있습니다. ```python from mlflow.sklearn import save_model save_model( sk_model=clf, path="svc", serialization_format="cloudpickle", conda_env=conda_env, signature=signature, input_example=input_example, ) ``` 로컬에서 작업하면 다음과 같은 svc 폴더가 생기며 아래와 같은 파일들이 생성됩니다. ```bash ls svc ``` 위의 명령어를 실행하면 다음의 출력값을 확인할 수 있습니다. ```bash MLmodel conda.yaml input_example.json model.pkl requirements.txt ``` 각 파일을 확인하면 다음과 같습니다. - MLmodel ```bash flavors: python_function: env: conda.yaml loader_module: mlflow.sklearn model_path: model.pkl python_version: 3.8.10 sklearn: pickled_model: model.pkl serialization_format: cloudpickle sklearn_version: 1.0.1 saved_input_example_info: artifact_path: input_example.json pandas_orient: split type: dataframe signature: inputs: '[{"name": "sepal length (cm)", "type": "double"}, {"name": "sepal width (cm)", "type": "double"}, {"name": "petal length (cm)", "type": "double"}, {"name": "petal width (cm)", "type": "double"}]' outputs: '[{"type": "tensor", "tensor-spec": {"dtype": "int64", "shape": [-1]}}]' utc_time_created: '2021-12-06 06:52:30.612810' ``` - conda.yaml ```bash channels: - conda-forge dependencies: - python=3.8.10 - pip - pip: - mlflow - dill - pandas - scikit-learn name: mlflow-env ``` - input_example.json ```bash { "columns": [ "sepal length (cm)", "sepal width (cm)", "petal length (cm)", "petal width (cm)" ], "data": [ [6.7, 3.1, 4.4, 1.4] ] } ``` - requirements.txt ```bash mlflow dill pandas scikit-learn ``` - model.pkl ## MLFlow on Server 이제 저장된 모델을 mlflow 서버에 올리는 작업을 해보겠습니다. ```python import mlflow with mlflow.start_run(): mlflow.log_artifact("svc/") ``` 저장하고 `mlruns` 가 생성된 경로에서 `mlflow ui` 명령어를 이용해 mlflow 서버와 대시보드를 띄웁니다. mlflow 대시보드에 접속하여 생성된 run을 클릭하면 다음과 같이 보입니다. ![mlflow-0.png](./img/mlflow-0.png) (해당 화면은 mlflow 버전에 따라 다를 수 있습니다.) ## MLFlow Component 이제 Kubeflow에서 재사용할 수 있는 컴포넌트를 작성해 보겠습니다. 재사용할 수 있는 컴포넌트를 작성하는 방법은 크게 3가지가 있습니다. 1. 모델을 학습하는 컴포넌트에서 필요한 환경을 저장 후 MLFlow 컴포넌트는 업로드만 담당 ![mlflow-1.png](./img/mlflow-1.png) 2. 학습된 모델과 데이터를 MLFlow 컴포넌트에 전달 후 컴포넌트에서 저장과 업로드 담당 ![mlflow-2.png](./img/mlflow-2.png) 3. 모델을 학습하는 컴포넌트에서 저장과 업로드를 담당 ![mlflow-3.png](./img/mlflow-3.png) 저희는 이 중 1번의 접근 방법을 통해 모델을 관리하려고 합니다. 이유는 MLFlow 모델을 업로드하는 코드는 바뀌지 않기 때문에 매번 3번처럼 컴포넌트 작성마다 작성할 필요는 없기 때문입니다. 컴포넌트를 재활용하는 방법은 1번과 2번의 방법으로 가능합니다. 다만 2번의 경우 모델이 학습된 이미지와 패키지들을 전달해야 하므로 결국 컴포넌트에 대한 추가 정보를 전달해야 합니다. 1번의 방법으로 진행하기 위해서는 학습하는 컴포넌트 또한 변경되어야 합니다. 모델을 저장하는데 필요한 환경들을 저장해주는 코드가 추가되어야 합니다. ```python from functools import partial from kfp.components import InputPath, OutputPath, create_component_from_func @partial( create_component_from_func, packages_to_install=["dill", "pandas", "scikit-learn", "mlflow"], ) def train_from_csv( train_data_path: InputPath("csv"), train_target_path: InputPath("csv"), model_path: OutputPath("dill"), input_example_path: OutputPath("dill"), signature_path: OutputPath("dill"), conda_env_path: OutputPath("dill"), kernel: str, ): import dill import pandas as pd from sklearn.svm import SVC from mlflow.models.signature import infer_signature from mlflow.utils.environment import _mlflow_conda_env train_data = pd.read_csv(train_data_path) train_target = pd.read_csv(train_target_path) clf = SVC(kernel=kernel) clf.fit(train_data, train_target) with open(model_path, mode="wb") as file_writer: dill.dump(clf, file_writer) input_example = train_data.sample(1) with open(input_example_path, "wb") as file_writer: dill.dump(input_example, file_writer) signature = infer_signature(train_data, clf.predict(train_data)) with open(signature_path, "wb") as file_writer: dill.dump(signature, file_writer) conda_env = _mlflow_conda_env( additional_pip_deps=["dill", "pandas", "scikit-learn"] ) with open(conda_env_path, "wb") as file_writer: dill.dump(conda_env, file_writer) ``` 그리고 MLFlow에 업로드하는 컴포넌트를 작성합니다. 이 때 업로드되는 MLflow의 endpoint를 우리가 설치한 [mlflow service](../setup-components/install-components-mlflow.md) 로 이어지게 설정해주어야 합니다. 이 때 S3 Endpoint의 주소는 MLflow Server 설치 당시 설치한 minio의 [쿠버네티스 서비스 DNS 네임을 활용](https://kubernetes.io/ko/docs/concepts/services-networking/dns-pod-service/)합니다. 해당 service 는 kubeflow namespace에서 minio-service라는 이름으로 생성되었으므로, `http://minio-service.kubeflow.svc:9000` 로 설정합니다. 이와 비슷하게 tracking_uri의 주소는 mlflow server의 쿠버네티스 서비스 DNS 네임을 활용하여, `http://mlflow-server-service.mlflow-system.svc:5000` 로 설정합니다. ```python from functools import partial from kfp.components import InputPath, create_component_from_func @partial( create_component_from_func, packages_to_install=["dill", "pandas", "scikit-learn", "mlflow", "boto3"], ) def upload_sklearn_model_to_mlflow( model_name: str, model_path: InputPath("dill"), input_example_path: InputPath("dill"), signature_path: InputPath("dill"), conda_env_path: InputPath("dill"), ): import os import dill from mlflow.sklearn import save_model from mlflow.tracking.client import MlflowClient os.environ["MLFLOW_S3_ENDPOINT_URL"] = "http://minio-service.kubeflow.svc:9000" os.environ["AWS_ACCESS_KEY_ID"] = "minio" os.environ["AWS_SECRET_ACCESS_KEY"] = "minio123" client = MlflowClient("http://mlflow-server-service.mlflow-system.svc:5000") with open(model_path, mode="rb") as file_reader: clf = dill.load(file_reader) with open(input_example_path, "rb") as file_reader: input_example = dill.load(file_reader) with open(signature_path, "rb") as file_reader: signature = dill.load(file_reader) with open(conda_env_path, "rb") as file_reader: conda_env = dill.load(file_reader) save_model( sk_model=clf, path=model_name, serialization_format="cloudpickle", conda_env=conda_env, signature=signature, input_example=input_example, ) run = client.create_run(experiment_id="0") client.log_artifact(run.info.run_id, model_name) ``` ## MLFlow Pipeline 이제 작성한 컴포넌트들을 연결해서 파이프라인으로 만들어 보겠습니다. ### Data Component 모델을 학습할 때 쓸 데이터는 sklearn의 iris 입니다. 데이터를 생성하는 컴포넌트를 작성합니다. ```python from functools import partial from kfp.components import InputPath, OutputPath, create_component_from_func @partial( create_component_from_func, packages_to_install=["pandas", "scikit-learn"], ) def load_iris_data( data_path: OutputPath("csv"), target_path: OutputPath("csv"), ): import pandas as pd from sklearn.datasets import load_iris iris = load_iris() data = pd.DataFrame(iris["data"], columns=iris["feature_names"]) target = pd.DataFrame(iris["target"], columns=["target"]) data.to_csv(data_path, index=False) target.to_csv(target_path, index=False) ``` ### Pipeline 파이프라인 코드는 다음과 같이 작성할 수 있습니다. ```python from kfp.dsl import pipeline @pipeline(name="mlflow_pipeline") def mlflow_pipeline(kernel: str, model_name: str): iris_data = load_iris_data() model = train_from_csv( train_data=iris_data.outputs["data"], train_target=iris_data.outputs["target"], kernel=kernel, ) _ = upload_sklearn_model_to_mlflow( model_name=model_name, model=model.outputs["model"], input_example=model.outputs["input_example"], signature=model.outputs["signature"], conda_env=model.outputs["conda_env"], ) ``` ### Run 위에서 작성된 컴포넌트와 파이프라인을 하나의 파이썬 파일에 정리하면 다음과 같습니다. ```python from functools import partial import kfp from kfp.components import InputPath, OutputPath, create_component_from_func from kfp.dsl import pipeline @partial( create_component_from_func, packages_to_install=["pandas", "scikit-learn"], ) def load_iris_data( data_path: OutputPath("csv"), target_path: OutputPath("csv"), ): import pandas as pd from sklearn.datasets import load_iris iris = load_iris() data = pd.DataFrame(iris["data"], columns=iris["feature_names"]) target = pd.DataFrame(iris["target"], columns=["target"]) data.to_csv(data_path, index=False) target.to_csv(target_path, index=False) @partial( create_component_from_func, packages_to_install=["dill", "pandas", "scikit-learn", "mlflow"], ) def train_from_csv( train_data_path: InputPath("csv"), train_target_path: InputPath("csv"), model_path: OutputPath("dill"), input_example_path: OutputPath("dill"), signature_path: OutputPath("dill"), conda_env_path: OutputPath("dill"), kernel: str, ): import dill import pandas as pd from sklearn.svm import SVC from mlflow.models.signature import infer_signature from mlflow.utils.environment import _mlflow_conda_env train_data = pd.read_csv(train_data_path) train_target = pd.read_csv(train_target_path) clf = SVC(kernel=kernel) clf.fit(train_data, train_target) with open(model_path, mode="wb") as file_writer: dill.dump(clf, file_writer) input_example = train_data.sample(1) with open(input_example_path, "wb") as file_writer: dill.dump(input_example, file_writer) signature = infer_signature(train_data, clf.predict(train_data)) with open(signature_path, "wb") as file_writer: dill.dump(signature, file_writer) conda_env = _mlflow_conda_env( additional_pip_deps=["dill", "pandas", "scikit-learn"] ) with open(conda_env_path, "wb") as file_writer: dill.dump(conda_env, file_writer) @partial( create_component_from_func, packages_to_install=["dill", "pandas", "scikit-learn", "mlflow", "boto3"], ) def upload_sklearn_model_to_mlflow( model_name: str, model_path: InputPath("dill"), input_example_path: InputPath("dill"), signature_path: InputPath("dill"), conda_env_path: InputPath("dill"), ): import os import dill from mlflow.sklearn import save_model from mlflow.tracking.client import MlflowClient os.environ["MLFLOW_S3_ENDPOINT_URL"] = "http://minio-service.kubeflow.svc:9000" os.environ["AWS_ACCESS_KEY_ID"] = "minio" os.environ["AWS_SECRET_ACCESS_KEY"] = "minio123" client = MlflowClient("http://mlflow-server-service.mlflow-system.svc:5000") with open(model_path, mode="rb") as file_reader: clf = dill.load(file_reader) with open(input_example_path, "rb") as file_reader: input_example = dill.load(file_reader) with open(signature_path, "rb") as file_reader: signature = dill.load(file_reader) with open(conda_env_path, "rb") as file_reader: conda_env = dill.load(file_reader) save_model( sk_model=clf, path=model_name, serialization_format="cloudpickle", conda_env=conda_env, signature=signature, input_example=input_example, ) run = client.create_run(experiment_id="0") client.log_artifact(run.info.run_id, model_name) @pipeline(name="mlflow_pipeline") def mlflow_pipeline(kernel: str, model_name: str): iris_data = load_iris_data() model = train_from_csv( train_data=iris_data.outputs["data"], train_target=iris_data.outputs["target"], kernel=kernel, ) _ = upload_sklearn_model_to_mlflow( model_name=model_name, model=model.outputs["model"], input_example=model.outputs["input_example"], signature=model.outputs["signature"], conda_env=model.outputs["conda_env"], ) if __name__ == "__main__": kfp.compiler.Compiler().compile(mlflow_pipeline, "mlflow_pipeline.yaml") ```

mlflow_pipeline.yaml ```bash apiVersion: argoproj.io/v1alpha1 kind: Workflow metadata: generateName: mlflow-pipeline- annotations: {pipelines.kubeflow.org/kfp_sdk_version: 1.8.10, pipelines.kubeflow.org/pipeline_compilation_time: '2022-01-19T14:14:11.999807', pipelines.kubeflow.org/pipeline_spec: '{"inputs": [{"name": "kernel", "type": "String"}, {"name": "model_name", "type": "String"}], "name": "mlflow_pipeline"}'} labels: {pipelines.kubeflow.org/kfp_sdk_version: 1.8.10} spec: entrypoint: mlflow-pipeline templates: - name: load-iris-data container: args: [--data, /tmp/outputs/data/data, --target, /tmp/outputs/target/data] command: - sh - -c - (PIP_DISABLE_PIP_VERSION_CHECK=1 python3 -m pip install --quiet --no-warn-script-location 'pandas' 'scikit-learn' || PIP_DISABLE_PIP_VERSION_CHECK=1 python3 -m pip install --quiet --no-warn-script-location 'pandas' 'scikit-learn' --user) && "$0" "$@" - sh - -ec - | program_path=$(mktemp) printf "%s" "$0" > "$program_path" python3 -u "$program_path" "$@" - | def _make_parent_dirs_and_return_path(file_path: str): import os os.makedirs(os.path.dirname(file_path), exist_ok=True) return file_path def load_iris_data( data_path, target_path, ): import pandas as pd from sklearn.datasets import load_iris iris = load_iris() data = pd.DataFrame(iris["data"], columns=iris["feature_names"]) target = pd.DataFrame(iris["target"], columns=["target"]) data.to_csv(data_path, index=False) target.to_csv(target_path, index=False) import argparse _parser = argparse.ArgumentParser(prog='Load iris data', description='') _parser.add_argument("--data", dest="data_path", type=_make_parent_dirs_and_return_path, required=True, default=argparse.SUPPRESS) _parser.add_argument("--target", dest="target_path", type=_make_parent_dirs_and_return_path, required=True, default=argparse.SUPPRESS) _parsed_args = vars(_parser.parse_args()) _outputs = load_iris_data(**_parsed_args) image: python:3.7 outputs: artifacts: - {name: load-iris-data-data, path: /tmp/outputs/data/data} - {name: load-iris-data-target, path: /tmp/outputs/target/data} metadata: labels: pipelines.kubeflow.org/kfp_sdk_version: 1.8.10 pipelines.kubeflow.org/pipeline-sdk-type: kfp pipelines.kubeflow.org/enable_caching: "true" annotations: {pipelines.kubeflow.org/component_spec: '{"implementation": {"container": {"args": ["--data", {"outputPath": "data"}, "--target", {"outputPath": "target"}], "command": ["sh", "-c", "(PIP_DISABLE_PIP_VERSION_CHECK=1 python3 -m pip install --quiet --no-warn-script-location ''pandas'' ''scikit-learn'' || PIP_DISABLE_PIP_VERSION_CHECK=1 python3 -m pip install --quiet --no-warn-script-location ''pandas'' ''scikit-learn'' --user) && \"$0\" \"$@\"", "sh", "-ec", "program_path=$(mktemp)\nprintf \"%s\" \"$0\" > \"$program_path\"\npython3 -u \"$program_path\" \"$@\"\n", "def _make_parent_dirs_and_return_path(file_path: str):\n import os\n os.makedirs(os.path.dirname(file_path), exist_ok=True)\n return file_path\n\ndef load_iris_data(\n data_path,\n target_path,\n):\n import pandas as pd\n from sklearn.datasets import load_iris\n\n iris = load_iris()\n\n data = pd.DataFrame(iris[\"data\"], columns=iris[\"feature_names\"])\n target = pd.DataFrame(iris[\"target\"], columns=[\"target\"])\n\n data.to_csv(data_path, index=False)\n target.to_csv(target_path, index=False)\n\nimport argparse\n_parser = argparse.ArgumentParser(prog=''Load iris data'', description='''')\n_parser.add_argument(\"--data\", dest=\"data_path\", type=_make_parent_dirs_and_return_path, required=True, default=argparse.SUPPRESS)\n_parser.add_argument(\"--target\", dest=\"target_path\", type=_make_parent_dirs_and_return_path, required=True, default=argparse.SUPPRESS)\n_parsed_args = vars(_parser.parse_args())\n\n_outputs = load_iris_data(**_parsed_args)\n"], "image": "python:3.7"}}, "name": "Load iris data", "outputs": [{"name": "data", "type": "csv"}, {"name": "target", "type": "csv"}]}', pipelines.kubeflow.org/component_ref: '{}'} - name: mlflow-pipeline inputs: parameters: - {name: kernel} - {name: model_name} dag: tasks: - {name: load-iris-data, template: load-iris-data} - name: train-from-csv template: train-from-csv dependencies: [load-iris-data] arguments: parameters: - {name: kernel, value: '{{inputs.parameters.kernel}}'} artifacts: - {name: load-iris-data-data, from: '{{tasks.load-iris-data.outputs.artifacts.load-iris-data-data}}'} - {name: load-iris-data-target, from: '{{tasks.load-iris-data.outputs.artifacts.load-iris-data-target}}'} - name: upload-sklearn-model-to-mlflow template: upload-sklearn-model-to-mlflow dependencies: [train-from-csv] arguments: parameters: - {name: model_name, value: '{{inputs.parameters.model_name}}'} artifacts: - {name: train-from-csv-conda_env, from: '{{tasks.train-from-csv.outputs.artifacts.train-from-csv-conda_env}}'} - {name: train-from-csv-input_example, from: '{{tasks.train-from-csv.outputs.artifacts.train-from-csv-input_example}}'} - {name: train-from-csv-model, from: '{{tasks.train-from-csv.outputs.artifacts.train-from-csv-model}}'} - {name: train-from-csv-signature, from: '{{tasks.train-from-csv.outputs.artifacts.train-from-csv-signature}}'} - name: train-from-csv container: args: [--train-data, /tmp/inputs/train_data/data, --train-target, /tmp/inputs/train_target/data, --kernel, '{{inputs.parameters.kernel}}', --model, /tmp/outputs/model/data, --input-example, /tmp/outputs/input_example/data, --signature, /tmp/outputs/signature/data, --conda-env, /tmp/outputs/conda_env/data] command: - sh - -c - (PIP_DISABLE_PIP_VERSION_CHECK=1 python3 -m pip install --quiet --no-warn-script-location 'dill' 'pandas' 'scikit-learn' 'mlflow' || PIP_DISABLE_PIP_VERSION_CHECK=1 python3 -m pip install --quiet --no-warn-script-location 'dill' 'pandas' 'scikit-learn' 'mlflow' --user) && "$0" "$@" - sh - -ec - | program_path=$(mktemp) printf "%s" "$0" > "$program_path" python3 -u "$program_path" "$@" - | def _make_parent_dirs_and_return_path(file_path: str): import os os.makedirs(os.path.dirname(file_path), exist_ok=True) return file_path def train_from_csv( train_data_path, train_target_path, model_path, input_example_path, signature_path, conda_env_path, kernel, ): import dill import pandas as pd from sklearn.svm import SVC from mlflow.models.signature import infer_signature from mlflow.utils.environment import _mlflow_conda_env train_data = pd.read_csv(train_data_path) train_target = pd.read_csv(train_target_path) clf = SVC(kernel=kernel) clf.fit(train_data, train_target) with open(model_path, mode="wb") as file_writer: dill.dump(clf, file_writer) input_example = train_data.sample(1) with open(input_example_path, "wb") as file_writer: dill.dump(input_example, file_writer) signature = infer_signature(train_data, clf.predict(train_data)) with open(signature_path, "wb") as file_writer: dill.dump(signature, file_writer) conda_env = _mlflow_conda_env( additional_pip_deps=["dill", "pandas", "scikit-learn"] ) with open(conda_env_path, "wb") as file_writer: dill.dump(conda_env, file_writer) import argparse _parser = argparse.ArgumentParser(prog='Train from csv', description='') _parser.add_argument("--train-data", dest="train_data_path", type=str, required=True, default=argparse.SUPPRESS) _parser.add_argument("--train-target", dest="train_target_path", type=str, required=True, default=argparse.SUPPRESS) _parser.add_argument("--kernel", dest="kernel", type=str, required=True, default=argparse.SUPPRESS) _parser.add_argument("--model", dest="model_path", type=_make_parent_dirs_and_return_path, required=True, default=argparse.SUPPRESS) _parser.add_argument("--input-example", dest="input_example_path", type=_make_parent_dirs_and_return_path, required=True, default=argparse.SUPPRESS) _parser.add_argument("--signature", dest="signature_path", type=_make_parent_dirs_and_return_path, required=True, default=argparse.SUPPRESS) _parser.add_argument("--conda-env", dest="conda_env_path", type=_make_parent_dirs_and_return_path, required=True, default=argparse.SUPPRESS) _parsed_args = vars(_parser.parse_args()) _outputs = train_from_csv(**_parsed_args) image: python:3.7 inputs: parameters: - {name: kernel} artifacts: - {name: load-iris-data-data, path: /tmp/inputs/train_data/data} - {name: load-iris-data-target, path: /tmp/inputs/train_target/data} outputs: artifacts: - {name: train-from-csv-conda_env, path: /tmp/outputs/conda_env/data} - {name: train-from-csv-input_example, path: /tmp/outputs/input_example/data} - {name: train-from-csv-model, path: /tmp/outputs/model/data} - {name: train-from-csv-signature, path: /tmp/outputs/signature/data} metadata: labels: pipelines.kubeflow.org/kfp_sdk_version: 1.8.10 pipelines.kubeflow.org/pipeline-sdk-type: kfp pipelines.kubeflow.org/enable_caching: "true" annotations: {pipelines.kubeflow.org/component_spec: '{"implementation": {"container": {"args": ["--train-data", {"inputPath": "train_data"}, "--train-target", {"inputPath": "train_target"}, "--kernel", {"inputValue": "kernel"}, "--model", {"outputPath": "model"}, "--input-example", {"outputPath": "input_example"}, "--signature", {"outputPath": "signature"}, "--conda-env", {"outputPath": "conda_env"}], "command": ["sh", "-c", "(PIP_DISABLE_PIP_VERSION_CHECK=1 python3 -m pip install --quiet --no-warn-script-location ''dill'' ''pandas'' ''scikit-learn'' ''mlflow'' || PIP_DISABLE_PIP_VERSION_CHECK=1 python3 -m pip install --quiet --no-warn-script-location ''dill'' ''pandas'' ''scikit-learn'' ''mlflow'' --user) && \"$0\" \"$@\"", "sh", "-ec", "program_path=$(mktemp)\nprintf \"%s\" \"$0\" > \"$program_path\"\npython3 -u \"$program_path\" \"$@\"\n", "def _make_parent_dirs_and_return_path(file_path: str):\n import os\n os.makedirs(os.path.dirname(file_path), exist_ok=True)\n return file_path\n\ndef train_from_csv(\n train_data_path,\n train_target_path,\n model_path,\n input_example_path,\n signature_path,\n conda_env_path,\n kernel,\n):\n import dill\n import pandas as pd\n from sklearn.svm import SVC\n\n from mlflow.models.signature import infer_signature\n from mlflow.utils.environment import _mlflow_conda_env\n\n train_data = pd.read_csv(train_data_path)\n train_target = pd.read_csv(train_target_path)\n\n clf = SVC(kernel=kernel)\n clf.fit(train_data, train_target)\n\n with open(model_path, mode=\"wb\") as file_writer:\n dill.dump(clf, file_writer)\n\n input_example = train_data.sample(1)\n with open(input_example_path, \"wb\") as file_writer:\n dill.dump(input_example, file_writer)\n\n signature = infer_signature(train_data, clf.predict(train_data))\n with open(signature_path, \"wb\") as file_writer:\n dill.dump(signature, file_writer)\n\n conda_env = _mlflow_conda_env(\n additional_pip_deps=[\"dill\", \"pandas\", \"scikit-learn\"]\n )\n with open(conda_env_path, \"wb\") as file_writer:\n dill.dump(conda_env, file_writer)\n\nimport argparse\n_parser = argparse.ArgumentParser(prog=''Train from csv'', description='''')\n_parser.add_argument(\"--train-data\", dest=\"train_data_path\", type=str, required=True, default=argparse.SUPPRESS)\n_parser.add_argument(\"--train-target\", dest=\"train_target_path\", type=str, required=True, default=argparse.SUPPRESS)\n_parser.add_argument(\"--kernel\", dest=\"kernel\", type=str, required=True, default=argparse.SUPPRESS)\n_parser.add_argument(\"--model\", dest=\"model_path\", type=_make_parent_dirs_and_return_path, required=True, default=argparse.SUPPRESS)\n_parser.add_argument(\"--input-example\", dest=\"input_example_path\", type=_make_parent_dirs_and_return_path, required=True, default=argparse.SUPPRESS)\n_parser.add_argument(\"--signature\", dest=\"signature_path\", type=_make_parent_dirs_and_return_path, required=True, default=argparse.SUPPRESS)\n_parser.add_argument(\"--conda-env\", dest=\"conda_env_path\", type=_make_parent_dirs_and_return_path, required=True, default=argparse.SUPPRESS)\n_parsed_args = vars(_parser.parse_args())\n\n_outputs = train_from_csv(**_parsed_args)\n"], "image": "python:3.7"}}, "inputs": [{"name": "train_data", "type": "csv"}, {"name": "train_target", "type": "csv"}, {"name": "kernel", "type": "String"}], "name": "Train from csv", "outputs": [{"name": "model", "type": "dill"}, {"name": "input_example", "type": "dill"}, {"name": "signature", "type": "dill"}, {"name": "conda_env", "type": "dill"}]}', pipelines.kubeflow.org/component_ref: '{}', pipelines.kubeflow.org/arguments.parameters: '{"kernel": "{{inputs.parameters.kernel}}"}'} - name: upload-sklearn-model-to-mlflow container: args: [--model-name, '{{inputs.parameters.model_name}}', --model, /tmp/inputs/model/data, --input-example, /tmp/inputs/input_example/data, --signature, /tmp/inputs/signature/data, --conda-env, /tmp/inputs/conda_env/data] command: - sh - -c - (PIP_DISABLE_PIP_VERSION_CHECK=1 python3 -m pip install --quiet --no-warn-script-location 'dill' 'pandas' 'scikit-learn' 'mlflow' 'boto3' || PIP_DISABLE_PIP_VERSION_CHECK=1 python3 -m pip install --quiet --no-warn-script-location 'dill' 'pandas' 'scikit-learn' 'mlflow' 'boto3' --user) && "$0" "$@" - sh - -ec - | program_path=$(mktemp) printf "%s" "$0" > "$program_path" python3 -u "$program_path" "$@" - | def upload_sklearn_model_to_mlflow( model_name, model_path, input_example_path, signature_path, conda_env_path, ): import os import dill from mlflow.sklearn import save_model from mlflow.tracking.client import MlflowClient os.environ["MLFLOW_S3_ENDPOINT_URL"] = "http://minio-service.kubeflow.svc:9000" os.environ["AWS_ACCESS_KEY_ID"] = "minio" os.environ["AWS_SECRET_ACCESS_KEY"] = "minio123" client = MlflowClient("http://mlflow-server-service.mlflow-system.svc:5000") with open(model_path, mode="rb") as file_reader: clf = dill.load(file_reader) with open(input_example_path, "rb") as file_reader: input_example = dill.load(file_reader) with open(signature_path, "rb") as file_reader: signature = dill.load(file_reader) with open(conda_env_path, "rb") as file_reader: conda_env = dill.load(file_reader) save_model( sk_model=clf, path=model_name, serialization_format="cloudpickle", conda_env=conda_env, signature=signature, input_example=input_example, ) run = client.create_run(experiment_id="0") client.log_artifact(run.info.run_id, model_name) import argparse _parser = argparse.ArgumentParser(prog='Upload sklearn model to mlflow', description='') _parser.add_argument("--model-name", dest="model_name", type=str, required=True, default=argparse.SUPPRESS) _parser.add_argument("--model", dest="model_path", type=str, required=True, default=argparse.SUPPRESS) _parser.add_argument("--input-example", dest="input_example_path", type=str, required=True, default=argparse.SUPPRESS) _parser.add_argument("--signature", dest="signature_path", type=str, required=True, default=argparse.SUPPRESS) _parser.add_argument("--conda-env", dest="conda_env_path", type=str, required=True, default=argparse.SUPPRESS) _parsed_args = vars(_parser.parse_args()) _outputs = upload_sklearn_model_to_mlflow(**_parsed_args) image: python:3.7 inputs: parameters: - {name: model_name} artifacts: - {name: train-from-csv-conda_env, path: /tmp/inputs/conda_env/data} - {name: train-from-csv-input_example, path: /tmp/inputs/input_example/data} - {name: train-from-csv-model, path: /tmp/inputs/model/data} - {name: train-from-csv-signature, path: /tmp/inputs/signature/data} metadata: labels: pipelines.kubeflow.org/kfp_sdk_version: 1.8.10 pipelines.kubeflow.org/pipeline-sdk-type: kfp pipelines.kubeflow.org/enable_caching: "true" annotations: {pipelines.kubeflow.org/component_spec: '{"implementation": {"container": {"args": ["--model-name", {"inputValue": "model_name"}, "--model", {"inputPath": "model"}, "--input-example", {"inputPath": "input_example"}, "--signature", {"inputPath": "signature"}, "--conda-env", {"inputPath": "conda_env"}], "command": ["sh", "-c", "(PIP_DISABLE_PIP_VERSION_CHECK=1 python3 -m pip install --quiet --no-warn-script-location ''dill'' ''pandas'' ''scikit-learn'' ''mlflow'' ''boto3'' || PIP_DISABLE_PIP_VERSION_CHECK=1 python3 -m pip install --quiet --no-warn-script-location ''dill'' ''pandas'' ''scikit-learn'' ''mlflow'' ''boto3'' --user) && \"$0\" \"$@\"", "sh", "-ec", "program_path=$(mktemp)\nprintf \"%s\" \"$0\" > \"$program_path\"\npython3 -u \"$program_path\" \"$@\"\n", "def upload_sklearn_model_to_mlflow(\n model_name,\n model_path,\n input_example_path,\n signature_path,\n conda_env_path,\n):\n import os\n import dill\n from mlflow.sklearn import save_model\n\n from mlflow.tracking.client import MlflowClient\n\n os.environ[\"MLFLOW_S3_ENDPOINT_URL\"] = \"http://minio-service.kubeflow.svc:9000\"\n os.environ[\"AWS_ACCESS_KEY_ID\"] = \"minio\"\n os.environ[\"AWS_SECRET_ACCESS_KEY\"] = \"minio123\"\n\n client = MlflowClient(\"http://mlflow-server-service.mlflow-system.svc:5000\")\n\n with open(model_path, mode=\"rb\") as file_reader:\n clf = dill.load(file_reader)\n\n with open(input_example_path, \"rb\") as file_reader:\n input_example = dill.load(file_reader)\n\n with open(signature_path, \"rb\") as file_reader:\n signature = dill.load(file_reader)\n\n with open(conda_env_path, \"rb\") as file_reader:\n conda_env = dill.load(file_reader)\n\n save_model(\n sk_model=clf,\n path=model_name,\n serialization_format=\"cloudpickle\",\n conda_env=conda_env,\n signature=signature,\n input_example=input_example,\n )\n run = client.create_run(experiment_id=\"0\")\n client.log_artifact(run.info.run_id, model_name)\n\nimport argparse\n_parser = argparse.ArgumentParser(prog=''Upload sklearn model to mlflow'', description='''')\n_parser.add_argument(\"--model-name\", dest=\"model_name\", type=str, required=True, default=argparse.SUPPRESS)\n_parser.add_argument(\"--model\", dest=\"model_path\", type=str, required=True, default=argparse.SUPPRESS)\n_parser.add_argument(\"--input-example\", dest=\"input_example_path\", type=str, required=True, default=argparse.SUPPRESS)\n_parser.add_argument(\"--signature\", dest=\"signature_path\", type=str, required=True, default=argparse.SUPPRESS)\n_parser.add_argument(\"--conda-env\", dest=\"conda_env_path\", type=str, required=True, default=argparse.SUPPRESS)\n_parsed_args = vars(_parser.parse_args())\n\n_outputs = upload_sklearn_model_to_mlflow(**_parsed_args)\n"], "image": "python:3.7"}}, "inputs": [{"name": "model_name", "type": "String"}, {"name": "model", "type": "dill"}, {"name": "input_example", "type": "dill"}, {"name": "signature", "type": "dill"}, {"name": "conda_env", "type": "dill"}], "name": "Upload sklearn model to mlflow"}', pipelines.kubeflow.org/component_ref: '{}', pipelines.kubeflow.org/arguments.parameters: '{"model_name": "{{inputs.parameters.model_name}}"}'} arguments: parameters: - {name: kernel} - {name: model_name} serviceAccountName: pipeline-runner ```

실행후 생성된 mlflow_pipeline.yaml 파일을 파이프라인 업로드한 후, 실행하여 run 의 결과를 확인합니다. ![mlflow-svc-0](./img/mlflow-svc-0.png) mlflow service를 포트포워딩해서 MLflow ui에 접속합니다. ```bash kubectl port-forward svc/mlflow-server-service -n mlflow-system 5000:5000 ``` 웹 브라우저를 열어 localhost:5000으로 접속하면, 다음과 같이 run이 생성된 것을 확인할 수 있습니다. ![mlflow-svc-1](./img/mlflow-svc-1.png) run 을 클릭해서 확인하면 학습한 모델 파일이 있는 것을 확인할 수 있습니다. ![mlflow-svc-2](./img/mlflow-svc-2.png) ================================================ FILE: docs/kubeflow/advanced-pipeline.md ================================================ --- title : "10. Pipeline - Setting" description: "" sidebar_position: 10 contributors: ["Jongseob Jeon"] --- ## Pipeline Setting 이번 페이지에서는 파이프라인에서 설정할 수 있는 값들에 대해 알아보겠습니다. ## Display Name 생성된 파이프라인 내에서 컴포넌트는 두 개의 이름을 갖습니다. - task_name: 컴포넌트를 작성할 때 작성한 함수 이름 - display_name: kubeflow UI상에 보이는 이름 예를 들어서 다음과 같은 경우 두 컴포넌트 모두 Print and return number로 설정되어 있어서 어떤 컴포넌트가 1번인지 2번인지 확인하기 어렵습니다. ![run-7](./img/run-7.png) ### set_display_name 이를 위한 것이 바로 display_name 입니다. 설정하는 방법은 파이프라인에서 컴포넌트에 다음과 같이 `set_display_name` [attribute](https://kubeflow-pipelines.readthedocs.io/en/latest/source/kfp.dsl.html#kfp.dsl.ContainerOp.set_display_name)를 이용하면 됩니다. ```python import kfp from kfp.components import create_component_from_func from kfp.dsl import pipeline @create_component_from_func def print_and_return_number(number: int) -> int: print(number) return number @create_component_from_func def sum_and_print_numbers(number_1: int, number_2: int): print(number_1 + number_2) @pipeline(name="example_pipeline") def example_pipeline(number_1: int, number_2: int): number_1_result = print_and_return_number(number_1).set_display_name("This is number 1") number_2_result = print_and_return_number(number_2).set_display_name("This is number 2") sum_result = sum_and_print_numbers( number_1=number_1_result.output, number_2=number_2_result.output ).set_display_name("This is sum of number 1 and number 2") if __name__ == "__main__": kfp.compiler.Compiler().compile(example_pipeline, "example_pipeline.yaml") ``` 이 스크립트를 실행해서 나온 `example_pipeline.yaml`을 확인하면 다음과 같습니다.

example_pipeline.yaml ```bash apiVersion: argoproj.io/v1alpha1 kind: Workflow metadata: generateName: example-pipeline- annotations: {pipelines.kubeflow.org/kfp_sdk_version: 1.8.9, pipelines.kubeflow.org/pipeline_compilation_time: '2021-12-09T18:11:43.193190', pipelines.kubeflow.org/pipeline_spec: '{"inputs": [{"name": "number_1", "type": "Integer"}, {"name": "number_2", "type": "Integer"}], "name": "example_pipeline"}'} labels: {pipelines.kubeflow.org/kfp_sdk_version: 1.8.9} spec: entrypoint: example-pipeline templates: - name: example-pipeline inputs: parameters: - {name: number_1} - {name: number_2} dag: tasks: - name: print-and-return-number template: print-and-return-number arguments: parameters: - {name: number_1, value: '{{inputs.parameters.number_1}}'} - name: print-and-return-number-2 template: print-and-return-number-2 arguments: parameters: - {name: number_2, value: '{{inputs.parameters.number_2}}'} - name: sum-and-print-numbers template: sum-and-print-numbers dependencies: [print-and-return-number, print-and-return-number-2] arguments: parameters: - {name: print-and-return-number-2-Output, value: '{{tasks.print-and-return-number-2.outputs.parameters.print-and-return-number-2-Output}}'} - {name: print-and-return-number-Output, value: '{{tasks.print-and-return-number.outputs.parameters.print-and-return-number-Output}}'} - name: print-and-return-number container: args: [--number, '{{inputs.parameters.number_1}}', '----output-paths', /tmp/outputs/Output/data] command: - sh - -ec - | program_path=$(mktemp) printf "%s" "$0" > "$program_path" python3 -u "$program_path" "$@" - | def print_and_return_number(number): print(number) return number def _serialize_int(int_value: int) -> str: if isinstance(int_value, str): return int_value if not isinstance(int_value, int): raise TypeError('Value "{}" has type "{}" instead of int.'.format( str(int_value), str(type(int_value)))) return str(int_value) import argparse _parser = argparse.ArgumentParser(prog='Print and return number', description='') _parser.add_argument("--number", dest="number", type=int, required=True, default=argparse.SUPPRESS) _parser.add_argument("----output-paths", dest="_output_paths", type=str, nargs=1) _parsed_args = vars(_parser.parse_args()) _output_files = _parsed_args.pop("_output_paths", []) _outputs = print_and_return_number(**_parsed_args) _outputs = [_outputs] _output_serializers = [ _serialize_int, ] import os for idx, output_file in enumerate(_output_files): try: os.makedirs(os.path.dirname(output_file)) except OSError: pass with open(output_file, 'w') as f: f.write(_output_serializers[idx](_outputs[idx])) image: python:3.7 inputs: parameters: - {name: number_1} outputs: parameters: - name: print-and-return-number-Output valueFrom: {path: /tmp/outputs/Output/data} artifacts: - {name: print-and-return-number-Output, path: /tmp/outputs/Output/data} metadata: annotations: {pipelines.kubeflow.org/task_display_name: This is number 1, pipelines.kubeflow.org/component_spec: '{"implementation": {"container": {"args": ["--number", {"inputValue": "number"}, "----output-paths", {"outputPath": "Output"}], "command": ["sh", "-ec", "program_path=$(mktemp)\nprintf \"%s\" \"$0\" > \"$program_path\"\npython3 -u \"$program_path\" \"$@\"\n", "def print_and_return_number(number):\n print(number)\n return number\n\ndef _serialize_int(int_value: int) -> str:\n if isinstance(int_value, str):\n return int_value\n if not isinstance(int_value, int):\n raise TypeError(''Value \"{}\" has type \"{}\" instead of int.''.format(\n str(int_value), str(type(int_value))))\n return str(int_value)\n\nimport argparse\n_parser = argparse.ArgumentParser(prog=''Print and return number'', description='''')\n_parser.add_argument(\"--number\", dest=\"number\", type=int, required=True, default=argparse.SUPPRESS)\n_parser.add_argument(\"----output-paths\", dest=\"_output_paths\", type=str, nargs=1)\n_parsed_args = vars(_parser.parse_args())\n_output_files = _parsed_args.pop(\"_output_paths\", [])\n\n_outputs = print_and_return_number(**_parsed_args)\n\n_outputs = [_outputs]\n\n_output_serializers = [\n _serialize_int,\n\n]\n\nimport os\nfor idx, output_file in enumerate(_output_files):\n try:\n os.makedirs(os.path.dirname(output_file))\n except OSError:\n pass\n with open(output_file, ''w'') as f:\n f.write(_output_serializers[idx](_outputs[idx]))\n"], "image": "python:3.7"}}, "inputs": [{"name": "number", "type": "Integer"}], "name": "Print and return number", "outputs": [{"name": "Output", "type": "Integer"}]}', pipelines.kubeflow.org/component_ref: '{}', pipelines.kubeflow.org/arguments.parameters: '{"number": "{{inputs.parameters.number_1}}"}'} labels: pipelines.kubeflow.org/kfp_sdk_version: 1.8.9 pipelines.kubeflow.org/pipeline-sdk-type: kfp pipelines.kubeflow.org/enable_caching: "true" - name: print-and-return-number-2 container: args: [--number, '{{inputs.parameters.number_2}}', '----output-paths', /tmp/outputs/Output/data] command: - sh - -ec - | program_path=$(mktemp) printf "%s" "$0" > "$program_path" python3 -u "$program_path" "$@" - | def print_and_return_number(number): print(number) return number def _serialize_int(int_value: int) -> str: if isinstance(int_value, str): return int_value if not isinstance(int_value, int): raise TypeError('Value "{}" has type "{}" instead of int.'.format( str(int_value), str(type(int_value)))) return str(int_value) import argparse _parser = argparse.ArgumentParser(prog='Print and return number', description='') _parser.add_argument("--number", dest="number", type=int, required=True, default=argparse.SUPPRESS) _parser.add_argument("----output-paths", dest="_output_paths", type=str, nargs=1) _parsed_args = vars(_parser.parse_args()) _output_files = _parsed_args.pop("_output_paths", []) _outputs = print_and_return_number(**_parsed_args) _outputs = [_outputs] _output_serializers = [ _serialize_int, ] import os for idx, output_file in enumerate(_output_files): try: os.makedirs(os.path.dirname(output_file)) except OSError: pass with open(output_file, 'w') as f: f.write(_output_serializers[idx](_outputs[idx])) image: python:3.7 inputs: parameters: - {name: number_2} outputs: parameters: - name: print-and-return-number-2-Output valueFrom: {path: /tmp/outputs/Output/data} artifacts: - {name: print-and-return-number-2-Output, path: /tmp/outputs/Output/data} metadata: annotations: {pipelines.kubeflow.org/task_display_name: This is number 2, pipelines.kubeflow.org/component_spec: '{"implementation": {"container": {"args": ["--number", {"inputValue": "number"}, "----output-paths", {"outputPath": "Output"}], "command": ["sh", "-ec", "program_path=$(mktemp)\nprintf \"%s\" \"$0\" > \"$program_path\"\npython3 -u \"$program_path\" \"$@\"\n", "def print_and_return_number(number):\n print(number)\n return number\n\ndef _serialize_int(int_value: int) -> str:\n if isinstance(int_value, str):\n return int_value\n if not isinstance(int_value, int):\n raise TypeError(''Value \"{}\" has type \"{}\" instead of int.''.format(\n str(int_value), str(type(int_value))))\n return str(int_value)\n\nimport argparse\n_parser = argparse.ArgumentParser(prog=''Print and return number'', description='''')\n_parser.add_argument(\"--number\", dest=\"number\", type=int, required=True, default=argparse.SUPPRESS)\n_parser.add_argument(\"----output-paths\", dest=\"_output_paths\", type=str, nargs=1)\n_parsed_args = vars(_parser.parse_args())\n_output_files = _parsed_args.pop(\"_output_paths\", [])\n\n_outputs = print_and_return_number(**_parsed_args)\n\n_outputs = [_outputs]\n\n_output_serializers = [\n _serialize_int,\n\n]\n\nimport os\nfor idx, output_file in enumerate(_output_files):\n try:\n os.makedirs(os.path.dirname(output_file))\n except OSError:\n pass\n with open(output_file, ''w'') as f:\n f.write(_output_serializers[idx](_outputs[idx]))\n"], "image": "python:3.7"}}, "inputs": [{"name": "number", "type": "Integer"}], "name": "Print and return number", "outputs": [{"name": "Output", "type": "Integer"}]}', pipelines.kubeflow.org/component_ref: '{}', pipelines.kubeflow.org/arguments.parameters: '{"number": "{{inputs.parameters.number_2}}"}'} labels: pipelines.kubeflow.org/kfp_sdk_version: 1.8.9 pipelines.kubeflow.org/pipeline-sdk-type: kfp pipelines.kubeflow.org/enable_caching: "true" - name: sum-and-print-numbers container: args: [--number-1, '{{inputs.parameters.print-and-return-number-Output}}', --number-2, '{{inputs.parameters.print-and-return-number-2-Output}}'] command: - sh - -ec - | program_path=$(mktemp) printf "%s" "$0" > "$program_path" python3 -u "$program_path" "$@" - | def sum_and_print_numbers(number_1, number_2): print(number_1 + number_2) import argparse _parser = argparse.ArgumentParser(prog='Sum and print numbers', description='') _parser.add_argument("--number-1", dest="number_1", type=int, required=True, default=argparse.SUPPRESS) _parser.add_argument("--number-2", dest="number_2", type=int, required=True, default=argparse.SUPPRESS) _parsed_args = vars(_parser.parse_args()) _outputs = sum_and_print_numbers(**_parsed_args) image: python:3.7 inputs: parameters: - {name: print-and-return-number-2-Output} - {name: print-and-return-number-Output} metadata: annotations: {pipelines.kubeflow.org/task_display_name: This is sum of number 1 and number 2, pipelines.kubeflow.org/component_spec: '{"implementation": {"container": {"args": ["--number-1", {"inputValue": "number_1"}, "--number-2", {"inputValue": "number_2"}], "command": ["sh", "-ec", "program_path=$(mktemp)\nprintf \"%s\" \"$0\" > \"$program_path\"\npython3 -u \"$program_path\" \"$@\"\n", "def sum_and_print_numbers(number_1, number_2):\n print(number_1 + number_2)\n\nimport argparse\n_parser = argparse.ArgumentParser(prog=''Sum and print numbers'', description='''')\n_parser.add_argument(\"--number-1\", dest=\"number_1\", type=int, required=True, default=argparse.SUPPRESS)\n_parser.add_argument(\"--number-2\", dest=\"number_2\", type=int, required=True, default=argparse.SUPPRESS)\n_parsed_args = vars(_parser.parse_args())\n\n_outputs = sum_and_print_numbers(**_parsed_args)\n"], "image": "python:3.7"}}, "inputs": [{"name": "number_1", "type": "Integer"}, {"name": "number_2", "type": "Integer"}], "name": "Sum and print numbers"}', pipelines.kubeflow.org/component_ref: '{}', pipelines.kubeflow.org/arguments.parameters: '{"number_1": "{{inputs.parameters.print-and-return-number-Output}}", "number_2": "{{inputs.parameters.print-and-return-number-2-Output}}"}'} labels: pipelines.kubeflow.org/kfp_sdk_version: 1.8.9 pipelines.kubeflow.org/pipeline-sdk-type: kfp pipelines.kubeflow.org/enable_caching: "true" arguments: parameters: - {name: number_1} - {name: number_2} serviceAccountName: pipeline-runner ```

이 전의 파일과 비교하면 `pipelines.kubeflow.org/task_display_name` key가 새로 생성되었습니다. ### UI in Kubeflow 위에서 만든 파일을 이용해 이전에 생성한 [파이프라인](../kubeflow/basic-pipeline-upload.md#upload-pipeline-version)의 버전을 올리겠습니다. ![adv-pipeline-0.png](./img/adv-pipeline-0.png) 그러면 위와 같이 설정한 이름이 노출되는 것을 확인할 수 있습니다. ## Resources ### GPU 특별한 설정이 없다면 파이프라인은 컴포넌트를 쿠버네티스 파드(pod)로 실행할 때, 기본 리소스 스펙으로 실행하게 됩니다. 만약 GPU를 사용해 모델을 학습해야 할 때 쿠버네티스상에서 GPU를 할당받지 못해 제대로 학습이 이루어지지 않습니다. 이를 위해 `set_gpu_limit()` [attribute](https://kubeflow-pipelines.readthedocs.io/en/latest/source/kfp.dsl.html?highlight=set_gpu_limit#kfp.dsl.UserContainer.set_gpu_limit)을 이용해 설정할 수 있습니다. ```python import kfp from kfp.components import create_component_from_func from kfp.dsl import pipeline @create_component_from_func def print_and_return_number(number: int) -> int: print(number) return number @create_component_from_func def sum_and_print_numbers(number_1: int, number_2: int): print(number_1 + number_2) @pipeline(name="example_pipeline") def example_pipeline(number_1: int, number_2: int): number_1_result = print_and_return_number(number_1).set_display_name("This is number 1") number_2_result = print_and_return_number(number_2).set_display_name("This is number 2") sum_result = sum_and_print_numbers( number_1=number_1_result.output, number_2=number_2_result.output ).set_display_name("This is sum of number 1 and number 2").set_gpu_limit(1) if __name__ == "__main__": kfp.compiler.Compiler().compile(example_pipeline, "example_pipeline.yaml") ``` 위의 스크립트를 실행하면 생성된 파일에서 `sum-and-print-numbers`를 자세히 보면 resources에 `{nvidia.com/gpu: 1}` 도 추가된 것을 볼 수 있습니다. 이를 통해 GPU를 할당받을 수 있습니다. ```bash - name: sum-and-print-numbers container: args: [--number-1, '{{inputs.parameters.print-and-return-number-Output}}', --number-2, '{{inputs.parameters.print-and-return-number-2-Output}}'] command: - sh - -ec - | program_path=$(mktemp) printf "%s" "$0" > "$program_path" python3 -u "$program_path" "$@" - | def sum_and_print_numbers(number_1, number_2): print(number_1 + number_2) import argparse _parser = argparse.ArgumentParser(prog='Sum and print numbers', description='') _parser.add_argument("--number-1", dest="number_1", type=int, required=True, default=argparse.SUPPRESS) _parser.add_argument("--number-2", dest="number_2", type=int, required=True, default=argparse.SUPPRESS) _parsed_args = vars(_parser.parse_args()) _outputs = sum_and_print_numbers(**_parsed_args) image: python:3.7 resources: limits: {nvidia.com/gpu: 1} ``` ### CPU cpu의 개수를 정하기 위해서 이용하는 함수는 `.set_cpu_limit()` [attribute](https://kubeflow-pipelines.readthedocs.io/en/latest/source/kfp.dsl.html?highlight=set_gpu_limit#kfp.dsl.Sidecar.set_cpu_limit)을 이용해 설정할 수 있습니다. gpu와는 다른 점은 int가 아닌 string으로 입력해야 한다는 점입니다. ```python import kfp from kfp.components import create_component_from_func from kfp.dsl import pipeline @create_component_from_func def print_and_return_number(number: int) -> int: print(number) return number @create_component_from_func def sum_and_print_numbers(number_1: int, number_2: int): print(number_1 + number_2) @pipeline(name="example_pipeline") def example_pipeline(number_1: int, number_2: int): number_1_result = print_and_return_number(number_1).set_display_name("This is number 1") number_2_result = print_and_return_number(number_2).set_display_name("This is number 2") sum_result = sum_and_print_numbers( number_1=number_1_result.output, number_2=number_2_result.output ).set_display_name("This is sum of number 1 and number 2").set_gpu_limit(1).set_cpu_limit("16") if __name__ == "__main__": kfp.compiler.Compiler().compile(example_pipeline, "example_pipeline.yaml") ``` 바뀐 부분만 확인하면 다음과 같습니다. ```bash resources: limits: {nvidia.com/gpu: 1, cpu: '16'} ``` ### Memory 메모리는 `.set_memory_limit()` [attribute](https://kubeflow-pipelines.readthedocs.io/en/latest/source/kfp.dsl.html?highlight=set_gpu_limit#kfp.dsl.Sidecar.set_memory_limit)을 이용해 설정할 수 있습니다. ```python import kfp from kfp.components import create_component_from_func from kfp.dsl import pipeline @create_component_from_func def print_and_return_number(number: int) -> int: print(number) return number @create_component_from_func def sum_and_print_numbers(number_1: int, number_2: int): print(number_1 + number_2) @pipeline(name="example_pipeline") def example_pipeline(number_1: int, number_2: int): number_1_result = print_and_return_number(number_1).set_display_name("This is number 1") number_2_result = print_and_return_number(number_2).set_display_name("This is number 2") sum_result = sum_and_print_numbers( number_1=number_1_result.output, number_2=number_2_result.output ).set_display_name("This is sum of number 1 and number 2").set_gpu_limit(1).set_memory_limit("1G") if __name__ == "__main__": kfp.compiler.Compiler().compile(example_pipeline, "example_pipeline.yaml") ``` 바뀐 부분만 확인하면 다음과 같습니다. ```bash resources: limits: {nvidia.com/gpu: 1, memory: 1G} ``` ================================================ FILE: docs/kubeflow/advanced-run.md ================================================ --- title : "11. Pipeline - Run Result" description: "" sidebar_position: 11 contributors: ["Jongseob Jeon", "SeungTae Kim"] --- ## Run Result Run 실행 결과를 눌러보면 3개의 탭이 존재합니다. 각각 Graph, Run output, Config 입니다. ![advanced-run-0.png](./img/advanced-run-0.png) ## Graph ![advanced-run-1.png](./img/advanced-run-1.png) 그래프에서는 실행된 컴포넌트를 누르면 컴포넌트의 실행 정보를 확인할 수 있습니다. ### Input/Output Input/Output 탭은 컴포넌트에서 사용한 Config들과 Input, Output Artifacts를 확인하고 다운로드 받을 수 있습니다. ### Logs Logs에서는 파이썬 코드 실행 중 나오는 모든 stdout을 확인할 수 있습니다. 다만 pod은 일정 시간이 지난 후 지워지기 때문에 일정 시간이 지나면 이 탭에서는 확인할 수 없습니다. 이때는 Output artifacts의 main-logs에서 확인할 수 있습니다. ### Visualizations Visualizations에서는 컴포넌트에서 생성된 플랏을 보여줍니다. 플랏을 생성하기 위해서는 `mlpipeline_ui_metadata: OutputPath("UI_Metadata")` argument로 보여주고 싶은 값을 저장하면 됩니다. 이 때 플랏의 형태는 html 포맷이어야 합니다. 변환하는 과정은 다음과 같습니다. ```python @partial( create_component_from_func, packages_to_install=["matplotlib"], ) def plot_linear( mlpipeline_ui_metadata: OutputPath("UI_Metadata") ): import base64 import json from io import BytesIO import matplotlib.pyplot as plt plt.plot(x=[1, 2, 3], y=[1, 2,3]) tmpfile = BytesIO() plt.savefig(tmpfile, format="png") encoded = base64.b64encode(tmpfile.getvalue()).decode("utf-8") html = f"" metadata = { "outputs": [ { "type": "web-app", "storage": "inline", "source": html, }, ], } with open(mlpipeline_ui_metadata, "w") as html_writer: json.dump(metadata, html_writer) ``` 파이프라인으로 작성하면 다음과 같이 됩니다. ```python from functools import partial import kfp from kfp.components import create_component_from_func, OutputPath from kfp.dsl import pipeline @partial( create_component_from_func, packages_to_install=["matplotlib"], ) def plot_linear(mlpipeline_ui_metadata: OutputPath("UI_Metadata")): import base64 import json from io import BytesIO import matplotlib.pyplot as plt plt.plot([1, 2, 3], [1, 2, 3]) tmpfile = BytesIO() plt.savefig(tmpfile, format="png") encoded = base64.b64encode(tmpfile.getvalue()).decode("utf-8") html = f"" metadata = { "outputs": [ { "type": "web-app", "storage": "inline", "source": html, }, ], } with open(mlpipeline_ui_metadata, "w") as html_writer: json.dump(metadata, html_writer) @pipeline(name="plot_pipeline") def plot_pipeline(): plot_linear() if __name__ == "__main__": kfp.compiler.Compiler().compile(plot_pipeline, "plot_pipeline.yaml") ``` 이 스크립트를 실행해서 나온 `plot_pipeline.yaml`을 확인하면 다음과 같습니다.

plot_pipeline.yaml ```bash apiVersion: argoproj.io/v1alpha1 kind: Workflow metadata: generateName: plot-pipeline- annotations: {pipelines.kubeflow.org/kfp_sdk_version: 1.8.9, pipelines.kubeflow.org/pipeline_compilation_time: '2 022-01-17T13:31:32.963214', pipelines.kubeflow.org/pipeline_spec: '{"name": "plot_pipeline"}'} labels: {pipelines.kubeflow.org/kfp_sdk_version: 1.8.9} spec: entrypoint: plot-pipeline templates: - name: plot-linear container: args: [--mlpipeline-ui-metadata, /tmp/outputs/mlpipeline_ui_metadata/data] command: - sh - -c - (PIP_DISABLE_PIP_VERSION_CHECK=1 python3 -m pip install --quiet --no-warn-script-location 'matplotlib' || PIP_DISABLE_PIP_VERSION_CHECK=1 python3 -m pip install --quiet --no-warn-script-location 'matplotlib' --user) && "$0" "$@" - sh - -ec - | program_path=$(mktemp) printf "%s" "$0" > "$program_path" python3 -u "$program_path" "$@" - | def _make_parent_dirs_and_return_path(file_path: str): import os os.makedirs(os.path.dirname(file_path), exist_ok=True) return file_path def plot_linear(mlpipeline_ui_metadata): import base64 import json from io import BytesIO import matplotlib.pyplot as plt plt.plot([1, 2, 3], [1, 2, 3]) tmpfile = BytesIO() plt.savefig(tmpfile, format="png") encoded = base64.b64encode(tmpfile.getvalue()).decode("utf-8") html = f"" metadata = { "outputs": [ { "type": "web-app", "storage": "inline", "source": html, }, ], } with open(mlpipeline_ui_metadata, "w") as html_writer: json.dump(metadata, html_writer) import argparse _parser = argparse.ArgumentParser(prog='Plot linear', description='') _parser.add_argument("--mlpipeline-ui-metadata", dest="mlpipeline_ui_metadata", type=_make_parent_dirs_and_return_path, required=True, default=argparse.SUPPRESS) _parsed_args = vars(_parser.parse_args()) _outputs = plot_linear(**_parsed_args) image: python:3.7 outputs: artifacts: - {name: mlpipeline-ui-metadata, path: /tmp/outputs/mlpipeline_ui_metadata/data} metadata: labels: pipelines.kubeflow.org/kfp_sdk_version: 1.8.9 pipelines.kubeflow.org/pipeline-sdk-type: kfp pipelines.kubeflow.org/enable_caching: "true" annotations: {pipelines.kubeflow.org/component_spec: '{"implementation": {"container": {"args": ["--mlpipeline-ui-metadata", {"outputPath": "mlpipeline_ui_metadata"}], "command": ["sh", "-c", "(PIP_DISABLE_PIP_VERSION_CHECK=1 python3 -m pip install --quiet --no-warn-script-location ''matplotlib'' || PIP_DISABLE_PIP_VERSION_CHECK=1 python3 -m pip install --quiet --no-warn-script-location ''matplotlib'' --user) && \"$0\" \"$@\"", "sh", "-ec", "program_path=$(mktemp)\nprintf \"%s\" \"$0\" > \"$program_path\"\npython3 -u \"$program_path\" \"$@\"\n", "def _make_parent_dirs_and_return_path(file_path: str):\n import os\n os.makedirs(os.path.dirname(file_path), exist_ok=True)\n return file_path\n\ndef plot_linear(mlpipeline_ui_metadata):\n import base64\n import json\n from io import BytesIO\n\n import matplotlib.pyplot as plt\n\n plt.plot([1, 2, 3], [1, 2, 3])\n\n tmpfile = BytesIO()\n plt.savefig(tmpfile, format=\"png\")\n encoded = base64.b64encode(tmpfile.getvalue()).decode(\"utf-8\")\n\n html = f\"\"\n metadata = {\n \"outputs\": [\n {\n \"type\": \"web-app\",\n \"storage\": \"inline\",\n \"source\": html,\n },\n ],\n }\n with open(mlpipeline_ui_metadata, \"w\") as html_writer:\n json.dump(metadata, html_writer)\n\nimport argparse\n_parser = argparse.ArgumentParser(prog=''Plot linear'', description='''')\n_parser.add_argument(\"--mlpipeline-ui-metadata\", dest=\"mlpipeline_ui_metadata\", type=_make_parent_dirs_and_return_path, required=True, default=argparse.SUPPRESS)\n_parsed_args = vars(_parser.parse_args())\n\n_outputs = plot_linear(**_parsed_args)\n"], "image": "python:3.7"}}, "name": "Plot linear", "outputs": [{"name": "mlpipeline_ui_metadata", "type": "UI_Metadata"}]}', pipelines.kubeflow.org/component_ref: '{}'} - name: plot-pipeline dag: tasks: - {name: plot-linear, template: plot-linear} arguments: parameters: [] serviceAccountName: pipeline-runner ```

실행 후 Visualization을 클릭합니다. ![advanced-run-5.png](./img/advanced-run-5.png) ## Run output ![advanced-run-2.png](./img/advanced-run-2.png) Run output은 kubeflow에서 지정한 형태로 생긴 Artifacts를 모아서 보여주는 곳이며 평가 지표(Metric)를 보여줍니다. 평가 지표(Metric)을 보여주기 위해서는 `mlpipeline_metrics_path: OutputPath("Metrics")` argument에 보여주고 싶은 이름과 값을 json 형태로 저장하면 됩니다. 예를 들어서 다음과 같이 작성할 수 있습니다. ```python @create_component_from_func def show_metric_of_sum( number: int, mlpipeline_metrics_path: OutputPath("Metrics"), ): import json metrics = { "metrics": [ { "name": "sum_value", "numberValue": number, }, ], } with open(mlpipeline_metrics_path, "w") as f: json.dump(metrics, f) ``` 평가 지표를 생성하는 컴포넌트를 [파이프라인](../kubeflow/basic-pipeline.md)에서 생성한 파이프라인에 추가 후 실행해 보겠습니다. 전체 파이프라인은 다음과 같습니다. ```python import kfp from kfp.components import create_component_from_func, OutputPath from kfp.dsl import pipeline @create_component_from_func def print_and_return_number(number: int) -> int: print(number) return number @create_component_from_func def sum_and_print_numbers(number_1: int, number_2: int) -> int: sum_number = number_1 + number_2 print(sum_number) return sum_number @create_component_from_func def show_metric_of_sum( number: int, mlpipeline_metrics_path: OutputPath("Metrics"), ): import json metrics = { "metrics": [ { "name": "sum_value", "numberValue": number, }, ], } with open(mlpipeline_metrics_path, "w") as f: json.dump(metrics, f) @pipeline(name="example_pipeline") def example_pipeline(number_1: int, number_2: int): number_1_result = print_and_return_number(number_1) number_2_result = print_and_return_number(number_2) sum_result = sum_and_print_numbers( number_1=number_1_result.output, number_2=number_2_result.output ) show_metric_of_sum(sum_result.output) if __name__ == "__main__": kfp.compiler.Compiler().compile(example_pipeline, "example_pipeline.yaml") ``` 실행 후 Run Output을 클릭하면 다음과 같이 나옵니다. ![advanced-run-4.png](./img/advanced-run-4.png) ## Config ![advanced-run-3.png](./img/advanced-run-3.png) Config에서는 파이프라인 Config로 입력받은 모든 값을 확인할 수 있습니다. ================================================ FILE: docs/kubeflow/basic-component.md ================================================ --- title : "4. Component - Write" description: "" sidebar_position: 4 contributors: ["Jongseob Jeon"] --- ## Component 컴포넌트(Component)를 작성하기 위해서는 다음과 같은 내용을 작성해야 합니다. 1. 컴포넌트 콘텐츠(Component Contents) 작성 2. 컴포넌트 래퍼(Component Wrapper) 작성 이제 각 과정에 대해서 알아보도록 하겠습니다. ## Component Contents 컴포넌트 콘텐츠는 우리가 흔히 작성하는 파이썬 코드와 다르지 않습니다. 예를 들어서 숫자를 입력으로 받고 입력받은 숫자를 출력한 뒤 반환하는 컴포넌트를 작성해 보겠습니다. 파이썬 코드로 작성하면 다음과 같이 작성할 수 있습니다. ```python print(number) ``` 그런데 이 코드를 실행하면 에러가 나고 동작하지 않는데 그 이유는 출력해야 할 `number`가 정의되어 있지 않기 때문입니다. [Kubeflow Concepts](../kubeflow/kubeflow-concepts.md)에서 `number` 와 같이 컴포넌트 콘텐츠에서 필요한 값들은 **Config**로 정의한다고 했습니다. 컴포넌트 콘텐츠를 실행시키기 위해 필요한 Config들은 컴포넌트 래퍼에서 전달이 되어야 합니다. ## Component Wrapper ### Define a standalone Python function 이제 필요한 Config를 전달할 수 있도록 컴포넌트 래퍼를 만들어야 합니다. 별도의 Config 없이 컴포넌트 래퍼로 감쌀 경우 다음과 같이 됩니다. ```python def print_and_return_number(): print(number) return number ``` 이제 콘텐츠에서 필요한 Config를 래퍼의 argument로 추가합니다. 다만, argument 만을 적는 것이 아니라 argument의 타입 힌트도 작성해야 합니다. Kubeflow에서는 파이프라인을 Kubeflow 포맷으로 변환할 때, 컴포넌트 간의 연결에서 정해진 입력과 출력의 타입이 일치하는지 체크합니다. 만약 컴포넌트가 필요로 하는 입력과 다른 컴포넌트로부터 전달받은 출력의 포맷이 일치하지 않을 경우 파이프라인 생성을 할 수 없습니다. 이제 다음과 같이 argument와 그 타입, 그리고 반환하는 타입을 적어서 컴포넌트 래퍼를 완성합니다. ```python def print_and_return_number(number: int) -> int: print(number) return number ``` Kubeflow에서 반환 값으로 사용할 수 있는 타입은 json에서 표현할 수 있는 타입들만 사용할 수 있습니다. 대표적으로 사용되며 권장하는 타입들은 다음과 같습니다. - int - float - str 만약 단일 값이 아닌 여러 값을 반환하려면 `collections.namedtuple` 을 이용해야 합니다. 자세한 내용은 [Kubeflow 공식 문서](https://www.kubeflow.org/docs/components/pipelines/sdk/python-function-components/#passing-parameters-by-value)를 참고 하시길 바랍니다. 예를 들어서 입력받은 숫자를 2로 나눈 몫과 나머지를 반환하는 컴포넌트는 다음과 같이 작성해야 합니다. ```python from typing import NamedTuple def divide_and_return_number( number: int, ) -> NamedTuple("DivideOutputs", [("quotient", int), ("remainder", int)]): from collections import namedtuple quotient, remainder = divmod(number, 2) print("quotient is", quotient) print("remainder is", remainder) divide_outputs = namedtuple( "DivideOutputs", [ "quotient", "remainder", ], ) return divide_outputs(quotient, remainder) ``` ### Convert to Kubeflow Format 이제 작성한 컴포넌트를 kubeflow에서 사용할 수 있는 포맷으로 변환해야 합니다. 변환은 `kfp.components.create_component_from_func` 를 통해서 할 수 있습니다. 이렇게 변환된 형태는 파이썬에서 함수로 import 하여서 파이프라인에서 사용할 수 있습니다. ```python from kfp.components import create_component_from_func @create_component_from_func def print_and_return_number(number: int) -> int: print(number) return number ``` ### Share component with yaml file 만약 파이썬 코드로 공유를 할 수 없는 경우 YAML 파일로 컴포넌트를 공유해서 사용할 수 있습니다. 이를 위해서는 우선 컴포넌트를 YAML 파일로 변환한 뒤 `kfp.components.load_component_from_file` 을 통해 파이프라인에서 사용할 수 있습니다. 우선 작성한 컴포넌트를 YAML 파일로 변환하는 과정에 대해서 설명합니다. ```python from kfp.components import create_component_from_func @create_component_from_func def print_and_return_number(number: int) -> int: print(number) return number if __name__ == "__main__": print_and_return_number.component_spec.save("print_and_return_number.yaml") ``` 작성한 파이썬 코드를 실행하면 `print_and_return_number.yaml` 파일이 생성됩니다. 파일을 확인하면 다음과 같습니다. ```bash name: Print and return number inputs: - {name: number, type: Integer} outputs: - {name: Output, type: Integer} implementation: container: image: python:3.7 command: - sh - -ec - | program_path=$(mktemp) printf "%s" "$0" > "$program_path" python3 -u "$program_path" "$@" - | def print_and_return_number(number): print(number) return number def _serialize_int(int_value: int) -> str: if isinstance(int_value, str): return int_value if not isinstance(int_value, int): raise TypeError('Value "{}" has type "{}" instead of int.'.format(str(int_value), str(type(int_value)))) return str(int_value) import argparse _parser = argparse.ArgumentParser(prog='Print and return number', description='') _parser.add_argument("--number", dest="number", type=int, required=True, default=argparse.SUPPRESS) _parser.add_argument("----output-paths", dest="_output_paths", type=str, nargs=1) _parsed_args = vars(_parser.parse_args()) _output_files = _parsed_args.pop("_output_paths", []) _outputs = print_and_return_number(**_parsed_args) _outputs = [_outputs] _output_serializers = [ _serialize_int, ] import os for idx, output_file in enumerate(_output_files): try: os.makedirs(os.path.dirname(output_file)) except OSError: pass with open(output_file, 'w') as f: f.write(_output_serializers[idx](_outputs[idx])) args: - --number - {inputValue: number} - '----output-paths' - {outputPath: Output} ``` 이제 생성된 파일을 공유해서 파이프라인에서 다음과 같이 사용할 수 있습니다. ```python from kfp.components import load_component_from_file print_and_return_number = load_component_from_file("print_and_return_number.yaml") ``` ## How Kubeflow executes component Kubeflow에서 컴포넌트가 실행되는 순서는 다음과 같습니다. 1. `docker pull `: 정의된 컴포넌트의 실행 환경 정보가 담긴 이미지를 pull 2. run `command`: pull 한 이미지에서 컴포넌트 콘텐츠를 실행합니다. `print_and_return_number.yaml` 를 예시로 들자면 `@create_component_from_func` 의 default image 는 python:3.7 이므로 해당 이미지를 기준으로 컴포넌트 콘텐츠를 실행하게 됩니다. 1. `docker pull python:3.7` 2. `print(number)` ## References: - [Getting Started With Python function based components](https://www.kubeflow.org/docs/components/pipelines/sdk/python-function-components/#getting-started-with-python-function-based-components) ================================================ FILE: docs/kubeflow/basic-pipeline-upload.md ================================================ --- title : "6. Pipeline - Upload" description: "" sidebar_position: 6 contributors: ["Jongseob Jeon"] --- ## Upload Pipeline 이제 우리가 만든 파이프라인을 직접 kubeflow에서 업로드 해 보겠습니다. 파이프라인 업로드는 kubeflow 대시보드 UI를 통해 진행할 수 있습니다. [Install Kubeflow](../setup-components/install-components-kf.md#정상-설치-확인) 에서 사용한 방법을 이용해 포트포워딩합니다. ```bash kubectl port-forward svc/istio-ingressgateway -n istio-system 8080:80 ``` [http://localhost:8080](http://localhost:8080)에 접속해 대시보드를 열어줍니다. ### 1. Pipelines 탭 선택 ![pipeline-gui-0.png](./img/pipeline-gui-0.png) ### 2. Upload Pipeline 선택 ![pipeline-gui-1.png](./img/pipeline-gui-1.png) ### 3. Choose file 선택 ![pipeline-gui-2.png](./img/pipeline-gui-2.png) ### 4. 생성된 yaml파일 업로드 ![pipeline-gui-3.png](./img/pipeline-gui-3.png) ### 5. Create ![pipeline-gui-4.png](./img/pipeline-gui-4.png) ## Upload Pipeline Version 업로드된 파이프라인은 업로드를 통해서 버전을 관리할 수 있습니다. 다만 깃헙과 같은 코드 차원의 버전 관리가 아닌 같은 이름의 파이프라인을 모아서 보여주는 역할을 합니다. 위의 예시에서 파이프라인을 업로드한 경우 다음과 같이 example_pipeline이 생성된 것을 확인할 수 있습니다. ![pipeline-gui-5.png](./img/pipeline-gui-5.png) 클릭하면 다음과 같은 화면이 나옵니다. ![pipeline-gui-4.png](./img/pipeline-gui-4.png) Upload Version을 클릭하면 다음과 같이 파이프라인을 업로드할 수 있는 화면이 생성됩니다. ![pipeline-gui-6.png](./img/pipeline-gui-6.png) 파이프라인을 업로드 합니다. ![pipeline-gui-7.png](./img/pipeline-gui-7.png) 업로드된 경우 다음과 같이 파이프라인 버전을 확인할 수 있습니다. ![pipeline-gui-8.png](./img/pipeline-gui-8.png) ================================================ FILE: docs/kubeflow/basic-pipeline.md ================================================ --- title : "5. Pipeline - Write" description: "" sidebar_position: 5 contributors: ["Jongseob Jeon"] --- ## Pipeline 컴포넌트는 독립적으로 실행되지 않고 파이프라인의 구성요소로써 실행됩니다. 그러므로 컴포넌트를 실행해 보려면 파이프라인을 작성해야 합니다. 그리고 파이프라인을 작성하기 위해서는 컴포넌트의 집합과 컴포넌트의 실행 순서가 필요합니다. 이번 페이지에서는 숫자를 입력받고 출력하는 컴포넌트와 두 개의 컴포넌트로부터 숫자를 받아서 합을 출력하는 컴포넌트가 있는 파이프라인을 만들어 보도록 하겠습니다. ## Component Set 우선 파이프라인에서 사용할 컴포넌트들을 작성합니다. 1. `print_and_return_number` 입력받은 숫자를 출력하고 반환하는 컴포넌트입니다. 컴포넌트가 입력받은 값을 반환하기 때문에 int를 return의 타입 힌트로 입력합니다. ```python @create_component_from_func def print_and_return_number(number: int) -> int: print(number) return number ``` 2. `sum_and_print_numbers` 입력받은 두 개의 숫자의 합을 출력하는 컴포넌트입니다. 이 컴포넌트 역시 두 숫자의 합을 반환하기 때문에 int를 return의 타입 힌트로 입력합니다. ```python @create_component_from_func def sum_and_print_numbers(number_1: int, number_2: int) -> int: sum_num = number_1 + number_2 print(sum_num) return sum_num ``` ## Component Order ### Define Order 필요한 컴포넌트의 집합을 만들었으면, 다음으로는 이들의 순서를 정의해야 합니다. 이번 페이지에서 만들 파이프라인의 순서를 그림으로 표현하면 다음과 같이 됩니다. ![pipeline-0.png](./img/pipeline-0.png) ### Single Output 이제 이 순서를 코드로 옮겨보겠습니다. 우선 위의 그림에서 `print_and_return_number_1` 과 `print_and_return_number_2` 를 작성하면 다음과 같이 됩니다. ```python def example_pipeline(): number_1_result = print_and_return_number(number_1) number_2_result = print_and_return_number(number_2) ``` 컴포넌트를 실행하고 그 반환 값을 각각 `number_1_result` 와 `number_2_result` 에 저장합니다. 저장된 `number_1_result` 의 반환 값은 `number_1_resulst.output` 를 통해 사용할 수 있습니다. ### Multi Output 위의 예시에서 컴포넌트는 단일 값만을 반환하기 때문에 `output`을 이용해 바로 사용할 수 있습니다. 만약, 여러 개의 반환 값이 있다면 `outputs`에 저장이 되며 dict 타입이기에 key를 이용해 원하는 반환 값을 사용할 수 있습니다. 예를 들어서 앞에서 작성한 여러 개를 반환하는 [컴포넌트](../kubeflow/basic-component.md#define-a-standalone-python-function) 의 경우를 보겠습니다. `divde_and_return_number` 의 return 값은 `quotient` 와 `remainder` 가 있습니다. 이 두 값을 `print_and_return_number` 에 전달하는 예시를 보면 다음과 같습니다. ```python def multi_pipeline(): divided_result = divde_and_return_number(number) num_1_result = print_and_return_number(divided_result.outputs["quotient"]) num_2_result = print_and_return_number(divided_result.outputs["remainder"]) ``` `divde_and_return_number`의 결과를 `divided_result`에 저장하고 각각 `divided_result.outputs["quotient"]`, `divided_result.outputs["remainder"]`로 값을 가져올 수 있습니다. ### Write to python code 이제 다시 본론으로 돌아와서 이 두 값의 결과를 `sum_and_print_numbers` 에 전달합니다. ```python def example_pipeline(): number_1_result = print_and_return_number(number_1) number_2_result = print_and_return_number(number_2) sum_result = sum_and_print_numbers( number_1=number_1_result.output, number_2=number_2_result.output ) ``` 다음으로 각 컴포넌트에 필요한 Config들을 모아서 파이프라인 Config로 정의 합니다. ```python def example_pipeline(number_1: int, number_2:int): number_1_result = print_and_return_number(number_1) number_2_result = print_and_return_number(number_2) sum_result = sum_and_print_numbers( number_1=number_1_result.output, number_2=number_2_result.output ) ``` ## Convert to Kubeflow Format 마지막으로 kubeflow에서 사용할 수 있는 형식으로 변환합니다. 변환은 `kfp.dsl.pipeline` 함수를 이용해 할 수 있습니다. ```python from kfp.dsl import pipeline @pipeline(name="example_pipeline") def example_pipeline(number_1: int, number_2: int): number_1_result = print_and_return_number(number_1) number_2_result = print_and_return_number(number_2) sum_result = sum_and_print_numbers( number_1=number_1_result.output, number_2=number_2_result.output ) ``` Kubeflow에서 파이프라인을 실행하기 위해서는 yaml 형식으로만 가능하기 때문에 생성한 파이프라인을 정해진 yaml 형식으로 컴파일(Compile) 해 주어야 합니다. 컴파일은 다음 명령어를 이용해 생성할 수 있습니다. ```python if __name__ == "__main__": import kfp kfp.compiler.Compiler().compile(example_pipeline, "example_pipeline.yaml") ``` ## Conclusion 앞서 설명한 내용을 한 파이썬 코드로 모으면 다음과 같이 됩니다. ```python import kfp from kfp.components import create_component_from_func from kfp.dsl import pipeline @create_component_from_func def print_and_return_number(number: int) -> int: print(number) return number @create_component_from_func def sum_and_print_numbers(number_1: int, number_2: int): print(number_1 + number_2) @pipeline(name="example_pipeline") def example_pipeline(number_1: int, number_2: int): number_1_result = print_and_return_number(number_1) number_2_result = print_and_return_number(number_2) sum_result = sum_and_print_numbers( number_1=number_1_result.output, number_2=number_2_result.output ) if __name__ == "__main__": kfp.compiler.Compiler().compile(example_pipeline, "example_pipeline.yaml") ``` 컴파일된 결과를 보면 다음과 같습니다.
example_pipeline.yaml ```bash apiVersion: argoproj.io/v1alpha1 kind: Workflow metadata: generateName: example-pipeline- annotations: {pipelines.kubeflow.org/kfp_sdk_version: 1.6.3, pipelines.kubeflow.org/pipeline_compilation_time: '2021-12-05T13:38:51.566777', pipelines.kubeflow.org/pipeline_spec: '{"inputs": [{"name": "number_1", "type": "Integer"}, {"name": "number_2", "type": "Integer"}], "name": "example_pipeline"}'} labels: {pipelines.kubeflow.org/kfp_sdk_version: 1.6.3} spec: entrypoint: example-pipeline templates: - name: example-pipeline inputs: parameters: - {name: number_1} - {name: number_2} dag: tasks: - name: print-and-return-number template: print-and-return-number arguments: parameters: - {name: number_1, value: '{{inputs.parameters.number_1}}'} - name: print-and-return-number-2 template: print-and-return-number-2 arguments: parameters: - {name: number_2, value: '{{inputs.parameters.number_2}}'} - name: sum-and-print-numbers template: sum-and-print-numbers dependencies: [print-and-return-number, print-and-return-number-2] arguments: parameters: - {name: print-and-return-number-2-Output, value: '{{tasks.print-and-return-number-2.outputs.parameters.print-and-return-number-2-Output}}'} - {name: print-and-return-number-Output, value: '{{tasks.print-and-return-number.outputs.parameters.print-and-return-number-Output}}'} - name: print-and-return-number container: args: [--number, '{{inputs.parameters.number_1}}', '----output-paths', /tmp/outputs/Output/data] command: - sh - -ec - | program_path=$(mktemp) printf "%s" "$0" > "$program_path" python3 -u "$program_path" "$@" - | def print_and_return_number(number): print(number) return number def _serialize_int(int_value: int) -> str: if isinstance(int_value, str): return int_value if not isinstance(int_value, int): raise TypeError('Value "{}" has type "{}" instead of int.'.format(str(int_value), str(type(int_value)))) return str(int_value) import argparse _parser = argparse.ArgumentParser(prog='Print and return number', description='') _parser.add_argument("--number", dest="number", type=int, required=True, default=argparse.SUPPRESS) _parser.add_argument("----output-paths", dest="_output_paths", type=str, nargs=1) _parsed_args = vars(_parser.parse_args()) _output_files = _parsed_args.pop("_output_paths", []) _outputs = print_and_return_number(**_parsed_args) _outputs = [_outputs] _output_serializers = [ _serialize_int, ] import os for idx, output_file in enumerate(_output_files): try: os.makedirs(os.path.dirname(output_file)) except OSError: pass with open(output_file, 'w') as f: f.write(_output_serializers[idx](_outputs[idx])) image: python:3.7 inputs: parameters: - {name: number_1} outputs: parameters: - name: print-and-return-number-Output valueFrom: {path: /tmp/outputs/Output/data} artifacts: - {name: print-and-return-number-Output, path: /tmp/outputs/Output/data} metadata: labels: {pipelines.kubeflow.org/kfp_sdk_version: 1.6.3, pipelines.kubeflow.org/pipeline-sdk-type: kfp} annotations: {pipelines.kubeflow.org/component_spec: '{"implementation": {"container": {"args": ["--number", {"inputValue": "number"}, "----output-paths", {"outputPath": "Output"}], "command": ["sh", "-ec", "program_path=$(mktemp)\nprintf \"%s\" \"$0\" > \"$program_path\"\npython3 -u \"$program_path\" \"$@\"\n", "def print_and_return_number(number):\n print(number)\n return number\n\ndef _serialize_int(int_value: int) -> str:\n if isinstance(int_value, str):\n return int_value\n if not isinstance(int_value, int):\n raise TypeError(''Value \"{}\" has type \"{}\" instead of int.''.format(str(int_value), str(type(int_value))))\n return str(int_value)\n\nimport argparse\n_parser = argparse.ArgumentParser(prog=''Print and return number'', description='''')\n_parser.add_argument(\"--number\", dest=\"number\", type=int, required=True, default=argparse.SUPPRESS)\n_parser.add_argument(\"----output-paths\", dest=\"_output_paths\", type=str, nargs=1)\n_parsed_args = vars(_parser.parse_args())\n_output_files = _parsed_args.pop(\"_output_paths\", [])\n\n_outputs = print_and_return_number(**_parsed_args)\n\n_outputs = [_outputs]\n\n_output_serializers = [\n _serialize_int,\n\n]\n\nimport os\nfor idx, output_file in enumerate(_output_files):\n try:\n os.makedirs(os.path.dirname(output_file))\n except OSError:\n pass\n with open(output_file, ''w'') as f:\n f.write(_output_serializers[idx](_outputs[idx]))\n"], "image": "python:3.7"}}, "inputs": [{"name": "number", "type": "Integer"}], "name": "Print and return number", "outputs": [{"name": "Output", "type": "Integer"}]}', pipelines.kubeflow.org/component_ref: '{}', pipelines.kubeflow.org/arguments.parameters: '{"number": "{{inputs.parameters.number_1}}"}'} - name: print-and-return-number-2 container: args: [--number, '{{inputs.parameters.number_2}}', '----output-paths', /tmp/outputs/Output/data] command: - sh - -ec - | program_path=$(mktemp) printf "%s" "$0" > "$program_path" python3 -u "$program_path" "$@" - | def print_and_return_number(number): print(number) return number def _serialize_int(int_value: int) -> str: if isinstance(int_value, str): return int_value if not isinstance(int_value, int): raise TypeError('Value "{}" has type "{}" instead of int.'.format(str(int_value), str(type(int_value)))) return str(int_value) import argparse _parser = argparse.ArgumentParser(prog='Print and return number', description='') _parser.add_argument("--number", dest="number", type=int, required=True, default=argparse.SUPPRESS) _parser.add_argument("----output-paths", dest="_output_paths", type=str, nargs=1) _parsed_args = vars(_parser.parse_args()) _output_files = _parsed_args.pop("_output_paths", []) _outputs = print_and_return_number(**_parsed_args) _outputs = [_outputs] _output_serializers = [ _serialize_int, ] import os for idx, output_file in enumerate(_output_files): try: os.makedirs(os.path.dirname(output_file)) except OSError: pass with open(output_file, 'w') as f: f.write(_output_serializers[idx](_outputs[idx])) image: python:3.7 inputs: parameters: - {name: number_2} outputs: parameters: - name: print-and-return-number-2-Output valueFrom: {path: /tmp/outputs/Output/data} artifacts: - {name: print-and-return-number-2-Output, path: /tmp/outputs/Output/data} metadata: labels: {pipelines.kubeflow.org/kfp_sdk_version: 1.6.3, pipelines.kubeflow.org/pipeline-sdk-type: kfp} annotations: {pipelines.kubeflow.org/component_spec: '{"implementation": {"container": {"args": ["--number", {"inputValue": "number"}, "----output-paths", {"outputPath": "Output"}], "command": ["sh", "-ec", "program_path=$(mktemp)\nprintf \"%s\" \"$0\" > \"$program_path\"\npython3 -u \"$program_path\" \"$@\"\n", "def print_and_return_number(number):\n print(number)\n return number\n\ndef _serialize_int(int_value: int) -> str:\n if isinstance(int_value, str):\n return int_value\n if not isinstance(int_value, int):\n raise TypeError(''Value \"{}\" has type \"{}\" instead of int.''.format(str(int_value), str(type(int_value))))\n return str(int_value)\n\nimport argparse\n_parser = argparse.ArgumentParser(prog=''Print and return number'', description='''')\n_parser.add_argument(\"--number\", dest=\"number\", type=int, required=True, default=argparse.SUPPRESS)\n_parser.add_argument(\"----output-paths\", dest=\"_output_paths\", type=str, nargs=1)\n_parsed_args = vars(_parser.parse_args())\n_output_files = _parsed_args.pop(\"_output_paths\", [])\n\n_outputs = print_and_return_number(**_parsed_args)\n\n_outputs = [_outputs]\n\n_output_serializers = [\n _serialize_int,\n\n]\n\nimport os\nfor idx, output_file in enumerate(_output_files):\n try:\n os.makedirs(os.path.dirname(output_file))\n except OSError:\n pass\n with open(output_file, ''w'') as f:\n f.write(_output_serializers[idx](_outputs[idx]))\n"], "image": "python:3.7"}}, "inputs": [{"name": "number", "type": "Integer"}], "name": "Print and return number", "outputs": [{"name": "Output", "type": "Integer"}]}', pipelines.kubeflow.org/component_ref: '{}', pipelines.kubeflow.org/arguments.parameters: '{"number": "{{inputs.parameters.number_2}}"}'} - name: sum-and-print-numbers container: args: [--number-1, '{{inputs.parameters.print-and-return-number-Output}}', --number-2, '{{inputs.parameters.print-and-return-number-2-Output}}'] command: - sh - -ec - | program_path=$(mktemp) printf "%s" "$0" > "$program_path" python3 -u "$program_path" "$@" - | def sum_and_print_numbers(number_1, number_2): print(number_1 + number_2) import argparse _parser = argparse.ArgumentParser(prog='Sum and print numbers', description='') _parser.add_argument("--number-1", dest="number_1", type=int, required=True, default=argparse.SUPPRESS) _parser.add_argument("--number-2", dest="number_2", type=int, required=True, default=argparse.SUPPRESS) _parsed_args = vars(_parser.parse_args()) _outputs = sum_and_print_numbers(**_parsed_args) image: python:3.7 inputs: parameters: - {name: print-and-return-number-2-Output} - {name: print-and-return-number-Output} metadata: labels: {pipelines.kubeflow.org/kfp_sdk_version: 1.6.3, pipelines.kubeflow.org/pipeline-sdk-type: kfp} annotations: {pipelines.kubeflow.org/component_spec: '{"implementation": {"container": {"args": ["--number-1", {"inputValue": "number_1"}, "--number-2", {"inputValue": "number_2"}], "command": ["sh", "-ec", "program_path=$(mktemp)\nprintf \"%s\" \"$0\" > \"$program_path\"\npython3 -u \"$program_path\" \"$@\"\n", "def sum_and_print_numbers(number_1, number_2):\n print(number_1 + number_2)\n\nimport argparse\n_parser = argparse.ArgumentParser(prog=''Sum and print numbers'', description='''')\n_parser.add_argument(\"--number-1\", dest=\"number_1\", type=int, required=True, default=argparse.SUPPRESS)\n_parser.add_argument(\"--number-2\", dest=\"number_2\", type=int, required=True, default=argparse.SUPPRESS)\n_parsed_args = vars(_parser.parse_args())\n\n_outputs = sum_and_print_numbers(**_parsed_args)\n"], "image": "python:3.7"}}, "inputs": [{"name": "number_1", "type": "Integer"}, {"name": "number_2", "type": "Integer"}], "name": "Sum and print numbers"}', pipelines.kubeflow.org/component_ref: '{}', pipelines.kubeflow.org/arguments.parameters: '{"number_1": "{{inputs.parameters.print-and-return-number-Output}}", "number_2": "{{inputs.parameters.print-and-return-number-2-Output}}"}'} arguments: parameters: - {name: number_1} - {name: number_2} serviceAccountName: pipeline-runner ```
================================================ FILE: docs/kubeflow/basic-requirements.md ================================================ --- title : "3. Install Requirements" description: "" sidebar_position: 3 contributors: ["Jongseob Jeon"] --- 실습을 위해 권장하는 파이썬 버전은 python>=3.7입니다. 파이썬 환경에 익숙하지 않은 분들은 다음 [Appendix 1. 파이썬 가상환경](../appendix/pyenv)을 참고하여 **클라이언트 노드**에 설치해주신 뒤 패키지 설치를 진행해주시기를 바랍니다. 실습을 진행하기에서 필요한 패키지들과 버전은 다음과 같습니다. - requirements.txt ```bash kfp==1.8.9 scikit-learn==1.0.1 mlflow==1.21.0 pandas==1.3.4 dill==0.3.4 ``` [앞에서 만든 파이썬 가상환경](../appendix/pyenv.md#python-가상환경-생성)을 활성화합니다. ```bash pyenv activate demo ``` 패키지 설치를 진행합니다. ```bash pip3 install -U pip pip3 install kfp==1.8.9 scikit-learn==1.0.1 mlflow==1.21.0 pandas==1.3.4 dill==0.3.4 ``` ================================================ FILE: docs/kubeflow/basic-run.md ================================================ --- title : "7. Pipeline - Run" description: "" sidebar_position: 7 contributors: ["Jongseob Jeon"] --- ## Run Pipeline 이제 업로드한 파이프라인을 실행시켜 보겠습니다. ## Before Run ### 1. Create Experiment Experiment란 Kubeflow 에서 실행되는 Run을 논리적으로 관리하는 단위입니다. Kubeflow에서 namespace를 처음 들어오면 생성되어 있는 Experiment가 없습니다. 따라서 파이프라인을 실행하기 전에 미리 Experiment를 생성해두어야 합니다. Experiment이 있다면 [Run Pipeline](../kubeflow/basic-run.md#run-pipeline-1)으로 넘어가도 무방합니다. Experiment는 Create Experiment 버튼을 통해 생성할 수 있습니다. ![run-0.png](./img/run-0.png) ### 2. Name 입력 Experiment로 사용할 이름을 입력합니다. ![run-1.png](./img/run-1.png) ## Run Pipeline ### 1. Create Run 선택 ![run-2.png](./img/run-2.png) ### 2. Experiment 선택 ![run-9.png](./img/run-9.png) ![run-10.png](./img/run-10.png) ### 3. Pipeline Config 입력 파이프라인을 생성할 때 입력한 Config 값들을 채워 넣습니다. 업로드한 파이프라인은 number_1과 number_2를 입력해야 합니다. ![run-3.png](./img/run-3.png) ### 4. Start 입력 후 Start 버튼을 누르면 파이프라인이 실행됩니다. ![run-4.png](./img/run-4.png) ## Run Result 실행된 파이프라인들은 Runs 탭에서 확인할 수 있습니다. Run을 클릭하면 실행된 파이프라인과 관련된 자세한 내용을 확인해 볼 수 있습니다. ![run-5.png](./img/run-5.png) 클릭하면 다음과 같은 화면이 나옵니다. 아직 실행되지 않은 컴포넌트는 회색 표시로 나옵니다. ![run-6.png](./img/run-6.png) 컴포넌트가 실행이 완료되면 초록색 체크 표시가 나옵니다. ![run-7.png](./img/run-7.png) 가장 마지막 컴포넌트를 보면 입력한 Config인 3과 5의 합인 8이 출력된 것을 확인할 수 있습니다. ![run-8.png](./img/run-8.png) ================================================ FILE: docs/kubeflow/how-to-debug.md ================================================ --- title : "13. Component - Debugging" description: "" sidebar_position: 13 contributors: ["Jongseob Jeon"] --- ## Debugging Pipeline 이번 페이지에서는 Kubeflow 컴포넌트를 디버깅하는 방법에 대해서 알아봅니다. ## Failed Component 이번 페이지에서는 [Component - MLFlow](../kubeflow/advanced-mlflow.md#mlflow-pipeline) 에서 이용한 파이프라인을 조금 수정해서 사용합니다. 우선 컴포넌트가 실패하도록 파이프라인을 변경하도록 하겠습니다. ```python from functools import partial import kfp from kfp.components import InputPath, OutputPath, create_component_from_func from kfp.dsl import pipeline @partial( create_component_from_func, packages_to_install=["pandas", "scikit-learn"], ) def load_iris_data( data_path: OutputPath("csv"), target_path: OutputPath("csv"), ): import pandas as pd from sklearn.datasets import load_iris iris = load_iris() data = pd.DataFrame(iris["data"], columns=iris["feature_names"]) target = pd.DataFrame(iris["target"], columns=["target"]) data["sepal length (cm)"] = None data.to_csv(data_path, index=False) target.to_csv(target_path, index=False) @partial( create_component_from_func, packages_to_install=["pandas"], ) def drop_na_from_csv( data_path: InputPath("csv"), output_path: OutputPath("csv"), ): import pandas as pd data = pd.read_csv(data_path) data = data.dropna() data.to_csv(output_path, index=False) @partial( create_component_from_func, packages_to_install=["dill", "pandas", "scikit-learn", "mlflow"], ) def train_from_csv( train_data_path: InputPath("csv"), train_target_path: InputPath("csv"), model_path: OutputPath("dill"), input_example_path: OutputPath("dill"), signature_path: OutputPath("dill"), conda_env_path: OutputPath("dill"), kernel: str, ): import dill import pandas as pd from sklearn.svm import SVC from mlflow.models.signature import infer_signature from mlflow.utils.environment import _mlflow_conda_env train_data = pd.read_csv(train_data_path) train_target = pd.read_csv(train_target_path) clf = SVC(kernel=kernel) clf.fit(train_data, train_target) with open(model_path, mode="wb") as file_writer: dill.dump(clf, file_writer) input_example = train_data.sample(1) with open(input_example_path, "wb") as file_writer: dill.dump(input_example, file_writer) signature = infer_signature(train_data, clf.predict(train_data)) with open(signature_path, "wb") as file_writer: dill.dump(signature, file_writer) conda_env = _mlflow_conda_env( additional_pip_deps=["dill", "pandas", "scikit-learn"] ) with open(conda_env_path, "wb") as file_writer: dill.dump(conda_env, file_writer) @pipeline(name="debugging_pipeline") def debugging_pipeline(kernel: str): iris_data = load_iris_data() drop_data = drop_na_from_csv(data=iris_data.outputs["data"]) model = train_from_csv( train_data=drop_data.outputs["output"], train_target=iris_data.outputs["target"], kernel=kernel, ) if __name__ == "__main__": kfp.compiler.Compiler().compile(debugging_pipeline, "debugging_pipeline.yaml") ``` 수정한 점은 다음과 같습니다. 1. 데이터를 불러오는 `load_iris_data` 컴포넌트에서 `sepal length (cm)` 피처에 `None` 값을 주입 2. `drop_na_from_csv` 컴포넌트에서 `drop_na()` 함수를 이용해 na 값이 포함된 `row`를 제거 이제 파이프라인을 업로드하고 실행해 보겠습니다. 실행 후 Run을 눌러서 확인해보면 `Train from csv` 컴포넌트에서 실패했다고 나옵니다. ![debug-0.png](./img/debug-0.png) 실패한 컴포넌트를 클릭하고 로그를 확인해서 실패한 이유를 확인해 보겠습니다. ![debug-2.png](./img/debug-2.png) 로그를 확인하면 데이터의 개수가 0이여서 실행되지 않았다고 나옵니다. 분명 정상적으로 데이터를 전달했는데 왜 데이터의 개수가 0개일까요? 이제 입력받은 데이터에 어떤 문제가 있었는지 확인해 보겠습니다. 우선 컴포넌트를 클릭하고 Input/Ouput 탭에서 입력값으로 들어간 데이터들을 다운로드 받습니다. 다운로드는 빨간색 네모로 표시된 곳의 링크를 클릭하면 됩니다. ![debug-5.png](./img/debug-5.png) 두 개의 파일을 같은 경로에 다운로드합니다. 그리고 해당 경로로 이동해서 파일을 확인합니다. ```bash ls ``` 다음과 같이 두 개의 파일이 있습니다. ```bash drop-na-from-csv-output.tgz load-iris-data-target.tgz ``` 압축을 풀어보겠습니다. ```bash tar -xzvf load-iris-data-target.tgz ; mv data target.csv tar -xzvf drop-na-from-csv-output.tgz ; mv data data.csv ``` 그리고 이를 주피터 노트북을 이용해 컴포넌트 코드를 실행합니다. ![debug-3.png](./img/debug-3.png) 디버깅을 해본 결과 dropna 할 때 column을 기준으로 drop을 해야 하는데 row를 기준으로 drop을 해서 데이터가 모두 사라졌습니다. 이제 문제의 원인을 알아냈으니 column을 기준으로 drop이 되게 컴포넌트를 수정합니다. ```python @partial( create_component_from_func, packages_to_install=["pandas"], ) def drop_na_from_csv( data_path: InputPath("csv"), output_path: OutputPath("csv"), ): import pandas as pd data = pd.read_csv(data_path) data = data.dropna(axis="columns") data.to_csv(output_path, index=False) ``` 수정 후 파이프라인을 다시 업로드하고 실행하면 다음과 같이 정상적으로 수행하는 것을 확인할 수 있습니다. ![debug-6.png](./img/debug-6.png) ================================================ FILE: docs/kubeflow/kubeflow-concepts.md ================================================ --- title : "2. Kubeflow Concepts" description: "" sidebar_position: 2 contributors: ["Jongseob Jeon"] --- ## Component 컴포넌트(Component)는 컴포넌트 콘텐츠(Component contents)와 컴포넌트 래퍼(Component wrapper)로 구성되어 있습니다. 하나의 컴포넌트는 컴포넌트 래퍼를 통해 kubeflow에 전달되며 전달된 컴포넌트는 정의된 컴포넌트 콘텐츠를 실행(execute)하고 아티팩트(artifacts)들을 생산합니다. ![concept-0.png](./img/concept-0.png) ### Component Contents 컴포넌트 콘텐츠를 구성하는 것은 총 3가지가 있습니다. ![concept-1.png](./img/concept-1.png) 1. Environemnt 2. Python code w\ Config 3. Generates Artifacts 예시와 함께 각 구성 요소가 어떤 것인지 알아보도록 하겠습니다. 다음과 같이 데이터를 불러와 SVC(Support Vector Classifier)를 학습한 후 SVC 모델을 저장하는 과정을 적은 파이썬 코드가 있습니다. ```python import dill import pandas as pd from sklearn.svm import SVC train_data = pd.read_csv(train_data_path) train_target= pd.read_csv(train_target_path) clf= SVC( kernel=kernel ) clf.fit(train_data) with open(model_path, mode="wb") as file_writer: dill.dump(clf, file_writer) ``` 위의 파이썬 코드는 다음과 같이 컴포넌트 콘텐츠로 나눌 수 있습니다. ![concept-2.png](./img/concept-2.png) Environment는 파이썬 코드에서 사용하는 패키지들을 import하는 부분입니다. 다음으로 Python Code w\ Config 에서는 주어진 Config를 이용해 실제로 학습을 수행합니다. 마지막으로 아티팩트를 저장하는 과정이 있습니다. ### Component Wrapper 컴포넌트 래퍼는 컴포넌트 콘텐츠에 필요한 Config를 전달하고 실행시키는 작업을 합니다. ![concept-3.png](./img/concept-3.png) Kubeflow에서는 컴포넌트 래퍼를 위의 `train_svc_from_csv`와 같이 함수의 형태로 정의합니다. 컴포넌트 래퍼가 콘텐츠를 감싸면 다음과 같이 됩니다. ![concept-4.png](./img/concept-4.png) ### Artifacts 위의 설명에서 컴포넌트는 아티팩트(Artifacts)를 생성한다고 했습니다. 아티팩트란 evaluation result, log 등 어떤 형태로든 파일로 생성되는 것을 통틀어서 칭하는 용어입니다. 그중 우리가 관심을 두는 유의미한 것들은 다음과 같은 것들이 있습니다. ![concept-5.png](./img/concept-5.png) - Model - Data - Metric - etc #### Model 저희는 모델을 다음과 같이 정의 했습니다. > 모델이란 파이썬 코드와 학습된 Weights와 Network 구조 그리고 이를 실행시키기 위한 환경이 모두 포함된 형태 #### Data 데이터는 전 처리된 피처, 모델의 예측 값 등을 포함합니다. #### Metric Metric은 동적 지표와 정적 지표 두 가지로 나누었습니다. - 동적 지표란 train loss와 같이 학습이 진행되는 중 에폭(Epoch)마다 계속해서 변화하는 값을 의미합니다. - 정적 지표란 학습이 끝난 후 최종적으로 모델을 평가하는 정확도 등을 의미합니다. ## Pipeline 파이프라인은 컴포넌트의 집합과 컴포넌트를 실행시키는 순서도로 구성되어 있습니다. 이 때, 순서도는 방향 순환이 없는 그래프로 이루어져 있으며, 간단한 조건문을 포함할 수 있습니다. ![concept-6.png](./img/concept-6.png) ### Pipeline Config 앞서 컴포넌트를 실행시키기 위해서는 Config가 필요하다고 설명했습니다. 파이프라인을 구성하는 컴포넌트의 Config 들을 모아 둔 것이 파이프라인 Config입니다. ![concept-7.png](./img/concept-7.png) ## Run 파이프라인이 필요로 하는 파이프라인 Config가 주어져야지만 파이프라인을 실행할 수 있습니다. Kubeflow에서는 실행된 파이프라인을 Run 이라고 부릅니다. ![concept-8.png](./img/concept-8.png) 파이프라인이 실행되면 각 컴포넌트가 아티팩트들을 생성합니다. Kubeflow pipeline에서는 Run 하나당 고유한 ID 를 생성하고, Run에서 생성되는 모든 아티팩트들을 저장합니다. ![concept-9.png](./img/concept-9.png) 그러면 이제 직접 컴포넌트와 파이프라인을 작성하는 방법에 대해서 알아보도록 하겠습니다. ================================================ FILE: docs/kubeflow/kubeflow-intro.md ================================================ --- title : "1. Kubeflow Introduction" description: "" sidebar_position: 1 contributors: ["Jongseob Jeon"] --- Kubeflow를 사용하기 위해서는 컴포넌트(Component)와 파이프라인(Pipeline)을 작성해야 합니다. *모두의 MLOps*에서 설명하는 방식은 [Kubeflow Pipeline 공식 홈페이지](https://www.kubeflow.org/docs/components/pipelines/overview/quickstart/)에서 설명하는 방식과는 다소 차이가 있습니다. 여기에서는 Kubeflow Pipeline을 워크플로(Workflow)가 아닌 앞서 설명한 [MLOps를 구성하는 요소](../kubeflow/kubeflow-concepts.md#component-contents) 중 하나의 컴포넌트로 사용하기 때문입니다. 그럼 이제 컴포넌트와 파이프라인은 무엇이며 어떻게 작성할 수 있는지 알아보도록 하겠습니다. ================================================ FILE: docs/kubeflow-dashboard-guide/_category_.json ================================================ { "label": "Kubeflow UI Guide", "position": 5, "link": { "type": "generated-index" } } ================================================ FILE: docs/kubeflow-dashboard-guide/experiments-and-others.md ================================================ --- title : "6. Kubeflow Pipeline 관련" description: "" sidebar_position: 6 contributors: ["Jaeyeon Kim"] --- Central Dashboard의 왼쪽 탭의 Experiments(KFP), Pipelines, Runs, Recurring Runs, Artifacts, Executions 페이지들에서는 Kubeflow Pipeline과 Pipeline의 실행 그리고 Pipeline Run의 결과를 관리합니다. ![left-tabs](./img/left-tabs.png) Kubeflow Pipeline이 *모두의 MLOps*에서 Kubeflow를 사용하는 주된 이유이며, Kubeflow Pipeline을 만드는 방법, 실행하는 방법, 결과를 확인하는 방법 등 자세한 내용은 [3.Kubeflow](../kubeflow/kubeflow-intro)에서 다룹니다. ================================================ FILE: docs/kubeflow-dashboard-guide/experiments.md ================================================ --- title : "5. Experiments(AutoML)" description: "" sidebar_position: 5 contributors: ["Jaeyeon Kim"] --- 다음으로는 Central Dashboard의 왼쪽 탭의 Experiments(AutoML)을 클릭해보겠습니다. ![left-tabs](./img/left-tabs.png) ![automl](./img/automl.png) Experiments(AutoML) 페이지는 Kubeflow에서 Hyperparameter Tuning과 Neural Architecture Search를 통한 AutoML을 담당하는 [Katib](https://www.kubeflow.org/docs/components/katib/overview/)를 관리할 수 있는 페이지입니다. Katib와 Experiments(AutoML)에 대한 사용법은 *모두의 MLOps* v1.0에서는 다루지 않으며, v2.0에 추가될 예정입니다. ================================================ FILE: docs/kubeflow-dashboard-guide/intro.md ================================================ --- title : "1. Central Dashboard" description: "" sidebar_position: 1 contributors: ["Jaeyeon Kim", "SeungTae Kim"] --- [Kubeflow 설치](../setup-components/install-components-kf.md)를 완료하면, 다음 커맨드를 통해 대시보드에 접속할 수 있습니다. ```bash kubectl port-forward --address 0.0.0.0 svc/istio-ingressgateway -n istio-system 8080:80 ``` ![after-login](./img/after-login.png) Central Dashboard는 Kubeflow에서 제공하는 모든 기능을 통합하여 제공하는 UI입니다. Central Dashboard에서 제공하는 기능은 크게 왼쪽의 탭을 기준으로 구분할 수 있습니다. ![left-tabs](./img/left-tabs.png) - Home - Notebooks - Tensorboards - Volumes - Models - Experiments(AutoML) - Experiments(KFP) - Pipelines - Runs - Recurring Runs - Artifacts - Executions 그럼 이제 기능별 간단한 사용법을 알아보겠습니다. ================================================ FILE: docs/kubeflow-dashboard-guide/notebooks.md ================================================ --- title : "2. Notebooks" description: "" sidebar_position: 2 contributors: ["Jaeyeon Kim"] --- ## 노트북 서버(Notebook Server) 생성하기 다음 Central Dashboard의 왼쪽 탭의 Notebooks를 클릭해보겠습니다. ![left-tabs](./img/left-tabs.png) 다음과 같은 화면을 볼 수 있습니다. Notebooks 탭은 JupyterHub와 비슷하게 유저별로 jupyter notebook 및 code server 환경(이하 노트북 서버)을 독립적으로 생성하고 접속할 수 있는 페이지입니다. ![notebook-home](./img/notebook-home.png) 오른쪽 위의 `+ NEW NOTEBOOK` 버튼을 클릭합니다. ![new-notebook](./img/new-notebook.png) 아래와 같은 화면이 나타나면, 이제 생성할 노트북 서버의 스펙(Spec)을 명시하여 생성합니다. ![create](./img/create.png)
각 스펙에 대한 자세한 내용은 아래와 같습니다. - **name**: - 노트북 서버를 구분할 수 있는 이름으로 생성합니다. - **namespace** : - 따로 변경할 수 없습니다. (현재 로그인한 user 계정의 namespace이 자동으로 지정되어 있습니다.) - **Image**: - sklearn, pytorch, tensorflow 등의 파이썬 패키지가 미리 설치된 jupyter lab 이미지 중 사용할 이미지를 선택합니다. - 노트북 서버 내에서 GPU를 사용하여 tensorflow-cuda, pytorch-cuda 등의 이미지를 사용하는 경우, **하단의 GPUs** 부분을 확인하시기 바랍니다. - 추가적인 패키지나 소스코드 등을 포함한 커스텀(Custom) 노트북 서버를 사용하고 싶은 경우에는 커스텀 이미지(Custom Image)를 만들고 배포 후 사용할 수도 있습니다. - **CPU / RAM** - 필요한 자원 사용량을 입력합니다. - cpu : core 단위 - 가상 core 개수 단위를 의미하며, int 형식이 아닌 `1.5`, `2.7` 등의 float 형식도 입력할 수 있습니다. - memory : Gi 단위 - **GPUs** - 주피터 노트북에 할당할 GPU 개수를 입력합니다. - `None` - GPU 자원이 필요하지 않은 상황 - 1, 2, 4 - GPU 1, 2, 4 개 할당 - GPU Vendor - 앞의 [(Optional) Setup GPU](../setup-kubernetes/setup-nvidia-gpu.md) 를 따라 nvidia gpu plugin을 설치하였다면 NVIDIA를 선택합니다. - **Workspace Volume** - 노트북 서버 내에서 필요한 만큼의 디스크 용량을 입력합니다. - Type 과 Name 은 변경하지 않고, **디스크 용량을 늘리고 싶거나** **AccessMode 를 변경하고 싶을** 때에만 변경해서 사용하시면 됩니다. - **"Don't use Persistent Storage for User's home"** 체크박스는 노트북 서버의 작업 내용을 저장하지 않아도 상관없을 때에만 클릭합니다. **일반적으로는 누르지 않는 것을 권장합니다.** - 기존에 미리 생성해두었던 PVC를 사용하고 싶을 때에는, Type을 "Existing" 으로 입력하여 해당 PVC의 이름을 입력하여 사용하시면 됩니다. - **Data Volumes** - 추가적인 스토리지 자원이 필요하다면 **"+ ADD VOLUME"** 버튼을 클릭하여 생성할 수 있습니다. - ~~Configurations, Affinity/Tolerations, Miscellaneous Settings~~ - 일반적으로는 필요하지 않으므로 *모두의 MLOps*에서는 자세한 설명을 생략합니다.
모두 정상적으로 입력하였다면 하단의 **LAUNCH** 버튼이 활성화되며, 버튼을 클릭하면 노트북 서버 생성이 시작됩니다. ![creating](./img/creating.png) 생성 후 아래와 같이 **Status** 가 초록색 체크 표시 아이콘으로 변하며, **CONNECT 버튼**이 활성화됩니다. ![created](./img/created.png) --- ## 노트북 서버 접속하기 **CONNECT 버튼**을 클릭하면 브라우저에 새 창이 열리며, 다음과 같은 화면이 보입니다. ![notebook-access](./img/notebook-access.png) **Launcher**의 Notebook, Console, Terminal 아이콘을 클릭하여 사용할 수 있습니다. 생성된 Notebook 화면 ![notebook-console](./img/notebook-console.png) 생성된 Terminal 화면 ![terminal-console](./img/terminal-console.png) --- ## 노트북 서버 중단하기 노트북 서버를 오랜 시간 사용하지 않는 경우, 쿠버네티스 클러스터의 효율적인 리소스 사용을 위해서 노트북 서버를 중단(Stop)할 수 있습니다. **단, 이 경우 노트북 서버 생성 시 Workspace Volume 또는 Data Volume으로 지정해놓은 경로 외에 저장된 데이터는 모두 초기화되는 것에 주의하시기 바랍니다.** 노트북 서버 생성 당시 경로를 변경하지 않았다면, 디폴트(Default) Workspace Volume의 경로는 노트북 서버 내의 `/home/jovyan` 이므로, `/home/jovyan` 의 하위 경로 이외의 경로에 저장된 데이터는 모두 사라집니다. 다음과 같이 `STOP` 버튼을 클릭하면 노트북 서버가 중단됩니다. ![notebook-stop](./img/notebook-stop.png) 중단이 완료되면 다음과 같이 `CONNECT` 버튼이 비활성화되며, `PLAY` 버튼을 클릭하면 다시 정상적으로 사용할 수 있습니다. ![notebook-restart](./img/notebook-restart.png) ================================================ FILE: docs/kubeflow-dashboard-guide/tensorboards.md ================================================ --- title : "3. Tensorboards" description: "" sidebar_position: 3 contributors: ["Jaeyeon Kim"] --- 다음으로는 Central Dashboard의 왼쪽 탭의 Tensorboards를 클릭해보겠습니다. ![left-tabs](./img/left-tabs.png) 다음과 같은 화면을 볼 수 있습니다. ![tensorboard](./img/tensorboard.png) Tensorboards 탭은 Tensorflow, PyTorch 등의 프레임워크에서 제공하는 Tensorboard 유틸이 생성한 ML 학습 관련 데이터를 시각화하는 텐서보드 서버(Tensorboard Server)를 쿠버네티스 클러스터에 생성하는 기능을 제공합니다. 이렇게 생성한 텐서보드 서버는, 일반적인 원격 텐서보드 서버의 사용법과 같이 사용할 수도 있으며, [Kubeflow 파이프라인 런에서 바로 텐서보드 서버에 데이터를 저장하는 용도](https://www.kubeflow.org/docs/components/pipelines/sdk/output-viewer/#tensorboard)로 활용할 수 있습니다. Kubeflow 파이프라인 런의 결과를 시각화하는 방법에는 [다양한 방식](https://www.kubeflow.org/docs/components/pipelines/sdk/output-viewer/)이 있으며, *모두의 MLOps*에서는 더 일반적으로 활용할 수 있도록 Kubeflow 컴포넌트의 Visualization 기능과 MLflow의 시각화 기능을 활용할 예정이므로, Tensorboards 페이지에 대한 자세한 설명은 생략하겠습니다. ================================================ FILE: docs/kubeflow-dashboard-guide/volumes.md ================================================ --- title : "4. Volumes" description: "" sidebar_position: 4 contributors: ["Jaeyeon Kim"] --- ## Volumes 다음으로는 Central Dashboard의 왼쪽 탭의 Volumes를 클릭해보겠습니다. ![left-tabs](./img/left-tabs.png) 다음과 같은 화면을 볼 수 있습니다. ![volumes](./img/volumes.png) Volumes 탭은 [Kubernetes의 볼륨(Volume)](https://kubernetes.io/ko/docs/concepts/storage/volumes/), 정확히는 [퍼시스턴트 볼륨 클레임(Persistent Volume Claim, 이하 pvc)](https://kubernetes.io/ko/docs/concepts/storage/persistent-volumes/) 중 현재 user의 namespace에 속한 pvc를 관리하는 기능을 제공합니다. 위 스크린샷을 보면, [1. Notebooks](../kubeflow-dashboard-guide/notebooks) 페이지에서 생성한 Volume의 정보를 확인할 수 있습니다. 해당 Volume의 Storage Class는 쿠버네티스 클러스터 설치 당시 설치한 Default Storage Class인 local-path로 설정되어있음을 확인할 수 있습니다. 이외에도 user namespace에 새로운 볼륨을 생성하거나, 조회하거나, 삭제하고 싶은 경우에 Volumes 페이지를 활용할 수 있습니다. --- ## 볼륨 생성하기 오른쪽 위의 `+ NEW VOLUME` 버튼을 클릭하면 다음과 같은 화면을 볼 수 있습니다. ![new-volume](./img/new-volume.png) name, size, storage class, access mode를 지정하여 생성할 수 있습니다. 원하는 리소스 스펙을 지정하여 생성하면 다음과 같이 볼륨의 Status가 `Pending`으로 조회됩니다. `Status` 아이콘에 마우스 커서를 가져다 대면 *해당 볼륨은 mount하여 사용하는 first consumer가 나타날 때 실제로 생성을 진행한다(This volume will be bound when its first consumer is created.)*는 메시지를 확인할 수 있습니다. 이는 실습을 진행하는 [StorageClass](https://kubernetes.io/ko/docs/concepts/storage/storage-classes/)인 `local-path`의 볼륨 생성 정책에 해당하며, **문제 상황이 아닙니다.** 해당 페이지에서 Status가 `Pending` 으로 보이더라도 해당 볼륨을 사용하길 원하는 노트북 서버 혹은 파드(Pod)에서는 해당 볼륨의 이름을 지정하여 사용할 수 있으며, 그때 실제로 볼륨 생성이 진행됩니다. ![creating-volume](./img/creating-volume.png) ================================================ FILE: docs/prerequisites/_category_.json ================================================ { "label": "Prerequisites", "position": 1, "link": { "type": "generated-index" } } ================================================ FILE: docs/prerequisites/docker/_category_.json ================================================ { "label": "Docker", "position": 1, "link": { "type": "generated-index" } } ================================================ FILE: docs/prerequisites/docker/advanced.md ================================================ --- title : "[Practice] Docker Advanced" description: "Practice to use docker more advanced way." sidebar_position: 6 contributors: ["Jongseob Jeon", "Jaeyeon Kim"] --- ## 도커 이미지 잘 만들기 ### 도커 이미지를 만들 때 고려해야 될 점 Dockerfile 을 활용하여 도커 이미지를 만들 때는 명령어의 **순서**가 중요합니다. 그 이유는 도커 이미지는 여러 개의 Read-Only Layer 로 구성되어있고, 이미지를 빌드할 때 이미 존재하는 레이어는 **캐시되어** 재사용되기 때문에, 이를 생각해서 Dockerfile 을 구성한다면 **빌드 시간을 줄일 수 있습니다.** Dockerfile에서 `RUN`, `ADD`, `COPY` 명령어 하나가 하나의 레이어로 저장됩니다. 예를 들어서 다음과 같은 `Dockerfile`이 있습니다. ```docker # Layer 1 FROM ubuntu:latest # Layer 2 RUN apt-get update && apt-get install python3 pip3 -y # Layer 3 RUN pip3 install -U pip && pip3 install torch # Layer 4 COPY src/ src/ # Layer 5 CMD python src/app.py ``` 위의 `Dockerfile`로 빌드된 이미지를 `docker run -it app:latest /bin/bash` 명령어로 실행하면 다음과 같은 레이어로 표현할 수 있습니다. ![layers.png](./img/layers.png) 최상단의 R/W Layer 는 이미지에 영향을 주지 않습니다. 즉, 컨테이너 내부에서 작업한 내역은 모두 휘발성입니다. 하단의 레이어가 변경되면, 그 위의 레이어는 모두 새로 빌드됩니다. 그래서 Dockerfile 내장 명령어의 순서가 중요합니다. 예를 들면, **자주 변경**되는 부분은 **최대한 뒤쪽으로** 정렬하는 것을 추천합니다. (ex. `COPY src/ app/src/`) 그렇기 때문에 반대로 변경되지 않는 부분은 최대한 앞쪽으로 정렬하는게 좋습니다. 만약 거의 **변경되지 않지만**, 여러 곳에서 **자주** 쓰이는 부분을 공통화할 수도 있습니다. 해당 공통부분만 묶어서 별도의 이미지는 미리 만들어둔 다음, **베이스 이미지** 로 활용하는 것이 좋습니다. 예를 들어, 다른 건 거의 똑같은데, tensorflow-cpu 를 사용하는 이미지와, tensorflow-gpu 를 사용하는 환경을 분리해서 이미지로 만들고 싶은 경우에는 다음과 같이 할 수 있습니다. python 과 기타 기본적인 패키지가 설치된 [`ghcr.io/makinarocks/python:3.8-base`](http://ghcr.io/makinarocks/python:3.8-base-cpu) 를 만들어두고, **tensorflow cpu 버전과 gpu 버전이** 설치된 이미지 새로 만들때는, 위의 이미지를 `FROM` 으로 불러온 다음, tensorflow install 하는 부분만 별도로 작성해서 Dockerfile 을 2 개로 관리한다면 가독성도 좋고 빌드 시간도 줄일 수 있습니다. **합칠 수 있는 Layer 는 합치는 것**이 Old version 의 도커에서는 성능 향상 효과를 이끌었습니다. 여러분의 도커 컨테이너가 어떤 도커 버전에서 실행될 것인지 보장할 수 없으며, **가독성**을 위해서도 합칠 수 있는 Layer 는 적절히 합치는 것이 좋습니다. 예를 들면, 다음과 같이 작성된 `Dockerfile`이 있습니다. ```docker # Bad Case RUN apt-get update RUN apt-get install build-essential -y RUN apt-get install curl -y RUN apt-get install jq -y RUN apt-get install git -y ``` 이를 아래와 같이 합쳐서 적을 수 있습니다. ```docker # Better Case RUN apt-get update && \ apt-get install -y \ build-essential \ curl \ jq \ git ``` 편의를 위해서는 `.dockerignore` 도 사용하는게 좋습니다. `.dockerignore`는 `.gitignore` 와 비슷한 역할을 한다고 이해하면 됩니다. (git add 할 때 제외할 수 있듯이, docker build 할 때 자동으로 제외) 더 많은 정보는 [Docker 공식 문서](https://docs.docker.com/develop/develop-images/dockerfile_best-practices/)에서 확인하실 수 있습니다. ### ENTRYPOINT vs CMD `ENTRYPOINT` 와 `CMD` 는 모두 컨테이너의 실행 시점에서 어떤 명령어를 실행시키고 싶을 때 사용합니다. 그리고 이 둘 중 하나는 반드시 존재해야 합니다. - **차이점** - `CMD`: docker run 을 수행할 때, 쉽게 변경하여 사용할 수 있음 - `ENTRYPOINT`: `--entrypoint` 를 사용해야 변경할 수 있음 `ENTRYPOINT` 와 `CMD` 가 함께 쓰일 때는 보통 `CMD`는 `ENTRYPOINT` 에서 적은 명령의 arguments(parameters) 를 의미합니다. 예를 들어서 다음과 같은 `Dockerfile` 이 있습니다. ```docker FROM ubuntu:latest # 아래 4 가지 option 을 바꿔가며 직접 테스트해보시면 이해하기 편합니다. # 단, NO ENTRYPOINT 옵션은 base image 인 ubuntu:latest 에 이미 있어서 테스트해볼 수는 없고 나머지 v2, 3, 5, 6, 8, 9, 11, 12 를 테스트해볼 수 있습니다. # ENTRYPOINT echo "Hello ENTRYPOINT" # ENTRYPOINT ["echo", "Hello ENTRYPOINT"] # CMD echo "Hello CMD" # CMD ["echo", "Hello CMD"] ``` 위의 `Dockerfile`에서 주석으로 표시된 부분들을 해제하며 빌드하고 실행하면 다음과 같은 결과를 얻을 수 있습니다. | | No ENTRYPOINT | ENTRYPOINT a b | ENTRYPOINT ["a", "b"] | | ------------------ | -------------- | -------------- | --------------------- | | **NO CMD** | Error! | /bin/sh -c a b | a b | | **CMD ["x", "y"]** | x y | /bin/sh -c a b | a b x y | | **CMD x y** | /bin/sh -c x y | /bin/sh -c a b | a b /bin/sh -c x y | - In Kubernetes pod - `ENTRYPOINT` → command - `CMD` → args ### Docker tag 이름 짓기 도커 이미지의 tag 로 **latest 는 사용하지 않는 것을 권장**합니다. 이유는 latest 는 default tag name 이므로 **의도치 않게 overwritten** 되는 경우가 너무 많이 발생하기 때문입니다. 하나의 이미지는 하나의 태그를 가짐(**uniqueness**)을 보장해야 추후 Production 단계에서 **협업/디버깅**에 용이합니다. 내용은 다르지만, 동일한 tag 를 사용하게 되면 추후 dangling image 로 취급되어 관리하기 어려워집니다. dangling image는 `docker images`에는 나오지 않지만 계속해서 저장소를 차지하고 있습니다. ### ETC 1. log 등의 정보는 container 내부가 아닌 곳에 따로 저장합니다. container 내부에서 write 한 data 는 언제든지 사라질 수 있기 때문입니다. 2. secret 한 정보, 환경(dev/prod) dependent 한 정보 등은 Dockerfile 에 직접 적는 게 아니라, env var 또는 .env config file 을 사용합니다. 3. Dockerfile **linter** 도 존재하므로, 협업 시에는 활용하면 좋습니다. [https://github.com/hadolint/hadolint](https://github.com/hadolint/hadolint) ## docker run 의 다양한 옵션 ### docker run with volume Docker container 사용 시 불편한 점이 있습니다. 바로 Docker는 기본적으로 Docker **container 내부에서 작업한 모든 사항은 저장되지 않습니다.** 이유는 Docker container 는 각각 격리된 파일시스템을 사용합니다. 따라서, **여러 docker container 끼리 데이터를 공유하기 어렵습니다.** 이 문제를 해결하기 위해서 Docker에서 제공하는 방식은 **2 가지**가 있습니다. ![storage.png](./img/storage.png) #### Docker volume - docker cli 를 사용해 `volume` 이라는 리소스를 직접 관리 - host 에서 Docker area(`/var/lib/docker`) 아래에 특정 디렉토리를 생성한 다음, 해당 경로를 docker container 에 mount #### Bind mount - host 의 특정 경로를 docker container 에 mount #### How to use? 사용 방식은 **동일한 인터페이스**로 `-v` 옵션을 통해 사용할 수 있습니다. 다만, volume 을 사용할 때에는 `docker volume create`, `docker volume ls`, `docker volume rm` 등을 수행하여 직접 관리해주어야 합니다. - Docker volume ```bash docker run \ -v my_volume:/app \ nginx:latest ```` - Blind mount ```bash docker run \ -v /home/user/some/path:/app \ nginx:latest ``` 로컬에서 개발할 때는 bind mount 가 편하긴 하지만, 환경을 깔끔하게 유지하고 싶다면 docker volume 을 사용하여 create, rm 을 명시적으로 수행하는 것도 하나의 방법입니다. 쿠버네티스에서 스토리지를 제공하는 방식도 결국 docker 의 bind mount 를 활용하여 제공합니다. ### docker run with resource limit 기본적으로 docker container 는 **host OS 의 cpu, memory 자원을 fully 사용**할 수 있습니다. 하지만 이렇게 사용하게 되면 host OS 의 자원 상황에 따라서 **OOM** 등의 이슈로 docker container 가 비정상적으로 종료되는 상황이 발생할 수 있습니다. 이런 문제를 다루기 위해 **docker container 실행 시, cpu 와 memory 의 사용량 제한**을 걸 수 있는 `-m` [옵션](https://docs.docker.com/config/containers/resource_constraints/#limit-a-containers-access-to-memory)을 제공합니다. ```bash docker run -d -m 512m --memory-reservation=256m --name 512-limit ubuntu sleep 3600 docker run -d -m 1g --memory-reservation=256m --name 1g-limit ubuntu sleep 3600 ``` 위의 도커를 실행 후 `docker stats` 커맨드를 통해 사용량을 확인할 수 있습니다. ```bash CONTAINER ID NAME CPU % MEM USAGE / LIMIT MEM % NET I/O BLOCK I/O PIDS 4ea1258e2e09 1g-limit 0.00% 300KiB / 1GiB 0.03% 1kB / 0B 0B / 0B 1 4edf94b9a3e5 512-limit 0.00% 296KiB / 512MiB 0.06% 1.11kB / 0B 0B / 0B 1 ``` 쿠버네티스에서 pod 라는 리소스에 cpu, memory 제한을 줄 때, 이 방식을 활용하여 제공합니다. ### docker run with restart policy 특정 컨테이너가 계속해서 running 상태를 유지시켜야 하는 경우가 존재합니다. 이런 경우를 위해서 해당 컨테이너가 종료되자마자 바로 재생성을 시도할 수 있는 `--restart=always` 옵션을 제공하고 있습니다. 옵션 입력 후 도커를 실행합니다. ```bash docker run --restart=always ubuntu ``` `watch -n1 docker ps`를 통해 재실행이 되고 있는지 확인합니다. 정상적으로 수행되고 있다면 다음과 같이 STATUS에 `Restarting (0)` 이 출력됩니다. ```bash CONTAINER ID IMAGE COMMAND CREATED STATUS PORTS NAMES a911850276e8 ubuntu "bash" 35 seconds ago Restarting (0) 6 seconds ago hungry_vaughan ``` - [https://docs.docker.com/engine/reference/commandline/run/#restart-policies---restart](https://docs.docker.com/engine/reference/commandline/run/#restart-policies---restart) - on-failure with max retries - always 등의 선택지 제공 쿠버네티스에서 job 이라는 resource 의 restart 옵션을 줄 때, 이 방식을 활용하여 제공합니다. ### docker run as a background process 도커 컨테이너를 실행할 때는 기본적으로 foreground process 로 실행됩니다. 즉, 컨테이너를 실행한 터미널이 해당 컨테이너에 자동으로 attach 되어 있어, 다른 명령을 실행할 수 없습니다. 다음과 같은 예시를 수행해봅니다. 우선 터미널 2 개를 열어, 하나의 터미널에서는 `docker ps` 를 지켜보고, 다른 하나의 터미널에서는 다음과 같은 명령을 차례로 실행해보며 동작을 지켜봅니다. #### First Practice ```bash docker run -it ubuntu sleep 10 ``` 10 초동안 멈춰 있어야 하고, 해당 컨테이너에서 다른 명령을 수행할 수 없습니다. 10초 뒤에는 docker ps 에서 container 가 종료되는 것을 확인할 수 있습니다. #### Second Practice ```bash docker run -it ubuntu sleep 10 ``` 이후, `ctrl + p` -> `ctrl + q` 해당 터미널에서 이제 다른 명령을 수행할 수 있게 되었으며, docker ps 로도 10초까지는 해당 컨테이너가 살아있는 것을 확인할 수 있습니다. 이렇게 docker container 내부에서 빠져나온 상황을 detached 라고 부릅니다. 도커에서는 run 을 실행함과 동시에 detached mode 로 실행시킬 수 있는 옵션을 제공합니다. #### Third Practice ```bash docker run -d ubuntu sleep 10 ``` detached mode 이므로 해당 명령을 실행시킨 터미널에서 다른 액션을 수행시킬 수 있습니다. 상황에 따라 detached mode 를 적절히 활용하면 좋습니다. 예를 들어, DB 와 통신하는 Backend API server 를 개발할 때 Backend API server 는 source code 를 변경시켜가면서 hot-loading 으로 계속해서 로그를 확인해봐야 하지만, DB 는 로그를 지켜볼 필요는 없는 경우라면 다음과 같이 실행할 수 있습니다. DB 는 docker container 를 detached mode 로 실행시키고, Backend API server 는 attached mode 로 log 를 following 하면서 실행시키면 효율적입니다. ## References - [https://towardsdatascience.com/docker-storage-598e385f4efe](https://towardsdatascience.com/docker-storage-598e385f4efe) - [https://vsupalov.com/docker-latest-tag/](https://vsupalov.com/docker-latest-tag/) - [https://docs.microsoft.com/ko-kr/azure/container-registry/container-registry-image-tag-version](https://docs.microsoft.com/ko-kr/azure/container-registry/container-registry-image-tag-version) - [https://stevelasker.blog/2018/03/01/docker-tagging-best-practices-for-tagging-and-versioning-docker-images/](https://stevelasker.blog/2018/03/01/docker-tagging-best-practices-for-tagging-and-versioning-docker-images/) ================================================ FILE: docs/prerequisites/docker/command.md ================================================ --- title : "[Practice] Docker command" description: "Practice to use docker command." sidebar_position: 4 contributors: ["Jongseob Jeon", "Jaeyeon Kim"] --- ## 1. 정상 설치 확인 ```bash docker run hello-world ``` 정상적으로 설치된 경우 다음과 같은 메시지를 확인할 수 있습니다. ```bash Hello from Docker! This message shows that your installation appears to be working correctly. .... ``` **(For ubuntu)** sudo 없이 사용하고 싶다면 아래 사이트를 참고합니다. - [https://docs.docker.com/engine/install/linux-postinstall/#manage-docker-as-a-non-root-user](https://docs.docker.com/engine/install/linux-postinstall/#manage-docker-as-a-non-root-user) ## 2. Docker Pull docker image registry(도커 이미지를 저장하고 공유할 수 있는 저장소)로부터 Docker image 를 로컬에 다운로드 받는 커맨드입니다. 아래 커맨드를 통해 docker pull에서 사용 가능한 argument들을 확인할 수 있습니다. ```bash docker pull --help ``` 정상적으로 수행되면 아래와 같이 출력됩니다. ```bash Usage: docker pull [OPTIONS] NAME[:TAG|@DIGEST] Pull an image or a repository from a registry Options: -a, --all-tags Download all tagged images in the repository --disable-content-trust Skip image verification (default true) --platform string Set platform if server is multi-platform capable -q, --quiet Suppress verbose output ``` 여기서 알 수 있는 것은 바로 docker pull은 두 개 타입의 argument를 받는다는 것을 알 수 있습니다. 1. `[OPTIONS]` 2. `NAME[:TAG|@DIGEST]` help에서 나온 `-a`, -`q` 옵션을 사용하기 위해서는 NAME 앞에서 사용해야 합니다. 직접 `ubuntu:18.04` 이미지를 pull 해보겠습니다. ```bash docker pull ubuntu:18.04 ``` 위 명령어를 해석하면 `ubuntu` 라는 이름을 가진 이미지 중 `18.04` 태그가 달려있는 이미지를 가져오라는 뜻입니다. 만약, 정상적으로 수행된다면 다음과 비슷하게 출력됩니다. ```bash 18.04: Pulling from library/ubuntu 20d796c36622: Pull complete Digest: sha256:42cd9143b6060261187a72716906187294b8b66653b50d70bc7a90ccade5c984 Status: Downloaded newer image for ubuntu:18.04 docker.io/library/ubuntu:18.04 ``` 위의 명령어를 수행하면 [docker.io/library](http://docker.io/library/) 라는 이름의 registry 에서 ubuntu:18.04 라는 image 를 여러분의 노트북에 다운로드 받게됩니다. - 참고사항 - 추후 [docker.io](http://docker.io) 나 public 한 docker hub 와 같은 registry 대신에, 특정 **private** 한 registry 에서 docker image 를 가져와야 하는 경우에는, [`docker login`](https://docs.docker.com/engine/reference/commandline/login/) 을 통해서 특정 registry 를 바라보도록 한 뒤, docker pull 을 수행하는 형태로 사용합니다. 혹은 insecure registry 를 설정하는 [방안](https://stackoverflow.com/questions/42211380/add-insecure-registry-to-docker)도 활용할 수 있습니다. - 폐쇄망에서 docker image 를 `.tar` 파일과 같은 형태로 저장하고 공유할 수 있도록 [`docker save`](https://docs.docker.com/engine/reference/commandline/save/), [`docker load`](https://docs.docker.com/engine/reference/commandline/load/) 와 같은 명령어도 존재합니다. ## 3. Docker images 로컬에 존재하는 docker image 리스트를 출력하는 커맨드입니다. ```bash docker images --help ``` docker images에서 사용할 수 있는 argument는 다음과 같습니다. ```bash Usage: docker images [OPTIONS] [REPOSITORY[:TAG]] List images Options: -a, --all Show all images (default hides intermediate images) --digests Show digests -f, --filter filter Filter output based on conditions provided --format string Pretty-print images using a Go template --no-trunc Don't truncate output -q, --quiet Only show image IDs ``` 아래 명령어를 이용해 직접 실행해 보겠습니다. ```bash docker images ``` 만약 도커를 최초 설치 후 이 실습을 진행한다면 다음과 비슷하게 출력됩니다. ```bash REPOSITORY TAG IMAGE ID CREATED SIZE ubuntu 18.04 29e70752d7b2 2 days ago 56.7MB ``` 줄 수 있는 argument중 `-q`를 사용하면 `IMAGE ID` 만 출력됩니다. ```bash docker images -q ``` ```bash 29e70752d7b2 ``` ## 4. Docker ps 현재 실행 중인 도커 컨테이너 리스트를 출력하는 커맨드입니다. ```bash docker ps --help ``` docker ps에서 사용할 수 있는 argument는 다음과 같습니다. ```bash Usage: docker ps [OPTIONS] List containers Options: -a, --all Show all containers (default shows just running) -f, --filter filter Filter output based on conditions provided --format string Pretty-print containers using a Go template -n, --last int Show n last created containers (includes all states) (default -1) -l, --latest Show the latest created container (includes all states) --no-trunc Don't truncate output -q, --quiet Only display container IDs -s, --size Display total file sizes ``` 아래 명령어를 이용해 직접 실행해 보겠습니다. ```bash docker ps ``` 현재 실행 중인 컨테이너가 없다면 다음과 같이 나옵니다. ```bash CONTAINER ID IMAGE COMMAND CREATED STATUS PORTS NAMES ``` 만약 실행되는 컨테이너가 있다면 다음과 비슷하게 나옵니다. ```bash CONTAINER ID IMAGE COMMAND CREATED STATUS PORTS NAMES c1e8f5e89d8d ubuntu "sleep 3600" 13 seconds ago Up 12 seconds trusting_newton ``` ## 5. Docker run 도커 컨테이너를 실행시키는 커맨드입니다. ```bash docker run --help ``` docker run을 실행하는 명령어는 다음과 같습니다. ```bash Usage: docker run [OPTIONS] IMAGE [COMMAND] [ARG...] Run a command in a new container ``` 여기서 우리가 확인해야 하는 것은 바로 docker run은 세 개 타입의 argument를 받는다는 것을 알 수 있습니다. 1. `[OPTIONS]` 2. `[COMMAND]` 3. `[ARG...]` 직접 도커 컨테이너를 실행해 보겠습니다. ```bash ## Usage: docker run [OPTIONS] IMAGE [COMMAND] [ARG...] docker run -it --name demo1 ubuntu:18.04 /bin/bash ``` - `-it` : `-i` 옵션 + `-t` 옵션 - container 를 실행시킴과 동시에 interactive 한 terminal 로 접속시켜주는 옵션 - `--name` : name - 컨테이너 id 대신, 구분하기 쉽도록 지정해주는 이름 - `/bin/bash` - 컨테이너를 실행시킴과 동시에 실행할 커맨드로, `/bin/bash` 는 bash 쉘을 여는 것을 의미합니다. 실행 후 `exit` 명령어를 통해 컨테이너를 종료합니다. 이 제 앞서 배웠던 `docker ps` 명령어를 치면 다음과 같이 나옵니다. ```bash CONTAINER ID IMAGE COMMAND CREATED STATUS PORTS NAMES ``` 실행되고 있는 컨테이너가 나온다고 했지만 어째서인지 방금 실행한 컨테이너가 보이지 않습니다. 그 이유는 `docker ps`는 기본값으로 현재 실행 중인 컨테이너를 보여주기 때문입니다. 만약 종료된 컨테이너들도 보고 싶다면 `-a` 옵션을 주어야 합니다. ```bash docker ps -a ``` 그러면 다음과 같이 종료된 컨테이너 목록도 나옵니다. ```bash CONTAINER ID IMAGE COMMAND CREATED STATUS PORTS NAMES 4c1aa74a382a ubuntu:18.04 "/bin/bash" 2 minutes ago Exited (0) 2 minutes ago demo1 ``` ## 6. Docker exec Docker 컨테이너 내부에서 명령을 내리거나, 내부로 접속하는 커맨드입니다. ```bash docker exec --help ``` 예를 들어서 다음과 같은 명령어를 실행해 보겠습니다. ```bash docker run -d --name demo2 ubuntu:18.04 sleep 3600 ``` 여기서 `-d` 옵션은 도커 컨테이너를 백그라운드에서 실행시켜서, 컨테이너에서 접속 종료를 하더라도, 계속 실행 중이 되도록 하는 커맨드입니다. `docker ps`를 통해 현재 실행중인지 확인합니다. 다음과 같이 실행 중임을 확인할 수 있습니다. ```bash CONTAINER ID IMAGE COMMAND CREATED STATUS PORTS NAMES fc88a83e90f0 ubuntu:18.04 "sleep 3600" 4 seconds ago Up 3 seconds demo2 ``` 이제 `docker exec` 명령어를 통해서 실행중인 도커 컨테이너에 접속해 보겠습니다. ```bash docker exec -it demo2 /bin/bash ``` 이 전의 `docker run`과 동일하게 container 내부에 접속할 수 있습니다. `exit`을 통해 종료합니다. ## 7. Docker logs 도커 컨테이너의 log를 확인하는 커맨드 입니다. ```bash docker logs --help ``` 다음과 같은 컨테이너를 실행시키도록 하겠습니다. ```bash docker run --name demo3 -d busybox sh -c "while true; do $(echo date); sleep 1; done" ``` 위 명령어를 통해서 test 라는 이름의 busybox 컨테이너를 백그라운드에서 도커 컨테이너로 실행하여, 1초에 한 번씩 현재 시간을 출력하도록 했습니다. 이제 아래 명령어를 통해 log를 확인해 보겠습니다. ```bash docker logs demo3 ``` 정상적으로 수행되면 아래와 비슷하게 나옵니다. ```bash Sun Mar 6 11:06:49 UTC 2022 Sun Mar 6 11:06:50 UTC 2022 Sun Mar 6 11:06:51 UTC 2022 Sun Mar 6 11:06:52 UTC 2022 Sun Mar 6 11:06:53 UTC 2022 Sun Mar 6 11:06:54 UTC 2022 ``` 그런데 이렇게 사용할 경우 여태까지 찍힌 log 밖에 확인할 수 없습니다. 이 때 `-f` 옵션을 이용해 계속 watch 하며 출력할 수 있습니다. ```bash docker logs demo3 -f ``` ## 8. Docker stop 실행 중인 도커 컨테이너를 중단시키는 커맨드입니다. ```bash docker stop --help ``` `docker ps`를 통해 현재 실행 중인 컨테이너를 확인하면 다음과 같습니다. ```bash CONTAINER ID IMAGE COMMAND CREATED STATUS PORTS NAMES 730391669c39 busybox "sh -c 'while true; …" About a minute ago Up About a minute demo3 fc88a83e90f0 ubuntu:18.04 "sleep 3600" 4 minutes ago Up 4 minutes demo2 ``` 이제 `docker stop` 을 통해 도커를 정지해 보겠습니다. ```bash docker stop demo2 ``` 실행 후 `docker ps`를 다시 입력합니다. ```bash CONTAINER ID IMAGE COMMAND CREATED STATUS PORTS NAMES 730391669c39 busybox "sh -c 'while true; …" 2 minutes ago Up 2 minutes demo3 ``` 위의 결과와 비교했을 때 demo2 컨테이너가 현재 실행 중인 컨테이너 목록에서 사라진 것을 확인할 수 있습니다. 나머지 컨테이너도 정지합니다. ```bash docker stop demo3 ``` ## 9. Docker rm 도커 컨테이너를 삭제하는 커맨드입니다. ```bash docker rm --help ``` 도커 컨테이너는 기본적으로 종료가 된 상태로 있습니다. 그래서 `docker ps -a`를 통해서 종료된 컨테이너도 볼 수 있습니다. 그런데 종료된 컨테이너는 왜 지워야 할까요? 종료되어 있는 도커에는 이전에 사용한 데이터가 아직 컨테이너 내부에 남아있습니다. 그래서 restart 등을 통해서 컨테이너를 재시작할 수 있습니다. 그런데 이 과정에서 disk를 사용하게 됩니다. 그래서 완전히 사용하지 않는 컨테이너를 지우기 위해서는 `docker rm` 명령어를 사용해야 합니다. 우선 현재 컨테이너들을 확인합니다. ```bash docker ps -a ``` 다음과 같이 3개의 컨테이너가 있습니다. ```bash CONTAINER ID IMAGE COMMAND CREATED STATUS PORTS NAMES 730391669c39 busybox "sh -c 'while true; …" 4 minutes ago Exited (137) About a minute ago demo3 fc88a83e90f0 ubuntu:18.04 "sleep 3600" 7 minutes ago Exited (137) 2 minutes ago demo2 4c1aa74a382a ubuntu:18.04 "/bin/bash" 10 minutes ago Exited (0) 10 minutes ago demo1 ``` 아래 명령어를 통해 `demo3` 컨테이너를 삭제해 보겠습니다. ```bash docker rm demo3 ``` `docker ps -a` 명령어를 치면 다음과 같이 2개로 줄었습니다. ```bash CONTAINER ID IMAGE COMMAND CREATED STATUS PORTS NAMES fc88a83e90f0 ubuntu:18.04 "sleep 3600" 13 minutes ago Exited (137) 8 minutes ago demo2 4c1aa74a382a ubuntu:18.04 "/bin/bash" 16 minutes ago Exited (0) 16 minutes ago demo1 ``` 나머지 컨테이너들도 삭제합니다. ```bash docker rm demo2 docker rm demo1 ``` ## 10. Docker rmi 도커 이미지를 삭제하는 커맨드입니다. ```bash docker rmi --help ``` 아래 명령어를 통해 현재 어떤 이미지들이 로컬에 있는지 확인합니다. ```bash docker images ``` 다음과 같이 출력됩니다. ```bash REPOSITORY TAG IMAGE ID CREATED SIZE busybox latest a8440bba1bc0 32 hours ago 1.41MB ubuntu 18.04 29e70752d7b2 2 days ago 56.7MB ``` `busybox` 이미지를 삭제해 보겠습니다. ```bash docker rmi busybox ``` 다시 `docker images`를 칠 경우 다음과 같이 나옵니다. ```bash REPOSITORY TAG IMAGE ID CREATED SIZE ubuntu 18.04 29e70752d7b2 2 days ago 56.7MB ``` ## References - [https://docs.github.com/en/packages/working-with-a-github-packages-registry/working-with-the-container-registry](https://docs.github.com/en/packages/working-with-a-github-packages-registry/working-with-the-container-registry) ================================================ FILE: docs/prerequisites/docker/docker.md ================================================ --- title : "What is Docker?" description: "Introduction to Docker." sidebar_position: 3 contributors: ["Jongseob Jeon", "Jaeyeon Kim"] --- ## 컨테이너 - 컨테이너 가상화 - 어플리케이션을 어디에서나 동일하게 실행하는 기술 - 컨테이너 이미지 - 어플리케이션을 실행시키기 위해 필요한 모든 파일들의 집합 - → 붕어빵 틀 - 컨테이너란? - 컨테이너 이미지를 기반으로 실행된 한 개의 프로세스 - → 붕어빵 틀로 찍어낸 붕어빵 ## 도커 도커는 **컨테이너를 관리**하고 사용할 수 있게 해주는 플랫폼입니다. 이러한 도커의 슬로건은 바로 **Build Once, Run Anywhere** 로 어디에서나 동일한 실행 결과를 보장합니다. 도커 내부에서 동작하는 과정을 보자면 실제로 container 를 위한 리소스를 분리하고, lifecycle 을 제어하는 기능은 linux kernel 의 cgroup 등이 수행합니다. 하지만 이러한 인터페이스를 바로 사용하는 것은 **너무 어렵기 때문에** 다음과 같은 추상화 layer를 만들게 됩니다. ![docker-layer.png](./img/docker-layer.png) 이를 통해 사용자는 사용자 친화적인 API 인 **Docker CLI** 만으로 쉽게 컨테이너를 제어할 수 있습니다. ## Layer 해석 위에서 나온 layer들의 역할은 다음과 같습니다. 1. runC: linux kernel 의 기능을 직접 사용해서, container 라는 하나의 프로세스가 사용할 네임스페이스와 cpu, memory, filesystem 등을 격리시켜주는 기능을 수행합니다. 2. containerd: runC(OCI layer) 에게 명령을 내리기 위한 추상화 단계이며, 표준화된 인터페이스(OCI)를 사용합니다. 3. dockerd: containerd 에게 명령을 내리는 역할만 합니다. 4. docker cli: 사용자는 docker cli 로 dockerd (Docker daemon)에게 명령을 내리기만 하면 됩니다. - 이 통신 과정에서 unix socket 을 사용하기 때문에 가끔 도커 관련 에러가 나면 `/var/run/docker.sock` 가 사용 중이다, 권한이 없다 등등의 에러 메시지가 나오는 것입니다. 이처럼 도커는 많은 단계를 감싸고 있지만, 흔히 도커라는 용어를 사용할 때는 Docker CLI 를 말할 때도 있고, Dockerd 를 말할 때도 있고 Docker Container 하나를 말할 때도 있어서 혼란이 생길 수 있습니다. 앞으로 나오는 글에서도 도커가 여러가지 의미로 쓰일 수 있습니다. ## For ML Engineer 머신러닝 엔지니어가 도커를 사용하는 이유는 다음과 같습니다. 1. 나의 ML 학습/추론 코드를 OS, python version, python 환경, 특정 python package 버전에 independent 하도록 해야 한다. 2. 그래서 코드 뿐만이 아닌 **해당 코드가 실행되기 위해 필요한 모든 종속적인 패키지, 환경 변수, 폴더명 등등을 하나의 패키지로** 묶을 수 있는 기술이 컨테이너화 기술이다. 3. 이 기술을 쉽게 사용하고 관리할 수 있는 소프트웨어 중 하나가 도커이며, 패키지를 도커 이미지라고 부른다. ================================================ FILE: docs/prerequisites/docker/images.md ================================================ --- title : "[Practice] Docker images" description: "Practice to use docker image." sidebar_position: 5 contributors: ["Jongseob Jeon", "Jaeyeon Kim"] --- ## 1. Dockerfile 만들기 도커 이미지를 만드는 가장 쉬운 방법은 도커에서 제공하는 템플릿인 Dockerfile을 사용하는 것입니다. 이외에는 running container 를 docker image 로 만드는 `docker commit` 등을 활용하는 방법이 있습니다. - `Dockerfile` - 사용자가 도커 이미지를 쉽게 만들 수 있도록, 제공하는 템플릿 - 파일명은 꼭 `Dockerfile` 이 아니어도 상관없지만, `docker build` 수행 시, default 로 사용하는 파일명이 `Dockerfile` 입니다. - 도커 이미지를 만드는 `docker build` 를 수행할 때, `-f` 옵션을 주면 다른 파일명으로도 사용 가능합니다. - ex) `docker build -f dockerfile-asdf .` 도 가능 1. 실습을 위해서 편한 디렉토리로 이동합니다. ```bash cd ``` 2. docker-practice 라는 이름의 폴더를 생성합니다. ```bash mkdir docker-practice ``` 3. docker-practice 폴더로 이동합니다. ```bash cd docker-practice ``` 4. Dockerfile 이라는 빈 파일을 생성합니다. ```bash touch Dockerfile ``` 5. 정상적으로 생성되었는지 확인합니다. ```bash ls ``` ## 2. Dockerfile 내장 명령어 Dockerfile 에서 사용할 수 있는 기본적인 명령어에 대해서 하나씩 알아보겠습니다. ### FROM Dockerfile 이 base image 로 어떠한 이미지를 사용할 것인지를 명시하는 명령어입니다. 도커 이미지를 만들 때, 아무것도 없는 빈 환경에서부터 하나하나씩 제가 의도한 환경을 만들어가는게 아니라, python 3.9 버전이 설치된 환경을 베이스로해두고, 저는 pytorch 를 설치하고, 제 소스코드만 넣어두는 형태로 활용할 수가 있습니다. 이러한 경우에는 `python:3.9`, `python-3.9-alpine`, ... 등의 잘 만들어진 이미지를 베이스로 활용합니다. ```docker FROM [:] [AS ] # 예시 FROM ubuntu FROM ubuntu:18.04 FROM nginx:latest AS ngx ``` ### COPY **host(로컬)에서의 ``** 경로의 파일 혹은 디렉토리를 **container 내부에서의 ``** 경로에 복사하는 명령어입니다. ```docker COPY ... # 예시 COPY a.txt /some-directory/b.txt COPY my-directory /some-directory-2 ``` `ADD` 는 `COPY` 와 비슷하지만 추가적인 기능을 품고 있습니다. ```docker # 1 - 호스트에 압축되어있는 파일을 풀면서 컨테이너 내부로 copy 할 수 있음 ADD scripts.tar.gz /tmp # 2 - Remote URLs 에 있는 파일을 소스 경로로 지정할 수 있음 ADD http://www.example.com/script.sh /tmp # 위 두 가지 기능을 사용하고 싶을 경우에만 COPY 대신 ADD 를 사용하는 것을 권장 ``` ### RUN 명시한 커맨드를 도커 컨테이너 내부에서 실행하는 명령어입니다. 도커 이미지는 해당 커맨드들이 실행된 상태를 유지합니다. ```docker RUN RUN ["executable-command", "parameter1", "parameter2"] # 예시 RUN pip install torch RUN pip install -r requirements.txt ``` ### CMD 명시한 커맨드를 도커 컨테이너가 **시작될 때**, 실행하는 것을 명시하는 명령어입니다. 비슷한 역할을 하는 명령어로 **ENTRYPOINT** 가 있습니다. 이 둘의 차이에 대해서는 **뒤에서** 다룹니다. 하나의 도커 이미지에서는 하나의 **CMD** 만 실행할 수 있다는 점에서 **RUN** 명령어와 다릅니다. ```docker CMD CMD ["executable-command", "parameter1", "parameter2"] CMD ["parameter1", "parameter2"] # ENTRYPOINT 와 함께 사용될 때 # 예시 CMD python main.py ``` ### WORKDIR 이후 추가될 명령어를 컨테이너 내의 어떤 디렉토리에서 수행할 것인지를 명시하는 명령어입니다. 만약, 해당 디렉토리가 없다면 생성합니다. ```docker WORKDIR /path/to/workdir # 예시 WORKDIR /home/demo RUN pwd # /home/demo 가 출력됨 ``` ### ENV 컨테이너 내부에서 지속적으로 사용될 environment variable 의 값을 설정하는 명령어입니다. ```docker ENV ENV = # 예시 # default 언어 설정 RUN locale-gen ko_KR.UTF-8 ENV LANG ko_KR.UTF-8 ENV LANGUAGE ko_KR.UTF-8 ENV LC_ALL ko_KR.UTF-8 ``` ### EXPOSE 컨테이너에서 뚫어줄 포트/프로토콜을 지정할 수 있습니다. `` 을 지정하지 않으면 TCP 가 디폴트로 설정됩니다. ```docker EXPOSE EXPOSE / # 예시 EXPOSE 8080 ``` ## 3. 간단한 Dockerfile 작성해보기 `vim Dockerfile` 혹은 vscode 등 본인이 사용하는 편집기로 `Dockerfile` 을 열어 다음과 같이 작성해줍니다. ```docker # base image 를 ubuntu 18.04 로 설정합니다. FROM ubuntu:18.04 # apt-get update 명령을 실행합니다. RUN apt-get update # TEST env var의 값을 hello 로 지정합니다. ENV TEST hello # DOCKER CONTAINER 가 시작될 때, 환경변수 TEST 의 값을 출력합니다. CMD echo $TEST ``` ## 4. Docker build from Dockerfile `docker build` 명령어로 Dockerfile 로부터 Docker Image 를 만들어봅니다. ```bash docker build --help ``` Dockerfile 이 있는 경로에서 다음 명령을 실행합니다. ```bash docker build -t my-image:v1.0.0 . ``` 위 커맨드를 설명하면 다음과 같습니다. - `.` : **현재 경로**에 있는 Dockerfile 로부터 - `-t` : my-image 라는 **이름**과 v1.0.0 이라는 **태그**로 **이미지**를 - 빌드하겠다라는 명령어 정상적으로 이미지 빌드되었는지 확인해 보겠습니다. ```bash # grep : my-image 가 있는지를 잡아내는 (grep) 하는 명령어 docker images | grep my-image ``` 정상적으로 수행된다면 다음과 같이 출력됩니다. ```bash my-image v1.0.0 143114710b2d 3 seconds ago 87.9MB ``` ## 5. Docker run from Dockerfile 그럼 이제 방금 빌드한 `my-image:v1.0.0` 이미지로 docker 컨테이너를 **run** 해보겠습니다. ```bash docker run my-image:v1.0.0 ``` 정상적으로 수행된다면 다음과 같이 나옵니다. ```bash hello ``` ## 6. Docker run with env 이번에는 방금 빌드한 `my-image:v1.0.0` 이미지를 실행하는 시점에, `TEST` env var 의 값을 변경하여 docker 컨테이너를 run 해보겠습니다. ```bash docker run -e TEST=bye my-image:v1.0.0 ``` 정상적으로 수행된다면 다음과 같이 나옵니다. ```bash bye ``` ================================================ FILE: docs/prerequisites/docker/install.md ================================================ --- title : "Install Docker" description: "Install docker to start." sidebar_position: 1 contributors: ["Jongseob Jeon", "Jaeyeon Kim"] --- ## Docker 도커 실습을 위해 도커를 설치해야 합니다. 도커 설치는 어떤 OS를 사용하는지에 따라 달라집니다. 각 환경에 맞는 도커 설치는 공식 홈페이지를 참고해주세요. - [ubuntu](https://docs.docker.com/engine/install/ubuntu/) - [mac](https://docs.docker.com/desktop/mac/install/) - [windows](https://docs.docker.com/desktop/windows/install/) ## 설치 확인 `docker run hello-world` 가 정상적으로 수행되는 OS, 터미널 환경이 필요합니다. | OS | Docker Engine | Terminal | | ------- | -------------- | ------------------ | | MacOS | Docker Desktop | zsh | | Windows | Docker Desktop | Powershell | | Windows | Docker Desktop | WSL2 | | Ubuntu | Docker Engine | bash | ## 들어가기 앞서서.. MLOps를 사용하기 위해 필요한 도커 사용법을 설명하니 많은 비유와 예시가 MLOps 쪽으로 치중되어 있을 수 있습니다. ================================================ FILE: docs/prerequisites/docker/introduction.md ================================================ --- title : "Why Docker & Kubernetes ?" description: "Introduction to Docker." sidebar_position: 2 contributors: ["Jongseob Jeon", "Jaeyeon Kim"] --- ## Why Kubernetes ? 머신러닝 모델을 서비스화하기 위해서는 모델 개발 외에도 많은 **부가적인** 기능들이 필요합니다. 1. 학습 단계 - 모델 학습 명령의 스케줄 관리 - 학습된 모델의 Reproducibility 보장 2. 배포 단계 - 트래픽 분산 - 서비스 장애 모니터링 - 장애 시 트러블슈팅 다행히도 이런 기능들에 대한 needs는 소프트웨어 개발 쪽에서 이미 많은 고민을 거쳐 발전되어 왔습니다. 따라서 머신러닝 모델을 배포할 때도 이런 고민의 결과물들을 활용하면 큰 도움을 받을 수 있습니다. MLOps에서 대표적으로 활용하는 소프트웨어 제품이 바로 도커와 쿠버네티스입니다. ## 도커와 쿠버네티스 ### 기술 이름이 아니라 제품 이름 도커와 쿠버네티스는 각각 컨테이너라이제이션(Containerization) 기능과 컨테이너 오케스트레이션(Container Orchestration) 기능을 제공하는 대표 소프트웨어(제품)입니다. #### 도커 도커는 과거에 대세였지만 유료화 관련 정책들을 하나씩 추가하면서 점점 사용 빈도가 하락세입니다. 하지만 2022년 3월 기준으로 아직까지도 가장 일반적으로 사용되는 컨테이너 가상화 소프트웨어입니다. ![sysdig-2019.png](./img/sysdig-2019.png)
[from sysdig 2019]
![sysdig-2021.png](./img/sysdig-2021.png)
[from sysdig 2021]
#### 쿠버네티스 쿠버네티스는 지금까지는 비교 대상조차 거의 없는 제품입니다. ![cncf-survey.png](./img/cncf-survey.png)
[from cncf survey]
![t4-ai.png](./img/t4-ai.png)
[from t4.ai]
### **재미있는 오픈소스 역사 이야기** #### 초기 도커 & 쿠버네티스 초기 도커 개발시에는 Docker Engine이라는 **하나의 패키지**에 API, CLI, 네트워크, 스토리지 등 여러 기능들을 모두 포함했으나, **MSA** 의 철학을 담아 **하나씩 분리**하기 시작했습니다. 하지만 초기의 쿠버네티스는 컨테이너 가상화를 위해 Docker Engine을 내장하고 있었습니다. 따라서 도커 버전이 업데이트될 때마다 Docker Engine 의 인터페이스가 변경되어 쿠버네티스에서 크게 영향을 받는 일이 계속해서 발생하였습니다. #### Open Container Initiative 그래서 **이런 불편함을 해소**하고자, 도커를 중심으로 구글 등 컨테이너 기술에 관심있는 **여러 집단**들이 한데 모여 **Open Container Initiative,** 이하 **OCI**라는 프로젝트를 시작하여 컨테이너에 관한 **표준**을 정하는 일들을 시작하였습니다. 도커에서도 인터페이스를 **한 번 더 분리**해서, OCI 표준을 준수하는 **containerd**라는 Container Runtime 를 개발하고, **dockerd** 가 containerd 의 API 를 호출하도록 추상화 레이어를 추가하였습니다. 이러한 흐름에 맞추어서 쿠버네티스에서도 이제부터는 도커만을 지원하지 않고, **OCI 표준을** 준수하고, 정해진 스펙을 지키는 컨테이너 런타임은 무엇이든 쿠버네티스에서 사용할 수 있도록, Container Runtime Interface, 이하 **CRI 스펙**을 버전 1.5부터 제공하기 시작했습니다. #### CRI-O Red Hat, Intel, SUSE, IBM에서 **OCI 표준+CRI 스펙을** 따라 Kubernetes 전용 Container Runtime 을 목적으로 개발한 컨테이너 런타임입니다. #### 지금의 도커 & 쿠버네티스 쿠버네티스는 Docker Engine 을 디폴트 컨테이너 런타임으로 사용해왔지만, 도커의 API 가 **CRI** 스펙에 맞지 않아(*OCI 는 따름*) 도커의 API를 **CRI**와 호환되게 바꿔주는 **dockershim**을 쿠버네티스 자체적으로 개발 및 지원해왔었는데,(*도커 측이 아니라 쿠버네티스 측에서 지원했다는 점이 굉장히 큰 짐이었습니다.*) 이걸 쿠버네티스 **v1.20 부터는 Deprecated하고,** **v1.23 부터는 지원을 포기**하기로 결정하였습니다. - v1.23 은 2021 년 12월 릴리즈 그래서 쿠버네티스 v1.23 부터는 도커를 native 하게 쓸 수 없습니다다. 그렇지만 **사용자들은 이런 변화에 크게 관련이 있진 않습니다.** 왜냐하면 Docker Engine을 통해 만들어진 도커 이미지는 OCI 표준을 준수하기 때문에, 쿠버네티스가 어떤 컨테이너 런타임으로 이루어져있든 사용 가능하기 때문입니다. ### References - [*https://www.linkedin.com/pulse/containerd는-무엇이고-왜-중요할까-sean-lee/?originalSubdomain=kr*](https://www.linkedin.com/pulse/containerd%EB%8A%94-%EB%AC%B4%EC%97%87%EC%9D%B4%EA%B3%A0-%EC%99%9C-%EC%A4%91%EC%9A%94%ED%95%A0%EA%B9%8C-sean-lee/?originalSubdomain=kr) - [https://kubernetes.io/blog/2021/12/07/kubernetes-1-23-release-announcement/](https://kubernetes.io/blog/2021/12/07/kubernetes-1-23-release-announcement/) - [https://kubernetes.io/blog/2020/12/02/dockershim-faq/](https://kubernetes.io/blog/2020/12/02/dockershim-faq/) - [https://kubernetes.io/blog/2020/12/02/dont-panic-kubernetes-and-docker/](https://kubernetes.io/blog/2020/12/02/dont-panic-kubernetes-and-docker/) - [https://kubernetes.io/ko/blog/2020/12/02/dont-panic-kubernetes-and-docker/](https://kubernetes.io/ko/blog/2020/12/02/dont-panic-kubernetes-and-docker/) ================================================ FILE: docs/setup-components/_category_.json ================================================ { "label": "Setup Components", "position": 3, "link": { "type": "generated-index" } } ================================================ FILE: docs/setup-components/install-components-kf.md ================================================ --- title : "1. Kubeflow" description: "구성요소 설치 - Kubeflow" sidebar_position: 1 date: 2021-12-13 lastmod: 2021-12-20 contributors: ["Jaeyeon Kim", "SeungTae Kim"] --- ## 설치 파일 준비 Kubeflow **v1.4.0** 버전을 설치하기 위해서, 설치에 필요한 manifests 파일들을 준비합니다. [kubeflow/manifests Repository](https://github.com/kubeflow/manifests) 를 **v1.4.0** 태그로 깃 클론한 뒤, 해당 폴더로 이동합니다. ```bash git clone -b v1.4.0 https://github.com/kubeflow/manifests.git cd manifests ``` ## 각 구성 요소별 설치 kubeflow/manifests Repository 에 각 구성 요소별 설치 커맨드가 적혀져 있지만, 설치하며 발생할 수 있는 이슈 혹은 정상적으로 설치되었는지 확인하는 방법이 적혀져 있지 않아 처음 설치하는 경우 어려움을 겪는 경우가 많습니다. 따라서, 각 구성 요소별로 정상적으로 설치되었는지 확인하는 방법을 함께 작성합니다. 또한, 본 문서에서는 **모두의 MLOps** 에서 다루지 않는 구성요소인 Knative, KFServing, MPI Operator 의 설치는 리소스의 효율적 사용을 위해 따로 설치하지 않습니다. ### Cert-manager 1. cert-manager 를 설치합니다. ```bash kustomize build common/cert-manager/cert-manager/base | kubectl apply -f - ``` 정상적으로 설치되면 다음과 같이 출력됩니다. ```bash namespace/cert-manager created customresourcedefinition.apiextensions.k8s.io/certificaterequests.cert-manager.io created customresourcedefinition.apiextensions.k8s.io/certificates.cert-manager.io created customresourcedefinition.apiextensions.k8s.io/challenges.acme.cert-manager.io created customresourcedefinition.apiextensions.k8s.io/clusterissuers.cert-manager.io created customresourcedefinition.apiextensions.k8s.io/issuers.cert-manager.io created customresourcedefinition.apiextensions.k8s.io/orders.acme.cert-manager.io created serviceaccount/cert-manager created serviceaccount/cert-manager-cainjector created serviceaccount/cert-manager-webhook created role.rbac.authorization.k8s.io/cert-manager-webhook:dynamic-serving created role.rbac.authorization.k8s.io/cert-manager-cainjector:leaderelection created role.rbac.authorization.k8s.io/cert-manager:leaderelection created clusterrole.rbac.authorization.k8s.io/cert-manager-cainjector created clusterrole.rbac.authorization.k8s.io/cert-manager-controller-approve:cert-manager-io created clusterrole.rbac.authorization.k8s.io/cert-manager-controller-certificates created clusterrole.rbac.authorization.k8s.io/cert-manager-controller-challenges created clusterrole.rbac.authorization.k8s.io/cert-manager-controller-clusterissuers created clusterrole.rbac.authorization.k8s.io/cert-manager-controller-ingress-shim created clusterrole.rbac.authorization.k8s.io/cert-manager-controller-issuers created clusterrole.rbac.authorization.k8s.io/cert-manager-controller-orders created clusterrole.rbac.authorization.k8s.io/cert-manager-edit created clusterrole.rbac.authorization.k8s.io/cert-manager-view created clusterrole.rbac.authorization.k8s.io/cert-manager-webhook:subjectaccessreviews created rolebinding.rbac.authorization.k8s.io/cert-manager-webhook:dynamic-serving created rolebinding.rbac.authorization.k8s.io/cert-manager-cainjector:leaderelection created rolebinding.rbac.authorization.k8s.io/cert-manager:leaderelection created clusterrolebinding.rbac.authorization.k8s.io/cert-manager-cainjector created clusterrolebinding.rbac.authorization.k8s.io/cert-manager-controller-approve:cert-manager-io created clusterrolebinding.rbac.authorization.k8s.io/cert-manager-controller-certificates created clusterrolebinding.rbac.authorization.k8s.io/cert-manager-controller-challenges created clusterrolebinding.rbac.authorization.k8s.io/cert-manager-controller-clusterissuers created clusterrolebinding.rbac.authorization.k8s.io/cert-manager-controller-ingress-shim created clusterrolebinding.rbac.authorization.k8s.io/cert-manager-controller-issuers created clusterrolebinding.rbac.authorization.k8s.io/cert-manager-controller-orders created clusterrolebinding.rbac.authorization.k8s.io/cert-manager-webhook:subjectaccessreviews created service/cert-manager created service/cert-manager-webhook created deployment.apps/cert-manager created deployment.apps/cert-manager-cainjector created deployment.apps/cert-manager-webhook created mutatingwebhookconfiguration.admissionregistration.k8s.io/cert-manager-webhook created validatingwebhookconfiguration.admissionregistration.k8s.io/cert-manager-webhook created ``` cert-manager namespace 의 3 개의 pod 가 모두 Running 이 될 때까지 기다립니다. ```bash kubectl get pod -n cert-manager ``` 모두 Running 이 되면 다음과 비슷한 결과가 출력됩니다. ```bash NAME READY STATUS RESTARTS AGE cert-manager-7dd5854bb4-7nmpd 1/1 Running 0 2m10s cert-manager-cainjector-64c949654c-2scxr 1/1 Running 0 2m10s cert-manager-webhook-6b57b9b886-7q6g2 1/1 Running 0 2m10s ``` 2. kubeflow-issuer 를 설치합니다. ```bash kustomize build common/cert-manager/kubeflow-issuer/base | kubectl apply -f - ``` 정상적으로 설치되면 다음과 같이 출력됩니다. ```bash clusterissuer.cert-manager.io/kubeflow-self-signing-issuer created ``` - cert-manager-webhook 이슈 cert-manager-webhook deployment 가 Running 이 아닌 경우, 다음과 비슷한 에러가 발생하며 kubeflow-issuer가 설치되지 않을 수 있음에 주의하시기 바랍니다. 해당 에러가 발생한 경우, cert-manager 의 3개의 pod 가 모두 Running 이 되는 것을 확인한 이후 다시 명령어를 수행하시기 바랍니다. ```bash Error from server: error when retrieving current configuration of: Resource: "cert-manager.io/v1alpha2, Resource=clusterissuers", GroupVersionKind: "cert-manager.io/v1alpha2, Kind=ClusterIssuer" Name: "kubeflow-self-signing-issuer", Namespace: "" from server for: "STDIN": conversion webhook for cert-manager.io/v1, Kind=ClusterIssuer failed: Post "https://cert-manager-webhook.cert-manager.svc:443/convert?timeout=30s": dial tcp 10.101.177.157:443: connect: connection refused ``` ### Istio 1. istio 관련 Custom Resource Definition(CRD) 를 설치합니다. ```bash kustomize build common/istio-1-9/istio-crds/base | kubectl apply -f - ``` 정상적으로 수행되면 다음과 같이 출력됩니다. ```bash customresourcedefinition.apiextensions.k8s.io/authorizationpolicies.security.istio.io created customresourcedefinition.apiextensions.k8s.io/destinationrules.networking.istio.io created customresourcedefinition.apiextensions.k8s.io/envoyfilters.networking.istio.io created customresourcedefinition.apiextensions.k8s.io/gateways.networking.istio.io created customresourcedefinition.apiextensions.k8s.io/istiooperators.install.istio.io created customresourcedefinition.apiextensions.k8s.io/peerauthentications.security.istio.io created customresourcedefinition.apiextensions.k8s.io/requestauthentications.security.istio.io created customresourcedefinition.apiextensions.k8s.io/serviceentries.networking.istio.io created customresourcedefinition.apiextensions.k8s.io/sidecars.networking.istio.io created customresourcedefinition.apiextensions.k8s.io/virtualservices.networking.istio.io created customresourcedefinition.apiextensions.k8s.io/workloadentries.networking.istio.io created customresourcedefinition.apiextensions.k8s.io/workloadgroups.networking.istio.io created ``` 2. istio namespace 를 설치합니다. ```bash kustomize build common/istio-1-9/istio-namespace/base | kubectl apply -f - ``` 정상적으로 수행되면 다음과 같이 출력됩니다. ```bash namespace/istio-system created ``` 3. istio 를 설치합니다. ```bash kustomize build common/istio-1-9/istio-install/base | kubectl apply -f - ``` 정상적으로 수행되면 다음과 같이 출력됩니다. ```bash serviceaccount/istio-ingressgateway-service-account created serviceaccount/istio-reader-service-account created serviceaccount/istiod-service-account created role.rbac.authorization.k8s.io/istio-ingressgateway-sds created role.rbac.authorization.k8s.io/istiod-istio-system created clusterrole.rbac.authorization.k8s.io/istio-reader-istio-system created clusterrole.rbac.authorization.k8s.io/istiod-istio-system created rolebinding.rbac.authorization.k8s.io/istio-ingressgateway-sds created rolebinding.rbac.authorization.k8s.io/istiod-istio-system created clusterrolebinding.rbac.authorization.k8s.io/istio-reader-istio-system created clusterrolebinding.rbac.authorization.k8s.io/istiod-istio-system created configmap/istio created configmap/istio-sidecar-injector created service/istio-ingressgateway created service/istiod created deployment.apps/istio-ingressgateway created deployment.apps/istiod created envoyfilter.networking.istio.io/metadata-exchange-1.8 created envoyfilter.networking.istio.io/metadata-exchange-1.9 created envoyfilter.networking.istio.io/stats-filter-1.8 created envoyfilter.networking.istio.io/stats-filter-1.9 created envoyfilter.networking.istio.io/tcp-metadata-exchange-1.8 created envoyfilter.networking.istio.io/tcp-metadata-exchange-1.9 created envoyfilter.networking.istio.io/tcp-stats-filter-1.8 created envoyfilter.networking.istio.io/tcp-stats-filter-1.9 created envoyfilter.networking.istio.io/x-forwarded-host created gateway.networking.istio.io/istio-ingressgateway created authorizationpolicy.security.istio.io/global-deny-all created authorizationpolicy.security.istio.io/istio-ingressgateway created mutatingwebhookconfiguration.admissionregistration.k8s.io/istio-sidecar-injector created validatingwebhookconfiguration.admissionregistration.k8s.io/istiod-istio-system created ``` istio-system namespace 의 2 개의 pod 가 모두 Running 이 될 때까지 기다립니다. ```bash kubectl get po -n istio-system ``` 모두 Running 이 되면 다음과 비슷한 결과가 출력됩니다. ```bash NAME READY STATUS RESTARTS AGE istio-ingressgateway-79b665c95-xm22l 1/1 Running 0 16s istiod-86457659bb-5h58w 1/1 Running 0 16s ``` ### Dex dex 를 설치합니다. ```bash kustomize build common/dex/overlays/istio | kubectl apply -f - ``` 정상적으로 수행되면 다음과 같이 출력됩니다. ```bash namespace/auth created customresourcedefinition.apiextensions.k8s.io/authcodes.dex.coreos.com created serviceaccount/dex created clusterrole.rbac.authorization.k8s.io/dex created clusterrolebinding.rbac.authorization.k8s.io/dex created configmap/dex created secret/dex-oidc-client created service/dex created deployment.apps/dex created virtualservice.networking.istio.io/dex created ``` auth namespace 의 1 개의 pod 가 모두 Running 이 될 때까지 기다립니다. ```bash kubectl get po -n auth ``` 모두 Running 이 되면 다음과 비슷한 결과가 출력됩니다. ```bash NAME READY STATUS RESTARTS AGE dex-5ddf47d88d-458cs 1/1 Running 1 12s ``` ### OIDC AuthService OIDC AuthService 를 설치합니다. ```bash kustomize build common/oidc-authservice/base | kubectl apply -f - ``` 정상적으로 수행되면 다음과 같이 출력됩니다. ```bash configmap/oidc-authservice-parameters created secret/oidc-authservice-client created service/authservice created persistentvolumeclaim/authservice-pvc created statefulset.apps/authservice created envoyfilter.networking.istio.io/authn-filter created ``` istio-system namespace 에 authservice-0 pod 가 Running 이 될 때까지 기다립니다. ```bash kubectl get po -n istio-system -w ``` 모두 Running 이 되면 다음과 비슷한 결과가 출력됩니다. ```bash NAME READY STATUS RESTARTS AGE authservice-0 1/1 Running 0 14s istio-ingressgateway-79b665c95-xm22l 1/1 Running 0 2m37s istiod-86457659bb-5h58w 1/1 Running 0 2m37s ``` ### Kubeflow Namespace kubeflow namespace 를 생성합니다. ```bash kustomize build common/kubeflow-namespace/base | kubectl apply -f - ``` 정상적으로 수행되면 다음과 같이 출력됩니다. ```bash namespace/kubeflow created ``` kubeflow namespace 를 조회합니다. ```bash kubectl get ns kubeflow ``` 정상적으로 생성되면 다음과 비슷한 결과가 출력됩니다. ```bash NAME STATUS AGE kubeflow Active 8s ``` ### Kubeflow Roles kubeflow-roles 를 설치합니다. ```bash kustomize build common/kubeflow-roles/base | kubectl apply -f - ``` 정상적으로 수행되면 다음과 같이 출력됩니다. ```bash clusterrole.rbac.authorization.k8s.io/kubeflow-admin created clusterrole.rbac.authorization.k8s.io/kubeflow-edit created clusterrole.rbac.authorization.k8s.io/kubeflow-kubernetes-admin created clusterrole.rbac.authorization.k8s.io/kubeflow-kubernetes-edit created clusterrole.rbac.authorization.k8s.io/kubeflow-kubernetes-view created clusterrole.rbac.authorization.k8s.io/kubeflow-view created ``` 방금 생성한 kubeflow roles 를 조회합니다. ```bash kubectl get clusterrole | grep kubeflow ``` 다음과 같이 총 6개의 clusterrole 이 출력됩니다. ```bash kubeflow-admin 2021-12-03T08:51:36Z kubeflow-edit 2021-12-03T08:51:36Z kubeflow-kubernetes-admin 2021-12-03T08:51:36Z kubeflow-kubernetes-edit 2021-12-03T08:51:36Z kubeflow-kubernetes-view 2021-12-03T08:51:36Z kubeflow-view 2021-12-03T08:51:36Z ``` ### Kubeflow Istio Resources kubeflow-istio-resources 를 설치합니다. ```bash kustomize build common/istio-1-9/kubeflow-istio-resources/base | kubectl apply -f - ``` 정상적으로 수행되면 다음과 같이 출력됩니다. ```bash clusterrole.rbac.authorization.k8s.io/kubeflow-istio-admin created clusterrole.rbac.authorization.k8s.io/kubeflow-istio-edit created clusterrole.rbac.authorization.k8s.io/kubeflow-istio-view created gateway.networking.istio.io/kubeflow-gateway created ``` 방금 생성한 kubeflow roles 를 조회합니다. ```bash kubectl get clusterrole | grep kubeflow-istio ``` 다음과 같이 총 3개의 clusterrole 이 출력됩니다. ```bash kubeflow-istio-admin 2021-12-03T08:53:17Z kubeflow-istio-edit 2021-12-03T08:53:17Z kubeflow-istio-view 2021-12-03T08:53:17Z ``` Kubeflow namespace 에 gateway 가 정상적으로 설치되었는지 확인합니다. ```bash kubectl get gateway -n kubeflow ``` 정상적으로 생성되면 다음과 비슷한 결과가 출력됩니다. ```bash NAME AGE kubeflow-gateway 31s ``` ### Kubeflow Pipelines kubeflow pipelines 를 설치합니다. ```bash kustomize build apps/pipeline/upstream/env/platform-agnostic-multi-user | kubectl apply -f - ``` 정상적으로 수행되면 다음과 같이 출력됩니다. ```bash customresourcedefinition.apiextensions.k8s.io/clusterworkflowtemplates.argoproj.io created customresourcedefinition.apiextensions.k8s.io/cronworkflows.argoproj.io created customresourcedefinition.apiextensions.k8s.io/workfloweventbindings.argoproj.io created ...(생략) authorizationpolicy.security.istio.io/ml-pipeline-visualizationserver created authorizationpolicy.security.istio.io/mysql created authorizationpolicy.security.istio.io/service-cache-server created ``` 위 명령어는 여러 resources 를 한 번에 설치하고 있지만, 설치 순서의 의존성이 있는 리소스가 존재합니다. 따라서 때에 따라 다음과 비슷한 에러가 발생할 수 있습니다. ```bash "error: unable to recognize "STDIN": no matches for kind "CompositeController" in version "metacontroller.k8s.io/v1alpha1"" ``` 위와 비슷한 에러가 발생한다면, 10 초 정도 기다린 뒤 다시 위의 명령을 수행합니다. ```bash kustomize build apps/pipeline/upstream/env/platform-agnostic-multi-user | kubectl apply -f - ``` 정상적으로 설치되었는지 확인합니다. ```bash kubectl get po -n kubeflow ``` 다음과 같이 총 16개의 pod 가 모두 Running 이 될 때까지 기다립니다. ```bash NAME READY STATUS RESTARTS AGE cache-deployer-deployment-79fdf9c5c9-bjnbg 2/2 Running 1 5m3s cache-server-5bdf4f4457-48gbp 2/2 Running 0 5m3s kubeflow-pipelines-profile-controller-7b947f4748-8d26b 1/1 Running 0 5m3s metacontroller-0 1/1 Running 0 5m3s metadata-envoy-deployment-5b4856dd5-xtlkd 1/1 Running 0 5m3s metadata-grpc-deployment-6b5685488-kwvv7 2/2 Running 3 5m3s metadata-writer-548bd879bb-zjkcn 2/2 Running 1 5m3s minio-5b65df66c9-k5gzg 2/2 Running 0 5m3s ml-pipeline-8c4b99589-85jw6 2/2 Running 1 5m3s ml-pipeline-persistenceagent-d6bdc77bd-ssxrv 2/2 Running 0 5m3s ml-pipeline-scheduledworkflow-5db54d75c5-zk2cw 2/2 Running 0 5m2s ml-pipeline-ui-5bd8d6dc84-j7wqr 2/2 Running 0 5m2s ml-pipeline-viewer-crd-68fb5f4d58-mbcbg 2/2 Running 1 5m2s ml-pipeline-visualizationserver-8476b5c645-wljfm 2/2 Running 0 5m2s mysql-f7b9b7dd4-xfnw4 2/2 Running 0 5m2s workflow-controller-5cbbb49bd8-5zrwx 2/2 Running 1 5m2s ``` 추가로 ml-pipeline UI가 정상적으로 접속되는지 확인합니다. ```bash kubectl port-forward svc/ml-pipeline-ui -n kubeflow 8888:80 ``` 웹 브라우저를 열어 [http://localhost:8888/#/pipelines/](http://localhost:8888/#/pipelines/) 경로에 접속합니다. 다음과 같은 화면이 출력되는 것을 확인합니다. ![pipeline-ui](./img/pipeline-ui.png) - localhost 연결 거부 이슈 ![localhost-reject](./img/localhost-reject.png) 만약 다음과 같이 `localhost에서 연결을 거부했습니다` 라는 에러가 출력될 경우, 커맨드로 address 설정을 통해 접근하는 것이 가능합니다. **보안상의 문제가 되지 않는다면,** 아래와 같이 `0.0.0.0` 로 모든 주소의 bind를 열어주는 방향으로 ml-pipeline UI가 정상적으로 접속되는지 확인합니다. ```bash kubectl port-forward --address 0.0.0.0 svc/ml-pipeline-ui -n kubeflow 8888:80 ``` - 위의 옵션으로 실행했음에도 여전히 연결 거부 이슈가 발생할 경우 방화벽 설정으로 접속해 모든 tcp 프로토콜의 포트에 대한 접속을 허가 또는 8888번 포트의 접속 허가를 추가해 접근 권한을 허가해줍니다. 웹 브라우저를 열어 `http://<당신의 가상 인스턴스 공인 ip 주소>:8888/#/pipelines/` 경로에 접속하면, ml-pipeline UI 화면이 출력되는 것을 확인할 수 있습니다. 하단에서 진행되는 다른 포트의 경로에 접속할 때도 위의 절차와 동일하게 커맨드를 실행하고, 방화벽에 포트 번호를 추가해주면 실행하는 것이 가능합니다. ### Katib Katib 를 설치합니다. ```bash kustomize build apps/katib/upstream/installs/katib-with-kubeflow | kubectl apply -f - ``` 정상적으로 수행되면 다음과 같이 출력됩니다. ```bash customresourcedefinition.apiextensions.k8s.io/experiments.kubeflow.org created customresourcedefinition.apiextensions.k8s.io/suggestions.kubeflow.org created customresourcedefinition.apiextensions.k8s.io/trials.kubeflow.org created serviceaccount/katib-controller created serviceaccount/katib-ui created clusterrole.rbac.authorization.k8s.io/katib-controller created clusterrole.rbac.authorization.k8s.io/katib-ui created clusterrole.rbac.authorization.k8s.io/kubeflow-katib-admin created clusterrole.rbac.authorization.k8s.io/kubeflow-katib-edit created clusterrole.rbac.authorization.k8s.io/kubeflow-katib-view created clusterrolebinding.rbac.authorization.k8s.io/katib-controller created clusterrolebinding.rbac.authorization.k8s.io/katib-ui created configmap/katib-config created configmap/trial-templates created secret/katib-mysql-secrets created service/katib-controller created service/katib-db-manager created service/katib-mysql created service/katib-ui created persistentvolumeclaim/katib-mysql created deployment.apps/katib-controller created deployment.apps/katib-db-manager created deployment.apps/katib-mysql created deployment.apps/katib-ui created certificate.cert-manager.io/katib-webhook-cert created issuer.cert-manager.io/katib-selfsigned-issuer created virtualservice.networking.istio.io/katib-ui created mutatingwebhookconfiguration.admissionregistration.k8s.io/katib.kubeflow.org created validatingwebhookconfiguration.admissionregistration.k8s.io/katib.kubeflow.org created ``` 정상적으로 설치되었는지 확인합니다. ```bash kubectl get po -n kubeflow | grep katib ``` 다음과 같이 총 4 개의 pod 가 Running 이 될 때까지 기다립니다. ```bash katib-controller-68c47fbf8b-b985z 1/1 Running 0 82s katib-db-manager-6c948b6b76-2d9gr 1/1 Running 0 82s katib-mysql-7894994f88-scs62 1/1 Running 0 82s katib-ui-64bb96d5bf-d89kp 1/1 Running 0 82s ``` 추가로 katib UI가 정상적으로 접속되는지 확인합니다. ```bash kubectl port-forward svc/katib-ui -n kubeflow 8081:80 ``` 웹 브라우저를 열어 [http://localhost:8081/katib/](http://localhost:8081/katib/) 경로에 접속합니다. 다음과 같은 화면이 출력되는 것을 확인합니다. ![katib-ui](./img/katib-ui.png) ### Central Dashboard Dashboard 를 설치합니다. ```bash kustomize build apps/centraldashboard/upstream/overlays/istio | kubectl apply -f - ``` 정상적으로 수행되면 다음과 같이 출력됩니다. ```bash serviceaccount/centraldashboard created role.rbac.authorization.k8s.io/centraldashboard created clusterrole.rbac.authorization.k8s.io/centraldashboard created rolebinding.rbac.authorization.k8s.io/centraldashboard created clusterrolebinding.rbac.authorization.k8s.io/centraldashboard created configmap/centraldashboard-config created configmap/centraldashboard-parameters created service/centraldashboard created deployment.apps/centraldashboard created virtualservice.networking.istio.io/centraldashboard created ``` 정상적으로 설치되었는지 확인합니다. ```bash kubectl get po -n kubeflow | grep centraldashboard ``` kubeflow namespace 에 centraldashboard 관련 1 개의 pod 가 Running 이 될 때까지 기다립니다. ```bash centraldashboard-8fc7d8cc-xl7ts 1/1 Running 0 52s ``` 추가로 Central Dashboard UI가 정상적으로 접속되는지 확인합니다. ```bash kubectl port-forward svc/centraldashboard -n kubeflow 8082:80 ``` 웹 브라우저를 열어 [http://localhost:8082/](http://localhost:8082/) 경로에 접속합니다. 다음과 같은 화면이 출력되는 것을 확인합니다. ![central-dashboard](./img/central-dashboard.png) ### Admission Webhook ```bash kustomize build apps/admission-webhook/upstream/overlays/cert-manager | kubectl apply -f - ``` 정상적으로 수행되면 다음과 같이 출력됩니다. ```bash customresourcedefinition.apiextensions.k8s.io/poddefaults.kubeflow.org created serviceaccount/admission-webhook-service-account created clusterrole.rbac.authorization.k8s.io/admission-webhook-cluster-role created clusterrole.rbac.authorization.k8s.io/admission-webhook-kubeflow-poddefaults-admin created clusterrole.rbac.authorization.k8s.io/admission-webhook-kubeflow-poddefaults-edit created clusterrole.rbac.authorization.k8s.io/admission-webhook-kubeflow-poddefaults-view created clusterrolebinding.rbac.authorization.k8s.io/admission-webhook-cluster-role-binding created service/admission-webhook-service created deployment.apps/admission-webhook-deployment created certificate.cert-manager.io/admission-webhook-cert created issuer.cert-manager.io/admission-webhook-selfsigned-issuer created mutatingwebhookconfiguration.admissionregistration.k8s.io/admission-webhook-mutating-webhook-configuration created ``` 정상적으로 설치되었는지 확인합니다. ```bash kubectl get po -n kubeflow | grep admission-webhook ``` 1 개의 pod 가 Running 이 될 때까지 기다립니다. ```bash admission-webhook-deployment-667bd68d94-2hhrx 1/1 Running 0 11s ``` ### Notebooks & Jupyter Web App 1. Notebook controller 를 설치합니다. ```bash kustomize build apps/jupyter/notebook-controller/upstream/overlays/kubeflow | kubectl apply -f - ``` 정상적으로 수행되면 다음과 같이 출력됩니다. ```bash customresourcedefinition.apiextensions.k8s.io/notebooks.kubeflow.org created serviceaccount/notebook-controller-service-account created role.rbac.authorization.k8s.io/notebook-controller-leader-election-role created clusterrole.rbac.authorization.k8s.io/notebook-controller-kubeflow-notebooks-admin created clusterrole.rbac.authorization.k8s.io/notebook-controller-kubeflow-notebooks-edit created clusterrole.rbac.authorization.k8s.io/notebook-controller-kubeflow-notebooks-view created clusterrole.rbac.authorization.k8s.io/notebook-controller-role created rolebinding.rbac.authorization.k8s.io/notebook-controller-leader-election-rolebinding created clusterrolebinding.rbac.authorization.k8s.io/notebook-controller-role-binding created configmap/notebook-controller-config-m44cmb547t created service/notebook-controller-service created deployment.apps/notebook-controller-deployment created ``` 정상적으로 설치되었는지 확인합니다. ```bash kubectl get po -n kubeflow | grep notebook-controller ``` 1 개의 pod 가 Running 이 될 때까지 기다립니다. ```bash notebook-controller-deployment-75b4f7b578-w4d4l 1/1 Running 0 105s ``` 2. Jupyter Web App 을 설치합니다. ```bash kustomize build apps/jupyter/jupyter-web-app/upstream/overlays/istio | kubectl apply -f - ``` 정상적으로 수행되면 다음과 같이 출력됩니다. ```bash serviceaccount/jupyter-web-app-service-account created role.rbac.authorization.k8s.io/jupyter-web-app-jupyter-notebook-role created clusterrole.rbac.authorization.k8s.io/jupyter-web-app-cluster-role created clusterrole.rbac.authorization.k8s.io/jupyter-web-app-kubeflow-notebook-ui-admin created clusterrole.rbac.authorization.k8s.io/jupyter-web-app-kubeflow-notebook-ui-edit created clusterrole.rbac.authorization.k8s.io/jupyter-web-app-kubeflow-notebook-ui-view created rolebinding.rbac.authorization.k8s.io/jupyter-web-app-jupyter-notebook-role-binding created clusterrolebinding.rbac.authorization.k8s.io/jupyter-web-app-cluster-role-binding created configmap/jupyter-web-app-config-76844k4cd7 created configmap/jupyter-web-app-logos created configmap/jupyter-web-app-parameters-chmg88cm48 created service/jupyter-web-app-service created deployment.apps/jupyter-web-app-deployment created virtualservice.networking.istio.io/jupyter-web-app-jupyter-web-app created ``` 정상적으로 설치되었는지 확인합니다. ```bash kubectl get po -n kubeflow | grep jupyter-web-app ``` 1개의 pod 가 Running 이 될 때까지 기다립니다. ```bash jupyter-web-app-deployment-6f744fbc54-p27ts 1/1 Running 0 2m ``` ### Profiles + KFAM Profile Controller를 설치합니다. ```bash kustomize build apps/profiles/upstream/overlays/kubeflow | kubectl apply -f - ``` 정상적으로 수행되면 다음과 같이 출력됩니다. ```bash customresourcedefinition.apiextensions.k8s.io/profiles.kubeflow.org created serviceaccount/profiles-controller-service-account created role.rbac.authorization.k8s.io/profiles-leader-election-role created rolebinding.rbac.authorization.k8s.io/profiles-leader-election-rolebinding created clusterrolebinding.rbac.authorization.k8s.io/profiles-cluster-role-binding created configmap/namespace-labels-data-48h7kd55mc created configmap/profiles-config-46c7tgh6fd created service/profiles-kfam created deployment.apps/profiles-deployment created virtualservice.networking.istio.io/profiles-kfam created ``` 정상적으로 설치되었는지 확인합니다. ```bash kubectl get po -n kubeflow | grep profiles-deployment ``` 1 개의 pod 가 Running 이 될 때까지 기다립니다. ```bash profiles-deployment-89f7d88b-qsnrd 2/2 Running 0 42s ``` ### Volumes Web App Volumes Web App 을 설치합니다. ```bash kustomize build apps/volumes-web-app/upstream/overlays/istio | kubectl apply -f - ``` 정상적으로 수행되면 다음과 같이 출력됩니다. ```bash serviceaccount/volumes-web-app-service-account created clusterrole.rbac.authorization.k8s.io/volumes-web-app-cluster-role created clusterrole.rbac.authorization.k8s.io/volumes-web-app-kubeflow-volume-ui-admin created clusterrole.rbac.authorization.k8s.io/volumes-web-app-kubeflow-volume-ui-edit created clusterrole.rbac.authorization.k8s.io/volumes-web-app-kubeflow-volume-ui-view created clusterrolebinding.rbac.authorization.k8s.io/volumes-web-app-cluster-role-binding created configmap/volumes-web-app-parameters-4gg8cm2gmk created service/volumes-web-app-service created deployment.apps/volumes-web-app-deployment created virtualservice.networking.istio.io/volumes-web-app-volumes-web-app created ``` 정상적으로 설치되었는지 확인합니다. ```bash kubectl get po -n kubeflow | grep volumes-web-app ``` 1개의 pod가 Running 이 될 때까지 기다립니다. ```bash volumes-web-app-deployment-8589d664cc-62svl 1/1 Running 0 27s ``` ### Tensorboard & Tensorboard Web App 1. Tensorboard Web App 를 설치합니다. ```bash kustomize build apps/tensorboard/tensorboards-web-app/upstream/overlays/istio | kubectl apply -f - ``` 정상적으로 수행되면 다음과 같이 출력됩니다. ```bash serviceaccount/tensorboards-web-app-service-account created clusterrole.rbac.authorization.k8s.io/tensorboards-web-app-cluster-role created clusterrole.rbac.authorization.k8s.io/tensorboards-web-app-kubeflow-tensorboard-ui-admin created clusterrole.rbac.authorization.k8s.io/tensorboards-web-app-kubeflow-tensorboard-ui-edit created clusterrole.rbac.authorization.k8s.io/tensorboards-web-app-kubeflow-tensorboard-ui-view created clusterrolebinding.rbac.authorization.k8s.io/tensorboards-web-app-cluster-role-binding created configmap/tensorboards-web-app-parameters-g28fbd6cch created service/tensorboards-web-app-service created deployment.apps/tensorboards-web-app-deployment created virtualservice.networking.istio.io/tensorboards-web-app-tensorboards-web-app created ``` 정상적으로 설치되었는지 확인합니다. ```bash kubectl get po -n kubeflow | grep tensorboards-web-app ``` 1 개의 pod 가 Running 이 될 때까지 기다립니다. ```bash tensorboards-web-app-deployment-6ff79b7f44-qbzmw 1/1 Running 0 22s ``` 2. Tensorboard Controller 를 설치합니다. ```bash kustomize build apps/tensorboard/tensorboard-controller/upstream/overlays/kubeflow | kubectl apply -f - ``` 정상적으로 수행되면 다음과 같이 출력됩니다. ```bash customresourcedefinition.apiextensions.k8s.io/tensorboards.tensorboard.kubeflow.org created serviceaccount/tensorboard-controller created role.rbac.authorization.k8s.io/tensorboard-controller-leader-election-role created clusterrole.rbac.authorization.k8s.io/tensorboard-controller-manager-role created clusterrole.rbac.authorization.k8s.io/tensorboard-controller-proxy-role created rolebinding.rbac.authorization.k8s.io/tensorboard-controller-leader-election-rolebinding created clusterrolebinding.rbac.authorization.k8s.io/tensorboard-controller-manager-rolebinding created clusterrolebinding.rbac.authorization.k8s.io/tensorboard-controller-proxy-rolebinding created configmap/tensorboard-controller-config-bf88mm96c8 created service/tensorboard-controller-controller-manager-metrics-service created deployment.apps/tensorboard-controller-controller-manager created ``` 정상적으로 설치되었는지 확인합니다. ```bash kubectl get po -n kubeflow | grep tensorboard-controller ``` 1 개의 pod 가 Running 이 될 때까지 기다립니다. ```bash tensorboard-controller-controller-manager-954b7c544-vjpzj 3/3 Running 1 73s ``` ### Training Operator Training Operator 를 설치합니다. ```bash kustomize build apps/training-operator/upstream/overlays/kubeflow | kubectl apply -f - ``` 정상적으로 수행되면 다음과 같이 출력됩니다. ```bash customresourcedefinition.apiextensions.k8s.io/mxjobs.kubeflow.org created customresourcedefinition.apiextensions.k8s.io/pytorchjobs.kubeflow.org created customresourcedefinition.apiextensions.k8s.io/tfjobs.kubeflow.org created customresourcedefinition.apiextensions.k8s.io/xgboostjobs.kubeflow.org created serviceaccount/training-operator created clusterrole.rbac.authorization.k8s.io/kubeflow-training-admin created clusterrole.rbac.authorization.k8s.io/kubeflow-training-edit created clusterrole.rbac.authorization.k8s.io/kubeflow-training-view created clusterrole.rbac.authorization.k8s.io/training-operator created clusterrolebinding.rbac.authorization.k8s.io/training-operator created service/training-operator created deployment.apps/training-operator created ``` 정상적으로 설치되었는지 확인합니다. ```bash kubectl get po -n kubeflow | grep training-operator ``` 1 개의 pod 가 Running 이 될 때까지 기다립니다. ```bash training-operator-7d98f9dd88-6887f 1/1 Running 0 28s ``` ### User Namespace Kubeflow 사용을 위해, 사용할 User의 Kubeflow Profile 을 생성합니다. ```bash kustomize build common/user-namespace/base | kubectl apply -f - ``` 정상적으로 수행되면 다음과 같이 출력됩니다. ```bash configmap/default-install-config-9h2h2b6hbk created profile.kubeflow.org/kubeflow-user-example-com created ``` kubeflow-user-example-com profile 이 생성된 것을 확인합니다. ```bash kubectl get profile ``` ```bash kubeflow-user-example-com 37s ``` ## 정상 설치 확인 Kubeflow central dashboard에 web browser로 접속하기 위해 포트 포워딩합니다. ```bash kubectl port-forward svc/istio-ingressgateway -n istio-system 8080:80 ``` Web Browser 를 열어 [http://localhost:8080](http://localhost:8080) 으로 접속하여, 다음과 같은 화면이 출력되는 것을 확인합니다. ![login-ui](./img/login-after-install.png) 다음 접속 정보를 입력하여 접속합니다. - Email Address: `user@example.com` - Password: `12341234` ![central-dashboard](./img/after-login.png) ================================================ FILE: docs/setup-components/install-components-mlflow.md ================================================ --- title : "2. MLflow Tracking Server" description: "구성요소 설치 - MLflow" sidebar_position: 2 date: 2021-12-13 lastmod: 2021-12-13 contributors: ["Jaeyeon Kim"] --- ## Install MLflow Tracking Server MLflow는 대표적인 오픈소스 ML 실험 관리 도구입니다. MLflow는 [실험 관리 용도](https://mlflow.org/docs/latest/tracking.html#tracking) 외에도 [ML Model 패키징](https://mlflow.org/docs/latest/projects.html#projects), [ML 모델 배포 관리](https://mlflow.org/docs/latest/models.html#models), [ML 모델 저장](https://mlflow.org/docs/latest/model-registry.html#registry)과 같은 기능도 제공하고 있습니다. *모두의 MLOps*에서는 MLflow를 실험 관리 용도로 사용합니다. 그래서 MLflow에서 관리하는 데이터를 저장하고 UI를 제공하는 MLflow Tracking Server를 쿠버네티스 클러스터에 배포하여 사용할 예정입니다. ## Before Install MLflow Tracking Server ### PostgreSQL DB 설치 MLflow Tracking Server가 Backend Store로 사용할 용도의 PostgreSQL DB를 쿠버네티스 클러스터에 배포합니다. 먼저 `mlflow-system`이라는 namespace 를 생성합니다. ```bash kubectl create ns mlflow-system ``` 다음과 같은 메시지가 출력되면 정상적으로 생성된 것을 의미합니다. ```bash namespace/mlflow-system created ``` postgresql DB를 `mlflow-system` namespace 에 생성합니다. ```bash kubectl -n mlflow-system apply -f https://raw.githubusercontent.com/mlops-for-all/helm-charts/b94b5fe4133f769c04b25068b98ccfa7a505aa60/mlflow/manifests/postgres.yaml ``` 정상적으로 수행되면 다음과 같이 출력됩니다. ```bash service/postgresql-mlflow-service created deployment.apps/postgresql-mlflow created persistentvolumeclaim/postgresql-mlflow-pvc created ``` mlflow-system namespace 에 1개의 postgresql 관련 pod 가 Running 이 될 때까지 기다립니다. ```bash kubectl get pod -n mlflow-system | grep postgresql ``` 다음과 비슷하게 출력되면 정상적으로 실행된 것입니다. ```bash postgresql-mlflow-7b9bc8c79f-srkh7 1/1 Running 0 38s ``` ### Minio 설정 MLflow Tracking Server가 Artifacts Store로 사용할 용도의 Minio는 이전 Kubeflow 설치 단계에서 설치한 Minio를 활용합니다. 단, kubeflow 용도와 mlflow 용도를 분리하기 위해, mlflow 전용 버킷(bucket)을 생성하겠습니다. minio 에 접속하여 버킷을 생성하기 위해, 우선 minio-service 를 포트포워딩합니다. ```bash kubectl port-forward svc/minio-service -n kubeflow 9000:9000 ``` 웹 브라우저를 열어 [localhost:9000](http://localhost:9000)으로 접속하면 다음과 같은 화면이 출력됩니다. ![minio-install](./img/minio-install.png) 다음과 같은 접속 정보를 입력하여 로그인합니다. - Username: `minio` - Password: `minio123` 우측 하단의 **`+`** 버튼을 클릭하여, `Create Bucket`를 클릭합니다. ![create-bucket](./img/create-bucket.png) `Bucket Name`에 `mlflow`를 입력하여 버킷을 생성합니다. 정상적으로 생성되면 다음과 같이 왼쪽에 `mlflow`라는 이름의 버킷이 생성됩니다. ![mlflow-bucket](./img/mlflow-bucket.png) --- ## Let's Install MLflow Tracking Server ### Helm Repository 추가 ```bash helm repo add mlops-for-all https://mlops-for-all.github.io/helm-charts ``` 다음과 같은 메시지가 출력되면 정상적으로 추가된 것을 의미합니다. ```bash "mlops-for-all" has been added to your repositories ``` ### Helm Repository 업데이트 ```bash helm repo update ``` 다음과 같은 메시지가 출력되면 정상적으로 업데이트된 것을 의미합니다. ```bash Hang tight while we grab the latest from your chart repositories... ...Successfully got an update from the "mlops-for-all" chart repository Update Complete. ⎈Happy Helming!⎈ ``` ### Helm Install mlflow-server Helm Chart 0.2.0 버전을 설치합니다. ```bash helm install mlflow-server mlops-for-all/mlflow-server \ --namespace mlflow-system \ --version 0.2.0 ``` - **주의**: 위의 helm chart는 MLflow 의 backend store 와 artifacts store 의 접속 정보를 kubeflow 설치 과정에서 생성한 minio와 위의 [PostgreSQL DB 설치](#postgresql-db-설치)에서 생성한 postgresql 정보를 default로 하여 설치합니다. - 별개로 생성한 DB 혹은 Object storage를 활용하고 싶은 경우, [Helm Chart Repo](https://github.com/mlops-for-all/helm-charts/tree/main/mlflow/chart)를 참고하여 helm install 시 value를 따로 설정하여 설치하시기 바랍니다. 다음과 같은 메시지가 출력되어야 합니다. ```bash NAME: mlflow-server LAST DEPLOYED: Sat Dec 18 22:02:13 2021 NAMESPACE: mlflow-system STATUS: deployed REVISION: 1 TEST SUITE: None ``` 정상적으로 설치되었는지 확인합니다. ```bash kubectl get pod -n mlflow-system | grep mlflow-server ``` mlflow-system namespace 에 1 개의 mlflow-server 관련 pod 가 Running 이 될 때까지 기다립니다. 다음과 비슷하게 출력되면 정상적으로 실행된 것입니다. ```bash mlflow-server-ffd66d858-6hm62 1/1 Running 0 74s ``` ### 정상 설치 확인 그럼 이제 MLflow Server에 정상적으로 접속되는지 확인해보겠습니다. 우선 클라이언트 노드에서 접속하기 위해, 포트포워딩을 수행합니다. ```bash kubectl port-forward svc/mlflow-server-service -n mlflow-system 5000:5000 ``` 웹 브라우저를 열어 [localhost:5000](http://localhost:5000)으로 접속하면 다음과 같은 화면이 출력됩니다. ![mlflow-install](./img/mlflow-install.png) ================================================ FILE: docs/setup-components/install-components-pg.md ================================================ --- title : "4. Prometheus & Grafana" description: "구성요소 설치 - Prometheus & Grafana" sidebar_position: 4 date: 2021-12-13 lastmod: 2021-12-13 contributors: ["Jaeyeon Kim"] --- ## Prometheus & Grafana 프로메테우스(Prometheus) 와 그라파나(Grafana) 는 모니터링을 위한 도구입니다. 안정적인 서비스 운영을 위해서는 서비스와 서비스가 운영되고 있는 인프라의 상태를 지속해서 관찰하고, 관찰한 메트릭을 바탕으로 문제가 생길 때 빠르게 대응해야 합니다. 이러한 모니터링을 효율적으로 수행하기 위한 많은 도구 중 *모두의 MLOps*에서는 오픈소스인 프로메테우스와 그라파나를 사용할 예정입니다. 더 자세한 내용은 [Prometheus 공식 문서](https://prometheus.io/docs/introduction/overview/), [Grafana 공식 문서](https://grafana.com/docs/)를 확인해주시기를 바랍니다. 프로메테우스는 다양한 대상으로부터 Metric을 수집하는 도구이며, 그라파나는 모인 데이터를 시각화하는 것을 도와주는 도구입니다. 서로 간의 종속성은 없지만 상호 보완적으로 사용할 수 있어 함께 사용되는 경우가 많습니다. 이번 페이지에서는 쿠버네티스 클러스터에 프로메테우스와 그라파나를 설치한 뒤, Seldon-Core 로 생성한 SeldonDeployment 로 API 요청을 보내, 정상적으로 Metrics 이 수집되는지 확인해보겠습니다. 본 글에서는 seldonio/seldon-core-analytics Helm Chart 1.12.0 버전을 활용해 쿠버네티스 클러스터에 프로메테우스와 그라파나를 설치하고, Seldon-Core 에서 생성한 SeldonDeployment의 Metrics 을 효율적으로 확인하기 위한 대시보드도 함께 설치합니다. ### Helm Repository 추가 ```bash helm repo add seldonio https://storage.googleapis.com/seldon-charts ``` 다음과 같은 메시지가 출력되면 정상적으로 추가된 것을 의미합니다. ```bash "seldonio" has been added to your repositories ``` ### Helm Repository 업데이트 ```bash helm repo update ``` 다음과 같은 메시지가 출력되면 정상적으로 업데이트된 것을 의미합니다. ```bash Hang tight while we grab the latest from your chart repositories... ...Successfully got an update from the "seldonio" chart repository ...Successfully got an update from the "datawire" chart repository Update Complete. ⎈Happy Helming!⎈ ``` ### Helm Install seldon-core-analytics Helm Chart 1.12.0 버전을 설치합니다. ```bash helm install seldon-core-analytics seldonio/seldon-core-analytics \ --namespace seldon-system \ --version 1.12.0 ``` 다음과 같은 메시지가 출력되어야 합니다. ```bash 생략... NAME: seldon-core-analytics LAST DEPLOYED: Tue Dec 14 18:29:38 2021 NAMESPACE: seldon-system STATUS: deployed REVISION: 1 ``` 정상적으로 설치되었는지 확인합니다. ```bash kubectl get pod -n seldon-system | grep seldon-core-analytics ``` seldon-system namespace 에 6개의 seldon-core-analytics 관련 pod 가 Running 이 될 때까지 기다립니다. ```bash seldon-core-analytics-grafana-657c956c88-ng8wn 2/2 Running 0 114s seldon-core-analytics-kube-state-metrics-94bb6cb9-svs82 1/1 Running 0 114s seldon-core-analytics-prometheus-alertmanager-64cf7b8f5-nxbl8 2/2 Running 0 114s seldon-core-analytics-prometheus-node-exporter-5rrj5 1/1 Running 0 114s seldon-core-analytics-prometheus-pushgateway-8476474cff-sr4n6 1/1 Running 0 114s seldon-core-analytics-prometheus-seldon-685c664894-7cr45 2/2 Running 0 114s ``` ### 정상 설치 확인 그럼 이제 그라파나에 정상적으로 접속되는지 확인해보겠습니다. 우선 클라이언트 노드에서 접속하기 위해, 포트포워딩을 수행합니다. ```bash kubectl port-forward svc/seldon-core-analytics-grafana -n seldon-system 8090:80 ``` 웹 브라우저를 열어 [localhost:8090](http://localhost:8090)으로 접속하면 다음과 같은 화면이 출력됩니다. ![grafana-install](./img/grafana-install.png) 다음과 같은 접속정보를 입력하여 접속합니다. - Email or username : `admin` - Password : `password` 로그인하면 다음과 같은 화면이 출력됩니다. ![grafana-login](./img/grafana-login.png) 좌측의 대시보드 아이콘을 클릭하여, `Manage` 버튼을 클릭합니다. ![dashboard-click](./img/dashboard-click.png) 기본적인 그라파나 대시보드가 포함되어있는 것을 확인할 수 있습니다. 이 중 `Prediction Analytics` 대시보드를 클릭합니다. ![dashboard](./img/dashboard.png) Seldon Core API Dashboard 가 보이고, 다음과 같이 출력되는 것을 확인할 수 있습니다. ![seldon-dashboard](./img/seldon-dashboard.png) ## References - [Seldon-Core-Analytics Helm Chart](https://github.com/SeldonIO/seldon-core/tree/master/helm-charts/seldon-core-analytics) ================================================ FILE: docs/setup-components/install-components-seldon.md ================================================ --- title : "3. Seldon-Core" description: "구성요소 설치 - Seldon-Core" sidebar_position: 3 date: 2021-12-13 lastmod: 2021-12-13 contributors: ["Jaeyeon Kim"] --- ## Seldon-Core Seldon-Core는 쿠버네티스 환경에 수많은 머신러닝 모델을 배포하고 관리할 수 있는 오픈소스 프레임워크 중 하나입니다. 더 자세한 내용은 Seldon-Core 의 공식 [제품 설명 페이지](https://www.seldon.io/tech/products/core/) 와 [깃헙](https://github.com/SeldonIO/seldon-core) 그리고 API Deployment 파트를 참고해주시기를 바랍니다. ## Selon-Core 설치 Seldon-Core를 사용하기 위해서는 쿠버네티스의 인그레스(Ingress)를 담당하는 Ambassador 와 Istio 와 같은 [모듈이 필요합니다](https://docs.seldon.io/projects/seldon-core/en/latest/workflow/install.html). Seldon-Core 에서는 Ambassador 와 Istio 만을 공식적으로 지원하며, *모두의 MLOps*에서는 Ambassador를 사용해 Seldon-core를 사용하므로 Ambassador를 설치하겠습니다. ### Ambassador - Helm Repository 추가 ```bash helm repo add datawire https://www.getambassador.io ``` 다음과 같은 메시지가 출력되면 정상적으로 추가된 것을 의미합니다. ```bash "datawire" has been added to your repositories ``` ### Ambassador - Helm Repository 업데이트 ```bash helm repo update ``` 다음과 같은 메시지가 출력되면 정상적으로 업데이트된 것을 의미합니다. ```bash Hang tight while we grab the latest from your chart repositories... ...Successfully got an update from the "datawire" chart repository Update Complete. ⎈Happy Helming!⎈ ``` ### Ambassador - Helm Install ambassador Chart 6.9.3 버전을 설치합니다. ```bash helm install ambassador datawire/ambassador \ --namespace seldon-system \ --create-namespace \ --set image.repository=quay.io/datawire/ambassador \ --set enableAES=false \ --set crds.keep=false \ --version 6.9.3 ``` 다음과 같은 메시지가 출력되어야 합니다. ```bash 생략... W1206 17:01:36.026326 26635 warnings.go:70] rbac.authorization.k8s.io/v1beta1 Role is deprecated in v1.17+, unavailable in v1.22+; use rbac.authorization.k8s.io/v1 Role W1206 17:01:36.029764 26635 warnings.go:70] rbac.authorization.k8s.io/v1beta1 RoleBinding is deprecated in v1.17+, unavailable in v1.22+; use rbac.authorization.k8s.io/v1 RoleBinding NAME: ambassador LAST DEPLOYED: Mon Dec 6 17:01:34 2021 NAMESPACE: seldon-system STATUS: deployed REVISION: 1 NOTES: ------------------------------------------------------------------------------- Congratulations! You've successfully installed Ambassador! ------------------------------------------------------------------------------- To get the IP address of Ambassador, run the following commands: NOTE: It may take a few minutes for the LoadBalancer IP to be available. You can watch the status of by running 'kubectl get svc -w --namespace seldon-system ambassador' On GKE/Azure: export SERVICE_IP=$(kubectl get svc --namespace seldon-system ambassador -o jsonpath='{.status.loadBalancer.ingress[0].ip}') On AWS: export SERVICE_IP=$(kubectl get svc --namespace seldon-system ambassador -o jsonpath='{.status.loadBalancer.ingress[0].hostname}') echo http://$SERVICE_IP: For help, visit our Slack at http://a8r.io/Slack or view the documentation online at https://www.getambassador.io. ``` seldon-system 에 4 개의 pod 가 Running 이 될 때까지 기다립니다. ```bash kubectl get pod -n seldon-system ``` ```bash ambassador-7f596c8b57-4s9xh 1/1 Running 0 7m15s ambassador-7f596c8b57-dt6lr 1/1 Running 0 7m15s ambassador-7f596c8b57-h5l6f 1/1 Running 0 7m15s ambassador-agent-77bccdfcd5-d5jxj 1/1 Running 0 7m15s ``` ### Seldon-Core - Helm Install seldon-core-operator Chart 1.11.2 버전을 설치합니다. ```bash helm install seldon-core seldon-core-operator \ --repo https://storage.googleapis.com/seldon-charts \ --namespace seldon-system \ --set usageMetrics.enabled=true \ --set ambassador.enabled=true \ --version 1.11.2 ``` 다음과 같은 메시지가 출력되어야 합니다. ```bash 생략... W1206 17:05:38.336391 28181 warnings.go:70] admissionregistration.k8s.io/v1beta1 ValidatingWebhookConfiguration is deprecated in v1.16+, unavailable in v1.22+; use admissionregistration.k8s.io/v1 ValidatingWebhookConfiguration NAME: seldon-core LAST DEPLOYED: Mon Dec 6 17:05:34 2021 NAMESPACE: seldon-system STATUS: deployed REVISION: 1 TEST SUITE: None ``` seldon-system namespace 에 1 개의 seldon-controller-manager pod 가 Running 이 될 때까지 기다립니다. ```bash kubectl get pod -n seldon-system | grep seldon-controller ``` ```bash seldon-controller-manager-8457b8b5c7-r2frm 1/1 Running 0 2m22s ``` ## References - [Example Model Servers with Seldon](https://docs.seldon.io/projects/seldon-core/en/latest/examples/server_examples.html#examples-server-examples--page-root) ================================================ FILE: docs/setup-kubernetes/_category_.json ================================================ { "label": "Setup Kubernetes", "position": 2, "link": { "type": "generated-index" } } ================================================ FILE: docs/setup-kubernetes/install-kubernetes/_category_.json ================================================ { "label": "4. Install Kubernetes", "position": 4, "link": { "type": "generated-index" } } ================================================ FILE: docs/setup-kubernetes/install-kubernetes/kubernetes-with-k3s.md ================================================ --- title: "4.1. K3s" description: "" sidebar_position: 1 date: 2021-12-13 lastmod: 2021-12-20 draft: false weight: 221 contributors: ["Jongseob Jeon"] menu: docs: parent:../setup-kubernetes" images: [] --- ## 1. Prerequisite 쿠버네티스 클러스터를 구축하기에 앞서, 필요한 구성 요소들을 **클러스터에** 설치합니다. [Install Prerequisite](../../setup-kubernetes/install-prerequisite.md)을 참고하여 Kubernetes를 설치하기 전에 필요한 요소들을 **클러스터에** 설치해 주시기 바랍니다. k3s 에서는 기본값으로 containerd를 백엔드로 이용해 설치합니다. 하지만 저희는 GPU를 사용하기 위해서 docker를 백엔드로 사용해야 하므로 `--docker` 옵션을 통해 백엔드를 docker로 설치하겠습니다. ```bash curl -sfL https://get.k3s.io | INSTALL_K3S_VERSION=v1.21.7+k3s1 sh -s - server --disable traefik --disable servicelb --disable local-storage --docker ``` k3s를 설치 후 k3s config를 확인합니다 ```bash sudo cat /etc/rancher/k3s/k3s.yaml ``` 정상적으로 설치되면 다음과 같은 항목이 출력됩니다. (보안 문제와 관련된 키들은 <...>로 가렸습니다.) ```bash apiVersion: v1 clusters: - cluster: certificate-authority-data: <...> server: https://127.0.0.1:6443 name: default contexts: - context: cluster: default user: default name: default current-context: default kind: Config preferences: {} users: - name: default user: client-certificate-data: <...> client-key-data: <...> ``` ## 2. 쿠버네티스 클러스터 셋업 k3s config를 클러스터의 kubeconfig로 사용하기 위해서 복사합니다. ```bash mkdir .kube sudo cp /etc/rancher/k3s/k3s.yaml .kube/config ``` 복사된 config 파일에 user가 접근할 수 있는 권한을 줍니다. ```bash sudo chown $USER:$USER .kube/config ``` ## 3. 쿠버네티스 클라이언트 셋업 이제 클러스터에서 설정한 kubeconfig를 로컬로 이동합니다. 로컬에서는 경로를 `~/.kube/config`로 설정합니다. 처음 복사한 config 파일에는 server ip가 `https://127.0.0.1:6443` 으로 되어 있습니다. 이 값을 클러스터의 ip에 맞게 수정합니다. (이번 페이지에서 사용하는 클러스터의 ip에 맞춰서 `https://192.168.0.19:6443` 으로 수정했습니다.) ```bash apiVersion: v1 clusters: - cluster: certificate-authority-data: <...> server: https://192.168.0.19:6443 name: default contexts: - context: cluster: default user: default name: default current-context: default kind: Config preferences: {} users: - name: default user: client-certificate-data: <...> client-key-data: <...> ``` ## 4. 쿠버네티스 기본 모듈 설치 [Setup Kubernetes Modules](../../setup-kubernetes/install-kubernetes-module.md)을 참고하여 다음 컴포넌트들을 설치해 주시기 바랍니다. - helm - kustomize - CSI plugin - [Optional] nvidia-docker, nvidia-device-plugin ## 5. 정상 설치 확인 최종적으로 node가 Ready 인지, OS, Docker, Kubernetes 버전을 확인합니다. ```bash kubectl get nodes -o wide ``` 다음과 같은 메시지가 보이면 정상적으로 설치된 것을 의미합니다. ```bash NAME STATUS ROLES AGE VERSION INTERNAL-IP EXTERNAL-IP OS-IMAGE KERNEL-VERSION CONTAINER-RUNTIME ubuntu Ready control-plane,master 11m v1.21.7+k3s1 192.168.0.19 Ubuntu 20.04.3 LTS 5.4.0-91-generic docker://20.10.11 ``` ## 6. References - [https://rancher.com/docs/k3s/latest/en/installation/install-options/](https://rancher.com/docs/k3s/latest/en/installation/install-options/) ================================================ FILE: docs/setup-kubernetes/install-kubernetes/kubernetes-with-kubeadm.md ================================================ --- title: "4.3. Kubeadm" description: "" sidebar_position: 3 date: 2021-12-13 lastmod: 2021-12-20 contributors: ["Youngcheol Jang"] --- ## 1. Prerequisite 쿠버네티스 클러스터를 구축하기에 앞서, 필요한 구성 요소들을 **클러스터에** 설치합니다. [Install Prerequisite](../../setup-kubernetes/install-prerequisite.md)을 참고하여 Kubernetes를 설치하기 전에 필요한 요소들을 **클러스터에** 설치해 주시기 바랍니다. 쿠버네티스를 위한 네트워크의 설정을 변경합니다. ```bash sudo modprobe br_netfilter cat < Ubuntu 20.04.3 LTS 5.4.0-91-generic docker://20.10.11 ``` ================================================ FILE: docs/setup-kubernetes/install-kubernetes-module.md ================================================ --- title: "5. Install Kubernetes Modules" description: "Install Helm, Kustomize" sidebar_position: 5 date: 2021-12-13 lastmod: 2021-12-20 contributors: ["Jaeyeon Kim"] --- ## Setup Kubernetes Modules 이번 페이지에서는 클러스터에서 사용할 모듈을 클라이언트 노드에서 설치하는 과정에 관해서 설명합니다. 앞으로 소개되는 과정은 모두 **클라이언트 노드**에서 진행됩니다. ## Helm Helm은 쿠버네티스 패키지와 관련된 자원을 한 번에 배포하고 관리할 수 있게 도와주는 패키지 매니징 도구 중 하나입니다. 1. 현재 폴더에 Helm v3.7.1 버전을 내려받습니다. - For Linux amd64 ```bash wget https://get.helm.sh/helm-v3.7.1-linux-amd64.tar.gz ``` - 다른 OS는 [공식 홈페이지](https://github.com/helm/helm/releases/tag/v3.7.1)를 참고하시어, 클라이언트 노드의 OS와 CPU에 맞는 바이너리의 다운 경로를 확인하시기 바랍니다. 2. helm을 사용할 수 있도록 압축을 풀고, 파일의 위치를 변경합니다. ```bash tar -zxvf helm-v3.7.1-linux-amd64.tar.gz sudo mv linux-amd64/helm /usr/local/bin/helm ``` 3. 정상적으로 설치되었는지 확인합니다. ```bash helm help ``` 다음과 같은 메시지가 보이면 정상적으로 설치된 것을 의미합니다. ```bash The Kubernetes package manager Common actions for Helm: - helm search: search for charts - helm pull: download a chart to your local directory to view - helm install: upload the chart to Kubernetes - helm list: list releases of charts Environment variables: | Name | Description | |--------------------------|---------------------------------------------------------------------| | $HELM_CACHE_HOME | set an alternative location for storing cached files. | | $HELM_CONFIG_HOME | set an alternative location for storing Helm configuration. | | $HELM_DATA_HOME | set an alternative location for storing Helm data. | ... ``` ## Kustomize kustomize 또한 여러 쿠버네티스 리소스를 한 번에 배포하고 관리할 수 있게 도와주는 패키지 매니징 도구 중 하나입니다. 1. 현재 폴더에 kustomize v3.10.0 버전의 바이너리를 다운받습니다. - For Linux amd64 ```bash wget https://github.com/kubernetes-sigs/kustomize/releases/download/kustomize%2Fv3.10.0/kustomize_v3.10.0_linux_amd64.tar.gz ``` - 다른 OS는 [kustomize/v3.10.0](https://github.com/kubernetes-sigs/kustomize/releases/tag/kustomize%2Fv3.10.0)에서 확인 후 다운로드 받습니다. 2. kustomize 를 사용할 수 있도록 압축을 풀고, 파일의 위치를 변경합니다. ```bash tar -zxvf kustomize_v3.10.0_linux_amd64.tar.gz sudo mv kustomize /usr/local/bin/kustomize ``` 3. 정상적으로 설치되었는지 확인합니다. ```bash kustomize help ``` 다음과 같은 메시지가 보이면 정상적으로 설치된 것을 의미합니다. ```bash Manages declarative configuration of Kubernetes. See https://sigs.k8s.io/kustomize Usage: kustomize [command] Available Commands: build Print configuration per contents of kustomization.yaml cfg Commands for reading and writing configuration. completion Generate shell completion script create Create a new kustomization in the current directory edit Edits a kustomization file fn Commands for running functions against configuration. ... ``` ## CSI Plugin : Local Path Provisioner 1. CSI Plugin은 kubernetes 내의 스토리지를 담당하는 모듈입니다. 단일 노드 클러스터에서 쉽게 사용할 수 있는 CSI Plugin인 Local Path Provisioner를 설치합니다. ```bash kubectl apply -f https://raw.githubusercontent.com/rancher/local-path-provisioner/v0.0.20/deploy/local-path-storage.yaml ``` 다음과 같은 메시지가 보이면 정상적으로 설치된 것을 의미합니다. ```bash namespace/local-path-storage created serviceaccount/local-path-provisioner-service-account created clusterrole.rbac.authorization.k8s.io/local-path-provisioner-role created clusterrolebinding.rbac.authorization.k8s.io/local-path-provisioner-bind created deployment.apps/local-path-provisioner created storageclass.storage.k8s.io/local-path created configmap/local-path-config created ``` 2. 또한, 다음과 같이 local-path-storage namespace 에 provisioner pod이 Running 인지 확인합니다. ```bash kubectl -n local-path-storage get pod ``` 정상적으로 수행되면 아래와 같이 출력됩니다. ```bash NAME READY STATUS RESTARTS AGE local-path-provisioner-d744ccf98-xfcbk 1/1 Running 0 7m ``` 4. 다음을 수행하여 default storage class로 변경합니다. ```bash kubectl patch storageclass local-path -p '{"metadata": {"annotations":{"storageclass.kubernetes.io/is-default-class":"true"}}}' ``` 정상적으로 수행되면 아래와 같이 출력됩니다. ```bash storageclass.storage.k8s.io/local-path patched ``` 5. default storage class로 설정되었는지 확인합니다. ```bash kubectl get sc ``` 다음과 같이 NAME에 `local-path (default)` 인 storage class가 존재하는 것을 확인합니다. ```bash NAME PROVISIONER RECLAIMPOLICY VOLUMEBINDINGMODE ALLOWVOLUMEEXPANSION AGE local-path (default) rancher.io/local-path Delete WaitForFirstConsumer false 2h ``` ================================================ FILE: docs/setup-kubernetes/install-prerequisite.md ================================================ --- title: "3. Install Prerequisite" description: "Install docker" sidebar_position: 3 date: 2021-12-13 lastmod: 2023-09-29 contributors: ["Jaeyeon Kim", "Jongsun Shinn", "Sangwoo Shim", "Minwook Je"] --- 이 페이지에서는 쿠버네티스를 설치하기에 앞서, **클러스터**와 **클라이언트**에 설치 혹은 설정해두어야 하는 컴포넌트들에 대한 매뉴얼을 설명합니다. ## Install apt packages 추후 클라이언트와 클러스터의 원활한 통신을 위해서는 Port-Forwarding을 수행해야 할 일이 있습니다. Port-Forwarding을 위해서는 **클러스터**에 다음 패키지를 설치해 주어야 합니다. ```bash sudo apt-get update sudo apt-get install -y socat ``` ## Install Docker 1. 도커 설치에 필요한 APT 패키지들을 설치합니다. ```bash sudo apt-get update && sudo apt-get install -y ca-certificates curl gnupg lsb-release ``` 2. 도커의 공식 GPG key를 추가합니다. ```bash curl -fsSL https://download.docker.com/linux/ubuntu/gpg | sudo gpg --dearmor -o /usr/share/keyrings/docker-archive-keyring.gpg ``` 3. apt 패키지 매니저로 도커를 설치할 때, stable Repository에서 받아오도록 설정합니다. ```bash echo \ "deb [arch=$(dpkg --print-architecture) signed-by=/usr/share/keyrings/docker-archive-keyring.gpg] https://download.docker.com/linux/ubuntu \ $(lsb_release -cs) stable" | sudo tee /etc/apt/sources.list.d/docker.list > /dev/null ``` 4. 현재 설치할 수 있는 도커 버전을 확인합니다. ```bash sudo apt-get update && apt-cache madison docker-ce ``` 출력되는 버전 중 `5:20.10.11~3-0~ubuntu-focal` 버전이 있는지 확인합니다. ```bash apt-cache madison docker-ce | grep 5:20.10.11~3-0~ubuntu-focal ``` 정상적으로 추가가 된 경우 다음과 같이 출력됩니다. ```bash docker-ce | 5:20.10.11~3-0~ubuntu-focal | https://download.docker.com/linux/ubuntu focal/stable amd64 Packages ``` 5. `5:20.10.11~3-0~ubuntu-focal` 버전의 도커를 설치합니다. ```bash sudo apt-get install -y containerd.io docker-ce=5:20.10.11~3-0~ubuntu-focal docker-ce-cli=5:20.10.11~3-0~ubuntu-focal ``` 6. 도커가 정상적으로 설치된 것을 확인합니다. ```bash sudo docker run hello-world ``` 명령어 실행 후 다음과 같은 메시지가 보이면 정상적으로 설치된 것을 의미합니다. ```bash mlops@ubuntu:~$ sudo docker run hello-world Hello from Docker! This message shows that your installation appears to be working correctly. To generate this message, Docker took the following steps: 1. The Docker client contacted the Docker daemon. 2. The Docker daemon pulled the "hello-world" image from the Docker Hub. (amd64) 3. The Docker daemon created a new container from that image which runs the executable that produces the output you are currently reading. 4. The Docker daemon streamed that output to the Docker client, which sent it to your terminal. To try something more ambitious, you can run an Ubuntu container with: $ docker run -it ubuntu bash Share images, automate workflows, and more with a free Docker ID: https://hub.docker.com/ For more examples and ideas, visit: https://docs.docker.com/get-started/ ``` 7. docker 관련 command를 sudo 키워드 없이 사용할 수 있게 하도록 다음 명령어를 통해 권한을 추가합니다. ```bash sudo groupadd docker sudo usermod -aG docker $USER newgrp docker ``` 8. sudo 키워드 없이 docker command를 사용할 수 있게 된 것을 확인하기 위해, 다시 한번 docker run을 실행합니다. ```bash docker run hello-world ``` 명령어 실행 후 다음과 같은 메시지가 보이면 정상적으로 권한이 추가된 것을 의미합니다. ```bash mlops@ubuntu:~$ docker run hello-world Hello from Docker! This message shows that your installation appears to be working correctly. To generate this message, Docker took the following steps: 1. The Docker client contacted the Docker daemon. 2. The Docker daemon pulled the "hello-world" image from the Docker Hub. (amd64) 3. The Docker daemon created a new container from that image which runs the executable that produces the output you are currently reading. 4. The Docker daemon streamed that output to the Docker client, which sent it to your terminal. To try something more ambitious, you can run an Ubuntu container with: $ docker run -it ubuntu bash Share images, automate workflows, and more with a free Docker ID: https://hub.docker.com/ For more examples and ideas, visit: https://docs.docker.com/get-started/ ``` ## Turn off Swap Memory kubelet 이 정상적으로 동작하게 하기 위해서는 **클러스터** 노드에서 swap이라고 불리는 가상메모리를 꺼 두어야 합니다. 다음 명령어를 통해 swap을 꺼 둡니다. **(클러스터와 클라이언트를 같은 데스크톱에서 사용할 때 swap 메모리를 종료하면 속도의 저하가 있을 수 있습니다)** ```bash sudo sed -i '/ swap / s/^\(.*\)$/#\1/g' /etc/fstab sudo swapoff -a ``` ## Install Kubectl kubectl 은 쿠버네티스 클러스터에 API를 요청할 때 사용하는 클라이언트 툴입니다. **클라이언트** 노드에 설치해두어야 합니다. 1. 현재 폴더에 kubectl v1.21.7 버전을 다운받습니다. ```bash curl -LO https://dl.k8s.io/release/v1.21.7/bin/linux/amd64/kubectl # Or if you use arm64 curl -LO https://dl.k8s.io/release/v1.21.7/bin/linux/arm64/kubectl ``` 2. kubectl 을 사용할 수 있도록 파일의 권한과 위치를 변경합니다. ```bash sudo install -o root -g root -m 0755 kubectl /usr/local/bin/kubectl ``` 3. 정상적으로 설치되었는지 확인합니다. ```bash kubectl version --client ``` 다음과 같은 메시지가 보이면 정상적으로 설치된 것을 의미합니다. ```bash Client Version: version.Info{Major:"1", Minor:"21", GitVersion:"v1.21.7", GitCommit:"1f86634ff08f37e54e8bfcd86bc90b61c98f84d4", GitTreeState:"clean", BuildDate:"2021-11-17T14:41:19Z", GoVersion:"go1.16.10", Compiler:"gc", Platform:"linux/amd64"} ``` 4. 여러 개의 쿠버네티스 클러스터를 사용하는 경우, 여러 개의 kubeconfig 파일을 관리해야 하는 경우가 있습니다. 여러 개의 kubeconfig 파일 혹은 여러 개의 kube-context를 효율적으로 관리하는 방법은 다음과 같은 문서를 참고하시기 바랍니다. - [https://dev.to/aabiseverywhere/configuring-multiple-kubeconfig-on-your-machine-59eo](https://dev.to/aabiseverywhere/configuring-multiple-kubeconfig-on-your-machine-59eo) - [https://github.com/ahmetb/kubectx](https://github.com/ahmetb/kubectx) ## References - [Install Docker Engine on Ubuntu](https://docs.docker.com/engine/install/ubuntu/) - [리눅스에 kubectl 설치 및 설정](https://kubernetes.io/ko/docs/tasks/tools/install-kubectl-linux/) ================================================ FILE: docs/setup-kubernetes/intro.md ================================================ --- title: "1. Introduction" description: "Setup Introduction" sidebar_position: 1 date: 2021-12-13 lastmod: 2021-12-13 contributors: ["Jaeyeon Kim", "Jongsun Shinn", "Youngdon Tae", "SeungTae Kim"] --- ## MLOps 시스템 구축해보기 MLOps를 공부하는 데 있어서 가장 큰 장벽은 MLOps 시스템을 구성해보고 사용해보기가 어렵다는 점입니다. AWS, GCP 등의 퍼블릭 클라우드 혹은 Weight & Bias, neptune.ai 등의 상용 툴을 사용해보기에는 과금에 대한 부담이 존재하고, 처음부터 모든 환경을 혼자서 구성하기에는 어디서부터 시작해야 할지 막막하게 느껴질 수밖에 없습니다. 이런 이유들로 MLOps를 선뜻 시작해보지 못하시는 분들을 위해, *모두의 MLOps*에서는 우분투가 설치되는 데스크톱 하나만 준비되어 있다면 MLOps 시스템을 밑바닥부터 구축하고 사용해 볼 수 있는 방법을 다룰 예정입니다. 우분투 데스크탑 환경을 준비할 수 없는 경우, 가상머신을 활용하여 환경을 구성하기 >Windows 혹은 Intel Mac을 사용해 `모두의 MLops` 실습을 진행 중인 분들은 `Virtual Box`, `VMware` 등의 가상머신 소프트웨어를 이용하여 우분투 데스크탑 환경을 준비할 수 있습니다. 이 때, 권장 사양을 맞춰 가상 머신을 생성해주시기 바랍니다. >또한, M1 Mac을 사용하시는 분들은 작성일(2022년 2월) 기준으로는 Virtual Box, VMware 는 이용할 수 없습니다. ([M1 Apple Silicone Mac에 최적화된 macOS 앱 지원 확인하기](https://isapplesiliconready.com/kr)) >따라서, 클라우드 환경을 이용해 실습하는 것이 아니라면, [UTM , Virtual machines for Mac](https://mac.getutm.app/)을 설치하여 가상 머신을 이용해주세요. >(앱스토어에서 구매하여 다운로드 받는 소프트웨어는 일종의 Donation 개념의 비용 지불입니다. 무료 버전과 자동 업데이트 정도의 차이가 있어, 무료버전을 사용해도 무방합니다.) >해당 가상머신 소프트웨어는 `Ubuntu 20.04.3 LTS` 실습 운영체제를 지원하고 있어, M1 Mac에서 실습을 수행하는 것을 가능하게 합니다. 하지만 [MLOps의 구성요소](../introduction/component.md)에서 설명하는 요소들을 모두 사용해볼 수는 없기에, *모두의 MLOps*에서는 대표적인 오픈소스만을 설치한 뒤, 서로 연동하여 사용하는 부분을 주로 다룰 예정입니다. *모두의 MLOps*에서 설치하는 오픈소스가 표준을 의미하는 것은 아니며, 여러분의 상황에 맞게 적절한 툴을 취사선택하는 것을 권장합니다. ## 구성 요소 이 글에서 만들어 볼 MLOps 시스템의 구성 요소들과 각 버전은 아래와 같은 환경에서 검증되었습니다. 원활한 환경에서 테스트하기 위해 **싱글 노드 클러스터 (혹은 클러스터)** 와 **클라이언트**를 분리하여 설명해 드릴 예정입니다. **클러스터** 는 우분투가 설치되어 있는 데스크톱 하나를 의미합니다. **클라이언트** 는 노트북 혹은 클러스터가 설치되어 있는 데스크톱 외의 클라이언트로 사용할 수 있는 다른 데스크톱을 사용하는 것을 권장합니다. 하지만 두 대의 머신을 준비할 수 없다면 데스크톱 하나를 동시에 클러스터와 클라이언트 용도로 사용하셔도 괜찮습니다. ### 클러스터 #### 1. Software 아래는 클러스터에 설치해야 할 소프트웨어 목록입니다. | Software | Version | | --------------- | ----------- | | Ubuntu | 20.04.3 LTS | | Docker (Server) | 20.10.11 | | NVIDIA-Driver | 470.86 | | Kubernetes | v1.21.7 | | Kubeflow | v1.4.0 | | MLFlow | v1.21.0 | #### 2. Helm Chart 아래는 Helm을 이용해 설치되어야 할 써드파티 소프트웨어 목록입니다. | Helm Chart Repo Name | Version | | ----------------------------- | ------- | | datawire/ambassador | 6.9.3 | | seldonio/seldon-core-operator | 1.11.2 | ### 클라이언트 클라이언트는 MacOS (Intel CPU), Ubuntu 20.04 에서 검증되었습니다. | Software | Version | | --------------- | ----------- | | kubectl | v1.21.7 | | helm | v3.7.1 | | kustomize | v3.10.0 | ### Minimum System Requirements 모두의 MLOps를 설치할 클러스터는 다음과 같은 사양을 만족시키는 것을 권장합니다. 이는 Kubernetes 및 Kubeflow 의 권장 사양에 의존합니다. - CPU : 6 core - RAM : 12GB - DISK : 50GB - GPU : NVIDIA GPU (Optional) ================================================ FILE: docs/setup-kubernetes/kubernetes.md ================================================ --- title : "2. Setup Kubernetes" description: "Setup Kubernetes" sidebar_position: 2 date: 2021-12-13 lastmod: 2021-12-13 contributors: ["Jaeyeon Kim"] --- ## Setup Kubernetes Cluster 쿠버네티스를 처음 배우시는 분들에게 첫 진입 장벽은 쿠버네티스 실습 환경을 구축하는 것입니다. 프로덕션 레벨의 쿠버네티스 클러스터를 구축할 수 있게 공식적으로 지원하는 도구는 kubeadm 이지만, 사용자들이 조금 더 쉽게 구축할 수 있도록 도와주는 kubespray, kops 등의 도구도 존재하며, 학습 목적을 위해서 컴팩트한 쿠버네티스 클러스터를 정말 쉽게 구축할 수 있도록 도와주는 k3s, minikube, microk8s, kind 등의 도구도 존재합니다. 각각의 도구는 장단점이 다르기에 사용자마다 선호하는 도구가 다른 점을 고려하여, 본 글에서는 kubeadm, k3s, minikube의 3가지 도구를 활용하여 쿠버네티스 클러스터를 구축하는 방법을 다룹니다. 각 도구에 대한 자세한 비교는 다음 쿠버네티스 [공식 문서](https://kubernetes.io/ko/docs/tasks/tools/)를 확인해주시기를 바랍니다. *모두의 MLOps*에서 권장하는 툴은 **k3s**로 쿠버네티스 클러스터를 구축할 때 쉽게 할 수 있다는 장점이 있습니다. 만약 쿠버네티스의 모든 기능을 사용하고 노드 구성까지 활용하고 싶다면 **kubeadm**을 권장해 드립니다. **minikube** 는 저희가 설명하는 컴포넌트 외에도 다른 쿠버네티스를 add-on 형식으로 쉽게 설치할 수 있다는 장점이 있습니다. 본 *모두의 MLOps*에서는 구축하게 될 MLOps 구성 요소들을 원활히 사용하기 위해, 각각의 도구를 활용해 쿠버네티스 클러스터를 구축할 때, 추가로 설정해 주어야 하는 부분이 추가되어 있습니다. Ubuntu OS까지는 설치되어 있는 데스크탑을 k8s cluster로 구축한 뒤, 외부 클라이언트 노드에서 쿠버네티스 클러스터에 접근하는 것을 확인하는 것까지가 본 **Setup Kubernetes**단원의 범위입니다. 자세한 구축 방법은 3가지 도구마다 다르기에 다음과 같은 흐름으로 구성되어 있습니다. ```bash 3. Setup Prerequisite 4. Setup Kubernetes 4.1. with k3s 4.2. with minikube 4.3. with kubeadm 5. Setup Kubernetes Modules ``` 그럼 이제 각각의 도구를 활용해 쿠버네티스 클러스터를 구축해보겠습니다. 반드시 모든 도구를 사용해 볼 필요는 없으며, 이 중 여러분이 익숙하신 도구를 활용해주시면 충분합니다. ================================================ FILE: docs/setup-kubernetes/setup-nvidia-gpu.md ================================================ --- title: "6. (Optional) Setup GPU" description: "Install nvidia docker, nvidia device plugin" sidebar_position: 6 date: 2021-12-13 lastmod: 2021-12-13 contributors: ["Jaeyeon Kim"] --- 쿠버네티스 및 Kubeflow 등에서 GP 를 사용하기 위해서는 다음 작업이 필요합니다. ## 1. Install NVIDIA Driver `nvidia-smi` 수행 시 다음과 같은 화면이 출력된다면 이 단계는 생략해 주시기 바랍니다. ```bash mlops@ubuntu:~$ nvidia-smi +-----------------------------------------------------------------------------+ | NVIDIA-SMI 470.86 Driver Version: 470.86 CUDA Version: 11.4 | |-------------------------------+----------------------+----------------------+ | GPU Name Persistence-M| Bus-Id Disp.A | Volatile Uncorr. ECC | | Fan Temp Perf Pwr:Usage/Cap| Memory-Usage | GPU-Util Compute M. | | | | MIG M. | |===============================+======================+======================| | 0 NVIDIA GeForce ... Off | 00000000:01:00.0 Off | N/A | | 25% 32C P8 4W / 120W | 211MiB / 6078MiB | 0% Default | | | | N/A | +-------------------------------+----------------------+----------------------+ | 1 NVIDIA GeForce ... Off | 00000000:02:00.0 Off | N/A | | 0% 34C P8 7W / 175W | 5MiB / 7982MiB | 0% Default | | | | N/A | +-------------------------------+----------------------+----------------------+ +-----------------------------------------------------------------------------+ | Processes: | | GPU GI CI PID Type Process name GPU Memory | | ID ID Usage | |=============================================================================| | 0 N/A N/A 1644 G /usr/lib/xorg/Xorg 198MiB | | 0 N/A N/A 1893 G /usr/bin/gnome-shell 10MiB | | 1 N/A N/A 1644 G /usr/lib/xorg/Xorg 4MiB | +-----------------------------------------------------------------------------+ ``` `nvidia-smi`의 출력 결과가 위와 같지 않다면 장착된 GPU에 맞는 nvidia driver를 설치해 주시기 바랍니다. 만약 nvidia driver의 설치에 익숙하지 않다면 아래 명령어를 통해 설치하시기 바랍니다. ```bash sudo add-apt-repository ppa:graphics-drivers/ppa sudo apt update && sudo apt install -y ubuntu-drivers-common sudo ubuntu-drivers autoinstall sudo reboot ``` ## 2. NVIDIA-Docker 설치 NVIDIA-Docker를 설치합니다. ```bash curl -s -L https://nvidia.github.io/nvidia-docker/gpgkey | \ sudo apt-key add - distribution=$(. /etc/os-release;echo $ID$VERSION_ID) curl -s -L https://nvidia.github.io/nvidia-docker/$distribution/nvidia-docker.list | sudo tee /etc/apt/sources.list.d/nvidia-docker.list sudo apt-get update sudo apt-get install -y nvidia-docker2 && sudo systemctl restart docker ``` 정상적으로 설치되었는지 확인하기 위해, GPU를 사용하는 도커 컨테이너를 실행해봅니다. ```bash sudo docker run --rm --gpus all nvidia/cuda:11.0-base nvidia-smi ``` 다음과 같은 메시지가 보이면 정상적으로 설치된 것을 의미합니다. ```bash mlops@ubuntu:~$ sudo docker run --rm --gpus all nvidia/cuda:11.0-base nvidia-smi +-----------------------------------------------------------------------------+ | NVIDIA-SMI 470.86 Driver Version: 470.86 CUDA Version: 11.4 | |-------------------------------+----------------------+----------------------+ | GPU Name Persistence-M| Bus-Id Disp.A | Volatile Uncorr. ECC | | Fan Temp Perf Pwr:Usage/Cap| Memory-Usage | GPU-Util Compute M. | | | | MIG M. | |===============================+======================+======================| | 0 NVIDIA GeForce ... Off | 00000000:01:00.0 Off | N/A | | 25% 32C P8 4W / 120W | 211MiB / 6078MiB | 0% Default | | | | N/A | +-------------------------------+----------------------+----------------------+ | 1 NVIDIA GeForce ... Off | 00000000:02:00.0 Off | N/A | | 0% 34C P8 6W / 175W | 5MiB / 7982MiB | 0% Default | | | | N/A | +-------------------------------+----------------------+----------------------+ +-----------------------------------------------------------------------------+ | Processes: | | GPU GI CI PID Type Process name GPU Memory | | ID ID Usage | |=============================================================================| +-----------------------------------------------------------------------------+ ``` ## 3. NVIDIA-Docker를 Default Container Runtime으로 설정 쿠버네티스는 기본적으로 Docker-CE를 Default Container Runtime으로 사용합니다. 따라서, Docker Container 내에서 NVIDIA GPU를 사용하기 위해서는 NVIDIA-Docker 를 Container Runtime 으로 사용하여 pod를 생성할 수 있도록 Default Runtime을 수정해 주어야 합니다. 1. `/etc/docker/daemon.json` 파일을 열어 다음과 같이 수정합니다. ```bash sudo vi /etc/docker/daemon.json { "default-runtime": "nvidia", "runtimes": { "nvidia": { "path": "nvidia-container-runtime", "runtimeArgs": [] } } } ``` 2. 파일이 변경된 것을 확인한 후, Docker를 재시작합니다. ```bash sudo systemctl daemon-reload sudo service docker restart ``` 3. 변경 사항이 반영되었는지 확인합니다. ```bash sudo docker info | grep nvidia ``` 다음과 같은 메시지가 보이면 정상적으로 설치된 것을 의미합니다. ```bash mlops@ubuntu:~$ docker info | grep nvidia Runtimes: io.containerd.runc.v2 io.containerd.runtime.v1.linux nvidia runc Default Runtime: nvidia ``` ## 4. Nvidia-Device-Plugin 1. nvidia-device-plugin daemonset을 생성합니다. ```bash kubectl create -f https://raw.githubusercontent.com/NVIDIA/k8s-device-plugin/v0.10.0/nvidia-device-plugin.yml ``` 2. nvidia-device-plugin pod이 RUNNING 상태로 생성되었는지 확인합니다. ```bash kubectl get pod -n kube-system | grep nvidia ``` 다음과 같은 결과가 출력되어야 합니다. ```bash kube-system nvidia-device-plugin-daemonset-nlqh2 1/1 Running 0 1h ``` 3. node 정보에 gpu가 사용가능하도록 설정되었는지 확인합니다. ```bash kubectl get nodes "-o=custom-columns=NAME:.metadata.name,GPU:.status.allocatable.nvidia\.com/gpu" ``` 다음과 같은 메시지가 보이면 정상적으로 설정된 것을 의미합니다. (*모두의 MLOps* 에서 실습을 진행한 클러스터는 2개의 GPU가 있어서 2가 출력됩니다. 본인의 클러스터의 GPU 개수와 맞는 숫자가 출력된다면 됩니다.) ```bash NAME GPU ubuntu 2 ``` 설정되지 않은 경우, GPU의 value가 `` 으로 표시됩니다. ================================================ FILE: docusaurus.config.js ================================================ // @ts-check // Note: type annotations allow type checking and IDEs autocompletion const lightCodeTheme = require("prism-react-renderer/themes/github"); const darkCodeTheme = require("prism-react-renderer/themes/dracula"); /** @type {import('@docusaurus/types').Config} */ const config = { title: "MLOps for ALL", tagline: "모두를 위한 MLOps", favicon: "img/favicon.ico", // Set the production url of your site here url: "https://mlops-for-all.github.io", // Set the // pathname under which your site is served // For GitHub pages deployment, it is often '//' baseUrl: "/", // GitHub pages deployment config. // If you aren't using GitHub pages, you don't need these. organizationName: "mlops-for-all", // Usually your GitHub org/user name. projectName: "mlops-for-all.github.io", // Usually your repo name. onBrokenLinks: "throw", onBrokenMarkdownLinks: "warn", // Even if you don't use internalization, you can use this field to set useful // metadata like html lang. For example, if your site is Chinese, you may want // to replace "en" with "zh-Hans". i18n: { defaultLocale: "ko", locales: ["en", "ko"], path: "i18n", }, plugins: [ [ "content-docs", /** @type {import('@docusaurus/plugin-content-docs').Options} */ ({ id: "community", path: "community", routeBasePath: "community", editUrl: "https://github.com/mlops-for-all/mlops-for-all.github.io/tree/main/", editCurrentVersion: true, sidebarPath: require.resolve("./sidebarsCommunity.js"), showLastUpdateAuthor: true, showLastUpdateTime: true, }), ], ], presets: [ [ "classic", /** @type {import('@docusaurus/preset-classic').Options} */ ({ docs: { sidebarPath: require.resolve("./sidebars.js"), // Please change this to your repo. // Remove this to remove the "edit this page" links. editUrl: "https://github.com/mlops-for-all/mlops-for-all.github.io/tree/main/", showLastUpdateAuthor: true, showLastUpdateTime: true, lastVersion: "current", versions: { current: { label: "1.0", }, }, }, // blog: { // showReadingTime: true, // // Please change this to your repo. // // Remove this to remove the "edit this page" links. // editUrl: // 'https://github.com/facebook/docusaurus/tree/main/packages/create-docusaurus/templates/shared/', // }, theme: { customCss: require.resolve("./src/css/custom.css"), }, gtag: { trackingID: "G-097K82469K", anonymizeIP: true, }, }), ], ], themeConfig: /** @type {import('@docusaurus/preset-classic').ThemeConfig} */ ({ // Replace with your project's social card image: "img/logo-mlops-for-all.png", navbar: { title: "MLOps for ALL", logo: { alt: "My Site Logo", src: "img/logo-mlops-for-all.png", }, items: [ { type: "docSidebar", sidebarId: "tutorialSidebar", position: "left", label: "Tutorial", }, { type: "docSidebar", sidebarId: "preSidebar", position: "left", label: "Prerequisites", }, { to: "/community/contributors", position: "left", label: "Community", }, // {to: '/blog', label: 'Blog', position: 'left'}, { type: "docsVersionDropdown", position: "right", }, { type: "localeDropdown", position: "right", }, { href: "https://github.com/mlops-for-all/mlops-for-all.github.io/tree/main/", label: "GitHub", position: "right", }, ], }, footer: { style: "dark", logo: { alt: "MakinaRocks", src: "/img/makinarocks.png", href: "https://makinarocks.ai", }, copyright: `Copyright © 2021-${new Date().getFullYear()} MakinaRocks. Built with Docusaurus.`, }, prism: { theme: lightCodeTheme, darkTheme: darkCodeTheme, }, }), }; module.exports = config; ================================================ FILE: i18n/en/code.json ================================================ { "team.profile.Jongseob Jeon.body": { "message": "마키나락스에서 머신러닝 엔지니어로 일하고 있습니다. 모두의 딥러닝을 통해 많은 사람들이 딥러닝을 쉽게 접했듯이 MLOps for ALL를 통해 많은 사람들이 MLOps에 쉽게 접할수 있길 바랍니다." }, "team.profile.Jaeyeon Kim.body": { "message": "비효율적인 작업을 자동화하는 것에 관심이 많습니다." }, "team.profile.Youngchel Jang.body": { "message": "마키나락스에서 MLOps Engineer로 일하고 있습니다. 단순하게 생각하는 노력을 하고 있습니다." }, "team.profile.Jongsun Shinn.body": { "message": "마키나락스에서 ML Engineer로 일하고 있습니다." }, "team.profile.Sangwoo Shim.body": { "message": "마키나락스에서 CTO로 일하고 있습니다. 마키나락스는 머신러닝 기반의 산업용 AI 솔루션을 개발하는 스타트업입니다. 산업 현장의 문제 해결을 통해 사람이 본연의 일에 집중할 수 있게 만드는 것, 그것이 우리가 하는 일입니다." }, "team.profile.Seunghyun Ko.body": { "message": "3i에서 MLOps Engineer로 일하고 있습니다. kubeflow에 관심이 많습니다." }, "team.profile.SeungTae Kim.body": { "message": "Genesis Lab이라는 스타트업에서 Applied AI Engineer 인턴 업무를 수행하고 있습니다. 머신러닝 생태계가 우리 산업 전반에 큰 변화을 가져올 것이라 믿으며, 한 걸음씩 나아가고 있습니다." }, "team.profile.Youngdon Tae.body": { "message": "백패커에서 ML 엔지니어로 일하고 있습니다. 자연어처리, 추천시스템, MLOps에 관심이 많습니다." }, "theme.ErrorPageContent.title": { "message": "This page crashed.", "description": "The title of the fallback page when the page crashed" }, "theme.NotFound.title": { "message": "Page Not Found", "description": "The title of the 404 page" }, "theme.NotFound.p1": { "message": "We could not find what you were looking for.", "description": "The first paragraph of the 404 page" }, "theme.NotFound.p2": { "message": "Please contact the owner of the site that linked you to the original URL and let them know their link is broken.", "description": "The 2nd paragraph of the 404 page" }, "theme.admonition.note": { "message": "note", "description": "The default label used for the Note admonition (:::note)" }, "theme.admonition.tip": { "message": "tip", "description": "The default label used for the Tip admonition (:::tip)" }, "theme.admonition.danger": { "message": "danger", "description": "The default label used for the Danger admonition (:::danger)" }, "theme.admonition.info": { "message": "info", "description": "The default label used for the Info admonition (:::info)" }, "theme.admonition.caution": { "message": "caution", "description": "The default label used for the Caution admonition (:::caution)" }, "theme.BackToTopButton.buttonAriaLabel": { "message": "Scroll back to top", "description": "The ARIA label for the back to top button" }, "theme.blog.archive.title": { "message": "Archive", "description": "The page & hero title of the blog archive page" }, "theme.blog.archive.description": { "message": "Archive", "description": "The page & hero description of the blog archive page" }, "theme.blog.paginator.navAriaLabel": { "message": "Blog list page navigation", "description": "The ARIA label for the blog pagination" }, "theme.blog.paginator.newerEntries": { "message": "Newer Entries", "description": "The label used to navigate to the newer blog posts page (previous page)" }, "theme.blog.paginator.olderEntries": { "message": "Older Entries", "description": "The label used to navigate to the older blog posts page (next page)" }, "theme.blog.post.paginator.navAriaLabel": { "message": "Blog post page navigation", "description": "The ARIA label for the blog posts pagination" }, "theme.blog.post.paginator.newerPost": { "message": "Newer Post", "description": "The blog post button label to navigate to the newer/previous post" }, "theme.blog.post.paginator.olderPost": { "message": "Older Post", "description": "The blog post button label to navigate to the older/next post" }, "theme.blog.post.plurals": { "message": "One post|{count} posts", "description": "Pluralized label for \"{count} posts\". Use as much plural forms (separated by \"|\") as your language support (see https://www.unicode.org/cldr/cldr-aux/charts/34/supplemental/language_plural_rules.html)" }, "theme.blog.tagTitle": { "message": "{nPosts} tagged with \"{tagName}\"", "description": "The title of the page for a blog tag" }, "theme.tags.tagsPageLink": { "message": "View All Tags", "description": "The label of the link targeting the tag list page" }, "theme.colorToggle.ariaLabel": { "message": "Switch between dark and light mode (currently {mode})", "description": "The ARIA label for the navbar color mode toggle" }, "theme.colorToggle.ariaLabel.mode.dark": { "message": "dark mode", "description": "The name for the dark color mode" }, "theme.colorToggle.ariaLabel.mode.light": { "message": "light mode", "description": "The name for the light color mode" }, "theme.docs.breadcrumbs.navAriaLabel": { "message": "Breadcrumbs", "description": "The ARIA label for the breadcrumbs" }, "theme.docs.DocCard.categoryDescription": { "message": "{count} items", "description": "The default description for a category card in the generated index about how many items this category includes" }, "theme.docs.paginator.navAriaLabel": { "message": "Docs pages", "description": "The ARIA label for the docs pagination" }, "theme.docs.paginator.previous": { "message": "Previous", "description": "The label used to navigate to the previous doc" }, "theme.docs.paginator.next": { "message": "Next", "description": "The label used to navigate to the next doc" }, "theme.docs.tagDocListPageTitle.nDocsTagged": { "message": "One doc tagged|{count} docs tagged", "description": "Pluralized label for \"{count} docs tagged\". Use as much plural forms (separated by \"|\") as your language support (see https://www.unicode.org/cldr/cldr-aux/charts/34/supplemental/language_plural_rules.html)" }, "theme.docs.tagDocListPageTitle": { "message": "{nDocsTagged} with \"{tagName}\"", "description": "The title of the page for a docs tag" }, "theme.docs.versionBadge.label": { "message": "Version: {versionLabel}" }, "theme.docs.versions.unreleasedVersionLabel": { "message": "This is unreleased documentation for {siteTitle} {versionLabel} version.", "description": "The label used to tell the user that he's browsing an unreleased doc version" }, "theme.docs.versions.unmaintainedVersionLabel": { "message": "This is documentation for {siteTitle} {versionLabel}, which is no longer actively maintained.", "description": "The label used to tell the user that he's browsing an unmaintained doc version" }, "theme.docs.versions.latestVersionSuggestionLabel": { "message": "For up-to-date documentation, see the {latestVersionLink} ({versionLabel}).", "description": "The label used to tell the user to check the latest version" }, "theme.docs.versions.latestVersionLinkLabel": { "message": "latest version", "description": "The label used for the latest version suggestion link label" }, "theme.common.editThisPage": { "message": "Edit this page", "description": "The link label to edit the current page" }, "theme.common.headingLinkTitle": { "message": "Direct link to {heading}", "description": "Title for link to heading" }, "theme.lastUpdated.atDate": { "message": " on {date}", "description": "The words used to describe on which date a page has been last updated" }, "theme.lastUpdated.byUser": { "message": " by {user}", "description": "The words used to describe by who the page has been last updated" }, "theme.lastUpdated.lastUpdatedAtBy": { "message": "Last updated{atDate}{byUser}", "description": "The sentence used to display when a page has been last updated, and by who" }, "theme.navbar.mobileVersionsDropdown.label": { "message": "Versions", "description": "The label for the navbar versions dropdown on mobile view" }, "theme.tags.tagsListLabel": { "message": "Tags:", "description": "The label alongside a tag list" }, "theme.AnnouncementBar.closeButtonAriaLabel": { "message": "Close", "description": "The ARIA label for close button of announcement bar" }, "theme.blog.sidebar.navAriaLabel": { "message": "Blog recent posts navigation", "description": "The ARIA label for recent posts in the blog sidebar" }, "theme.CodeBlock.copied": { "message": "Copied", "description": "The copied button label on code blocks" }, "theme.CodeBlock.copyButtonAriaLabel": { "message": "Copy code to clipboard", "description": "The ARIA label for copy code blocks button" }, "theme.CodeBlock.copy": { "message": "Copy", "description": "The copy button label on code blocks" }, "theme.CodeBlock.wordWrapToggle": { "message": "Toggle word wrap", "description": "The title attribute for toggle word wrapping button of code block lines" }, "theme.DocSidebarItem.toggleCollapsedCategoryAriaLabel": { "message": "Toggle the collapsible sidebar category '{label}'", "description": "The ARIA label to toggle the collapsible sidebar category" }, "theme.NavBar.navAriaLabel": { "message": "Main", "description": "The ARIA label for the main navigation" }, "theme.navbar.mobileLanguageDropdown.label": { "message": "Languages", "description": "The label for the mobile language switcher dropdown" }, "theme.TOCCollapsible.toggleButtonLabel": { "message": "On this page", "description": "The label used by the button on the collapsible TOC component" }, "theme.blog.post.readingTime.plurals": { "message": "One min read|{readingTime} min read", "description": "Pluralized label for \"{readingTime} min read\". Use as much plural forms (separated by \"|\") as your language support (see https://www.unicode.org/cldr/cldr-aux/charts/34/supplemental/language_plural_rules.html)" }, "theme.blog.post.readMore": { "message": "Read More", "description": "The label used in blog post item excerpts to link to full blog posts" }, "theme.blog.post.readMoreLabel": { "message": "Read more about {title}", "description": "The ARIA label for the link to full blog posts from excerpts" }, "theme.docs.breadcrumbs.home": { "message": "Home page", "description": "The ARIA label for the home page in the breadcrumbs" }, "theme.docs.sidebar.collapseButtonTitle": { "message": "Collapse sidebar", "description": "The title attribute for collapse button of doc sidebar" }, "theme.docs.sidebar.collapseButtonAriaLabel": { "message": "Collapse sidebar", "description": "The title attribute for collapse button of doc sidebar" }, "theme.docs.sidebar.navAriaLabel": { "message": "Docs sidebar", "description": "The ARIA label for the sidebar navigation" }, "theme.docs.sidebar.closeSidebarButtonAriaLabel": { "message": "Close navigation bar", "description": "The ARIA label for close button of mobile sidebar" }, "theme.navbar.mobileSidebarSecondaryMenu.backButtonLabel": { "message": "← Back to main menu", "description": "The label of the back button to return to main menu, inside the mobile navbar sidebar secondary menu (notably used to display the docs sidebar)" }, "theme.docs.sidebar.toggleSidebarButtonAriaLabel": { "message": "Toggle navigation bar", "description": "The ARIA label for hamburger menu button of mobile navigation" }, "theme.docs.sidebar.expandButtonTitle": { "message": "Expand sidebar", "description": "The ARIA label and title attribute for expand button of doc sidebar" }, "theme.docs.sidebar.expandButtonAriaLabel": { "message": "Expand sidebar", "description": "The ARIA label and title attribute for expand button of doc sidebar" }, "theme.ErrorPageContent.tryAgain": { "message": "Try again", "description": "The label of the button to try again rendering when the React error boundary captures an error" }, "theme.common.skipToMainContent": { "message": "Skip to main content", "description": "The skip to content label used for accessibility, allowing to rapidly navigate to main content with keyboard tab/enter navigation" }, "theme.tags.tagsPageTitle": { "message": "Tags", "description": "The title of the tag list page" } } ================================================ FILE: i18n/en/docusaurus-plugin-content-blog/options.json ================================================ { "title": { "message": "Blog", "description": "The title for the blog used in SEO" }, "description": { "message": "Blog", "description": "The description for the blog used in SEO" }, "sidebar.title": { "message": "Recent posts", "description": "The label for the left sidebar" } } ================================================ FILE: i18n/en/docusaurus-plugin-content-docs/current/api-deployment/_category_.json ================================================ { "label": "API Deployment", "position": 7, "link": { "type": "generated-index" } } ================================================ FILE: i18n/en/docusaurus-plugin-content-docs/current/api-deployment/seldon-children.md ================================================ --- title : "6. Multi Models" description: "" sidebar_position: 6 contributors: ["Jongseob Jeon"] --- Previously, the methods explained were all targeted at a single model. On this page, we will look at how to connect multiple models. First, we will create a pipeline that creates two models. We will add a StandardScaler to the SVC model we used before and store it. ```python from functools import partial import kfp from kfp.components import InputPath, OutputPath, create_component_from_func @partial( create_component_from_func, packages_to_install=["pandas", "scikit-learn"], ) def load_iris_data( data_path: OutputPath("csv"), target_path: OutputPath("csv"), ): import pandas as pd from sklearn.datasets import load_iris iris = load_iris() data = pd.DataFrame(iris["data"], columns=iris["feature_names"]) target = pd.DataFrame(iris["target"], columns=["target"]) data.to_csv(data_path, index=False) target.to_csv(target_path, index=False) @partial( create_component_from_func, packages_to_install=["dill", "pandas", "scikit-learn", "mlflow"], ) def train_scaler_from_csv( data_path: InputPath("csv"), scaled_data_path: OutputPath("csv"), model_path: OutputPath("dill"), input_example_path: OutputPath("dill"), signature_path: OutputPath("dill"), conda_env_path: OutputPath("dill"), ): import dill import pandas as pd from sklearn.preprocessing import StandardScaler from mlflow.models.signature import infer_signature from mlflow.utils.environment import _mlflow_conda_env data = pd.read_csv(data_path) scaler = StandardScaler() scaled_data = scaler.fit_transform(data) scaled_data = pd.DataFrame(scaled_data, columns=data.columns, index=data.index) scaled_data.to_csv(scaled_data_path, index=False) with open(model_path, mode="wb") as file_writer: dill.dump(scaler, file_writer) input_example = data.sample(1) with open(input_example_path, "wb") as file_writer: dill.dump(input_example, file_writer) signature = infer_signature(data, scaler.transform(data)) with open(signature_path, "wb") as file_writer: dill.dump(signature, file_writer) conda_env = _mlflow_conda_env( additional_pip_deps=["scikit-learn"], install_mlflow=False ) with open(conda_env_path, "wb") as file_writer: dill.dump(conda_env, file_writer) @partial( create_component_from_func, packages_to_install=["dill", "pandas", "scikit-learn", "mlflow"], ) def train_svc_from_csv( train_data_path: InputPath("csv"), train_target_path: InputPath("csv"), model_path: OutputPath("dill"), input_example_path: OutputPath("dill"), signature_path: OutputPath("dill"), conda_env_path: OutputPath("dill"), kernel: str, ): import dill import pandas as pd from sklearn.svm import SVC from mlflow.models.signature import infer_signature from mlflow.utils.environment import _mlflow_conda_env train_data = pd.read_csv(train_data_path) train_target = pd.read_csv(train_target_path) clf = SVC(kernel=kernel) clf.fit(train_data, train_target) with open(model_path, mode="wb") as file_writer: dill.dump(clf, file_writer) input_example = train_data.sample(1) with open(input_example_path, "wb") as file_writer: dill.dump(input_example, file_writer) signature = infer_signature(train_data, clf.predict(train_data)) with open(signature_path, "wb") as file_writer: dill.dump(signature, file_writer) conda_env = _mlflow_conda_env( additional_pip_deps=["scikit-learn"], install_mlflow=False ) with open(conda_env_path, "wb") as file_writer: dill.dump(conda_env, file_writer) @partial( create_component_from_func, packages_to_install=["dill", "pandas", "scikit-learn", "mlflow", "boto3"], ) def upload_sklearn_model_to_mlflow( model_name: str, model_path: InputPath("dill"), input_example_path: InputPath("dill"), signature_path: InputPath("dill"), conda_env_path: InputPath("dill"), ): import os import dill from mlflow.sklearn import save_model from mlflow.tracking.client import MlflowClient os.environ["MLFLOW_S3_ENDPOINT_URL"] = "http://minio-service.kubeflow.svc:9000" os.environ["AWS_ACCESS_KEY_ID"] = "minio" os.environ["AWS_SECRET_ACCESS_KEY"] = "minio123" client = MlflowClient("http://mlflow-server-service.mlflow-system.svc:5000") with open(model_path, mode="rb") as file_reader: clf = dill.load(file_reader) with open(input_example_path, "rb") as file_reader: input_example = dill.load(file_reader) with open(signature_path, "rb") as file_reader: signature = dill.load(file_reader) with open(conda_env_path, "rb") as file_reader: conda_env = dill.load(file_reader) save_model( sk_model=clf, path=model_name, serialization_format="cloudpickle", conda_env=conda_env, signature=signature, input_example=input_example, ) run = client.create_run(experiment_id="0") client.log_artifact(run.info.run_id, model_name) from kfp.dsl import pipeline @pipeline(name="multi_model_pipeline") def multi_model_pipeline(kernel: str = "rbf"): iris_data = load_iris_data() scaled_data = train_scaler_from_csv(data=iris_data.outputs["data"]) _ = upload_sklearn_model_to_mlflow( model_name="scaler", model=scaled_data.outputs["model"], input_example=scaled_data.outputs["input_example"], signature=scaled_data.outputs["signature"], conda_env=scaled_data.outputs["conda_env"], ) model = train_svc_from_csv( train_data=scaled_data.outputs["scaled_data"], train_target=iris_data.outputs["target"], kernel=kernel, ) _ = upload_sklearn_model_to_mlflow( model_name="svc", model=model.outputs["model"], input_example=model.outputs["input_example"], signature=model.outputs["signature"], conda_env=model.outputs["conda_env"], ) if __name__ == "__main__": kfp.compiler.Compiler().compile(multi_model_pipeline, "multi_model_pipeline.yaml") ``` If you upload the pipeline, it will look like this. ![children-kubeflow.png](./img/children-kubeflow.png) When you check the MLflow dashboard, two models will be generated, as shown below. ![children-mlflow.png](./img/children-mlflow.png) After checking the run_id of each one, define the SeldonDeployment spec as follows. ```bash apiVersion: machinelearning.seldon.io/v1 kind: SeldonDeployment metadata: name: multi-model-example namespace: kubeflow-user-example-com spec: name: model predictors: - name: model componentSpecs: - spec: volumes: - name: model-provision-location emptyDir: {} initContainers: - name: scaler-initializer image: gcr.io/kfserving/storage-initializer:v0.4.0 args: - "s3://mlflow/mlflow/artifacts/0/7f445015a0e94519b003d316478766ef/artifacts/scaler" - "/mnt/models" volumeMounts: - mountPath: /mnt/models name: model-provision-location envFrom: - secretRef: name: seldon-init-container-secret - name: svc-initializer image: gcr.io/kfserving/storage-initializer:v0.4.0 args: - "s3://mlflow/mlflow/artifacts/0/87eb168e76264b39a24b0e5ca0fe922b/artifacts/svc" - "/mnt/models" volumeMounts: - mountPath: /mnt/models name: model-provision-location envFrom: - secretRef: name: seldon-init-container-secret containers: - name: scaler image: seldonio/mlflowserver:1.8.0-dev volumeMounts: - mountPath: /mnt/models name: model-provision-location readOnly: true securityContext: privileged: true runAsUser: 0 runAsGroup: 0 - name: svc image: seldonio/mlflowserver:1.8.0-dev volumeMounts: - mountPath: /mnt/models name: model-provision-location readOnly: true securityContext: privileged: true runAsUser: 0 runAsGroup: 0 graph: name: scaler type: MODEL parameters: - name: model_uri type: STRING value: "/mnt/models" - name: predict_method type: STRING value: "transform" children: - name: svc type: MODEL parameters: - name: model_uri type: STRING value: "/mnt/models" ``` Two models have been created so each model's initContainer and container must be defined. This field takes input as an array and the order does not matter. The order in which the models are executed is defined in the graph. ```bash graph: name: scaler type: MODEL parameters: - name: model_uri type: STRING value: "/mnt/models" - name: predict_method type: STRING value: "transform" children: - name: svc type: MODEL parameters: - name: model_uri type: STRING value: "/mnt/models" ``` The operation of the graph is to convert the initial value received into a predefined predict_method and then pass it to the model defined as children. In this case, the data is passed from scaler -> svc. Now let's create the above specifications in a yaml file. ```bash cat < multi-model.yaml apiVersion: machinelearning.seldon.io/v1 kind: SeldonDeployment metadata: name: multi-model-example namespace: kubeflow-user-example-com spec: name: model predictors: - name: model componentSpecs: - spec: volumes: - name: model-provision-location emptyDir: {} initContainers: - name: scaler-initializer image: gcr.io/kfserving/storage-initializer:v0.4.0 args: - "s3://mlflow/mlflow/artifacts/0/7f445015a0e94519b003d316478766ef/artifacts/scaler" - "/mnt/models" volumeMounts: - mountPath: /mnt/models name: model-provision-location envFrom: - secretRef: name: seldon-init-container-secret - name: svc-initializer image: gcr.io/kfserving/storage-initializer:v0.4.0 args: - "s3://mlflow/mlflow/artifacts/0/87eb168e76264b39a24b0e5ca0fe922b/artifacts/svc" - "/mnt/models" volumeMounts: - mountPath: /mnt/models name: model-provision-location envFrom: - secretRef: name: seldon-init-container-secret containers: - name: scaler image: ghcr.io/mlops-for-all/mlflowserver volumeMounts: - mountPath: /mnt/models name: model-provision-location readOnly: true securityContext: privileged: true runAsUser: 0 runAsGroup: 0 - name: svc image: ghcr.io/mlops-for-all/mlflowserver volumeMounts: - mountPath: /mnt/models name: model-provision-location readOnly: true securityContext: privileged: true runAsUser: 0 runAsGroup: 0 graph: name: scaler type: MODEL parameters: - name: model_uri type: STRING value: "/mnt/models" - name: predict_method type: STRING value: "transform" children: - name: svc type: MODEL parameters: - name: model_uri type: STRING value: "/mnt/models" EOF ``` Create an API through the following command. ```bash kubectl apply -f multi-model.yaml ``` If properly performed, it will be outputted as follows. ```bash seldondeployment.machinelearning.seldon.io/multi-model-example created ``` Check to see if it has been generated normally. ```bash kubectl get po -n kubeflow-user-example-com | grep multi-model-example ``` If it is created normally, a similar pod will be created. ```bash multi-model-example-model-0-scaler-svc-9955fb795-n9ffw 4/4 Running 0 2m30s ``` ================================================ FILE: i18n/en/docusaurus-plugin-content-docs/current/api-deployment/seldon-fields.md ================================================ --- title : "4. Seldon Fields" description: "" sidebar_position: 4 contributors: ["Jongseob Jeon"] --- Summary of how Seldon Core creates an API server: 1. initContainer downloads the required model from the model repository. 2. The downloaded model is passed to the container. 3. The container runs an API server enclosing the model. 4. The API can be requested at the generated API server address to receive the inference values from the model. The yaml file defining the custom resource, SeldonDeployment, which is most commonly used when using Seldon Core is as follows: ```bash apiVersion: machinelearning.seldon.io/v1 kind: SeldonDeployment metadata: name: seldon-example namespace: kubeflow-user-example-com spec: name: model predictors: - name: model componentSpecs: - spec: volumes: - name: model-provision-location emptyDir: {} initContainers: - name: model-initializer image: gcr.io/kfserving/storage-initializer:v0.4.0 args: - "gs://seldon-models/v1.12.0-dev/sklearn/iris" - "/mnt/models" volumeMounts: - mountPath: /mnt/models name: model-provision-location containers: - name: model image: seldonio/sklearnserver:1.8.0-dev volumeMounts: - mountPath: /mnt/models name: model-provision-location readOnly: true securityContext: privileged: true runAsUser: 0 runAsGroup: 0 graph: name: model type: MODEL parameters: - name: model_uri type: STRING value: "/mnt/models" children: [] ``` The `name` and `predictors` fields of SeldonDeployment are required fields. `name` is mainly used as a name to differentiate pods in Kubernetes and does not have a major effect. `predictors` must be a single array consisting of `name`, `componentSpecs` and `graph` defined. Here also, `name` is mainly used as a name to differentiate pods in Kubernetes and does not have a major effect. Now let's take a look at the fields that need to be defined in `componentSpecs` and `graph`. ## componentSpecs `componentSpecs` must be a single array consisting of the `spec` key. The `spec` must have the fields `volumes`, `initContainers` and `containers` defined. ### volumes ```bash volumes: - name: model-provision-location emptyDir: {} ``` `Volumes` refer to the space used to store the models downloaded from the initContainer, which is received as an array with the components `name` and `emptyDir`. These values are used only once when downloading and moving the models, so they do not need to be modified significantly. ```bash - name: model-initializer image: gcr.io/kfserving/storage-initializer:v0.4.0 args: - "gs://seldon-models/v1.12.0-dev/sklearn/iris" - "/mnt/models" volumeMounts: - mountPath: /mnt/models name: model-provision-location ``` The `args` field contains the system arguments necessary to download the model from the model repository and move it to the specified model path. It provides the required parameters for the initContainer to perform the downloading and storage operations. initContainer is responsible for downloading the model to be used from the API, so the fields used determine the information needed to download data from the model registry. The value of initContainer consists of n arrays, and each model needs to be specified separately. #### name `name` is the name of the pod in Kubernetes, and it is recommended to use `{model_name}-initializer` for debugging. #### image `image` is the name of the image used to download the model, and there are two recommended images by - gcr.io/kfserving/storage-initializer:v0.4.0 - seldonio/rclone-storage-initializer:1.13.0-dev For more detailed information, please refer to the following resources: - [kfserving](https://docs.seldon.io/projects/seldon-core/en/latest/servers/kfserving-storage-initializer.html) - [rclone](https://github.com/SeldonIO/seldon-core/tree/master/components/rclone-storage-initializer) In MLOps for ALL, we use kfserving for downloading and storing models. #### args ```bash args: - "gs://seldon-models/v1.12.0-dev/sklearn/iris" - "/mnt/models" ``` When the gcr.io/kfserving/storage-initializer:v0.4.0 Docker image is run (`run`), it takes an argument in the form of an array. The first array value is the address of the model to be downloaded. The second array value is the address where the downloaded model will be stored (Seldon Core usually stores it in `/mnt/models`). ### volumeMounts ```bash volumeMounts: - mountPath: /mnt/models name: model-provision-location ``` `volumeMounts` is a field that attaches volumes to the Kubernetes to share `/mnt/models` as described in volumes. For more information, refer to Kubernetes Volume [Kubernetes Volume](https://kubernetes.io/docs/concepts/storage/volumes/)." ### container ```bash containers: - name: model image: seldonio/sklearnserver:1.8.0-dev volumeMounts: - mountPath: /mnt/models name: model-provision-location readOnly: true securityContext: privileged: true runAsUser: 0 runAsGroup: 0 ``` Container defines the fields that determine the configuration when the model is run in an API form. #### name The `name` field refers to the name of the pod in Kubernetes. It should be the name of the model being used. #### image The `image` field represents the image used to convert the model into an API. The image should have all the necessary packages installed when the model is loaded. Seldon Core provides official images for different types of models, including: - seldonio/sklearnserver - seldonio/mlflowserver - seldonio/xgboostserver - seldonio/tfserving You can choose the appropriate image based on the type of model you are using. #### volumeMounts ```bash volumeMounts: - mountPath: /mnt/models name: model-provision-location readOnly: true ``` This is a field that tells the path where the data downloaded from initContainer is located. Here, to prevent the model from being modified, `readOnly: true` will also be given. #### securityContext ```bash securityContext: privileged: true runAsUser: 0 runAsGroup: 0 ``` When installing necessary packages, pod may not be able to perform the package installation due to lack of permission. To address this, root permission is granted (although this could cause security issues when in actual service). ## graph ```bash graph: name: model type: MODEL parameters: - name: model_uri type: STRING value: "/mnt/models" children: [] ``` This is a field that defines the order in which the model operates. ### name The `name` field refers to the name of the model graph. It should match the name defined in the container. ### type The `type` field can have four different values: 1. TRANSFORMER 2. MODEL 3. OUTPUT_TRANSFORMER 4. ROUTER For detailed explanations of each type, you can refer to the [Seldon Core Complex Graphs Metadata Example](https://docs.seldon.io/projects/seldon-core/en/latest/examples/graph-metadata.html). ### parameters The `parameters` field contains values used in the class init. For the sklearnserver, you can find the required values in the [following file](https://github.com/SeldonIO/seldon-core/blob/master/servers/sklearnserver/sklearnserver/SKLearnServer.py). ```python class SKLearnServer(SeldonComponent): def __init__(self, model_uri: str = None, method: str = "predict_proba"): ``` If you look at the code, you can define `model_uri` and `method`. ### children The `children` field is used when creating the sequence diagram. More details about this field will be explained on the following page. ================================================ FILE: i18n/en/docusaurus-plugin-content-docs/current/api-deployment/seldon-iris.md ================================================ --- title : "2. Deploy SeldonDeployment" description: "" sidebar_position: 2 date: 2021-12-22 lastmod: 2021-12-22 contributors: ["Youngcheol Jang", "SeungTae Kim"] --- ## Deploy with SeldonDeployment Let's deploy our trained model as an API using SeldonDeployment. SeldonDeployment is a custom resource definition (CRD) defined to deploy models as REST/gRPC servers on Kubernetes. #### 1. Prerequisites We will conduct the SeldonDeployment related practice in a new namespace called seldon-deploy. After creating the namespace, set seldon-deploy as the current namespace. ```bash kubectl create namespace seldon-deploy kubectl config set-context --current --namespace=seldon-deploy ``` ### 2. Define Spec Generate a yaml file to deploy SeldonDeployment. In this page, we will use a publicly available iris model. Because this iris model is trained through the sklearn framework, we use SKLEARN_SERVER. ```bash cat < iris-sdep.yaml apiVersion: machinelearning.seldon.io/v1alpha2 kind: SeldonDeployment metadata: name: sklearn namespace: seldon-deploy spec: name: iris predictors: - graph: children: [] implementation: SKLEARN_SERVER modelUri: gs://seldon-models/v1.12.0-dev/sklearn/iris name: classifier name: default replicas: 1 EOF ``` Deploy yaml file. ```bash kubectl apply -f iris-sdep.yaml ``` Check if the deployment was successful through the following command. ```bash kubectl get pods --selector seldon-app=sklearn-default -n seldon-deploy ``` If everyone runs, similar results will be printed. ```bash NAME READY STATUS RESTARTS AGE sklearn-default-0-classifier-5fdfd7bb77-ls9tr 2/2 Running 0 5m ``` ## Ingress URL Now, send a inference request to the deployed model to get the inference result. The API created by the SeldonDeployment follows the following rule: `http://{NODE_IP}:{NODE_PORT}/seldon/{namespace}/{seldon-deployment-name}/api/v1.0/{method-name}/` ### NODE_IP / NODE_PORT [Since Seldon Core was installed with Ambassador as the Ingress Controller](../setup-components/install-components-seldon.md), all APIs created by SeldonDeployment can be requested through the Ambassador Ingress gateway. Therefore, first set the url of the Ambassador Ingress Gateway as an environment variable. ```bash export NODE_IP=$(kubectl get nodes -o jsonpath='{ $.items[*].status.addresses[?(@.type=="InternalIP")].address }') export NODE_PORT=$(kubectl get service ambassador -n seldon-system -o jsonpath="{.spec.ports[0].nodePort}") ``` Check the set url. ```bash echo "NODE_IP"=$NODE_IP echo "NODE_PORT"=$NODE_PORT ``` It should be outputted similarly as follows, and if set through the cloud, you can check that internal IP address is set. ```bash NODE_IP=192.168.0.19 NODE_PORT=30486 ``` ### namespace / seldon-deployment-name This refers to the `namespace` and `seldon-deployment-name` where the SeldonDeployment is deployed and used to define the values defined in the metadata when defining the spec. ```bash metadata: name: sklearn namespace: seldon-deploy ``` In the example above, `namespace` is seldon-deploy, `seldon-deployment-name` is sklearn. ### method-name In SeldonDeployment, the commonly used `method-name` has two options: 1. doc 2. predictions The detailed usage of each method is explained below. ## Using Swagger First, let's explore how to use the doc method, which allows access to the Swagger generated by Seldon. ### 1. Accessing Swagger According to the provided ingress URL rules, you can access the Swagger documentation using the following URL: `http://192.168.0.19:30486/seldon/seldon-deploy/sklearn/api/v1.0/doc/` ![iris-swagger1.png](./img/iris-swagger1.png) ### 2. Selecting Swagger Predictions In the Swagger UI, select the `/seldon/seldon-deploy/sklearn/api/v1.0/predictions` endpoint. ![iris-swagger2.png](./img/iris-swagger2.png) ### 3. Choosing *Try it out* ![iris-swagger3.png](./img/iris-swagger3.png) ### 4. Inputting data in the Request body ![iris-swagger4.png](./img/iris-swagger4.png) Enter the following data into the Request body. ```bash { "data": { "ndarray":[[1.0, 2.0, 5.0, 6.0]] } } ``` ### 5. Check the inference results You can click the `Execute` button to obtain the inference result. ![iris-swagger5.png](./img/iris-swagger5.png) If everything is executed successfully, you will obtain the following inference result. ```bash { "data": { "names": [ "t:0", "t:1", "t:2" ], "ndarray": [ [ 9.912315378486697e-7, 0.0007015931307746079, 0.9992974156376876 ] ] }, "meta": { "requestPath": { "classifier": "seldonio/sklearnserver:1.11.2" } } } ``` ## Using CLI Also, you can use http client CLI tools such as curl to make API requests. For example, requesting `/predictions` as follows ```bash curl -X POST http://$NODE_IP:$NODE_PORT/seldon/seldon-deploy/sklearn/api/v1.0/predictions \ -H 'Content-Type: application/json' \ -d '{ "data": { "ndarray": [[1,2,3,4]] } }' ``` You can confirm that the following response is outputted normally. ```bash {"data":{"names":["t:0","t:1","t:2"],"ndarray":[[0.0006985194531162835,0.00366803903943666,0.995633441507447]]},"meta":{"requestPath":{"classifier":"seldonio/sklearnserver:1.11.2"}}} ``` ================================================ FILE: i18n/en/docusaurus-plugin-content-docs/current/api-deployment/seldon-mlflow.md ================================================ --- title : "5. Model from MLflow" description: "" sidebar_position: 5 contributors: ["Jongseob Jeon"] --- ## Model from MLflow On this page, we will learn how to create an API using a model saved in the [MLflow Component](../kubeflow/advanced-mlflow.md). ## Secret The initContainer needs credentials to access minio and download the model. The credentials for access to minio are as follows. ```bash apiVersion: v1 type: Opaque kind: Secret metadata: name: seldon-init-container-secret namespace: kubeflow-user-example-com data: AWS_ACCESS_KEY_ID: bWluaW8K= AWS_SECRET_ACCESS_KEY: bWluaW8xMjM= AWS_ENDPOINT_URL: aHR0cDovL21pbmlvLm1ha2luYXJvY2tzLmFp USE_SSL: ZmFsc2U= ``` The input value for `AWS_ACCESS_KEY_ID` is `minio`. However, since the input value for the secret must be an encoded value, the value that is actually entered must be the value that comes out after performing the following. The values that need to be entered in data are as follows. - AWS_ACCESS_KEY_ID: minio - AWS_SECRET_ACCESS_KEY: minio123 - AWS_ENDPOINT_URL: http://minio-service.kubeflow.svc:9000 - USE_SSL: false The encoding can be done using the following command. ```bash echo -n minio | base64 ``` Then the following values will be output. ```bash bWluaW8= ``` If you do the encoding for the entire value, it will look like this: - AWS_ACCESS_KEY_ID: minio= - AWS_SECRET_ACCESS_KEY: minio123= - AWS_ENDPOINT_URL: http://minio-service.kubeflow.svc:9000= - USE_SSL: false= You can generate a yaml file through the following command to create the secret. ```bash cat < seldon-init-container-secret.yaml apiVersion: v1 kind: Secret metadata: name: seldon-init-container-secret namespace: kubeflow-user-example-com type: Opaque data: AWS_ACCESS_KEY_ID: bWluaW8= AWS_SECRET_ACCESS_KEY: bWluaW8xMjM= AWS_ENDPOINT_URL: aHR0cDovL21pbmlvLXNlcnZpY2Uua3ViZWZsb3cuc3ZjOjkwMDA= USE_SSL: ZmFsc2U= EOF ``` Create the secret through the following command. ```bash kubectl apply -f seldon-init-container-secret.yaml ``` If performed normally, it will be output as follows. ```bash secret/seldon-init-container-secret created ``` ## Seldon Core yaml Now let's write the yaml file to create Seldon Core. ```bash apiVersion: machinelearning.seldon.io/v1 kind: SeldonDeployment metadata: name: seldon-example namespace: kubeflow-user-example-com spec: name: model predictors: - name: model componentSpecs: - spec: volumes: - name: model-provision-location emptyDir: {} initContainers: - name: model-initializer image: gcr.io/kfserving/storage-initializer:v0.4.0 args: - "s3://mlflow/mlflow/artifacts/0/74ba8e33994144f599e50b3be176cdb0/artifacts/svc" - "/mnt/models" volumeMounts: - mountPath: /mnt/models name: model-provision-location envFrom: - secretRef: name: seldon-init-container-secret containers: - name: model image: ghcr.io/mlops-for-all/mlflowserver volumeMounts: - mountPath: /mnt/models name: model-provision-location readOnly: true securityContext: privileged: true runAsUser: 0 runAsGroup: 0 graph: name: model type: MODEL parameters: - name: model_uri type: STRING value: "/mnt/models" children: [] ``` There are two major changes compared to the previously created [Seldon Fields](../api-deployment/seldon-fields.md): 1. The `envFrom` field is added to the initContainer. 2. The address in the args has been changed to `s3://mlflow/mlflow/artifacts/0/74ba8e33994144f599e50b3be176cdb0/artifacts/svc`. ### args Previously, we mentioned that the first element of the args array is the path to the model we want to download. So, how can we determine the path of the model stored in MLflow? To find the path, go back to MLflow and click on the run, then click on the model, as shown below: ![seldon-mlflow-0.png](./img/seldon-mlflow-0.png) You can use the path obtained from there. ### envFrom This process involves providing the environment variables required to access MinIO and download the model. We will use the `seldon-init-container-secret` created earlier. ## API Creation First, let's generate the YAML file based on the specification defined above. ```bash apiVersion: machinelearning.seldon.io/v1 kind: SeldonDeployment metadata: name: seldon-example namespace: kubeflow-user-example-com spec: name: model predictors: - name: model componentSpecs: - spec: volumes: - name: model-provision-location emptyDir: {} initContainers: - name: model-initializer image: gcr.io/kfserving/storage-initializer:v0.4.0 args: - "s3://mlflow/mlflow/artifacts/0/74ba8e33994144f599e50b3be176cdb0/artifacts/svc" - "/mnt/models" volumeMounts: - mountPath: /mnt/models name: model-provision-location envFrom: - secretRef: name: seldon-init-container-secret containers: - name: model image: ghcr.io/mlops-for-all/mlflowserver volumeMounts: - mountPath: /mnt/models name: model-provision-location readOnly: true securityContext: privileged: true runAsUser: 0 runAsGroup: 0 graph: name: model type: MODEL parameters: - name: model_uri type: STRING value: "/mnt/models" - name: xtype type: STRING value: "dataframe" children: [] EOF ``` Create a seldon pod. ```bash kubectl apply -f seldon-mlflow.yaml ``` If it is performed normally, it will be outputted as follows. ```bash seldondeployment.machinelearning.seldon.io/seldon-example created ``` Now we wait until the pod is up and running properly. ```bash kubectl get po -n kubeflow-user-example-com | grep seldon ``` If it is outputted similarly to the following, the API has been created normally. ```bash seldon-example-model-0-model-5c949bd894-c5f28 3/3 Running 0 69s ``` You can confirm the execution through the following request on the API created through the CLI. ```bash curl -X POST http://$NODE_IP:$NODE_PORT/seldon/seldon-deploy/sklearn/api/v1.0/predictions \ -H 'Content-Type: application/json' \ -d '{ "data": { "ndarray": [ [ 143.0, 0.0, 30.0, 30.0 ] ], "names": [ "sepal length (cm)", "sepal width (cm)", "petal length (cm)", "petal width (cm)" ] } }' ``` If executed normally, you can get the following results. ```bash {"data":{"names":[],"ndarray":["Virginica"]},"meta":{"requestPath":{"model":"ghcr.io/mlops-for-all/mlflowserver:e141f57"}}} ``` ================================================ FILE: i18n/en/docusaurus-plugin-content-docs/current/api-deployment/seldon-pg.md ================================================ --- title : "3. Seldon Monitoring" description: "Prometheus & Grafana 확인하기" sidebar_position: 3 date: 2021-12-24 lastmod: 2021-12-24 contributors: ["Jongseob Jeon"] --- ## Grafana & Prometheus Now, let's perform repeated API requests with the SeldonDeployment we created on the [previous page](../api-deployment/seldon-iris.md) and check if the dashboard changes. ### Dashboard [Forward the dashboard created earlier](../setup-components/install-components-pg.md). ```bash kubectl port-forward svc/seldon-core-analytics-grafana -n seldon-system 8090:80 ``` ### Request API Request **repeated** to the [previously created Seldon Deployment](../api-deployment/seldon-iris.md#using-cli). ```bash curl -X POST http://$NODE_IP:$NODE_PORT/seldon/seldon-deploy/sklearn/api/v1.0/predictions \ -H 'Content-Type: application/json' \ -d '{ "data": { "ndarray": [[1,2,3,4]] } }' ``` Furthermore, when checking the Grafana dashboard, you can observe that the Global Request Rate increases momentarily from `0 ops`. ![repeat-raise.png](./img/repeat-raise.png) This confirms that Prometheus and Grafana have been successfully installed and configured. ================================================ FILE: i18n/en/docusaurus-plugin-content-docs/current/api-deployment/what-is-api-deployment.md ================================================ --- title : "1. What is API Deployment?" description: "" sidebar_position: 1 date: 2021-12-22 lastmod: 2021-12-22 contributors: ["Youngcheol Jang"] --- ## What is API Deployment? After training a machine learning model, how should it be used? When training a machine learning model, you expect a model with higher performance to come out, but when you infer with the trained model, you want to get the inference results quickly and easily. When you want to check the inference results of the model, you can load the trained model and infer through a Jupyter notebook or a Python script. However, this method becomes inefficient as the model gets bigger, and you can only use the model in the environment where the trained model exists and cannot be used by many people. Therefore, when machine learning is used in actual services, it uses an API to use the trained model. The model is loaded only once in the environment where the API server is running, and you can easily get the inference results using DNS, and you can also link it with other services. However, there is a lot of ancillary work necessary to make the model into an API. In order to make it easier to make an API, machine learning frameworks such as Tensorflow have developed inference engines. Using inference engines, we can create APIs (REST or gRPC) that can load and infer from machine learning models developed and trained in the corresponding frameworks. When we send a request with the data we want to infer to an API server built using these inference engines, the engine performs the inference and sends back the results in the response. Some well-known open-source inference engines include: - [Tensorflow: Tensorflow Serving](https://github.com/tensorflow/serving) - [PyTorch: Torchserve](https://github.com/pytorch/serve) - [ONNX: ONNX Runtime](https://github.com/microsoft/onnxruntime) While not officially supported in open-source, there are also inference engines developed for popular frameworks like sklearn and XGBoost. Deploying and serving the model's inference results through an API is called **API deployment**. ## Serving Framework I introduced the fact that various inference engines have been developed. Now, if we want to deploy these inference engines in a Kubernetes environment for API deployment, what steps are involved? We need to deploy various Kubernetes resources such as Deployments for the inference engines, Services to create endpoints for sending inference requests, and Ingress to forward external inference requests to the inference engines. Additionally, we may need to handle requirements such as scaling out when there is a high volume of inference requests, monitoring the status of the inference engines, and updating the version when an improved model is available. There are many considerations when operating an inference engine, and it goes beyond just a few tasks. To address these requirements, serving frameworks have been developed to further abstract the deployment of inference engines in a Kubernetes environment. Some popular serving frameworks include: - [Seldon Core](https://github.com/SeldonIO/seldon-core) - [Kserve](https://github.com/kserve) - [BentoML](https://github.com/bentoml/BentoML) In *MLOps for ALL*, we use Seldon Core to demonstrate the process of API deployment. ================================================ FILE: i18n/en/docusaurus-plugin-content-docs/current/appendix/_category_.json ================================================ { "label": "Appendix", "position": 9, "link": { "type": "generated-index" } } ================================================ FILE: i18n/en/docusaurus-plugin-content-docs/current/appendix/metallb.md ================================================ --- title: "2. Install load balancer metallb for Bare Metal Cluster" sidebar_position: 2 --- ## What is MetalLB? ## Installing MetalLB When using Kubernetes on cloud platforms such as AWS, GCP, and Azure, they provide their own load balancers. However, for on-premises clusters, an additional module needs to be installed to enable load balancing. [MetalLB](https://metallb.universe.tf/) is an open-source project that provides a load balancer for bare metal environments. ## Requirements | Requirement | Version and Details | | ----------------------------------------------------------- | ------------------------------------------------------------ | | Kubernetes | Version >= v1.13.0 without built-in load balancing | | [Compatible Network CNI](https://metallb.universe.tf/installation/network-addons/) | Calico, Canal, Cilium, Flannel, Kube-ovn, Kube-router, Weave Net | | IPv4 addresses | Used for MetalLB deployment | | BGP mode | One or more routers that support BGP functionality | | TCP/UDP port 7946 open between nodes | Memberlist requirement | ### MetalLB Installation #### Preparation If you are using kube-proxy in IPVS mode, starting from Kubernetes v1.14.2, you need to enable strict ARP mode. By default, Kube-router enables strict ARP, so this feature is not required if you are using Kube-router as a service proxy. Before applying strict ARP mode, check the current mode. ```bash # see what changes would be made, returns nonzero returncode if different kubectl get configmap kube-proxy -n kube-system -o yaml | \ grep strictARP ``` ```bash strictARP: false ``` If strictARP: false is outputted, run the following to change it to strictARP: true. (If strictARP: true is already outputted, you do not need to execute the following command). ```bash # actually apply the changes, returns nonzero returncode on errors only kubectl get configmap kube-proxy -n kube-system -o yaml | \ sed -e "s/strictARP: false/strictARP: true/" | \ kubectl apply -f - -n kube-system ``` If performed normally, it will be output as follows. ```bash Warning: resource configmaps/kube-proxy is missing the kubectl.kubernetes.io/last-applied-configuration annotation which is required by kubectl apply. kubectl apply should only be used on resources created declaratively by either kubectl create --save-config or kubectl apply. The missing annotation will be patched automatically. configmap/kube-proxy configured ``` ### Installation - Manifest #### 1. Install MetalLB. ```bash kubectl apply -f https://raw.githubusercontent.com/metallb/metallb/v0.11.0/manifests/namespace.yaml kubectl apply -f https://raw.githubusercontent.com/metallb/metallb/v0.11.0/manifests/metallb.yaml ``` #### 2. Check installation. Wait until both pods in the metallb-system namespace are Running. ```bash kubectl get pod -n metallb-system ``` When everthing is Running, similar results will be output. ```bash NAME READY STATUS RESTARTS AGE controller-7dcc8764f4-8n92q 1/1 Running 1 1m speaker-fnf8l 1/1 Running 1 1m ``` The components of the manifest are as follows: - metallb-system/controller - Deployed as a deployment, responsible for assigning external IP addresses for load balancing. - metallb-system/speaker - Deployed as a daemonset, responsible for configuring network communication to connect external traffic and services. The service includes RBAC permissions which are necessary for the controller and speaker components to operate. ## Configuration Setting up the load balancing policy of MetalLB can be done by deploying a configmap containing the related configuration information. There are two modes that can be configured in MetalLB: 1. [Layer 2 Mode](https://metallb.universe.tf/concepts/layer2/) 2. [BGP Mode](https://metallb.universe.tf/concepts/bgp/) Here we will proceed with Layer 2 mode. ### Layer 2 Configuration In the Layer 2 mode, it is enough to set only the range of IP addresses to be used simply. When using Layer 2 mode, it is not necessary to bind IP to the network interface of the worker node, because it operates in a way that it responds directly to the ARP request of the local network and provides the computer's MAC address to the client. The following `metallb_config.yaml` file is the configuration for MetalLB to provide control over the IP range of 192.168.35.100 ~ 192.168.35.110, and to configure Layer 2 mode. In case the cluster node and the client node are separated, the range of 192.168.35.100 ~ 192.168.35.110 must be accessible by both the client node and the cluster node. #### metallb_config.yaml ```bash apiVersion: v1 kind: ConfigMap metadata: namespace: metallb-system name: config data: config: | address-pools: - name: default protocol: layer2 addresses: - 192.168.35.100-192.168.35.110 # IP 대역폭 ``` Apply the above settings. ```test kubectl apply -f metallb_config.yaml ``` If deployed normally, it will output as follows. ```test configmap/config created ``` ## Using MetalLB ### Kubeflow Dashboard First, before getting the load-balancing feature from MetalLB, check the current status by changing the type of the istio-ingressgateway service in the istio-system namespace to `LoadBalancer` to provide the Kubeflow Dashboard. ```bash kubectl get svc/istio-ingressgateway -n istio-system ``` The type of this service is ClusterIP and you can see that the External-IP value is `none`. ```bash NAME TYPE CLUSTER-IP EXTERNAL-IP PORT(S) AGE istio-ingressgateway ClusterIP 10.103.72.5 15021/TCP,80/TCP,443/TCP,31400/TCP,15443/TCP 4h21m ``` Change the type to LoadBalancer and if you want to input a desired IP address, add the loadBalancerIP item. If you do not add it, IP addresses will be assigned sequentially from the IP address pool set above. ```bash kubectl edit svc/istio-ingressgateway -n istio-system ``` ```bash spec: clusterIP: 10.103.72.5 clusterIPs: - 10.103.72.5 ipFamilies: - IPv4 ipFamilyPolicy: SingleStack ports: - name: status-port port: 15021 protocol: TCP targetPort: 15021 - name: http2 port: 80 protocol: TCP targetPort: 8080 - name: https port: 443 protocol: TCP targetPort: 8443 - name: tcp port: 31400 protocol: TCP targetPort: 31400 - name: tls port: 15443 protocol: TCP targetPort: 15443 selector: app: istio-ingressgateway istio: ingressgateway sessionAffinity: None type: LoadBalancer # Change ClusterIP to LoadBalancer loadBalancerIP: 192.168.35.100 # Add IP status: loadBalancer: {} ``` If you check again, you will see that the External-IP value is `192.168.35.100`. ```bash kubectl get svc/istio-ingressgateway -n istio-system ``` ```bash NAME TYPE CLUSTER-IP EXTERNAL-IP PORT(S) AGE istio-ingressgateway LoadBalancer 10.103.72.5 192.168.35.100 15021:31054/TCP,80:30853/TCP,443:30443/TCP,31400:30012/TCP,15443:31650/TCP 5h1m ``` Open a web browser and connect to [http://192.168.35.100](http://192.168.35.100) to verify the following screen is output. ![login-after-istio-ingressgateway-setting.png](./img/login-after-istio-ingressgateway-setting.png) ### minio Dashboard First, we check the current status before changing the type of minio-service, which provides the Dashboard of minio, in the kubeflow namespace to LoadBalancer to receive the load balancing function from MetalLB. ```bash kubectl get svc/minio-service -n kubeflow ``` The type of this service is ClusterIP and you can confirm that the External-IP value is `none`. ```bash NAME TYPE CLUSTER-IP EXTERNAL-IP PORT(S) AGE minio-service ClusterIP 10.109.209.87 9000/TCP 5h14m ``` Change the type to LoadBalancer and if you want to enter an IP address, add the loadBalancerIP item. If you do not add, the IP address will be assigned sequentially from the IP address pool set above. ```bash kubectl edit svc/minio-service -n kubeflow ``` ```bash apiVersion: v1 kind: Service metadata: annotations: kubectl.kubernetes.io/last-applied-configuration: | {"apiVersion":"v1","kind":"Service","metadata":{"annotations":{},"labels":{"application-crd-id":"kubeflow-pipelines"},"name":"minio-ser> creationTimestamp: "2022-01-05T08:44:23Z" labels: application-crd-id: kubeflow-pipelines name: minio-service namespace: kubeflow resourceVersion: "21120" uid: 0053ee28-4f87-47bb-ad6b-7ad68aa29a48 spec: clusterIP: 10.109.209.87 clusterIPs: - 10.109.209.87 ipFamilies: - IPv4 ipFamilyPolicy: SingleStack ports: - name: http port: 9000 protocol: TCP targetPort: 9000 selector: app: minio application-crd-id: kubeflow-pipelines sessionAffinity: None type: LoadBalancer # Change ClusterIP to LoadBalancer loadBalancerIP: 192.168.35.101 # Add IP status: loadBalancer: {} ``` If we check again, we can see that the External-IP value is `192.168.35.101`. ```bash kubectl get svc/minio-service -n kubeflow ``` ```bash NAME TYPE CLUSTER-IP EXTERNAL-IP PORT(S) AGE minio-service LoadBalancer 10.109.209.87 192.168.35.101 9000:31371/TCP 5h21m ``` Open a web browser and connect to [http://192.168.35.101:9000](http://192.168.35.101:9000) to confirm the following screen is printed. ![login-after-minio-setting.png](./img/login-after-minio-setting.png) ### mlflow Dashboard First, we check the current status before changing the type of mlflow-server-service service in the mlflow-system namespace that provides the mlflow Dashboard to LoadBalancer to receive load balancing function from MetalLB. ```bash kubectl get svc/mlflow-server-service -n mlflow-system ``` The type of this service is ClusterIP and you can confirm that the External-IP value is `none`. ```bash NAME TYPE CLUSTER-IP EXTERNAL-IP PORT(S) AGE mlflow-server-service ClusterIP 10.111.173.209 5000/TCP 4m50s ``` Change the type to LoadBalancer and if you want to input the desired IP address, add the loadBalancerIP item. If you do not add it, the IP address will be assigned sequentially from the IP address pool set above. ```bash kubectl edit svc/mlflow-server-service -n mlflow-system ``` ```bash apiVersion: v1 kind: Service metadata: annotations: meta.helm.sh/release-name: mlflow-server meta.helm.sh/release-namespace: mlflow-system creationTimestamp: "2022-01-07T04:00:19Z" labels: app.kubernetes.io/managed-by: Helm name: mlflow-server-service namespace: mlflow-system resourceVersion: "276246" uid: e5d39fb7-ad98-47e7-b512-f9c673055356 spec: clusterIP: 10.111.173.209 clusterIPs: - 10.111.173.209 ipFamilies: - IPv4 ipFamilyPolicy: SingleStack ports: - port: 5000 protocol: TCP targetPort: 5000 selector: app.kubernetes.io/name: mlflow-server sessionAffinity: None type: LoadBalancer # Change ClusterIP to LoadBalancer loadBalancerIP: 192.168.35.102 # Add IP status: loadBalancer: {} ``` If we check again, we can see that the External-IP value is `192.168.35.102`. ```bash kubectl get svc/mlflow-server-service -n mlflow-system ``` ```bash NAME TYPE CLUSTER-IP EXTERNAL-IP PORT(S) AGE mlflow-server-service LoadBalancer 10.111.173.209 192.168.35.102 5000:32287/TCP 6m11s ``` Open the web browser and connect to [http://192.168.35.102:5000](http://192.168.35.102:5000) to confirm the following screen is displayed. ![login-after-mlflow-setting.png](./img/login-after-mlflow-setting.png) ### Grafana Dashboard First, check the current status before changing the type of seldon-core-analytics-grafana service in the seldon-system namespace which provides Grafana's Dashboard to receive Load Balancing function from MetalLB. ```bash kubectl get svc/seldon-core-analytics-grafana -n seldon-system ``` The type of the corresponding service is ClusterIP, and you can see that the External-IP value is `none`. ```bash NAME TYPE CLUSTER-IP EXTERNAL-IP PORT(S) AGE seldon-core-analytics-grafana ClusterIP 10.109.20.161 80/TCP 94s ``` Change the type to LoadBalancer and if you want to enter an IP address, add the loadBalancerIP item. If not, an IP address will be assigned sequentially from the IP address pool set above. ```bash kubectl edit svc/seldon-core-analytics-grafana -n seldon-system ``` ```bash apiVersion: v1 kind: Service metadata: annotations: meta.helm.sh/release-name: seldon-core-analytics meta.helm.sh/release-namespace: seldon-system creationTimestamp: "2022-01-07T04:16:47Z" labels: app.kubernetes.io/instance: seldon-core-analytics app.kubernetes.io/managed-by: Helm app.kubernetes.io/name: grafana app.kubernetes.io/version: 7.0.3 helm.sh/chart: grafana-5.1.4 name: seldon-core-analytics-grafana namespace: seldon-system resourceVersion: "280605" uid: 75073b78-92ec-472c-b0d5-240038ea8fa5 spec: clusterIP: 10.109.20.161 clusterIPs: - 10.109.20.161 ipFamilies: - IPv4 ipFamilyPolicy: SingleStack ports: - name: service port: 80 protocol: TCP targetPort: 3000 selector: app.kubernetes.io/instance: seldon-core-analytics app.kubernetes.io/name: grafana sessionAffinity: None type: LoadBalancer # Change ClusterIP to LoadBalancer loadBalancerIP: 192.168.35.103 # Add IP status: loadBalancer: {} ``` If you check again, you can see that the External-IP value is `192.168.35.103`. ```bash kubectl get svc/seldon-core-analytics-grafana -n seldon-system ``` ```bash NAME TYPE CLUSTER-IP EXTERNAL-IP PORT(S) AGE seldon-core-analytics-grafana LoadBalancer 10.109.20.161 192.168.35.103 80:31191/TCP 5m14s ``` Open the Web Browser and connect to http://192.168.35.103:80 to confirm that the following screen is displayed. ![login-after-grafana-setting.png](./img/login-after-grafana-setting.png) ================================================ FILE: i18n/en/docusaurus-plugin-content-docs/current/appendix/pyenv.md ================================================ --- title: "1. Install Python virtual environment" sidebar_position: 1 --- ## Python virtual environment When working with Python, there may be cases where you want to use multiple versions of Python environments or manage package versions separately for different projects. To easily manage Python environments or Python package environments in a virtualized manner, there are tools available such as pyenv, conda, virtualenv, and venv. Among these, *MLOps for ALL* covers the installation of [pyenv](https://github.com/pyenv/pyenv) and [pyenv-virtualenv](https://github.com/pyenv/pyenv-virtualenv). pyenv helps manage Python versions, while pyenv-virtualenv is a plugin for pyenv that helps manage Python package environments. ## Installing pyenv ### Prerequisites Prerequisites vary depending on the operating system. Please refer to the [following page](https://github.com/pyenv/pyenv/wiki#suggested-build-environment) and install the required packages accordingly. ### Installation - macOS 1. Install pyenv, pyenv-virtualenv ```bash brew update brew install pyenv brew install pyenv-virtualenv ``` 2. Set pyenv For macOS, assuming the use of zsh since the default shell has changed to zsh in Catalina version and later, setting up pyenv. ```bash echo 'eval "$(pyenv init -)"' >> ~/.zshrc echo 'eval "$(pyenv virtualenv-init -)"' >> ~/.zshrc source ~/.zshrc ``` Check if the pyenv command is executed properly. ```bash pyenv --help ``` ```bash $ pyenv --help Usage: pyenv [] Some useful pyenv commands are: --version Display the version of pyenv activate Activate virtual environment commands List all available pyenv commands deactivate Deactivate virtual environment exec Run an executable with the selected Python version global Set or show the global Python version(s) help Display help for a command hooks List hook scripts for a given pyenv command init Configure the shell environment for pyenv install Install a Python version using python-build local Set or show the local application-specific Python version(s) prefix Display prefix for a Python version rehash Rehash pyenv shims (run this after installing executables) root Display the root directory where versions and shims are kept shell Set or show the shell-specific Python version shims List existing pyenv shims uninstall Uninstall a specific Python version version Show the current Python version(s) and its origin version-file Detect the file that sets the current pyenv version version-name Show the current Python version version-origin Explain how the current Python version is set versions List all Python versions available to pyenv virtualenv Create a Python virtualenv using the pyenv-virtualenv plugin virtualenv-delete Uninstall a specific Python virtualenv virtualenv-init Configure the shell environment for pyenv-virtualenv virtualenv-prefix Display real_prefix for a Python virtualenv version virtualenvs List all Python virtualenvs found in `$PYENV_ROOT/versions/*'. whence List all Python versions that contain the given executable which Display the full path to an executable See `pyenv help ' for information on a specific command. For full documentation, see: https://github.com/pyenv/pyenv#readme ``` ### Installation - Ubuntu 1. Install pyenv and pyenv-virtualenv ```bash curl https://pyenv.run | bash ``` If the following content is output, it means that the installation is successful. ```bash % Total % Received % Xferd Average Speed Time Time Time Current Dload Upload Total Spent Left Speed 0 0 0 0 0 0 0 0 --:--:-- --:--:-- 0 0 0 0 0 0 0 0 --:--:-- --:--:-- 100 270 100 270 0 0 239 0 0:00:01 0:00:01 --:--:-- 239 Cloning into '/home/mlops/.pyenv'... r ... Skip... ... remote: Enumerating objects: 10, done. remote: Counting objects: 100% (10/10), done. remote: Compressing objects: 100% (6/6), done. remote: Total 10 (delta 1), reused 6 (delta 0), pack-reused 0 Unpacking objects: 100% (10/10), 2.92 KiB | 2.92 MiB/s, done. WARNING: seems you still have not added 'pyenv' to the load path. # See the README for instructions on how to set up # your shell environment for Pyenv. # Load pyenv-virtualenv automatically by adding # the following to ~/.bashrc: eval "$(pyenv virtualenv-init -)" ``` 2. Set pyenv Assuming the use of bash shell as the default shell, configure pyenv and pyenv-virtualenv to be used in bash. ```bash sudo vi ~/.bashrc ``` Enter the following string and save it. ```bash export PATH="$HOME/.pyenv/bin:$PATH" eval "$(pyenv init -)" eval "$(pyenv virtualenv-init -)" ``` Restart the shell. ```bash exec $SHELL ``` Check if the pyenv command is executed properly. ```bash pyenv --help ``` If the following message is displayed, it means that the settings have been configured correctly. ```bash $ pyenv pyenv 2.2.2 Usage: pyenv [] Some useful pyenv commands are: --version Display the version of pyenv activate Activate virtual environment commands List all available pyenv commands deactivate Deactivate virtual environment doctor Verify pyenv installation and development tools to build pythons. exec Run an executable with the selected Python version global Set or show the global Python version(s) help Display help for a command hooks List hook scripts for a given pyenv command init Configure the shell environment for pyenv install Install a Python version using python-build local Set or show the local application-specific Python version(s) prefix Display prefix for a Python version rehash Rehash pyenv shims (run this after installing executables) root Display the root directory where versions and shims are kept shell Set or show the shell-specific Python version shims List existing pyenv shims uninstall Uninstall a specific Python version version Show the current Python version(s) and its origin version-file Detect the file that sets the current pyenv version version-name Show the current Python version version-origin Explain how the current Python version is set versions List all Python versions available to pyenv virtualenv Create a Python virtualenv using the pyenv-virtualenv plugin virtualenv-delete Uninstall a specific Python virtualenv virtualenv-init Configure the shell environment for pyenv-virtualenv virtualenv-prefix Display real_prefix for a Python virtualenv version virtualenvs List all Python virtualenvs found in `$PYENV_ROOT/versions/*'. whence List all Python versions that contain the given executable which Display the full path to an executable See `pyenv help ' for information on a specific command. For full documentation, see: https://github.com/pyenv/pyenv#readme ``` ## Using pyenv ### Install python version Using the `pyenv install ` command, you can install the desired Python version. In this page, we will install the Python 3.7.12 version that is used by Kubeflow by default as an example. ```bash pyenv install 3.7.12 ``` If installed normally, the following message will be printed. ```bash $ pyenv install 3.7.12 Downloading Python-3.7.12.tar.xz... -> https://www.python.org/ftp/python/3.7.12/Python-3.7.12.tar.xz Installing Python-3.7.12... patching file Doc/library/ctypes.rst patching file Lib/test/test_unicode.py patching file Modules/_ctypes/_ctypes.c patching file Modules/_ctypes/callproc.c patching file Modules/_ctypes/ctypes.h patching file setup.py patching file 'Misc/NEWS.d/next/Core and Builtins/2020-06-30-04-44-29.bpo-41100.PJwA6F.rst' patching file Modules/_decimal/libmpdec/mpdecimal.h Installed Python-3.7.12 to /home/mlops/.pyenv/versions/3.7.12 ``` ### Create python virtual environment Create a Python virtual environment with the `pyenv virtualenv ` command to create a Python virtual environment with the desired Python version. For example, let's create a Python virtual environment called `demo` with Python 3.7.12 version. ```bash pyenv virtualenv 3.7.12 demo ``` ```bash $ pyenv virtualenv 3.7.12 demo Looking in links: /tmp/tmpffqys0gv Requirement already satisfied: setuptools in /home/mlops/.pyenv/versions/3.7.12/envs/demo/lib/python3.7/site-packages (47.1.0) Requirement already satisfied: pip in /home/mlops/.pyenv/versions/3.7.12/envs/demo/lib/python3.7/site-packages (20.1.1) ``` ### Activating python virtual environment Use the `pyenv activate ` command to use the virtual environment created in this way. For example, we will use a Python virtual environment called `demo`. ```bash pyenv activate demo ``` You can see that the information of the current virtual environment is printed at the front of the shell. Before ```bash mlops@ubuntu:~$ pyenv activate demo ``` After ```bash pyenv-virtualenv: prompt changing will be removed from future release. configure `export PYENV_VIRTUALENV_DISABLE_PROMPT=1' to simulate the behavior. (demo) mlops@ubuntu:~$ ``` ### Deactivating python virtual environment You can deactivate the currently active virtualenv by using the command `source deactivate`. ```bash source deactivate ``` Before ```bash (demo) mlops@ubuntu:~$ source deactivate ``` After ```bash mlops@ubuntu:~$ ``` ================================================ FILE: i18n/en/docusaurus-plugin-content-docs/current/further-readings/_category_.json ================================================ { "label": "Further Readings", "position": 8, "link": { "type": "generated-index" } } ================================================ FILE: i18n/en/docusaurus-plugin-content-docs/current/further-readings/info.md ================================================ --- title: "Further Readings" date: 2021-12-21 lastmod: 2021-12-21 --- ## MLOps Component From the components covered in [MLOps Concepts](../introduction/component.md), the following diagram illustrates them. ![open-stacks-0.png](./img/open-stacks-0.png) The technology stacks covered in *Everyone's MLOps* are as follows. ![open-stacks-1.png](./img/open-stacks-1.png) | | Storage | [Minio](https://min.io/) | | | Data Processing | [Apache Spark](https://spark.apache.org/) | | | Data Visualization | [Tableau](https://www.tableau.com/) | | Workflow Mgmt. | Orchestration | [Airflow](https://airflow.apache.org/) | | | Scheduling | [Kubernetes](https://kubernetes.io/) | | Security & Compliance | Authentication & Authorization | [Ldap](https://www.openldap.org/) | | | Data Encryption & Tokenization | [Vault](https://www.vaultproject.io/) | | | Governance & Auditing | [Open Policy Agent](https://www.openpolicyagent.org/) | As you can see, there are still many MLOps components that we have not covered yet. We could not cover them all this time due to time constraints, but if you need it, it might be a good idea to refer to the following open source projects first. ![open-stacks-2.png](./img/open-stacks-2.png) For details: | Mgmt. | Component | Open Soruce | | -------------------------- | --------------------------- | ------------------------------------- | | Data Mgmt. | Collection | [Kafka](https://kafka.apache.org/) | | | Validation | [Beam](https://beam.apache.org/) | | | Feature Store | [Flink](https://flink.apache.org/) | | ML Model Dev. & Experiment | Modeling | [Jupyter](https://jupyter.org/) | | | Analysis & Experiment Mgmt. | [MLflow](https://mlflow.org/) | | | HPO Tuning & AutoML | [Katib](https://github.com/kubeflow/katib) | | Deploy Mgmt. | Serving Framework | [Seldon Core](https://docs.seldon.io/projects/seldon-core/en/latest/index.html) | | | A/B Test | [Iter8](https://iter8.tools/) | | | Monitoring | [Grafana](https://grafana.com/oss/grafana/), [Prometheus](https://prometheus.io/) | | Process Mgmt. | pipeline | [Kubeflow](https://www.kubeflow.org/) | | | CI/CD | [Github Action](https://docs.github.com/en/actions) | | | Continuous Training | [Argo Events](https://argoproj.github.io/events/) | | Platform Mgmt. | Configuration Mgmt. | [Consul](https://www.consul.io/) | | | Code Version Mgmt. | [Github](https://github.com/), [Minio](https://min.io/) | | | Logging | (EFK) [Elastic Search](https://www.elastic.co/kr/elasticsearch/), [Fluentd](https://www.fluentd.org/), [Kibana](https://www.elastic.co/kr/kibana/) | | | Resource Mgmt. | [Kubernetes](https://kubernetes.io/) | ================================================ FILE: i18n/en/docusaurus-plugin-content-docs/current/introduction/_category_.json ================================================ { "label": "Introduction", "position": 1, "link": { "type": "generated-index" } } ================================================ FILE: i18n/en/docusaurus-plugin-content-docs/current/introduction/component.md ================================================ --- title : "3. Components of MLOps" description: "Describe MLOps Components" sidebar_position: 3 date: 2021-12-03 lastmod: 2021-12-10 contributors: ["Youngcheol Jang"] --- ## Practitioners guide to MLOps Google's white paper [Practitioners guide to MLOps: A framework for continuous delivery and automation of machine learning] published in May 2021 mentions the following core functionalities of MLOps: ![mlops-component](./img/mlops-component.png) Let's look at what each feature does. ### 1. Experimentation Experimentation provides machine learning engineers with the following capabilities for data analysis, prototyping model development, and implementing training functionality: - Integration with version control tools like Git and a notebook (Jupyter Notebook) environment - Experiment tracking capabilities including data used, hyperparameters, and evaluation metrics - Data and model analysis and visualization capabilities ### 2. Data Processing Data Processing enables working with large volumes of data during the stages of model development, continuous training, and API deployment by providing the following functionalities: - Data connectors compatible with various data sources and services - Data encoders and decoders compatible with different data formats - Data transformation and feature engineering capabilities for different data types - Scalable batch and streaming data processing capabilities for training and serving ### 3. Model Training Model Training offers functionalities to efficiently execute algorithms for model training: - Environment provisioning for ML framework execution - Distributed training environment for multiple GPUs and distributed training - Hyperparameter tuning and optimization capabilities ### 4. Model Evaluation Model evaluation provides the following capabilities to observe the performance of models in both experimental and production environments: - Model performance evaluation on evaluation datasets - Tracking prediction performance across different continuous training runs - Comparison and visualization of performance between different models - Model output interpretation using interpretable AI techniques ### 5. Model Serving Model serving offers functionalities to deploy and serve models in production environments: - Low-latency and high-availability inference capabilities - Support for various ML model serving frameworks (TensorFlow Serving, TorchServe, NVIDIA Triton, Scikit-learn, XGBoost, etc.) - Advanced inference routines, such as preprocessing or postprocessing, and multi-model ensembling for final results - Autoscaling capabilities to handle spiking inference requests - Logging of inference requests and results ### 6. Online Experimentation Online experimentation provides capabilities to validate the performance of newly generated models when deployed. This functionality should be integrated with a Model Registry to coordinate the deployment of new models. - Canary and shadow deployment features - A/B testing capabilities - Multi-armed bandit testing functionality ### 7. Model Monitoring Model monitoring enables the monitoring of deployed models in production environments to ensure proper functioning and provides information on model performance degradation and the need for updates. ### 8. ML Pipeline ML Pipeline offers the following functionalities to configure, control, and automate complex ML training and inference workflows in production environments: - Pipeline execution through various event sources - ML metadata tracking and integration for pipeline parameter and artifact management - Support for built-in components for common ML tasks and user-defined components - Provisioning of different execution environments ### 9. Model Registry The Model Registry provides the capability to manage the lifecycle of machine learning models in a centralized repository. - Registration, tracking, and versioning of trained and deployed models - Storage of information about the required data and runtime packages for deployment ### 10. Dataset and Feature Repository - Sharing, search, reuse, and versioning capabilities for datasets - Real-time processing and low-latency serving capabilities for event streaming and online inference tasks - Support for various types of data, such as images, text, and tabular data ### 11. ML Metadata and Artifact Tracking In each stage of MLOps, various artifacts are generated. ML metadata refers to the information about these artifacts. ML metadata and artifact management provide the following functionalities to manage the location, type, attributes, and associations with experiments: - History management for ML artifacts - Tracking and sharing of experiments and pipeline parameter configurations - Storage, access, visualization, and download capabilities for ML artifacts - Integration with other MLOps functionalities ================================================ FILE: i18n/en/docusaurus-plugin-content-docs/current/introduction/intro.md ================================================ --- title : "1. What is MLOps?" description: "Introduction to MLOps" sidebar_position: 1 date: 2021-1./img to MLOps" lastmod: 2022-03-05 contributors: ["Jongseob Jeon"] --- ## Machine Learning Project Since 2012, when Alexnet was introduced, Machine Learning and Deep Learning have been introduced in any domain where data exists, such as Computer Vision and Natural Language Processing. Deep Learning and Machine Learning were referred to collectively as AI, and the need for AI was shouted from many media. And many companies conducted numerous projects using Machine Learning and Deep Learning. But what was the result? Byungchan Eum, the Head of North East Asia at Element AI, said “If 10 companies start an AI project, 9 of them will only be able to do concept validation (POC)”. In this way, in many projects, Machine Learning and Deep Learning only showed the possibility that they could solve this problem and then disappeared. And around this time, the outlook that [AI Winter was coming again](https://www.aifutures.org/2021/ai-winter-is-coming/) also began to emerge. Why did most projects end at the concept validation (POC) stage? Because it is impossible to operate an actual service with only Machine Learning and Deep Learning code. At the actual service stage, the portion taken up by machine learning and deep learning code is not as large as one would think, so one must consider many other aspects besides simply the performance of the model. Google has pointed out this problem in their 2015 paper [Hidden Technical Debt in Machine Learning Systems](https://proceedings.neurips.cc/paper/2015/file/86df7dcfd896fcaf2674f757a2463eba-Paper.pdf). However, at the time this paper was released, many ML engineers were busy proving the potential of deep learning and machine learning, so the points made in the paper were not given much attention. And after a few years, machine learning and deep learning had proven their potential and people were now looking to apply it to actual services. However, soon many people realized that actual services were not as easy as they thought. ## Devops MLOps is not a new concept, but rather a term derived from the development methodology called DevOps. Therefore, understanding DevOps can help in understanding MLOps. ### DevOps DevOps is a portmanteau of "Development" and "Operations," referring to a development and operations methodology that emphasizes communication, collaboration, and integration between software developers and IT professionals. It encompasses both the development and operation phases of software, aiming to achieve a symbiotic relationship between the two. The primary goal of DevOps is to enable organizations to develop and deploy software products and services rapidly by fostering close collaboration and interdependence between development and operations teams. ### Silo Effect Let's explore why DevOps is necessary through a simple scenario. In the early stages of a service, there are fewer supported features, and the team or company is relatively small. At this point, there may not be a clear distinction between development and operations, or the teams may be small. The key point here is the small scale. In such cases, there are many points of contact for effective communication, and with a limited number of services to focus on, it is possible to rapidly improve the service. However, as the service scales up, the development and operations teams tend to separate, and the physical limitations of communication channels become apparent. For example, in meetings involving multiple teams, only team leaders or a small number of seniors may attend, rather than the entire team. These limitations in communication channels inevitably lead to a lack of communication. Consequently, the development team continues to develop new features, while the operations team faces issues during deployment caused by the features developed by the development team. When such situations are repeated, it can lead to organizational silos, a phenomenon known as silo mentality. ![silo](./img/silo.png) > Indeed, the term "silo" originally refers to a tall, cylindrical structure used for storing grain or livestock feed. Silos are designed to keep the stored materials separate and prevent them from mixing. > In the context of organizations, the "silo effect" or "organizational silos effect" refers to a phenomenon where departments or teams within an organization operate independently and prioritize their own interests without effective collaboration. It reflects a mentality where individual departments focus on building their own "silos" and solely pursue their own interests. The silo effect can lead to a decline in service quality and hinder organizational performance. To address this issue, DevOps emerged as a solution. DevOps emphasizes collaboration, communication, and integration between development and operations teams, breaking down the barriers and fostering a culture of shared responsibility and collaboration. By promoting cross-functional teamwork and streamlining processes, DevOps aims to overcome silos and improve the efficiency and effectiveness of software development and operations. ### CI/CD Continuous Integration (CI) and Continuous Delivery (CD) are concrete methods to break down the barriers between development teams and operations teams. ![cicd](./img/cicd.png) Through this method, the development team can understand the operational environment and check whether the features being developed can be seamlessly deployed. The operations team can deploy validated features or improved products more often to increase customer product experience. In summary, DevOps is a methodology to solve the problem between development teams and operations teams. ## MLOps ### 1) ML + Ops DevOps is a methodology that addresses the challenges between development and operations teams, promoting collaboration and effective communication. By applying DevOps principles, development teams gain a better understanding of the operational environment, and the developed features can be seamlessly integrated and deployed. On the other hand, operations teams can deploy validated features or improved products more frequently, enhancing the overall customer experience. MLOps, which stands for Machine Learning Operations, extends the DevOps principles and practices specifically to the field of machine learning. In MLOps, the "Dev" in DevOps is replaced with "ML" to emphasize the unique challenges and considerations related to machine learning. MLOps aims to address the issues that arise between machine learning teams and operations teams. To understand these issues, let's consider an example using a recommendation system. #### Rule-Based Approach In the initial stages of building a recommendation system, a simple rule-based approach may be used. For example, items could be recommended based on the highest sales volume in the past week. With this approach, there is no need for model updates unless there are specific reasons for modification. #### Machine Learning Approach As the scale of the service grows and more log data accumulates, machine learning models can be developed based on item-based or user-based recommendations. In this case, the models are periodically retrained and redeployed. #### Deep Learning Approach When there is a greater demand for personalized recommendations and a need for models that deliver higher performance, deep learning models are developed. Similar to machine learning, these models are periodically retrained and redeployed. By considering these examples, it becomes evident that challenges can arise between the machine learning team and the operations team. MLOps aims to address these challenges and provide a methodology and set of practices to facilitate the development, deployment, and operation of machine learning models in a collaborative and efficient manner. ![graph](./img/graph.png) If we represent the concepts explained earlier on a graph, with model complexity on the x-axis and model performance on the y-axis, we can observe an upward trend where the model performance improves as the complexity increases. This often leads to the emergence of separate machine learning teams specializing in transitioning from traditional machine learning to deep learning. If there are only a few models to manage, collaboration between teams can be sufficient to address the challenges. However, as the number of models to develop increases, silos similar to those observed in DevOps can emerge. Considering the goals of DevOps, we can understand the goals of MLOps as ensuring that the developed models can be deployed successfully. While DevOps focuses on verifying that the features developed by the development team can be deployed correctly, MLOps focuses on verifying that the models developed by the machine learning team can be deployed effectively. ### 2) ML -> Ops However, recent MLOps-related products and explanations indicate that the goals are not limited to what was previously described. In some cases, the goal is to enable the machine learning team to directly operate and manage the models they develop. This need arises from the process of ongoing machine learning projects. In the case of recommendation systems, it was possible to start with simple models in operations. However, in domains such as natural language processing and image analysis, it is common to perform verification (POC) to determine if deep learning models can solve the given tasks. Once the verification is complete, the focus shifts to developing the operational environment for serving the models. However, it may not be easy for the machine learning team to handle this challenge with their internal capabilities alone. This is where MLOps becomes necessary. ### 3) Conclusion In summary, MLOps has two main goals. The earlier explanation of MLOps focused on ML+Ops, aiming to enhance productivity and collaboration between the two teams. On the other hand, the latter explanation focused on ML -> Ops, aiming to enable the machine learning team to directly operate and manage their models. ================================================ FILE: i18n/en/docusaurus-plugin-content-docs/current/introduction/levels.md ================================================ --- title : "2. Levels of MLOps" description: "Levels of MLOps" sidebar_position: 2 date: 2021-12-03 lastmod: 2022-03-05 contributors: ["Jongseob Jeon", "Chanmin Cho"] --- This page will look at the steps of MLOps outlined by Google and explore what the core features of MLOps are. ## Hidden Technical Debt in ML System Google has been talking about the need for MLOps since as far back as 2015. The paper Hidden Technical Debt in Machine Learning Systems encapsulates this idea from Google. ![paper](./img/paper.png) The key takeaway from this paper is that the machine learning code is only a small part of the entire system when it comes to building products with machine learning. Google developed MLOps by evolving this paper and expanding the term. More details can be found on the [Google Cloud homepage](https://cloud.google.com/architecture/mlops-continuous-delivery-and-automation-pipelines-in-machine-learning). In this post, we will try to explain what Google means by MLOps. Google divided the evolution of MLOps into three (0-2) stages. Before explaining each stage, let's review some of the concepts described in the previous post. In order to operate a machine learning model, there is a machine learning team responsible for developing the model and an operations team responsible for deployment and operations. MLOps is needed for the successful collaboration of these two teams. We have previously said that it can be done simply through Continuous Integration (CI) / Continuous Deployment (CD), so let us see how to do CI / CD. ## Level 0: Manual Process ![level-0](./img/level-0.png) At the 0th stage, two teams communicate through a "model". The machine learning team trains the model with accumulated data and delivers the trained model to the operation team. The operation team then deploys the model delivered in this way. ![toon](./img/toon.png) Initial machine learning models are deployed through this "model" centered communication. However, there are several problems with this distribution method. For example, if some functions use Python 3.7 and some use Python 3.8, we often see the following situation. The reason for this situation lies in the characteristics of the machine learning model. Three things are needed for the trained machine learning model to work: 1. Python code 2. Trained weights 3. Environment (Packages, versions) If any of these three aspects is communicated incorrectly, the model may fail to function or make unexpected predictions. However, in many cases, models fail to work due to environmental mismatches. Machine learning relies on various open-source libraries, and due to the nature of open-source, even the same function can produce different results depending on the version used. In the early stages of a service, when there are not many models to manage, these issues can be resolved quickly. However, as the number of managed features increases and communication becomes more challenging, it becomes difficult to deploy models with better performance quickly. ## Level 1: Automated ML Pipeline ### Pipeline ![level-1-pipeline](./img/level-1-pipeline.png) So, in MLOps, "pipeline" is used to prevent such problems. The MLOps pipeline ensures that the model operates in the same environment as the one used by the machine learning engineer during model development, using containers like Docker. This helps prevent situations where the model doesn't work due to differences in the environment. However, the term "pipeline" is used in a broader context and in various tasks. What is the role of the pipeline that machine learning engineers create? The pipeline created by machine learning engineers produces trained models. Therefore, it would be more accurate to refer to it as a training pipeline rather than just a pipeline. ### Continuous Training ![level-1-ct.png](./img/level-1-ct.png) And the concept of Continuous Training (CT) is added. So why is CT necessary? #### Auto Retrain In the real world, data exhibits a characteristic called "Data Shift," where the data distribution keeps changing over time. As a result, models trained in the past may experience performance degradation over time. The simplest and most effective solution to this problem is to retrain the model using recent data. By retraining the model according to the changed data distribution, it can regain its performance. #### Auto Deploy However, in industries such as manufacturing, where multiple recipes are processed in a single factory, it may not always be desirable to retrain the model unconditionally. One common example is the blind spot. For example, in an automotive production line, a model A was created and used for predictions. If an entirely different model B is introduced, it represents unseen data patterns, and a new model is trained for model B. Now, the model will make predictions for model B. However, if the data switches back to model A, what should be done? If there are only retraining rules, a new model for model A will be trained again. However, machine learning models require a sufficient amount of data to demonstrate satisfactory performance. The term "blind spot" refers to a period in which the model does not work while gathering enough data. There is a simple solution to address this blind spot. It involves checking whether there was a previous model for model A and, if so, using the previous model for prediction instead of immediately training a new model. This way, using meta-data associated with the model to automatically switch models is known as Auto Deploy. To summarize, for Continuous Training (CT), both Auto Retrain and Auto Deploy are necessary. They complement each other's weaknesses and enable the model's performance to be maintained continuously. ### Model Serving ![level-1-modelserving](./img/level-1-modelserving.png) Machine learning pipelines in production continuously deploy the latest models based on new data to your prediction service. This process involves automatically deploying trained and validated models to online prediction services. ## Level 2: Automating the CI/CD Pipeline ![level-2](./img/level-2.png) The title of Step 2 is the automation of CI and CD. In DevOps, the focus of CI/CD is on source code. So what is the focus of CI/CD in MLOps? In MLOps, the focus of CI/CD is also on source code, but more specifically, it can be seen as the training pipeline. Therefore, when it comes to training models, it is important to verify whether the model is trained correctly (CI) and whether the trained model functions properly (CD) in response to relevant changes that can impact the training process. Hence, CI/CD should be performed when there are direct modifications to the code used for training. In addition to code, the versions of the packages used and changes in the Python version are also part of CI/CD. In many cases, machine learning utilizes open-source packages. However, open-source packages can have changes in the internal logic of functions when their versions are updated. Although notifications may be provided when there are certain version updates, significant changes in versions can go unnoticed. Therefore, when the versions of the packages used change, it is important to perform CI/CD to ensure that the model is trained and functions correctly. In summary, in MLOps, CI/CD focuses on the source code, particularly the training pipeline, to verify that the model is trained correctly and functions properly. This includes checking for direct code modifications and changes in package versions or Python versions to ensure the integrity of the training and functioning processes of the model. ================================================ FILE: i18n/en/docusaurus-plugin-content-docs/current/introduction/why_kubernetes.md ================================================ --- title : "4. Why Kubernetes?" description: "Reason for using k8s in MLOps" sidebar_position: 4 date: 2021-12-03 lastmod: 2021-12-10 contributors: ["Jaeyeon Kim"] --- ## MLOps & Kubernetes When talking about MLOps, why is the word Kubernetes always heard together? To build a successful MLOps system, various components are needed as described in [Components of MLOps](../introduction/component.md), but to operate them organically at the infrastructure level, there are many issues to be solved. For example, simply running a large number of machine learning model requests in order, ensuring the same execution environment in other workspaces, and responding quickly when a deployed service has a failure. The need for containers and container orchestration systems appears here. With the introduction of container orchestration systems such as Kubernetes, efficient isolation and management of execution environments can be achieved. By introducing a container orchestration system, it is possible to prevent situations such as *'Is anyone using cluster 1?', 'Who killed my process that was using GPU?', 'Who updated the x package on the cluster?* when developing and deploying machine learning models while a few developers share a small number of clusters. ## Container Microsoft defines a container as follows: What is a container then? In Microsoft, a container is defined as [follows](https://azure.microsoft.com/en-us/overview/what-is-a-container/). > Container: Standardized, portable packaging of an application's code, libraries, and configuration files But why is a container needed for machine learning? Machine learning models can behave differently depending on the operating system, Python execution environment, package version, etc. To prevent this, the technology used to share and execute the entire dependent execution environment with the source code used in machine learning is called containerization technology. This packaged form is called a container image, and by sharing the container image, users can ensure the same execution results on any system. In other words, by sharing not just the Jupyter Notebook file or the source code and requirements.txt file of the model, but the entire container image with the execution environment, you can avoid situations such as *"It works on my notebook, why not yours?"*. One translation of the Korean sentence to English is: "One of the common misunderstandings that people who are new to containers often make is to assume that "container == Docker". Docker is not a concept that has the same meaning as containers; rather, it is a tool that provides features to make it easier and more flexible to use containers, such as launching containers and creating and sharing container images. In summary, container is a virtualization technology, and Docker is an implementation of virtualization technology. However, Docker has become the mainstream quickly due to its easy usability and high efficiency among various container virtualization tools, so when people think of containers, they often think of Docker automatically. There are various reasons why the container and Docker ecosystem have become the mainstream, but for technical reasons, I won't go into that detail since it is outside the scope of Everybody's MLOps. ## Container Orchestration System Then what is a container orchestration system? As inferred from the word "orchestration," it can be compared to a system that coordinates the operation of numerous containers to work together harmoniously. In container-based systems, services are provided to users in the form of containers. If the number of containers to be managed is small, a single operator can sufficiently handle all situations. However, if there are hundreds of containers running in dozens of clusters and they need to function continuously without causing any failures, it becomes nearly impossible for a single operator to monitor the proper functioning of all services and respond to issues. For example, continuous monitoring is required to ensure that all services are functioning properly. If a specific service experiences a failure, the operator needs to investigate the problem by examining the logs of multiple containers. Additionally, they need to handle various tasks such as scheduling and load balancing to prevent work overload on specific clusters or containers, as well as scaling operations. A container orchestration system is software that provides functionality to manage and operate the states of numerous containers continuously and automatically, making the process of managing and operating a large number of containers somewhat easier. How can it be used in machine learning? For example, a container that packages deep learning training code that requires a GPU can be executed on a cluster with available GPUs. A container that packages data preprocessing code requiring a large amount of memory can be executed on a cluster with ample memory. If there is an issue with the cluster during training, the system can automatically move the same container to a different cluster and continue the training, eliminating the need for manual intervention. Developing such a system that automates management without requiring manual intervention is the goal. As of the writing of this text in 2022, Kubernetes is considered the de facto standard for container orchestration systems. According to the [survey](https://www.cncf.io/blog/2018/08/29/cncf-survey-use-of-cloud-native-technologies-in-production-has-grown-over-200-percent/) released by CNCF in 2018, Kubernetes was already showing its prominence. The [survey](https://www.cncf.io/wp-content/uploads/2020/08/CNCF_Survey_Report.pdf) published in 2019 indicates that 78% of respondents were using Kubernetes at a production level. ![k8s-graph](./img/k8s-graph.png) The growth of the Kubernetes ecosystem can be attributed to various reasons. However, similar to Docker, Kubernetes is not exclusively limited to machine learning-based services. Since delving into detailed technical content would require a substantial amount of discussion, this edition of "MLOps for ALL" will omit the detailed explanation of Kubernetes. ================================================ FILE: i18n/en/docusaurus-plugin-content-docs/current/kubeflow/_category_.json ================================================ { "label": "Kubeflow", "position": 6, "link": { "type": "generated-index" } } ================================================ FILE: i18n/en/docusaurus-plugin-content-docs/current/kubeflow/advanced-component.md ================================================ --- title : "8. Component - InputPath/OutputPath" description: "" sidebar_position: 8 contributors: ["Jongseob Jeon", "SeungTae Kim"] --- ## Complex Outputs On this page, we will write the code example from [Kubeflow Concepts](../kubeflow/kubeflow-concepts.md#component-contents) as a component. ## Component Contents Below is the component content used in [Kubeflow Concepts](../kubeflow/kubeflow-concepts.md#component-contents). ```python import dill import pandas as pd from sklearn.svm import SVC train_data = pd.read_csv(train_data_path) train_target = pd.read_csv(train_target_path) clf = SVC(kernel=kernel) clf.fit(train_data, train_target) with open(model_path, mode="wb") as file_writer: dill.dump(clf, file_writer) ``` ## Component Wrapper ### Define a standalone Python function With the necessary Configs for the Component Wrapper, it will look like this. ```python def train_from_csv( train_data_path: str, train_target_path: str, model_path: str, kernel: str, ): import dill import pandas as pd from sklearn.svm import SVC train_data = pd.read_csv(train_data_path) train_target = pd.read_csv(train_target_path) clf = SVC(kernel=kernel) clf.fit(train_data, train_target) with open(model_path, mode="wb") as file_writer: dill.dump(clf, file_writer) ``` In the [Basic Usage Component]](../kubeflow/basic-component), we explained that you should provide type hints for input and output when describing. But what about complex objects such as dataframes, models, that cannot be used in json? When passing values between functions in Python, objects can be returned and their value will be stored in the host's memory, so the same object can be used in the next function. However, in Kubeflow, components are running independently on each container, that is, they are not sharing the same memory, so you cannot pass objects in the same way as in a normal Python function. The only information that can be passed between components is in `json` format. Therefore, objects of types that cannot be converted into json format such as Model or DataFrame must be passed in some other way. Kubeflow solves this by storing the data in a file instead of memory, and then using the file to pass information. Since the path of the stored file is a string, it can be passed between components. However, in Kubeflow, the user does not know the path of the file before the execution. For this, Kubeflow provides a magic related to the input and output paths, `InputPath` and `OutputPath`. `InputPath` literally means the input path, and `OutputPath` literally means the output path. For example, in a component that generates and returns data, `data_path: OutputPath()` is created as an argument. And in a component that receives data, `data_path: InputPath()` is created as an argument. Once these are created, when connecting them in a pipeline, Kubeflow automatically generates and inputs the necessary paths. Therefore, users no longer need to worry about the paths and only need to consider the relationships between components. Based on this information, when rewriting the component wrapper, it would look like the following. ```python from kfp.components import InputPath, OutputPath def train_from_csv( train_data_path: InputPath("csv"), train_target_path: InputPath("csv"), model_path: OutputPath("dill"), kernel: str, ): import dill import pandas as pd from sklearn.svm import SVC train_data = pd.read_csv(train_data_path) train_target = pd.read_csv(train_target_path) clf = SVC(kernel=kernel) clf.fit(train_data, train_target) with open(model_path, mode="wb") as file_writer: dill.dump(clf, file_writer) ``` InputPath or OutputPath can accept a string. This string is the format of the file to be input or output. However, it does not necessarily mean that the file has to be stored in this format. It just serves as a helper for type checking when compiling the pipeline. If the file format is not fixed, then no input is needed (it serves the role of something like `Any` in type hints). ### Convert to Kubeflow Format Convert the written component into a format that can be used in Kubeflow. ```python from kfp.components import InputPath, OutputPath, create_component_from_func @create_component_from_func def train_from_csv( train_data_path: InputPath("csv"), train_target_path: InputPath("csv"), model_path: OutputPath("dill"), kernel: str, ): import dill import pandas as pd from sklearn.svm import SVC train_data = pd.read_csv(train_data_path) train_target = pd.read_csv(train_target_path) clf = SVC(kernel=kernel) clf.fit(train_data, train_target) with open(model_path, mode="wb") as file_writer: dill.dump(clf, file_writer) ``` ## Rule for using InputPath/OutputPath There are rules to follow when using InputPath or OutputPath arguments in pipeline. ### Load Data Component To execute the previously written component, a component that generates data is created since data is required. ```python from functools import partial from kfp.components import InputPath, OutputPath, create_component_from_func @create_component_from_func def load_iris_data( data_path: OutputPath("csv"), target_path: OutputPath("csv"), ): import pandas as pd from sklearn.datasets import load_iris iris = load_iris() data = pd.DataFrame(iris["data"], columns=iris["feature_names"]) target = pd.DataFrame(iris["target"], columns=["target"]) data.to_csv(data_path, index=False) target.to_csv(target_path, index=False) ``` ### Write Pipeline Now let's write the pipeline. ```python from kfp.dsl import pipeline @pipeline(name="complex_pipeline") def complex_pipeline(kernel: str): iris_data = load_iris_data() model = train_from_csv( train_data=iris_data.outputs["data"], train_target=iris_data.outputs["target"], kernel=kernel, ) ``` Have you noticed something strange? All the `_path` suffixes have disappeared from the arguments received in the input and output. We can see that instead of accessing `iris_data.outputs["data_path"]`, we are accessing `iris_data.outputs["data"]`. This happens because Kubeflow has a rule that paths created with `InputPath` and `OutputPath` can be accessed without the `_path` suffix when accessed from the pipeline. However, if you upload the pipeline just written, it will not run. The reason is explained on the next page. ================================================ FILE: i18n/en/docusaurus-plugin-content-docs/current/kubeflow/advanced-environment.md ================================================ --- title : "9. Component - Environment" description: "" sidebar_position: 9 contributors: ["Jongseob Jeon"] --- ## Component Environment When we run the pipeline written in [8. Component - InputPath/OutputPath](../kubeflow/advanced-component.md), it fails. Let's find out why it fails and modify it so that it can run properly. ### Convert to Kubeflow Format Let's convert the component written [earlier](../kubeflow/advanced-component.md#convert-to-kubeflow-format) into a yaml file. ```python from kfp.components import InputPath, OutputPath, create_component_from_func @create_component_from_func def train_from_csv( train_data_path: InputPath("csv"), train_target_path: InputPath("csv"), model_path: OutputPath("dill"), kernel: str, ): import dill import pandas as pd from sklearn.svm import SVC train_data = pd.read_csv(train_data_path) train_target = pd.read_csv(train_target_path) clf = SVC(kernel=kernel) clf.fit(train_data, train_target) with open(model_path, mode="wb") as file_writer: dill.dump(clf, file_writer) if __name__ == "__main__": train_from_csv.component_spec.save("train_from_csv.yaml") ``` If you run the script above, you will get a `train_from_csv.yaml` file like the one below. ```bash name: Train from csv inputs: - {name: train_data, type: csv} - {name: train_target, type: csv} - {name: model, type: dill} - {name: kernel, type: String} implementation: container: image: python:3.7 command: - sh - -ec - | program_path=$(mktemp) printf "%s" "$0" > "$program_path" python3 -u "$program_path" "$@" - | def train_from_csv( train_data_path, train_target_path, model_path, kernel, ): import dill import pandas as pd from sklearn.svm import SVC train_data = pd.read_csv(train_data_path) train_target = pd.read_csv(train_target_path) clf = SVC(kernel=kernel) clf.fit(train_data, train_target) with open(model_path, mode="wb") as file_writer: dill.dump(clf, file_writer) import argparse _parser = argparse.ArgumentParser(prog='Train from csv', description='') _parser.add_argument("--train-data", dest="train_data_path", type=str, required=True, default=argparse.SUPPRESS) _parser.add_argument("--train-target", dest="train_target_path", type=str, required=True, default=argparse.SUPPRESS) _parser.add_argument("--model", dest="model_path", type=str, required=True, default=argparse.SUPPRESS) _parser.add_argument("--kernel", dest="kernel", type=str, required=True, default=argparse.SUPPRESS) _parsed_args = vars(_parser.parse_args()) _outputs = train_from_csv(**_parsed_args) args: - --train-data - {inputPath: train_data} - --train-target - {inputPath: train_target} - --model - {inputPath: model} - --kernel - {inputValue: kernel} ``` According to the content explained in the [Basic Usage Component](../kubeflow/basic-component.md#convert-to-kubeflow-format) previously mentioned, this component will be executed as follows: 1. `docker pull python:3.7` 2. run `command` However, when running the component created above, an error will occur. The reason is in the way the component wrapper is executed. Kubeflow uses Kubernetes, so the component wrapper runs the component content on its own separate container. In detail, the image specified in the generated `train_from_csv.yaml` is `image: python:3.7`. There may be some people who notice why it is not running for some reason. The `python:3.7` image does not have the packages we want to use, such as `dill`, `pandas`, and `sklearn`, installed. Therefore, when executing, it fails with an error indicating that the packages are not found. So, how can we add the packages? ## Adding packages During the process of converting Kubeflow, there are two ways to add packages: 1. Using `base_image` 2. Using `package_to_install` Let's check what arguments the function `create_component_from_func` used to compile the components can receive. ```bash def create_component_from_func( func: Callable, output_component_file: Optional[str] = None, base_image: Optional[str] = None, packages_to_install: List[str] = None, annotations: Optional[Mapping[str, str]] = None, ): ``` - `func`: Function that creates the component wrapper to be made into a component. - `base_image`: Image that the component wrapper will run on. - `packages_to_install`: Additional packages that need to be installed for the component to use. ### 1. base_image Take a closer look at the sequence in which the component is executed and it will be as follows: 1. `docker pull base_image` 2. `pip install packages_to_install` 3. run `command` If the base_image used by the component already has all the packages installed, you can use it without installing additional packages. For example, on this page we are going to write a Dockerfile like this: ```dockerfile FROM python:3.7 RUN pip install dill pandas scikit-learn ``` Let's build the image using the Dockerfile above. The Docker hub we will use for the practice is ghcr. You can choose a Docker hub according to your environment and upload it. ```bash docker build . -f Dockerfile -t ghcr.io/mlops-for-all/base-image docker push ghcr.io/mlops-for-all/base-image ``` Now let's try inputting the base image. ```python from functools import partial from kfp.components import InputPath, OutputPath, create_component_from_func @partial( create_component_from_func, base_image="ghcr.io/mlops-for-all/base-image:latest", ) def train_from_csv( train_data_path: InputPath("csv"), train_target_path: InputPath("csv"), model_path: OutputPath("dill"), kernel: str, ): import dill import pandas as pd from sklearn.svm import SVC train_data = pd.read_csv(train_data_path) train_target = pd.read_csv(train_target_path) clf = SVC(kernel=kernel) clf.fit(train_data, train_target) with open(model_path, mode="wb") as file_writer: dill.dump(clf, file_writer) if __name__ == "__main__": train_from_csv.component_spec.save("train_from_csv.yaml") ``` If you compile the generated component, it will appear as follows. ```bash name: Train from csv inputs: - {name: train_data, type: csv} - {name: train_target, type: csv} - {name: kernel, type: String} outputs: - {name: model, type: dill} implementation: container: image: ghcr.io/mlops-for-all/base-image:latest command: - sh - -ec - | program_path=$(mktemp) printf "%s" "$0" > "$program_path" python3 -u "$program_path" "$@" - | def _make_parent_dirs_and_return_path(file_path: str): import os os.makedirs(os.path.dirname(file_path), exist_ok=True) return file_path def train_from_csv( train_data_path, train_target_path, model_path, kernel, ): import dill import pandas as pd from sklearn.svm import SVC train_data = pd.read_csv(train_data_path) train_target = pd.read_csv(train_target_path) clf = SVC(kernel=kernel) clf.fit(train_data, train_target) with open(model_path, mode="wb") as file_writer: dill.dump(clf, file_writer) import argparse _parser = argparse.ArgumentParser(prog='Train from csv', description='') _parser.add_argument("--train-data", dest="train_data_path", type=str, required=True, default=argparse.SUPPRESS) _parser.add_argument("--train-target", dest="train_target_path", type=str, required=True, default=argparse.SUPPRESS) _parser.add_argument("--kernel", dest="kernel", type=str, required=True, default=argparse.SUPPRESS) _parser.add_argument("--model", dest="model_path", type=_make_parent_dirs_and_return_path, required=True, default=argparse.SUPPRESS) _parsed_args = vars(_parser.parse_args()) _outputs = train_from_csv(**_parsed_args) args: - --train-data - {inputPath: train_data} - --train-target - {inputPath: train_target} - --kernel - {inputValue: kernel} - --model - {outputPath: model} ``` We can confirm that the base_image has been changed to the value we have set. ### 2. packages_to_install However, when packages are added, it takes a lot of time to create a new Docker image. In this case, we can use the `packages_to_install` argument to easily add packages to the container. ```python from functools import partial from kfp.components import InputPath, OutputPath, create_component_from_func @partial( create_component_from_func, packages_to_install=["dill==0.3.4", "pandas==1.3.4", "scikit-learn==1.0.1"], ) def train_from_csv( train_data_path: InputPath("csv"), train_target_path: InputPath("csv"), model_path: OutputPath("dill"), kernel: str, ): import dill import pandas as pd from sklearn.svm import SVC train_data = pd.read_csv(train_data_path) train_target = pd.read_csv(train_target_path) clf = SVC(kernel=kernel) clf.fit(train_data, train_target) with open(model_path, mode="wb") as file_writer: dill.dump(clf, file_writer) if __name__ == "__main__": train_from_csv.component_spec.save("train_from_csv.yaml") ``` If you execute the script, the `train_from_csv.yaml` file will be generated. ```bash name: Train from csv inputs: - {name: train_data, type: csv} - {name: train_target, type: csv} - {name: kernel, type: String} outputs: - {name: model, type: dill} implementation: container: image: python:3.7 command: - sh - -c - (PIP_DISABLE_PIP_VERSION_CHECK=1 python3 -m pip install --quiet --no-warn-script-location 'dill==0.3.4' 'pandas==1.3.4' 'scikit-learn==1.0.1' || PIP_DISABLE_PIP_VERSION_CHECK=1 python3 -m pip install --quiet --no-warn-script-location 'dill==0.3.4' 'pandas==1.3.4' 'scikit-learn==1.0.1' --user) && "$0" "$@" - sh - -ec - | program_path=$(mktemp) printf "%s" "$0" > "$program_path" python3 -u "$program_path" "$@" - | def _make_parent_dirs_and_return_path(file_path: str): import os os.makedirs(os.path.dirname(file_path), exist_ok=True) return file_path def train_from_csv( train_data_path, train_target_path, model_path, kernel, ): import dill import pandas as pd from sklearn.svm import SVC train_data = pd.read_csv(train_data_path) train_target = pd.read_csv(train_target_path) clf = SVC(kernel=kernel) clf.fit(train_data, train_target) with open(model_path, mode="wb") as file_writer: dill.dump(clf, file_writer) import argparse _parser = argparse.ArgumentParser(prog='Train from csv', description='') _parser.add_argument("--train-data", dest="train_data_path", type=str, required=True, default=argparse.SUPPRESS) _parser.add_argument("--train-target", dest="train_target_path", type=str, required=True, default=argparse.SUPPRESS) _parser.add_argument("--kernel", dest="kernel", type=str, required=True, default=argparse.SUPPRESS) _parser.add_argument("--model", dest="model_path", type=_make_parent_dirs_and_return_path, required=True, default=argparse.SUPPRESS) _parsed_args = vars(_parser.parse_args()) _outputs = train_from_csv(**_parsed_args) args: - --train-data - {inputPath: train_data} - --train-target - {inputPath: train_target} - --kernel - {inputValue: kernel} - --model - {outputPath: model} ``` If we take a closer look at the order in which the components written above are executed, it looks like this: 1. `docker pull python:3.7` 2. `pip install dill==0.3.4 pandas==1.3.4 scikit-learn==1.0.1` 3. run `command` When the generated yaml file is closely examined, the following lines are automatically added, so that the necessary packages are installed and the program runs smoothly without errors. ```bash command: - sh - -c - (PIP_DISABLE_PIP_VERSION_CHECK=1 python3 -m pip install --quiet --no-warn-script-location 'dill==0.3.4' 'pandas==1.3.4' 'scikit-learn==1.0.1' || PIP_DISABLE_PIP_VERSION_CHECK=1 python3 -m pip install --quiet --no-warn-script-location 'dill==0.3.4' 'pandas==1.3.4' 'scikit-learn==1.0.1' --user) && "$0" "$@" ``` ================================================ FILE: i18n/en/docusaurus-plugin-content-docs/current/kubeflow/advanced-mlflow.md ================================================ --- title : "12. Component - MLFlow" description: "" sidebar_position: 12 date: 2021-12-13 lastmod: 2021-12-20 contributors: ["Jongseob Jeon", "SeungTae Kim"] --- ## MLFlow Component In this page, we will explain the process of writing a component to store the model in MLFlow so that the model trained in [Advanced Usage Component](../kubeflow/advanced-component.md) can be linked to API deployment. ## MLFlow in Local In order to store the model in MLFlow and use it in serving, the following items are needed. - model - signature - input_example - conda_env We will look into the process of saving a model to MLFlow through Python code. ### 1. Train model The following steps involve training an SVC model using the iris dataset. ```python import pandas as pd from sklearn.datasets import load_iris from sklearn.svm import SVC iris = load_iris() data = pd.DataFrame(iris["data"], columns=iris["feature_names"]) target = pd.DataFrame(iris["target"], columns=["target"]) clf = SVC(kernel="rbf") clf.fit(data, target) ``` ### 2. MLFLow Infos This process creates the necessary information for MLFlow. ```python from mlflow.models.signature import infer_signature from mlflow.utils.environment import _mlflow_conda_env input_example = data.sample(1) signature = infer_signature(data, clf.predict(data)) conda_env = _mlflow_conda_env(additional_pip_deps=["dill", "pandas", "scikit-learn"]) ``` Each variable's content is as follows. - `input_example` | sepal length (cm) | sepal width (cm) | petal length (cm) | petal width (cm) | | --- | --- | --- | --- | | 6.5 | 6.7 | 3.1 | 4.4 | - `signature` ```python inputs: ['sepal length (cm)': double, 'sepal width (cm)': double, 'petal length (cm)': double, 'petal width (cm)': double] outputs: [Tensor('int64', (-1,))] ``` - `conda_env` ```python {'name': 'mlflow-env', 'channels': ['conda-forge'], 'dependencies': ['python=3.8.10', 'pip', {'pip': ['mlflow', 'dill', 'pandas', 'scikit-learn']}]} ``` ### 3. Save MLFLow Infos Next, we save the learned information and the model. Since the trained model uses the sklearn package, we can easily save the model using `mlflow.sklearn`. ```python from mlflow.sklearn import save_model save_model( sk_model=clf, path="svc", serialization_format="cloudpickle", conda_env=conda_env, signature=signature, input_example=input_example, ) ``` If you work locally, a svc folder will be created and the following files will be generated. ```bash ls svc ``` If you execute the command above, you can check the following output value. ```bash MLmodel conda.yaml input_example.json model.pkl requirements.txt ``` Each file will be as follows if checked. - MLmodel ```bash flavors: python_function: env: conda.yaml loader_module: mlflow.sklearn model_path: model.pkl python_version: 3.8.10 sklearn: pickled_model: model.pkl serialization_format: cloudpickle sklearn_version: 1.0.1 saved_input_example_info: artifact_path: input_example.json pandas_orient: split type: dataframe signature: inputs: '[{"name": "sepal length (cm)", "type": "double"}, {"name": "sepal width (cm)", "type": "double"}, {"name": "petal length (cm)", "type": "double"}, {"name": "petal width (cm)", "type": "double"}]' outputs: '[{"type": "tensor", "tensor-spec": {"dtype": "int64", "shape": [-1]}}]' utc_time_created: '2021-12-06 06:52:30.612810' ``` - conda.yaml ```bash channels: - conda-forge dependencies: - python=3.8.10 - pip - pip: - mlflow - dill - pandas - scikit-learn name: mlflow-env ``` - input_example.json ```bash { "columns": [ "sepal length (cm)", "sepal width (cm)", "petal length (cm)", "petal width (cm)" ], "data": [ [6.7, 3.1, 4.4, 1.4] ] } ``` - requirements.txt ```bash mlflow dill pandas scikit-learn ``` - model.pkl ## MLFlow on Server Now, let's proceed with the task of uploading the saved model to the MLflow server. ```python import mlflow with mlflow.start_run(): mlflow.log_artifact("svc/") ``` Save and open the `mlruns` directory generated path with `mlflow ui` command to launch mlflow server and dashboard. Access the mlflow dashboard, click the generated run to view it as below. ![mlflow-0.png](./img/mlflow-0.png) (This screen may vary depending on the version of mlflow.) ## MLFlow Component Now, let's write a reusable component in Kubeflow. The ways of writing components that can be reused are broadly divided into three categories. 1. After saving the necessary environment in the component responsible for model training, the MLflow component is only responsible for the upload. ![mlflow-1.png](./img/mlflow-1.png) 2. Pass the trained model and data to the MLflow component, which is responsible for saving and uploading. ![mlflow-2.png](./img/mlflow-2.png) 3. The component responsible for model training handles both saving and uploading. ![mlflow-3.png](./img/mlflow-3.png) We are trying to manage the model through the first approach. The reason is that we don't need to write the code to upload the MLFlow model every time like three times for each component written. Reusing components is possible by the methods 1 and 2. However, in the case of 2, it is necessary to deliver the trained image and packages to the component, so ultimately additional information about the component must be delivered. In order to proceed with the method 1, the learning component must also be changed. Code that stores the environment needed to save the model must be added. ```python from functools import partial from kfp.components import InputPath, OutputPath, create_component_from_func @partial( create_component_from_func, packages_to_install=["dill", "pandas", "scikit-learn", "mlflow"], ) def train_from_csv( train_data_path: InputPath("csv"), train_target_path: InputPath("csv"), model_path: OutputPath("dill"), input_example_path: OutputPath("dill"), signature_path: OutputPath("dill"), conda_env_path: OutputPath("dill"), kernel: str, ): import dill import pandas as pd from sklearn.svm import SVC from mlflow.models.signature import infer_signature from mlflow.utils.environment import _mlflow_conda_env train_data = pd.read_csv(train_data_path) train_target = pd.read_csv(train_target_path) clf = SVC(kernel=kernel) clf.fit(train_data, train_target) with open(model_path, mode="wb") as file_writer: dill.dump(clf, file_writer) input_example = train_data.sample(1) with open(input_example_path, "wb") as file_writer: dill.dump(input_example, file_writer) signature = infer_signature(train_data, clf.predict(train_data)) with open(signature_path, "wb") as file_writer: dill.dump(signature, file_writer) conda_env = _mlflow_conda_env( additional_pip_deps=["dill", "pandas", "scikit-learn"] ) with open(conda_env_path, "wb") as file_writer: dill.dump(conda_env, file_writer) ``` Write a component to upload to MLFlow. At this time, configure the uploaded MLFlow endpoint to be connected to the [mlflow service](../setup-components/install-components-mlflow.md) that we installed. In this case, use the Kubernetes Service DNS Name of the Minio installed at the time of MLFlow Server installation. As this service is created in the Kubeflow namespace with the name minio-service, set it to `http://minio-service.kubeflow.svc:9000`. Similarly, for the tracking_uri address, use the Kubernetes Service DNS Name of the MLFlow server and set it to `http://mlflow-server-service.mlflow-system.svc:5000`. ```python from functools import partial from kfp.components import InputPath, create_component_from_func @partial( create_component_from_func, packages_to_install=["dill", "pandas", "scikit-learn", "mlflow", "boto3"], ) def upload_sklearn_model_to_mlflow( model_name: str, model_path: InputPath("dill"), input_example_path: InputPath("dill"), signature_path: InputPath("dill"), conda_env_path: InputPath("dill"), ): import os import dill from mlflow.sklearn import save_model from mlflow.tracking.client import MlflowClient os.environ["MLFLOW_S3_ENDPOINT_URL"] = "http://minio-service.kubeflow.svc:9000" os.environ["AWS_ACCESS_KEY_ID"] = "minio" os.environ["AWS_SECRET_ACCESS_KEY"] = "minio123" client = MlflowClient("http://mlflow-server-service.mlflow-system.svc:5000") with open(model_path, mode="rb") as file_reader: clf = dill.load(file_reader) with open(input_example_path, "rb") as file_reader: input_example = dill.load(file_reader) with open(signature_path, "rb") as file_reader: signature = dill.load(file_reader) with open(conda_env_path, "rb") as file_reader: conda_env = dill.load(file_reader) save_model( sk_model=clf, path=model_name, serialization_format="cloudpickle", conda_env=conda_env, signature=signature, input_example=input_example, ) run = client.create_run(experiment_id="0") client.log_artifact(run.info.run_id, model_name) ``` ## MLFlow Pipeline Now let's connect the components we have written and create a pipeline. ### Data Component The data we will use to train the model is sklearn's iris. We will write a component to generate the data. ```python from functools import partial from kfp.components import InputPath, OutputPath, create_component_from_func @partial( create_component_from_func, packages_to_install=["pandas", "scikit-learn"], ) def load_iris_data( data_path: OutputPath("csv"), target_path: OutputPath("csv"), ): import pandas as pd from sklearn.datasets import load_iris iris = load_iris() data = pd.DataFrame(iris["data"], columns=iris["feature_names"]) target = pd.DataFrame(iris["target"], columns=["target"]) data.to_csv(data_path, index=False) target.to_csv(target_path, index=False) ``` ### Pipeline The pipeline code can be written as follows. ```python from kfp.dsl import pipeline @pipeline(name="mlflow_pipeline") def mlflow_pipeline(kernel: str, model_name: str): iris_data = load_iris_data() model = train_from_csv( train_data=iris_data.outputs["data"], train_target=iris_data.outputs["target"], kernel=kernel, ) _ = upload_sklearn_model_to_mlflow( model_name=model_name, model=model.outputs["model"], input_example=model.outputs["input_example"], signature=model.outputs["signature"], conda_env=model.outputs["conda_env"], ) ``` ### Run If you organize the components and pipelines written above into a single Python file, it would look like this. ```python from functools import partial import kfp from kfp.components import InputPath, OutputPath, create_component_from_func from kfp.dsl import pipeline @partial( create_component_from_func, packages_to_install=["pandas", "scikit-learn"], ) def load_iris_data( data_path: OutputPath("csv"), target_path: OutputPath("csv"), ): import pandas as pd from sklearn.datasets import load_iris iris = load_iris() data = pd.DataFrame(iris["data"], columns=iris["feature_names"]) target = pd.DataFrame(iris["target"], columns=["target"]) data.to_csv(data_path, index=False) target.to_csv(target_path, index=False) @partial( create_component_from_func, packages_to_install=["dill", "pandas", "scikit-learn", "mlflow"], ) def train_from_csv( train_data_path: InputPath("csv"), train_target_path: InputPath("csv"), model_path: OutputPath("dill"), input_example_path: OutputPath("dill"), signature_path: OutputPath("dill"), conda_env_path: OutputPath("dill"), kernel: str, ): import dill import pandas as pd from sklearn.svm import SVC from mlflow.models.signature import infer_signature from mlflow.utils.environment import _mlflow_conda_env train_data = pd.read_csv(train_data_path) train_target = pd.read_csv(train_target_path) clf = SVC(kernel=kernel) clf.fit(train_data, train_target) with open(model_path, mode="wb") as file_writer: dill.dump(clf, file_writer) input_example = train_data.sample(1) with open(input_example_path, "wb") as file_writer: dill.dump(input_example, file_writer) signature = infer_signature(train_data, clf.predict(train_data)) with open(signature_path, "wb") as file_writer: dill.dump(signature, file_writer) conda_env = _mlflow_conda_env( additional_pip_deps=["dill", "pandas", "scikit-learn"] ) with open(conda_env_path, "wb") as file_writer: dill.dump(conda_env, file_writer) @partial( create_component_from_func, packages_to_install=["dill", "pandas", "scikit-learn", "mlflow", "boto3"], ) def upload_sklearn_model_to_mlflow( model_name: str, model_path: InputPath("dill"), input_example_path: InputPath("dill"), signature_path: InputPath("dill"), conda_env_path: InputPath("dill"), ): import os import dill from mlflow.sklearn import save_model from mlflow.tracking.client import MlflowClient os.environ["MLFLOW_S3_ENDPOINT_URL"] = "http://minio-service.kubeflow.svc:9000" os.environ["AWS_ACCESS_KEY_ID"] = "minio" os.environ["AWS_SECRET_ACCESS_KEY"] = "minio123" client = MlflowClient("http://mlflow-server-service.mlflow-system.svc:5000") with open(model_path, mode="rb") as file_reader: clf = dill.load(file_reader) with open(input_example_path, "rb") as file_reader: input_example = dill.load(file_reader) with open(signature_path, "rb") as file_reader: signature = dill.load(file_reader) with open(conda_env_path, "rb") as file_reader: conda_env = dill.load(file_reader) save_model( sk_model=clf, path=model_name, serialization_format="cloudpickle", conda_env=conda_env, signature=signature, input_example=input_example, ) run = client.create_run(experiment_id="0") client.log_artifact(run.info.run_id, model_name) @pipeline(name="mlflow_pipeline") def mlflow_pipeline(kernel: str, model_name: str): iris_data = load_iris_data() model = train_from_csv( train_data=iris_data.outputs["data"], train_target=iris_data.outputs["target"], kernel=kernel, ) _ = upload_sklearn_model_to_mlflow( model_name=model_name, model=model.outputs["model"], input_example=model.outputs["input_example"], signature=model.outputs["signature"], conda_env=model.outputs["conda_env"], ) if __name__ == "__main__": kfp.compiler.Compiler().compile(mlflow_pipeline, "mlflow_pipeline.yaml") ```

mlflow_pipeline.yaml ```bash apiVersion: argoproj.io/v1alpha1 kind: Workflow metadata: generateName: mlflow-pipeline- annotations: {pipelines.kubeflow.org/kfp_sdk_version: 1.8.10, pipelines.kubeflow.org/pipeline_compilation_time: '2022-01-19T14:14:11.999807', pipelines.kubeflow.org/pipeline_spec: '{"inputs": [{"name": "kernel", "type": "String"}, {"name": "model_name", "type": "String"}], "name": "mlflow_pipeline"}'} labels: {pipelines.kubeflow.org/kfp_sdk_version: 1.8.10} spec: entrypoint: mlflow-pipeline templates: - name: load-iris-data container: args: [--data, /tmp/outputs/data/data, --target, /tmp/outputs/target/data] command: - sh - -c - (PIP_DISABLE_PIP_VERSION_CHECK=1 python3 -m pip install --quiet --no-warn-script-location 'pandas' 'scikit-learn' || PIP_DISABLE_PIP_VERSION_CHECK=1 python3 -m pip install --quiet --no-warn-script-location 'pandas' 'scikit-learn' --user) && "$0" "$@" - sh - -ec - | program_path=$(mktemp) printf "%s" "$0" > "$program_path" python3 -u "$program_path" "$@" - | def _make_parent_dirs_and_return_path(file_path: str): import os os.makedirs(os.path.dirname(file_path), exist_ok=True) return file_path def load_iris_data( data_path, target_path, ): import pandas as pd from sklearn.datasets import load_iris iris = load_iris() data = pd.DataFrame(iris["data"], columns=iris["feature_names"]) target = pd.DataFrame(iris["target"], columns=["target"]) data.to_csv(data_path, index=False) target.to_csv(target_path, index=False) import argparse _parser = argparse.ArgumentParser(prog='Load iris data', description='') _parser.add_argument("--data", dest="data_path", type=_make_parent_dirs_and_return_path, required=True, default=argparse.SUPPRESS) _parser.add_argument("--target", dest="target_path", type=_make_parent_dirs_and_return_path, required=True, default=argparse.SUPPRESS) _parsed_args = vars(_parser.parse_args()) _outputs = load_iris_data(**_parsed_args) image: python:3.7 outputs: artifacts: - {name: load-iris-data-data, path: /tmp/outputs/data/data} - {name: load-iris-data-target, path: /tmp/outputs/target/data} metadata: labels: pipelines.kubeflow.org/kfp_sdk_version: 1.8.10 pipelines.kubeflow.org/pipeline-sdk-type: kfp pipelines.kubeflow.org/enable_caching: "true" annotations: {pipelines.kubeflow.org/component_spec: '{"implementation": {"container": {"args": ["--data", {"outputPath": "data"}, "--target", {"outputPath": "target"}], "command": ["sh", "-c", "(PIP_DISABLE_PIP_VERSION_CHECK=1 python3 -m pip install --quiet --no-warn-script-location ''pandas'' ''scikit-learn'' || PIP_DISABLE_PIP_VERSION_CHECK=1 python3 -m pip install --quiet --no-warn-script-location ''pandas'' ''scikit-learn'' --user) && \"$0\" \"$@\"", "sh", "-ec", "program_path=$(mktemp)\nprintf \"%s\" \"$0\" > \"$program_path\"\npython3 -u \"$program_path\" \"$@\"\n", "def _make_parent_dirs_and_return_path(file_path: str):\n import os\n os.makedirs(os.path.dirname(file_path), exist_ok=True)\n return file_path\n\ndef load_iris_data(\n data_path,\n target_path,\n):\n import pandas as pd\n from sklearn.datasets import load_iris\n\n iris = load_iris()\n\n data = pd.DataFrame(iris[\"data\"], columns=iris[\"feature_names\"])\n target = pd.DataFrame(iris[\"target\"], columns=[\"target\"])\n\n data.to_csv(data_path, index=False)\n target.to_csv(target_path, index=False)\n\nimport argparse\n_parser = argparse.ArgumentParser(prog=''Load iris data'', description='''')\n_parser.add_argument(\"--data\", dest=\"data_path\", type=_make_parent_dirs_and_return_path, required=True, default=argparse.SUPPRESS)\n_parser.add_argument(\"--target\", dest=\"target_path\", type=_make_parent_dirs_and_return_path, required=True, default=argparse.SUPPRESS)\n_parsed_args = vars(_parser.parse_args())\n\n_outputs = load_iris_data(**_parsed_args)\n"], "image": "python:3.7"}}, "name": "Load iris data", "outputs": [{"name": "data", "type": "csv"}, {"name": "target", "type": "csv"}]}', pipelines.kubeflow.org/component_ref: '{}'} - name: mlflow-pipeline inputs: parameters: - {name: kernel} - {name: model_name} dag: tasks: - {name: load-iris-data, template: load-iris-data} - name: train-from-csv template: train-from-csv dependencies: [load-iris-data] arguments: parameters: - {name: kernel, value: '{{inputs.parameters.kernel}}'} artifacts: - {name: load-iris-data-data, from: '{{tasks.load-iris-data.outputs.artifacts.load-iris-data-data}}'} - {name: load-iris-data-target, from: '{{tasks.load-iris-data.outputs.artifacts.load-iris-data-target}}'} - name: upload-sklearn-model-to-mlflow template: upload-sklearn-model-to-mlflow dependencies: [train-from-csv] arguments: parameters: - {name: model_name, value: '{{inputs.parameters.model_name}}'} artifacts: - {name: train-from-csv-conda_env, from: '{{tasks.train-from-csv.outputs.artifacts.train-from-csv-conda_env}}'} - {name: train-from-csv-input_example, from: '{{tasks.train-from-csv.outputs.artifacts.train-from-csv-input_example}}'} - {name: train-from-csv-model, from: '{{tasks.train-from-csv.outputs.artifacts.train-from-csv-model}}'} - {name: train-from-csv-signature, from: '{{tasks.train-from-csv.outputs.artifacts.train-from-csv-signature}}'} - name: train-from-csv container: args: [--train-data, /tmp/inputs/train_data/data, --train-target, /tmp/inputs/train_target/data, --kernel, '{{inputs.parameters.kernel}}', --model, /tmp/outputs/model/data, --input-example, /tmp/outputs/input_example/data, --signature, /tmp/outputs/signature/data, --conda-env, /tmp/outputs/conda_env/data] command: - sh - -c - (PIP_DISABLE_PIP_VERSION_CHECK=1 python3 -m pip install --quiet --no-warn-script-location 'dill' 'pandas' 'scikit-learn' 'mlflow' || PIP_DISABLE_PIP_VERSION_CHECK=1 python3 -m pip install --quiet --no-warn-script-location 'dill' 'pandas' 'scikit-learn' 'mlflow' --user) && "$0" "$@" - sh - -ec - | program_path=$(mktemp) printf "%s" "$0" > "$program_path" python3 -u "$program_path" "$@" - | def _make_parent_dirs_and_return_path(file_path: str): import os os.makedirs(os.path.dirname(file_path), exist_ok=True) return file_path def train_from_csv( train_data_path, train_target_path, model_path, input_example_path, signature_path, conda_env_path, kernel, ): import dill import pandas as pd from sklearn.svm import SVC from mlflow.models.signature import infer_signature from mlflow.utils.environment import _mlflow_conda_env train_data = pd.read_csv(train_data_path) train_target = pd.read_csv(train_target_path) clf = SVC(kernel=kernel) clf.fit(train_data, train_target) with open(model_path, mode="wb") as file_writer: dill.dump(clf, file_writer) input_example = train_data.sample(1) with open(input_example_path, "wb") as file_writer: dill.dump(input_example, file_writer) signature = infer_signature(train_data, clf.predict(train_data)) with open(signature_path, "wb") as file_writer: dill.dump(signature, file_writer) conda_env = _mlflow_conda_env( additional_pip_deps=["dill", "pandas", "scikit-learn"] ) with open(conda_env_path, "wb") as file_writer: dill.dump(conda_env, file_writer) import argparse _parser = argparse.ArgumentParser(prog='Train from csv', description='') _parser.add_argument("--train-data", dest="train_data_path", type=str, required=True, default=argparse.SUPPRESS) _parser.add_argument("--train-target", dest="train_target_path", type=str, required=True, default=argparse.SUPPRESS) _parser.add_argument("--kernel", dest="kernel", type=str, required=True, default=argparse.SUPPRESS) _parser.add_argument("--model", dest="model_path", type=_make_parent_dirs_and_return_path, required=True, default=argparse.SUPPRESS) _parser.add_argument("--input-example", dest="input_example_path", type=_make_parent_dirs_and_return_path, required=True, default=argparse.SUPPRESS) _parser.add_argument("--signature", dest="signature_path", type=_make_parent_dirs_and_return_path, required=True, default=argparse.SUPPRESS) _parser.add_argument("--conda-env", dest="conda_env_path", type=_make_parent_dirs_and_return_path, required=True, default=argparse.SUPPRESS) _parsed_args = vars(_parser.parse_args()) _outputs = train_from_csv(**_parsed_args) image: python:3.7 inputs: parameters: - {name: kernel} artifacts: - {name: load-iris-data-data, path: /tmp/inputs/train_data/data} - {name: load-iris-data-target, path: /tmp/inputs/train_target/data} outputs: artifacts: - {name: train-from-csv-conda_env, path: /tmp/outputs/conda_env/data} - {name: train-from-csv-input_example, path: /tmp/outputs/input_example/data} - {name: train-from-csv-model, path: /tmp/outputs/model/data} - {name: train-from-csv-signature, path: /tmp/outputs/signature/data} metadata: labels: pipelines.kubeflow.org/kfp_sdk_version: 1.8.10 pipelines.kubeflow.org/pipeline-sdk-type: kfp pipelines.kubeflow.org/enable_caching: "true" annotations: {pipelines.kubeflow.org/component_spec: '{"implementation": {"container": {"args": ["--train-data", {"inputPath": "train_data"}, "--train-target", {"inputPath": "train_target"}, "--kernel", {"inputValue": "kernel"}, "--model", {"outputPath": "model"}, "--input-example", {"outputPath": "input_example"}, "--signature", {"outputPath": "signature"}, "--conda-env", {"outputPath": "conda_env"}], "command": ["sh", "-c", "(PIP_DISABLE_PIP_VERSION_CHECK=1 python3 -m pip install --quiet --no-warn-script-location ''dill'' ''pandas'' ''scikit-learn'' ''mlflow'' || PIP_DISABLE_PIP_VERSION_CHECK=1 python3 -m pip install --quiet --no-warn-script-location ''dill'' ''pandas'' ''scikit-learn'' ''mlflow'' --user) && \"$0\" \"$@\"", "sh", "-ec", "program_path=$(mktemp)\nprintf \"%s\" \"$0\" > \"$program_path\"\npython3 -u \"$program_path\" \"$@\"\n", "def _make_parent_dirs_and_return_path(file_path: str):\n import os\n os.makedirs(os.path.dirname(file_path), exist_ok=True)\n return file_path\n\ndef train_from_csv(\n train_data_path,\n train_target_path,\n model_path,\n input_example_path,\n signature_path,\n conda_env_path,\n kernel,\n):\n import dill\n import pandas as pd\n from sklearn.svm import SVC\n\n from mlflow.models.signature import infer_signature\n from mlflow.utils.environment import _mlflow_conda_env\n\n train_data = pd.read_csv(train_data_path)\n train_target = pd.read_csv(train_target_path)\n\n clf = SVC(kernel=kernel)\n clf.fit(train_data, train_target)\n\n with open(model_path, mode=\"wb\") as file_writer:\n dill.dump(clf, file_writer)\n\n input_example = train_data.sample(1)\n with open(input_example_path, \"wb\") as file_writer:\n dill.dump(input_example, file_writer)\n\n signature = infer_signature(train_data, clf.predict(train_data))\n with open(signature_path, \"wb\") as file_writer:\n dill.dump(signature, file_writer)\n\n conda_env = _mlflow_conda_env(\n additional_pip_deps=[\"dill\", \"pandas\", \"scikit-learn\"]\n )\n with open(conda_env_path, \"wb\") as file_writer:\n dill.dump(conda_env, file_writer)\n\nimport argparse\n_parser = argparse.ArgumentParser(prog=''Train from csv'', description='''')\n_parser.add_argument(\"--train-data\", dest=\"train_data_path\", type=str, required=True, default=argparse.SUPPRESS)\n_parser.add_argument(\"--train-target\", dest=\"train_target_path\", type=str, required=True, default=argparse.SUPPRESS)\n_parser.add_argument(\"--kernel\", dest=\"kernel\", type=str, required=True, default=argparse.SUPPRESS)\n_parser.add_argument(\"--model\", dest=\"model_path\", type=_make_parent_dirs_and_return_path, required=True, default=argparse.SUPPRESS)\n_parser.add_argument(\"--input-example\", dest=\"input_example_path\", type=_make_parent_dirs_and_return_path, required=True, default=argparse.SUPPRESS)\n_parser.add_argument(\"--signature\", dest=\"signature_path\", type=_make_parent_dirs_and_return_path, required=True, default=argparse.SUPPRESS)\n_parser.add_argument(\"--conda-env\", dest=\"conda_env_path\", type=_make_parent_dirs_and_return_path, required=True, default=argparse.SUPPRESS)\n_parsed_args = vars(_parser.parse_args())\n\n_outputs = train_from_csv(**_parsed_args)\n"], "image": "python:3.7"}}, "inputs": [{"name": "train_data", "type": "csv"}, {"name": "train_target", "type": "csv"}, {"name": "kernel", "type": "String"}], "name": "Train from csv", "outputs": [{"name": "model", "type": "dill"}, {"name": "input_example", "type": "dill"}, {"name": "signature", "type": "dill"}, {"name": "conda_env", "type": "dill"}]}', pipelines.kubeflow.org/component_ref: '{}', pipelines.kubeflow.org/arguments.parameters: '{"kernel": "{{inputs.parameters.kernel}}"}'} - name: upload-sklearn-model-to-mlflow container: args: [--model-name, '{{inputs.parameters.model_name}}', --model, /tmp/inputs/model/data, --input-example, /tmp/inputs/input_example/data, --signature, /tmp/inputs/signature/data, --conda-env, /tmp/inputs/conda_env/data] command: - sh - -c - (PIP_DISABLE_PIP_VERSION_CHECK=1 python3 -m pip install --quiet --no-warn-script-location 'dill' 'pandas' 'scikit-learn' 'mlflow' 'boto3' || PIP_DISABLE_PIP_VERSION_CHECK=1 python3 -m pip install --quiet --no-warn-script-location 'dill' 'pandas' 'scikit-learn' 'mlflow' 'boto3' --user) && "$0" "$@" - sh - -ec - | program_path=$(mktemp) printf "%s" "$0" > "$program_path" python3 -u "$program_path" "$@" - | def upload_sklearn_model_to_mlflow( model_name, model_path, input_example_path, signature_path, conda_env_path, ): import os import dill from mlflow.sklearn import save_model from mlflow.tracking.client import MlflowClient os.environ["MLFLOW_S3_ENDPOINT_URL"] = "http://minio-service.kubeflow.svc:9000" os.environ["AWS_ACCESS_KEY_ID"] = "minio" os.environ["AWS_SECRET_ACCESS_KEY"] = "minio123" client = MlflowClient("http://mlflow-server-service.mlflow-system.svc:5000") with open(model_path, mode="rb") as file_reader: clf = dill.load(file_reader) with open(input_example_path, "rb") as file_reader: input_example = dill.load(file_reader) with open(signature_path, "rb") as file_reader: signature = dill.load(file_reader) with open(conda_env_path, "rb") as file_reader: conda_env = dill.load(file_reader) save_model( sk_model=clf, path=model_name, serialization_format="cloudpickle", conda_env=conda_env, signature=signature, input_example=input_example, ) run = client.create_run(experiment_id="0") client.log_artifact(run.info.run_id, model_name) import argparse _parser = argparse.ArgumentParser(prog='Upload sklearn model to mlflow', description='') _parser.add_argument("--model-name", dest="model_name", type=str, required=True, default=argparse.SUPPRESS) _parser.add_argument("--model", dest="model_path", type=str, required=True, default=argparse.SUPPRESS) _parser.add_argument("--input-example", dest="input_example_path", type=str, required=True, default=argparse.SUPPRESS) _parser.add_argument("--signature", dest="signature_path", type=str, required=True, default=argparse.SUPPRESS) _parser.add_argument("--conda-env", dest="conda_env_path", type=str, required=True, default=argparse.SUPPRESS) _parsed_args = vars(_parser.parse_args()) _outputs = upload_sklearn_model_to_mlflow(**_parsed_args) image: python:3.7 inputs: parameters: - {name: model_name} artifacts: - {name: train-from-csv-conda_env, path: /tmp/inputs/conda_env/data} - {name: train-from-csv-input_example, path: /tmp/inputs/input_example/data} - {name: train-from-csv-model, path: /tmp/inputs/model/data} - {name: train-from-csv-signature, path: /tmp/inputs/signature/data} metadata: labels: pipelines.kubeflow.org/kfp_sdk_version: 1.8.10 pipelines.kubeflow.org/pipeline-sdk-type: kfp pipelines.kubeflow.org/enable_caching: "true" annotations: {pipelines.kubeflow.org/component_spec: '{"implementation": {"container": {"args": ["--model-name", {"inputValue": "model_name"}, "--model", {"inputPath": "model"}, "--input-example", {"inputPath": "input_example"}, "--signature", {"inputPath": "signature"}, "--conda-env", {"inputPath": "conda_env"}], "command": ["sh", "-c", "(PIP_DISABLE_PIP_VERSION_CHECK=1 python3 -m pip install --quiet --no-warn-script-location ''dill'' ''pandas'' ''scikit-learn'' ''mlflow'' ''boto3'' || PIP_DISABLE_PIP_VERSION_CHECK=1 python3 -m pip install --quiet --no-warn-script-location ''dill'' ''pandas'' ''scikit-learn'' ''mlflow'' ''boto3'' --user) && \"$0\" \"$@\"", "sh", "-ec", "program_path=$(mktemp)\nprintf \"%s\" \"$0\" > \"$program_path\"\npython3 -u \"$program_path\" \"$@\"\n", "def upload_sklearn_model_to_mlflow(\n model_name,\n model_path,\n input_example_path,\n signature_path,\n conda_env_path,\n):\n import os\n import dill\n from mlflow.sklearn import save_model\n\n from mlflow.tracking.client import MlflowClient\n\n os.environ[\"MLFLOW_S3_ENDPOINT_URL\"] = \"http://minio-service.kubeflow.svc:9000\"\n os.environ[\"AWS_ACCESS_KEY_ID\"] = \"minio\"\n os.environ[\"AWS_SECRET_ACCESS_KEY\"] = \"minio123\"\n\n client = MlflowClient(\"http://mlflow-server-service.mlflow-system.svc:5000\")\n\n with open(model_path, mode=\"rb\") as file_reader:\n clf = dill.load(file_reader)\n\n with open(input_example_path, \"rb\") as file_reader:\n input_example = dill.load(file_reader)\n\n with open(signature_path, \"rb\") as file_reader:\n signature = dill.load(file_reader)\n\n with open(conda_env_path, \"rb\") as file_reader:\n conda_env = dill.load(file_reader)\n\n save_model(\n sk_model=clf,\n path=model_name,\n serialization_format=\"cloudpickle\",\n conda_env=conda_env,\n signature=signature,\n input_example=input_example,\n )\n run = client.create_run(experiment_id=\"0\")\n client.log_artifact(run.info.run_id, model_name)\n\nimport argparse\n_parser = argparse.ArgumentParser(prog=''Upload sklearn model to mlflow'', description='''')\n_parser.add_argument(\"--model-name\", dest=\"model_name\", type=str, required=True, default=argparse.SUPPRESS)\n_parser.add_argument(\"--model\", dest=\"model_path\", type=str, required=True, default=argparse.SUPPRESS)\n_parser.add_argument(\"--input-example\", dest=\"input_example_path\", type=str, required=True, default=argparse.SUPPRESS)\n_parser.add_argument(\"--signature\", dest=\"signature_path\", type=str, required=True, default=argparse.SUPPRESS)\n_parser.add_argument(\"--conda-env\", dest=\"conda_env_path\", type=str, required=True, default=argparse.SUPPRESS)\n_parsed_args = vars(_parser.parse_args())\n\n_outputs = upload_sklearn_model_to_mlflow(**_parsed_args)\n"], "image": "python:3.7"}}, "inputs": [{"name": "model_name", "type": "String"}, {"name": "model", "type": "dill"}, {"name": "input_example", "type": "dill"}, {"name": "signature", "type": "dill"}, {"name": "conda_env", "type": "dill"}], "name": "Upload sklearn model to mlflow"}', pipelines.kubeflow.org/component_ref: '{}', pipelines.kubeflow.org/arguments.parameters: '{"model_name": "{{inputs.parameters.model_name}}"}'} arguments: parameters: - {name: kernel} - {name: model_name} serviceAccountName: pipeline-runner ```

After generating the mlflow_pipeline.yaml file after execution, upload the pipeline and execute it to check the results of the run. ![mlflow-svc-0](./img/mlflow-svc-0.png) Port-forward the mlflow service to access the MLflow UI. ```bash kubectl port-forward svc/mlflow-server-service -n mlflow-system 5000:5000 ``` Open the web browser and connect to localhost:5000. You will then be able to see that the run has been created as follows. ![mlflow-svc-1](./img/mlflow-svc-1.png) Click on run to verify that the trained model file is present. ![mlflow-svc-2](./img/mlflow-svc-2.png) ================================================ FILE: i18n/en/docusaurus-plugin-content-docs/current/kubeflow/advanced-pipeline.md ================================================ --- title : "10. Pipeline - Setting" description: "" sidebar_position: 10 contributors: ["Jongseob Jeon"] --- ## Pipeline Setting In this page, we will look at values that can be set in the pipeline. ## Display Name Created within the pipeline, components have two names: - task_name: the function name when writing the component - display_name: the name that appears in the kubeflow UI For example, in the case where both components are set to Print and return number, it is difficult to tell which component is 1 or 2. ![run-7](./img/run-7.png) ### set_display_name The solution for this is the display_name. We can set the display_name in the pipeline by using the set_display_name [attribute](https://kubeflow-pipelines.readthedocs.io/en/latest/source/kfp.dsl.html#kfp.dsl.ContainerOp.set_display_name) of the component. ```python import kfp from kfp.components import create_component_from_func from kfp.dsl import pipeline @create_component_from_func def print_and_return_number(number: int) -> int: print(number) return number @create_component_from_func def sum_and_print_numbers(number_1: int, number_2: int): print(number_1 + number_2) @pipeline(name="example_pipeline") def example_pipeline(number_1: int, number_2: int): number_1_result = print_and_return_number(number_1).set_display_name("This is number 1") number_2_result = print_and_return_number(number_2).set_display_name("This is number 2") sum_result = sum_and_print_numbers( number_1=number_1_result.output, number_2=number_2_result.output ).set_display_name("This is sum of number 1 and number 2") if __name__ == "__main__": kfp.compiler.Compiler().compile(example_pipeline, "example_pipeline.yaml") ``` If you run this script and check the resulting `example_pipeline.yaml`, it would be like this.

example_pipeline.yaml ```bash apiVersion: argoproj.io/v1alpha1 kind: Workflow metadata: generateName: example-pipeline- annotations: {pipelines.kubeflow.org/kfp_sdk_version: 1.8.9, pipelines.kubeflow.org/pipeline_compilation_time: '2021-12-09T18:11:43.193190', pipelines.kubeflow.org/pipeline_spec: '{"inputs": [{"name": "number_1", "type": "Integer"}, {"name": "number_2", "type": "Integer"}], "name": "example_pipeline"}'} labels: {pipelines.kubeflow.org/kfp_sdk_version: 1.8.9} spec: entrypoint: example-pipeline templates: - name: example-pipeline inputs: parameters: - {name: number_1} - {name: number_2} dag: tasks: - name: print-and-return-number template: print-and-return-number arguments: parameters: - {name: number_1, value: '{{inputs.parameters.number_1}}'} - name: print-and-return-number-2 template: print-and-return-number-2 arguments: parameters: - {name: number_2, value: '{{inputs.parameters.number_2}}'} - name: sum-and-print-numbers template: sum-and-print-numbers dependencies: [print-and-return-number, print-and-return-number-2] arguments: parameters: - {name: print-and-return-number-2-Output, value: '{{tasks.print-and-return-number-2.outputs.parameters.print-and-return-number-2-Output}}'} - {name: print-and-return-number-Output, value: '{{tasks.print-and-return-number.outputs.parameters.print-and-return-number-Output}}'} - name: print-and-return-number container: args: [--number, '{{inputs.parameters.number_1}}', '----output-paths', /tmp/outputs/Output/data] command: - sh - -ec - | program_path=$(mktemp) printf "%s" "$0" > "$program_path" python3 -u "$program_path" "$@" - | def print_and_return_number(number): print(number) return number def _serialize_int(int_value: int) -> str: if isinstance(int_value, str): return int_value if not isinstance(int_value, int): raise TypeError('Value "{}" has type "{}" instead of int.'.format( str(int_value), str(type(int_value)))) return str(int_value) import argparse _parser = argparse.ArgumentParser(prog='Print and return number', description='') _parser.add_argument("--number", dest="number", type=int, required=True, default=argparse.SUPPRESS) _parser.add_argument("----output-paths", dest="_output_paths", type=str, nargs=1) _parsed_args = vars(_parser.parse_args()) _output_files = _parsed_args.pop("_output_paths", []) _outputs = print_and_return_number(**_parsed_args) _outputs = [_outputs] _output_serializers = [ _serialize_int, ] import os for idx, output_file in enumerate(_output_files): try: os.makedirs(os.path.dirname(output_file)) except OSError: pass with open(output_file, 'w') as f: f.write(_output_serializers[idx](_outputs[idx])) image: python:3.7 inputs: parameters: - {name: number_1} outputs: parameters: - name: print-and-return-number-Output valueFrom: {path: /tmp/outputs/Output/data} artifacts: - {name: print-and-return-number-Output, path: /tmp/outputs/Output/data} metadata: annotations: {pipelines.kubeflow.org/task_display_name: This is number 1, pipelines.kubeflow.org/component_spec: '{"implementation": {"container": {"args": ["--number", {"inputValue": "number"}, "----output-paths", {"outputPath": "Output"}], "command": ["sh", "-ec", "program_path=$(mktemp)\nprintf \"%s\" \"$0\" > \"$program_path\"\npython3 -u \"$program_path\" \"$@\"\n", "def print_and_return_number(number):\n print(number)\n return number\n\ndef _serialize_int(int_value: int) -> str:\n if isinstance(int_value, str):\n return int_value\n if not isinstance(int_value, int):\n raise TypeError(''Value \"{}\" has type \"{}\" instead of int.''.format(\n str(int_value), str(type(int_value))))\n return str(int_value)\n\nimport argparse\n_parser = argparse.ArgumentParser(prog=''Print and return number'', description='''')\n_parser.add_argument(\"--number\", dest=\"number\", type=int, required=True, default=argparse.SUPPRESS)\n_parser.add_argument(\"----output-paths\", dest=\"_output_paths\", type=str, nargs=1)\n_parsed_args = vars(_parser.parse_args())\n_output_files = _parsed_args.pop(\"_output_paths\", [])\n\n_outputs = print_and_return_number(**_parsed_args)\n\n_outputs = [_outputs]\n\n_output_serializers = [\n _serialize_int,\n\n]\n\nimport os\nfor idx, output_file in enumerate(_output_files):\n try:\n os.makedirs(os.path.dirname(output_file))\n except OSError:\n pass\n with open(output_file, ''w'') as f:\n f.write(_output_serializers[idx](_outputs[idx]))\n"], "image": "python:3.7"}}, "inputs": [{"name": "number", "type": "Integer"}], "name": "Print and return number", "outputs": [{"name": "Output", "type": "Integer"}]}', pipelines.kubeflow.org/component_ref: '{}', pipelines.kubeflow.org/arguments.parameters: '{"number": "{{inputs.parameters.number_1}}"}'} labels: pipelines.kubeflow.org/kfp_sdk_version: 1.8.9 pipelines.kubeflow.org/pipeline-sdk-type: kfp pipelines.kubeflow.org/enable_caching: "true" - name: print-and-return-number-2 container: args: [--number, '{{inputs.parameters.number_2}}', '----output-paths', /tmp/outputs/Output/data] command: - sh - -ec - | program_path=$(mktemp) printf "%s" "$0" > "$program_path" python3 -u "$program_path" "$@" - | def print_and_return_number(number): print(number) return number def _serialize_int(int_value: int) -> str: if isinstance(int_value, str): return int_value if not isinstance(int_value, int): raise TypeError('Value "{}" has type "{}" instead of int.'.format( str(int_value), str(type(int_value)))) return str(int_value) import argparse _parser = argparse.ArgumentParser(prog='Print and return number', description='') _parser.add_argument("--number", dest="number", type=int, required=True, default=argparse.SUPPRESS) _parser.add_argument("----output-paths", dest="_output_paths", type=str, nargs=1) _parsed_args = vars(_parser.parse_args()) _output_files = _parsed_args.pop("_output_paths", []) _outputs = print_and_return_number(**_parsed_args) _outputs = [_outputs] _output_serializers = [ _serialize_int, ] import os for idx, output_file in enumerate(_output_files): try: os.makedirs(os.path.dirname(output_file)) except OSError: pass with open(output_file, 'w') as f: f.write(_output_serializers[idx](_outputs[idx])) image: python:3.7 inputs: parameters: - {name: number_2} outputs: parameters: - name: print-and-return-number-2-Output valueFrom: {path: /tmp/outputs/Output/data} artifacts: - {name: print-and-return-number-2-Output, path: /tmp/outputs/Output/data} metadata: annotations: {pipelines.kubeflow.org/task_display_name: This is number 2, pipelines.kubeflow.org/component_spec: '{"implementation": {"container": {"args": ["--number", {"inputValue": "number"}, "----output-paths", {"outputPath": "Output"}], "command": ["sh", "-ec", "program_path=$(mktemp)\nprintf \"%s\" \"$0\" > \"$program_path\"\npython3 -u \"$program_path\" \"$@\"\n", "def print_and_return_number(number):\n print(number)\n return number\n\ndef _serialize_int(int_value: int) -> str:\n if isinstance(int_value, str):\n return int_value\n if not isinstance(int_value, int):\n raise TypeError(''Value \"{}\" has type \"{}\" instead of int.''.format(\n str(int_value), str(type(int_value))))\n return str(int_value)\n\nimport argparse\n_parser = argparse.ArgumentParser(prog=''Print and return number'', description='''')\n_parser.add_argument(\"--number\", dest=\"number\", type=int, required=True, default=argparse.SUPPRESS)\n_parser.add_argument(\"----output-paths\", dest=\"_output_paths\", type=str, nargs=1)\n_parsed_args = vars(_parser.parse_args())\n_output_files = _parsed_args.pop(\"_output_paths\", [])\n\n_outputs = print_and_return_number(**_parsed_args)\n\n_outputs = [_outputs]\n\n_output_serializers = [\n _serialize_int,\n\n]\n\nimport os\nfor idx, output_file in enumerate(_output_files):\n try:\n os.makedirs(os.path.dirname(output_file))\n except OSError:\n pass\n with open(output_file, ''w'') as f:\n f.write(_output_serializers[idx](_outputs[idx]))\n"], "image": "python:3.7"}}, "inputs": [{"name": "number", "type": "Integer"}], "name": "Print and return number", "outputs": [{"name": "Output", "type": "Integer"}]}', pipelines.kubeflow.org/component_ref: '{}', pipelines.kubeflow.org/arguments.parameters: '{"number": "{{inputs.parameters.number_2}}"}'} labels: pipelines.kubeflow.org/kfp_sdk_version: 1.8.9 pipelines.kubeflow.org/pipeline-sdk-type: kfp pipelines.kubeflow.org/enable_caching: "true" - name: sum-and-print-numbers container: args: [--number-1, '{{inputs.parameters.print-and-return-number-Output}}', --number-2, '{{inputs.parameters.print-and-return-number-2-Output}}'] command: - sh - -ec - | program_path=$(mktemp) printf "%s" "$0" > "$program_path" python3 -u "$program_path" "$@" - | def sum_and_print_numbers(number_1, number_2): print(number_1 + number_2) import argparse _parser = argparse.ArgumentParser(prog='Sum and print numbers', description='') _parser.add_argument("--number-1", dest="number_1", type=int, required=True, default=argparse.SUPPRESS) _parser.add_argument("--number-2", dest="number_2", type=int, required=True, default=argparse.SUPPRESS) _parsed_args = vars(_parser.parse_args()) _outputs = sum_and_print_numbers(**_parsed_args) image: python:3.7 inputs: parameters: - {name: print-and-return-number-2-Output} - {name: print-and-return-number-Output} metadata: annotations: {pipelines.kubeflow.org/task_display_name: This is sum of number 1 and number 2, pipelines.kubeflow.org/component_spec: '{"implementation": {"container": {"args": ["--number-1", {"inputValue": "number_1"}, "--number-2", {"inputValue": "number_2"}], "command": ["sh", "-ec", "program_path=$(mktemp)\nprintf \"%s\" \"$0\" > \"$program_path\"\npython3 -u \"$program_path\" \"$@\"\n", "def sum_and_print_numbers(number_1, number_2):\n print(number_1 + number_2)\n\nimport argparse\n_parser = argparse.ArgumentParser(prog=''Sum and print numbers'', description='''')\n_parser.add_argument(\"--number-1\", dest=\"number_1\", type=int, required=True, default=argparse.SUPPRESS)\n_parser.add_argument(\"--number-2\", dest=\"number_2\", type=int, required=True, default=argparse.SUPPRESS)\n_parsed_args = vars(_parser.parse_args())\n\n_outputs = sum_and_print_numbers(**_parsed_args)\n"], "image": "python:3.7"}}, "inputs": [{"name": "number_1", "type": "Integer"}, {"name": "number_2", "type": "Integer"}], "name": "Sum and print numbers"}', pipelines.kubeflow.org/component_ref: '{}', pipelines.kubeflow.org/arguments.parameters: '{"number_1": "{{inputs.parameters.print-and-return-number-Output}}", "number_2": "{{inputs.parameters.print-and-return-number-2-Output}}"}'} labels: pipelines.kubeflow.org/kfp_sdk_version: 1.8.9 pipelines.kubeflow.org/pipeline-sdk-type: kfp pipelines.kubeflow.org/enable_caching: "true" arguments: parameters: - {name: number_1} - {name: number_2} serviceAccountName: pipeline-runner ```

If compared with the previous file, the **`pipelines.kubeflow.org/task_display_name`** key has been newly created. ### UI in Kubeflow We will upload the version of the previously created [pipeline](../kubeflow/basic-pipeline-upload.md#upload-pipeline-version) using the files we created earlier. ![adv-pipeline-0.png](./img/adv-pipeline-0.png) As you can see, the configured name is displayed as shown above. ## Resources ### GPU By default, when the pipeline runs components as Kubernetes pods, it uses the default resource specifications. If you need to train a model using a GPU and the Kubernetes environment doesn't allocate a GPU, the training may not be performed correctly. To address this, you can use the `set_gpu_limit()` [attribute](https://kubeflow-pipelines.readthedocs.io/en/latest/source/kfp.dsl.html?highlight=set_gpu_limit#kfp.dsl.UserContainer.set_gpu_limit) to set the GPU limit. ```python import kfp from kfp.components import create_component_from_func from kfp.dsl import pipeline @create_component_from_func def print_and_return_number(number: int) -> int: print(number) return number @create_component_from_func def sum_and_print_numbers(number_1: int, number_2: int): print(number_1 + number_2) @pipeline(name="example_pipeline") def example_pipeline(number_1: int, number_2: int): number_1_result = print_and_return_number(number_1).set_display_name("This is number 1") number_2_result = print_and_return_number(number_2).set_display_name("This is number 2") sum_result = sum_and_print_numbers( number_1=number_1_result.output, number_2=number_2_result.output ).set_display_name("This is sum of number 1 and number 2").set_gpu_limit(1) if __name__ == "__main__": kfp.compiler.Compiler().compile(example_pipeline, "example_pipeline.yaml") ``` If you execute the above script, you can see that the resources has been added with `{nvidia.com/gpu: 1}` in the generated file when you look closely at `sum-and-print-numbers`. Through this, you can allocate a GPU. ```bash - name: sum-and-print-numbers container: args: [--number-1, '{{inputs.parameters.print-and-return-number-Output}}', --number-2, '{{inputs.parameters.print-and-return-number-2-Output}}'] command: - sh - -ec - | program_path=$(mktemp) printf "%s" "$0" > "$program_path" python3 -u "$program_path" "$@" - | def sum_and_print_numbers(number_1, number_2): print(number_1 + number_2) import argparse _parser = argparse.ArgumentParser(prog='Sum and print numbers', description='') _parser.add_argument("--number-1", dest="number_1", type=int, required=True, default=argparse.SUPPRESS) _parser.add_argument("--number-2", dest="number_2", type=int, required=True, default=argparse.SUPPRESS) _parsed_args = vars(_parser.parse_args()) _outputs = sum_and_print_numbers(**_parsed_args) image: python:3.7 resources: limits: {nvidia.com/gpu: 1} ``` ### CPU The function to set the number of CPUs can be set using the `.set_cpu_limit()` attribute [attribute](https://kubeflow-pipelines.readthedocs.io/en/latest/source/kfp.dsl.html?highlight=set_gpu_limit#kfp.dsl.Sidecar.set_cpu_limit). The difference from GPUs is that the input must be a string, not an int. ```python import kfp from kfp.components import create_component_from_func from kfp.dsl import pipeline @create_component_from_func def print_and_return_number(number: int) -> int: print(number) return number @create_component_from_func def sum_and_print_numbers(number_1: int, number_2: int): print(number_1 + number_2) @pipeline(name="example_pipeline") def example_pipeline(number_1: int, number_2: int): number_1_result = print_and_return_number(number_1).set_display_name("This is number 1") number_2_result = print_and_return_number(number_2).set_display_name("This is number 2") sum_result = sum_and_print_numbers( number_1=number_1_result.output, number_2=number_2_result.output ).set_display_name("This is sum of number 1 and number 2").set_gpu_limit(1).set_cpu_limit("16") if __name__ == "__main__": kfp.compiler.Compiler().compile(example_pipeline, "example_pipeline.yaml") ``` The changed part only can be confirmed as follows. ```bash resources: limits: {nvidia.com/gpu: 1, cpu: '16'} ``` ### Memory Memory can be set using the `.set_memory_limit()` [attribute](https://kubeflow-pipelines.readthedocs.io/en/latest/source/kfp.dsl.html?highlight=set_gpu_limit#kfp.dsl.Sidecar.set_memory_limit). ```python import kfp from kfp.components import create_component_from_func from kfp.dsl import pipeline @create_component_from_func def print_and_return_number(number: int) -> int: print(number) return number @create_component_from_func def sum_and_print_numbers(number_1: int, number_2: int): print(number_1 + number_2) @pipeline(name="example_pipeline") def example_pipeline(number_1: int, number_2: int): number_1_result = print_and_return_number(number_1).set_display_name("This is number 1") number_2_result = print_and_return_number(number_2).set_display_name("This is number 2") sum_result = sum_and_print_numbers( number_1=number_1_result.output, number_2=number_2_result.output ).set_display_name("This is sum of number 1 and number 2").set_gpu_limit(1).set_memory_limit("1G") if __name__ == "__main__": kfp.compiler.Compiler().compile(example_pipeline, "example_pipeline.yaml") ``` The changed parts are as follows if checked. ```bash resources: limits: {nvidia.com/gpu: 1, memory: 1G} ``` ================================================ FILE: i18n/en/docusaurus-plugin-content-docs/current/kubeflow/advanced-run.md ================================================ --- title : "11. Pipeline - Run Result" description: "" sidebar_position: 11 contributors: ["Jongseob Jeon", "SeungTae Kim"] --- ## Run Result Click Run Result and you will see three tabs: Graph, Run Output, and Config. ![advanced-run-0.png](./img/advanced-run-0.png) ## Graph ![advanced-run-1.png](./img/advanced-run-1.png) In the graph, if you click on the run component, you can check the running information of the component. ### Input/Output The Input/Output tab allows you to view and download the Configurations, Input, and Output Artifacts used in the components. ### Logs In the Logs tab, you can view all the stdout output generated during the execution of the Python code. However, pods are deleted after a certain period of time, so you may not be able to view them in this tab after a certain time. In that case, you can check them in the main-logs section of the Output artifacts. ### Visualizations The Visualizations tab displays plots generated by the components. To generate a plot, you can save the desired values as an argument using `mlpipeline_ui_metadata: OutputPath("UI_Metadata")`. The plot should be in HTML format. The conversion process is as follows. ```python @partial( create_component_from_func, packages_to_install=["matplotlib"], ) def plot_linear( mlpipeline_ui_metadata: OutputPath("UI_Metadata") ): import base64 import json from io import BytesIO import matplotlib.pyplot as plt plt.plot(x=[1, 2, 3], y=[1, 2,3]) tmpfile = BytesIO() plt.savefig(tmpfile, format="png") encoded = base64.b64encode(tmpfile.getvalue()).decode("utf-8") html = f"" metadata = { "outputs": [ { "type": "web-app", "storage": "inline", "source": html, }, ], } with open(mlpipeline_ui_metadata, "w") as html_writer: json.dump(metadata, html_writer) ``` If written in pipeline, it will be like this. ```python from functools import partial import kfp from kfp.components import create_component_from_func, OutputPath from kfp.dsl import pipeline @partial( create_component_from_func, packages_to_install=["matplotlib"], ) def plot_linear(mlpipeline_ui_metadata: OutputPath("UI_Metadata")): import base64 import json from io import BytesIO import matplotlib.pyplot as plt plt.plot([1, 2, 3], [1, 2, 3]) tmpfile = BytesIO() plt.savefig(tmpfile, format="png") encoded = base64.b64encode(tmpfile.getvalue()).decode("utf-8") html = f"" metadata = { "outputs": [ { "type": "web-app", "storage": "inline", "source": html, }, ], } with open(mlpipeline_ui_metadata, "w") as html_writer: json.dump(metadata, html_writer) @pipeline(name="plot_pipeline") def plot_pipeline(): plot_linear() if __name__ == "__main__": kfp.compiler.Compiler().compile(plot_pipeline, "plot_pipeline.yaml") ``` If you run this script and check the resulting `plot_pipeline.yaml`, you will see the following.

plot_pipeline.yaml ```bash apiVersion: argoproj.io/v1alpha1 kind: Workflow metadata: generateName: plot-pipeline- annotations: {pipelines.kubeflow.org/kfp_sdk_version: 1.8.9, pipelines.kubeflow.org/pipeline_compilation_time: '2 022-01-17T13:31:32.963214', pipelines.kubeflow.org/pipeline_spec: '{"name": "plot_pipeline"}'} labels: {pipelines.kubeflow.org/kfp_sdk_version: 1.8.9} spec: entrypoint: plot-pipeline templates: - name: plot-linear container: args: [--mlpipeline-ui-metadata, /tmp/outputs/mlpipeline_ui_metadata/data] command: - sh - -c - (PIP_DISABLE_PIP_VERSION_CHECK=1 python3 -m pip install --quiet --no-warn-script-location 'matplotlib' || PIP_DISABLE_PIP_VERSION_CHECK=1 python3 -m pip install --quiet --no-warn-script-location 'matplotlib' --user) && "$0" "$@" - sh - -ec - | program_path=$(mktemp) printf "%s" "$0" > "$program_path" python3 -u "$program_path" "$@" - | def _make_parent_dirs_and_return_path(file_path: str): import os os.makedirs(os.path.dirname(file_path), exist_ok=True) return file_path def plot_linear(mlpipeline_ui_metadata): import base64 import json from io import BytesIO import matplotlib.pyplot as plt plt.plot([1, 2, 3], [1, 2, 3]) tmpfile = BytesIO() plt.savefig(tmpfile, format="png") encoded = base64.b64encode(tmpfile.getvalue()).decode("utf-8") html = f"" metadata = { "outputs": [ { "type": "web-app", "storage": "inline", "source": html, }, ], } with open(mlpipeline_ui_metadata, "w") as html_writer: json.dump(metadata, html_writer) import argparse _parser = argparse.ArgumentParser(prog='Plot linear', description='') _parser.add_argument("--mlpipeline-ui-metadata", dest="mlpipeline_ui_metadata", type=_make_parent_dirs_and_return_path, required=True, default=argparse.SUPPRESS) _parsed_args = vars(_parser.parse_args()) _outputs = plot_linear(**_parsed_args) image: python:3.7 outputs: artifacts: - {name: mlpipeline-ui-metadata, path: /tmp/outputs/mlpipeline_ui_metadata/data} metadata: labels: pipelines.kubeflow.org/kfp_sdk_version: 1.8.9 pipelines.kubeflow.org/pipeline-sdk-type: kfp pipelines.kubeflow.org/enable_caching: "true" annotations: {pipelines.kubeflow.org/component_spec: '{"implementation": {"container": {"args": ["--mlpipeline-ui-metadata", {"outputPath": "mlpipeline_ui_metadata"}], "command": ["sh", "-c", "(PIP_DISABLE_PIP_VERSION_CHECK=1 python3 -m pip install --quiet --no-warn-script-location ''matplotlib'' || PIP_DISABLE_PIP_VERSION_CHECK=1 python3 -m pip install --quiet --no-warn-script-location ''matplotlib'' --user) && \"$0\" \"$@\"", "sh", "-ec", "program_path=$(mktemp)\nprintf \"%s\" \"$0\" > \"$program_path\"\npython3 -u \"$program_path\" \"$@\"\n", "def _make_parent_dirs_and_return_path(file_path: str):\n import os\n os.makedirs(os.path.dirname(file_path), exist_ok=True)\n return file_path\n\ndef plot_linear(mlpipeline_ui_metadata):\n import base64\n import json\n from io import BytesIO\n\n import matplotlib.pyplot as plt\n\n plt.plot([1, 2, 3], [1, 2, 3])\n\n tmpfile = BytesIO()\n plt.savefig(tmpfile, format=\"png\")\n encoded = base64.b64encode(tmpfile.getvalue()).decode(\"utf-8\")\n\n html = f\"\"\n metadata = {\n \"outputs\": [\n {\n \"type\": \"web-app\",\n \"storage\": \"inline\",\n \"source\": html,\n },\n ],\n }\n with open(mlpipeline_ui_metadata, \"w\") as html_writer:\n json.dump(metadata, html_writer)\n\nimport argparse\n_parser = argparse.ArgumentParser(prog=''Plot linear'', description='''')\n_parser.add_argument(\"--mlpipeline-ui-metadata\", dest=\"mlpipeline_ui_metadata\", type=_make_parent_dirs_and_return_path, required=True, default=argparse.SUPPRESS)\n_parsed_args = vars(_parser.parse_args())\n\n_outputs = plot_linear(**_parsed_args)\n"], "image": "python:3.7"}}, "name": "Plot linear", "outputs": [{"name": "mlpipeline_ui_metadata", "type": "UI_Metadata"}]}', pipelines.kubeflow.org/component_ref: '{}'} - name: plot-pipeline dag: tasks: - {name: plot-linear, template: plot-linear} arguments: parameters: [] serviceAccountName: pipeline-runner ```

After running, click Visualization. ![advanced-run-5.png](./img/advanced-run-5.png) ## Run output ![advanced-run-2.png](./img/advanced-run-2.png) Run output is where Kubeflow gathers the Artifacts generated in the specified form and shows the evaluation index (Metric). To show the evaluation index (Metric), you can save the name and value you want to show in the `mlpipeline_metrics_path: OutputPath("Metrics")` argument in json format. For example, you can write it like this. ```python @create_component_from_func def show_metric_of_sum( number: int, mlpipeline_metrics_path: OutputPath("Metrics"), ): import json metrics = { "metrics": [ { "name": "sum_value", "numberValue": number, }, ], } with open(mlpipeline_metrics_path, "w") as f: json.dump(metrics, f) ``` We will add a component to generate evaluation metrics to the pipeline created in the [Pipeline](../kubeflow/basic-pipeline.md) and execute it. The whole pipeline is as follows. ```python import kfp from kfp.components import create_component_from_func, OutputPath from kfp.dsl import pipeline @create_component_from_func def print_and_return_number(number: int) -> int: print(number) return number @create_component_from_func def sum_and_print_numbers(number_1: int, number_2: int) -> int: sum_number = number_1 + number_2 print(sum_number) return sum_number @create_component_from_func def show_metric_of_sum( number: int, mlpipeline_metrics_path: OutputPath("Metrics"), ): import json metrics = { "metrics": [ { "name": "sum_value", "numberValue": number, }, ], } with open(mlpipeline_metrics_path, "w") as f: json.dump(metrics, f) @pipeline(name="example_pipeline") def example_pipeline(number_1: int, number_2: int): number_1_result = print_and_return_number(number_1) number_2_result = print_and_return_number(number_2) sum_result = sum_and_print_numbers( number_1=number_1_result.output, number_2=number_2_result.output ) show_metric_of_sum(sum_result.output) if __name__ == "__main__": kfp.compiler.Compiler().compile(example_pipeline, "example_pipeline.yaml") ``` After execution, click Run Output and it will show like this. ![advanced-run-4.png](./img/advanced-run-4.png) ## Config ![advanced-run-3.png](./img/advanced-run-3.png) In the Config tab, you can view all the values received as pipeline configurations. ================================================ FILE: i18n/en/docusaurus-plugin-content-docs/current/kubeflow/basic-component.md ================================================ --- title : "4. Component - Write" description: "" sidebar_position: 4 contributors: ["Jongseob Jeon"] --- ## Component In order to write a component, the following must be written: 1. Writing Component Contents 2. Writing Component Wrapper Now, let's look at each process. ## Component Contents Component Contents are no different from the Python code we commonly write. For example, let's try writing a component that takes a number as input, prints it, and then returns it. We can write it in Python code like this. ```python print(number) ``` However, when this code is run, an error occurs and it does not work because the `number` that should be printed is not defined. As we saw in [Kubeflow Concepts](../kubeflow/kubeflow-concepts.md), values like `number` that are required in component content are defined in **Config**. In order to execute component content, the necessary Configs must be passed from the component wrapper. ## Component Wrapper ### Define a standalone Python function Now we need to create a component wrapper to be able to pass the required Configs. Without a separate Config, it will be like this when wrapped with a component wrapper. ```python def print_and_return_number(): print(number) return number ``` Now we add the required Config for the content as an argument to the wrapper. However, it is not just writing the argument but also writing the type hint of the argument. When Kubeflow converts the pipeline into the Kubeflow format, it checks if the specified input and output types are matched in the connection between the components. If the format of the input required by the component does not match the output received from another component, the pipeline cannot be created. Now we complete the component wrapper by writing down the argument, its type and the type to be returned as follows. ```python def print_and_return_number(number: int) -> int: print(number) return number ``` In Kubeflow, you can only use types that can be expressed in json as return values. The most commonly used and recommended types are as follows: - int - float - str If you want to return multiple values instead of a single value, you must use `collections.namedtuple`. For more details, please refer to the Kubeflow official documentation [Kubeflow Official Documentation](https://www.kubeflow.org/docs/components/pipelines/sdk/python-function-components/#passing-parameters-by-value). For example, if you want to write a component that returns the quotient and remainder of a number when divided by 2, it should be written as follows. ```python from typing import NamedTuple def divide_and_return_number( number: int, ) -> NamedTuple("DivideOutputs", [("quotient", int), ("remainder", int)]): from collections import namedtuple quotient, remainder = divmod(number, 2) print("quotient is", quotient) print("remainder is", remainder) divide_outputs = namedtuple( "DivideOutputs", [ "quotient", "remainder", ], ) return divide_outputs(quotient, remainder) ``` ### Convert to Kubeflow Format Now you have to convert the written component into a format that can be used in Kubeflow. The conversion can be done through `kfp.components.create_component_from_func`. This converted form can be imported as a function in Python and used in the pipeline. ```python from kfp.components import create_component_from_func @create_component_from_func def print_and_return_number(number: int) -> int: print(number) return number ``` ### Share component with yaml file If it is not possible to share with Python code, you can share components with a YAML file and use them. To do this, first convert the component to a YAML file and then use it in the pipeline with `kfp.components.load_component_from_file`. First, let's explain the process of converting the written component to a YAML file. ```python from kfp.components import create_component_from_func @create_component_from_func def print_and_return_number(number: int) -> int: print(number) return number if __name__ == "__main__": print_and_return_number.component_spec.save("print_and_return_number.yaml") ``` If you run the Python code you wrote, a file called `print_and_return_number.yaml` will be created. When you check the file, it will be as follows. ```bash name: Print and return number inputs: - {name: number, type: Integer} outputs: - {name: Output, type: Integer} implementation: container: image: python:3.7 command: - sh - -ec - | program_path=$(mktemp) printf "%s" "$0" > "$program_path" python3 -u "$program_path" "$@" - | def print_and_return_number(number): print(number) return number def _serialize_int(int_value: int) -> str: if isinstance(int_value, str): return int_value if not isinstance(int_value, int): raise TypeError('Value "{}" has type "{}" instead of int.'.format(str(int_value), str(type(int_value)))) return str(int_value) import argparse _parser = argparse.ArgumentParser(prog='Print and return number', description='') _parser.add_argument("--number", dest="number", type=int, required=True, default=argparse.SUPPRESS) _parser.add_argument("----output-paths", dest="_output_paths", type=str, nargs=1) _parsed_args = vars(_parser.parse_args()) _output_files = _parsed_args.pop("_output_paths", []) _outputs = print_and_return_number(**_parsed_args) _outputs = [_outputs] _output_serializers = [ _serialize_int, ] import os for idx, output_file in enumerate(_output_files): try: os.makedirs(os.path.dirname(output_file)) except OSError: pass with open(output_file, 'w') as f: f.write(_output_serializers[idx](_outputs[idx])) args: - --number - {inputValue: number} - '----output-paths' - {outputPath: Output} ``` Now the generated file can be shared and used in the pipeline as follows. ```python from kfp.components import load_component_from_file print_and_return_number = load_component_from_file("print_and_return_number.yaml") ``` ## How Kubeflow executes component In Kubeflow, the execution order of components is as follows: 1. `docker pull `: Pull the image containing the execution environment information of the defined component. 2. Run `command`: Execute the component's content within the pulled image. Taking `print_and_return_number.yaml` as an example, the default image in `@create_component_from_func` is `python:3.7`, so the component's content will be executed based on that image. 1. `docker pull python:3.7` 2. `print(number)` ## References: - [Getting Started With Python function based components](https://www.kubeflow.org/docs/components/pipelines/sdk/python-function-components/#getting-started-with-python-function-based-components) ================================================ FILE: i18n/en/docusaurus-plugin-content-docs/current/kubeflow/basic-pipeline-upload.md ================================================ --- title : "6. Pipeline - Upload" description: "" sidebar_position: 6 contributors: ["Jongseob Jeon"] --- ## Upload Pipeline Now, let's upload the pipeline we created directly to kubeflow. Pipeline uploads can be done through the kubeflow dashboard UI. Use the method used in [Install Kubeflow](../setup-components/install-components-kf.md) to do port forwarding. ```bash kubectl port-forward svc/istio-ingressgateway -n istio-system 8080:80 ``` Access [http://localhost:8080](http://localhost:8080) to open the dashboard. ### 1. Click Pipelines Tab ![pipeline-gui-0.png](./img/pipeline-gui-0.png) ### 2. Click Upload Pipeline ![pipeline-gui-1.png](./img/pipeline-gui-1.png) ### 3. Click Choose file ![pipeline-gui-2.png](./img/pipeline-gui-2.png) ### 4. Upload created yaml file ![pipeline-gui-3.png](./img/pipeline-gui-3.png) ### 5. Create ![pipeline-gui-4.png](./img/pipeline-gui-4.png) ## Upload Pipeline Version The uploaded pipeline allows you to manage versions through uploads. However, it serves the role of gathering pipelines with the same name rather than version management at the code level, such as Github. In the example above, clicking on example_pipeline will bring up the following screen. ![pipeline-gui-5.png](./img/pipeline-gui-5.png) If you click this screen shows. ![pipeline-gui-4.png](./img/pipeline-gui-4.png) If you click Upload Version, a screen appears where you can upload the pipeline. ![pipeline-gui-6.png](./img/pipeline-gui-6.png) Now, upload your pipeline. ![pipeline-gui-7.png](./img/pipeline-gui-7.png) Once uploaded, you can check the pipeline version as follows. ![pipeline-gui-8.png](./img/pipeline-gui-8.png) ================================================ FILE: i18n/en/docusaurus-plugin-content-docs/current/kubeflow/basic-pipeline.md ================================================ --- title : "5. Pipeline - Write" description: "" sidebar_position: 5 contributors: ["Jongseob Jeon"] --- ## Pipeline Components do not run independently but rather as components of a pipeline. Therefore, in order to run a component, a pipeline must be written. And in order to write a pipeline, a set of components and the order of execution of those components is necessary. On this page, we will create a pipeline with a component that takes a number as input and outputs it, and a component that takes two numbers from two components and outputs the sum. ## Component Set First, let's create the components that will be used in the pipeline. 1. `print_and_return_number` This component prints and returns the input number. Since the component returns the input value, we specify `int` as the return type hint. ```python @create_component_from_func def print_and_return_number(number: int) -> int: print(number) return number ``` 2. `sum_and_print_numbers` This component calculates the sum of two input numbers and prints it. Similarly, since the component returns the sum, we specify `int` as the return type hint. ```python @create_component_from_func def sum_and_print_numbers(number_1: int, number_2: int) -> int: sum_num = number_1 + number_2 print(sum_num) return sum_num ``` ## Component Order ### Define Order If you have created the necessary set of components, the next step is to define their sequence. The diagram below represents the order of the pipeline components to be created on this page. ![pipeline-0.png](./img/pipeline-0.png) ### Single Output Now let's translate this sequence into code. First, writing `print_and_return_number_1` and `print_and_return_number_2` from the picture above would look like this. ```python def example_pipeline(): number_1_result = print_and_return_number(number_1) number_2_result = print_and_return_number(number_2) ``` Run the component and store the return values in `number_1_result` and `number_2_result`, respectively. The return value of the stored `number_1_result` can be used through `number_1_resulst.output`. ### Multi Output In the example above, the components return a single value, so it can be directly used with `output`. However, if there are multiple return values, they will be stored in `outputs` as a dictionary. You can use the keys to access the desired return values. Let's consider an example with a component that returns multiple values, like the one mentioned in the [component](../kubeflow/basic-component.md#define-a-standalone-python-function) definition. The `divide_and_return_number` component returns `quotient` and `remainder`. Here's an example of passing these two values to `print_and_return_number`: ```python def multi_pipeline(): divided_result = divde_and_return_number(number) num_1_result = print_and_return_number(divided_result.outputs["quotient"]) num_2_result = print_and_return_number(divided_result.outputs["remainder"]) ``` Store the result of `divide_and_return_number` in `divided_result` and you can get the values of each by `divided_result.outputs["quotient"]` and `divided_result.outputs["remainder"]`. ### Write to python code Now, let's get back to the main topic and pass the result of these two values to `sum_and_print_numbers`. ```python def example_pipeline(): number_1_result = print_and_return_number(number_1) number_2_result = print_and_return_number(number_2) sum_result = sum_and_print_numbers( number_1=number_1_result.output, number_2=number_2_result.output ) ``` Next, gather the necessary Configs for each component and define it as a pipeline Config. ```python def example_pipeline(number_1: int, number_2:int): number_1_result = print_and_return_number(number_1) number_2_result = print_and_return_number(number_2) sum_result = sum_and_print_numbers( number_1=number_1_result.output, number_2=number_2_result.output ) ``` ## Convert to Kubeflow Format Finally, convert it into a format that can be used in Kubeflow. The conversion can be done using the `kfp.dsl.pipeline` function. ```python from kfp.dsl import pipeline @pipeline(name="example_pipeline") def example_pipeline(number_1: int, number_2: int): number_1_result = print_and_return_number(number_1) number_2_result = print_and_return_number(number_2) sum_result = sum_and_print_numbers( number_1=number_1_result.output, number_2=number_2_result.output ) ``` In order to run a pipeline in Kubeflow, it needs to be compiled into the designated yaml format as only yaml format is possible, so the created pipeline needs to be compiled into a specific yaml format. Compilation can be done using the following command. ```python if __name__ == "__main__": import kfp kfp.compiler.Compiler().compile(example_pipeline, "example_pipeline.yaml") ``` ## Conclusion As explained earlier, if we gather the content into a Python code, it will look like this. ```python import kfp from kfp.components import create_component_from_func from kfp.dsl import pipeline @create_component_from_func def print_and_return_number(number: int) -> int: print(number) return number @create_component_from_func def sum_and_print_numbers(number_1: int, number_2: int): print(number_1 + number_2) @pipeline(name="example_pipeline") def example_pipeline(number_1: int, number_2: int): number_1_result = print_and_return_number(number_1) number_2_result = print_and_return_number(number_2) sum_result = sum_and_print_numbers( number_1=number_1_result.output, number_2=number_2_result.output ) if __name__ == "__main__": kfp.compiler.Compiler().compile(example_pipeline, "example_pipeline.yaml") ``` The compiled result is as follows.
example_pipeline.yaml ```bash apiVersion: argoproj.io/v1alpha1 kind: Workflow metadata: generateName: example-pipeline- annotations: {pipelines.kubeflow.org/kfp_sdk_version: 1.6.3, pipelines.kubeflow.org/pipeline_compilation_time: '2021-12-05T13:38:51.566777', pipelines.kubeflow.org/pipeline_spec: '{"inputs": [{"name": "number_1", "type": "Integer"}, {"name": "number_2", "type": "Integer"}], "name": "example_pipeline"}'} labels: {pipelines.kubeflow.org/kfp_sdk_version: 1.6.3} spec: entrypoint: example-pipeline templates: - name: example-pipeline inputs: parameters: - {name: number_1} - {name: number_2} dag: tasks: - name: print-and-return-number template: print-and-return-number arguments: parameters: - {name: number_1, value: '{{inputs.parameters.number_1}}'} - name: print-and-return-number-2 template: print-and-return-number-2 arguments: parameters: - {name: number_2, value: '{{inputs.parameters.number_2}}'} - name: sum-and-print-numbers template: sum-and-print-numbers dependencies: [print-and-return-number, print-and-return-number-2] arguments: parameters: - {name: print-and-return-number-2-Output, value: '{{tasks.print-and-return-number-2.outputs.parameters.print-and-return-number-2-Output}}'} - {name: print-and-return-number-Output, value: '{{tasks.print-and-return-number.outputs.parameters.print-and-return-number-Output}}'} - name: print-and-return-number container: args: [--number, '{{inputs.parameters.number_1}}', '----output-paths', /tmp/outputs/Output/data] command: - sh - -ec - | program_path=$(mktemp) printf "%s" "$0" > "$program_path" python3 -u "$program_path" "$@" - | def print_and_return_number(number): print(number) return number def _serialize_int(int_value: int) -> str: if isinstance(int_value, str): return int_value if not isinstance(int_value, int): raise TypeError('Value "{}" has type "{}" instead of int.'.format(str(int_value), str(type(int_value)))) return str(int_value) import argparse _parser = argparse.ArgumentParser(prog='Print and return number', description='') _parser.add_argument("--number", dest="number", type=int, required=True, default=argparse.SUPPRESS) _parser.add_argument("----output-paths", dest="_output_paths", type=str, nargs=1) _parsed_args = vars(_parser.parse_args()) _output_files = _parsed_args.pop("_output_paths", []) _outputs = print_and_return_number(**_parsed_args) _outputs = [_outputs] _output_serializers = [ _serialize_int, ] import os for idx, output_file in enumerate(_output_files): try: os.makedirs(os.path.dirname(output_file)) except OSError: pass with open(output_file, 'w') as f: f.write(_output_serializers[idx](_outputs[idx])) image: python:3.7 inputs: parameters: - {name: number_1} outputs: parameters: - name: print-and-return-number-Output valueFrom: {path: /tmp/outputs/Output/data} artifacts: - {name: print-and-return-number-Output, path: /tmp/outputs/Output/data} metadata: labels: {pipelines.kubeflow.org/kfp_sdk_version: 1.6.3, pipelines.kubeflow.org/pipeline-sdk-type: kfp} annotations: {pipelines.kubeflow.org/component_spec: '{"implementation": {"container": {"args": ["--number", {"inputValue": "number"}, "----output-paths", {"outputPath": "Output"}], "command": ["sh", "-ec", "program_path=$(mktemp)\nprintf \"%s\" \"$0\" > \"$program_path\"\npython3 -u \"$program_path\" \"$@\"\n", "def print_and_return_number(number):\n print(number)\n return number\n\ndef _serialize_int(int_value: int) -> str:\n if isinstance(int_value, str):\n return int_value\n if not isinstance(int_value, int):\n raise TypeError(''Value \"{}\" has type \"{}\" instead of int.''.format(str(int_value), str(type(int_value))))\n return str(int_value)\n\nimport argparse\n_parser = argparse.ArgumentParser(prog=''Print and return number'', description='''')\n_parser.add_argument(\"--number\", dest=\"number\", type=int, required=True, default=argparse.SUPPRESS)\n_parser.add_argument(\"----output-paths\", dest=\"_output_paths\", type=str, nargs=1)\n_parsed_args = vars(_parser.parse_args())\n_output_files = _parsed_args.pop(\"_output_paths\", [])\n\n_outputs = print_and_return_number(**_parsed_args)\n\n_outputs = [_outputs]\n\n_output_serializers = [\n _serialize_int,\n\n]\n\nimport os\nfor idx, output_file in enumerate(_output_files):\n try:\n os.makedirs(os.path.dirname(output_file))\n except OSError:\n pass\n with open(output_file, ''w'') as f:\n f.write(_output_serializers[idx](_outputs[idx]))\n"], "image": "python:3.7"}}, "inputs": [{"name": "number", "type": "Integer"}], "name": "Print and return number", "outputs": [{"name": "Output", "type": "Integer"}]}', pipelines.kubeflow.org/component_ref: '{}', pipelines.kubeflow.org/arguments.parameters: '{"number": "{{inputs.parameters.number_1}}"}'} - name: print-and-return-number-2 container: args: [--number, '{{inputs.parameters.number_2}}', '----output-paths', /tmp/outputs/Output/data] command: - sh - -ec - | program_path=$(mktemp) printf "%s" "$0" > "$program_path" python3 -u "$program_path" "$@" - | def print_and_return_number(number): print(number) return number def _serialize_int(int_value: int) -> str: if isinstance(int_value, str): return int_value if not isinstance(int_value, int): raise TypeError('Value "{}" has type "{}" instead of int.'.format(str(int_value), str(type(int_value)))) return str(int_value) import argparse _parser = argparse.ArgumentParser(prog='Print and return number', description='') _parser.add_argument("--number", dest="number", type=int, required=True, default=argparse.SUPPRESS) _parser.add_argument("----output-paths", dest="_output_paths", type=str, nargs=1) _parsed_args = vars(_parser.parse_args()) _output_files = _parsed_args.pop("_output_paths", []) _outputs = print_and_return_number(**_parsed_args) _outputs = [_outputs] _output_serializers = [ _serialize_int, ] import os for idx, output_file in enumerate(_output_files): try: os.makedirs(os.path.dirname(output_file)) except OSError: pass with open(output_file, 'w') as f: f.write(_output_serializers[idx](_outputs[idx])) image: python:3.7 inputs: parameters: - {name: number_2} outputs: parameters: - name: print-and-return-number-2-Output valueFrom: {path: /tmp/outputs/Output/data} artifacts: - {name: print-and-return-number-2-Output, path: /tmp/outputs/Output/data} metadata: labels: {pipelines.kubeflow.org/kfp_sdk_version: 1.6.3, pipelines.kubeflow.org/pipeline-sdk-type: kfp} annotations: {pipelines.kubeflow.org/component_spec: '{"implementation": {"container": {"args": ["--number", {"inputValue": "number"}, "----output-paths", {"outputPath": "Output"}], "command": ["sh", "-ec", "program_path=$(mktemp)\nprintf \"%s\" \"$0\" > \"$program_path\"\npython3 -u \"$program_path\" \"$@\"\n", "def print_and_return_number(number):\n print(number)\n return number\n\ndef _serialize_int(int_value: int) -> str:\n if isinstance(int_value, str):\n return int_value\n if not isinstance(int_value, int):\n raise TypeError(''Value \"{}\" has type \"{}\" instead of int.''.format(str(int_value), str(type(int_value))))\n return str(int_value)\n\nimport argparse\n_parser = argparse.ArgumentParser(prog=''Print and return number'', description='''')\n_parser.add_argument(\"--number\", dest=\"number\", type=int, required=True, default=argparse.SUPPRESS)\n_parser.add_argument(\"----output-paths\", dest=\"_output_paths\", type=str, nargs=1)\n_parsed_args = vars(_parser.parse_args())\n_output_files = _parsed_args.pop(\"_output_paths\", [])\n\n_outputs = print_and_return_number(**_parsed_args)\n\n_outputs = [_outputs]\n\n_output_serializers = [\n _serialize_int,\n\n]\n\nimport os\nfor idx, output_file in enumerate(_output_files):\n try:\n os.makedirs(os.path.dirname(output_file))\n except OSError:\n pass\n with open(output_file, ''w'') as f:\n f.write(_output_serializers[idx](_outputs[idx]))\n"], "image": "python:3.7"}}, "inputs": [{"name": "number", "type": "Integer"}], "name": "Print and return number", "outputs": [{"name": "Output", "type": "Integer"}]}', pipelines.kubeflow.org/component_ref: '{}', pipelines.kubeflow.org/arguments.parameters: '{"number": "{{inputs.parameters.number_2}}"}'} - name: sum-and-print-numbers container: args: [--number-1, '{{inputs.parameters.print-and-return-number-Output}}', --number-2, '{{inputs.parameters.print-and-return-number-2-Output}}'] command: - sh - -ec - | program_path=$(mktemp) printf "%s" "$0" > "$program_path" python3 -u "$program_path" "$@" - | def sum_and_print_numbers(number_1, number_2): print(number_1 + number_2) import argparse _parser = argparse.ArgumentParser(prog='Sum and print numbers', description='') _parser.add_argument("--number-1", dest="number_1", type=int, required=True, default=argparse.SUPPRESS) _parser.add_argument("--number-2", dest="number_2", type=int, required=True, default=argparse.SUPPRESS) _parsed_args = vars(_parser.parse_args()) _outputs = sum_and_print_numbers(**_parsed_args) image: python:3.7 inputs: parameters: - {name: print-and-return-number-2-Output} - {name: print-and-return-number-Output} metadata: labels: {pipelines.kubeflow.org/kfp_sdk_version: 1.6.3, pipelines.kubeflow.org/pipeline-sdk-type: kfp} annotations: {pipelines.kubeflow.org/component_spec: '{"implementation": {"container": {"args": ["--number-1", {"inputValue": "number_1"}, "--number-2", {"inputValue": "number_2"}], "command": ["sh", "-ec", "program_path=$(mktemp)\nprintf \"%s\" \"$0\" > \"$program_path\"\npython3 -u \"$program_path\" \"$@\"\n", "def sum_and_print_numbers(number_1, number_2):\n print(number_1 + number_2)\n\nimport argparse\n_parser = argparse.ArgumentParser(prog=''Sum and print numbers'', description='''')\n_parser.add_argument(\"--number-1\", dest=\"number_1\", type=int, required=True, default=argparse.SUPPRESS)\n_parser.add_argument(\"--number-2\", dest=\"number_2\", type=int, required=True, default=argparse.SUPPRESS)\n_parsed_args = vars(_parser.parse_args())\n\n_outputs = sum_and_print_numbers(**_parsed_args)\n"], "image": "python:3.7"}}, "inputs": [{"name": "number_1", "type": "Integer"}, {"name": "number_2", "type": "Integer"}], "name": "Sum and print numbers"}', pipelines.kubeflow.org/component_ref: '{}', pipelines.kubeflow.org/arguments.parameters: '{"number_1": "{{inputs.parameters.print-and-return-number-Output}}", "number_2": "{{inputs.parameters.print-and-return-number-2-Output}}"}'} arguments: parameters: - {name: number_1} - {name: number_2} serviceAccountName: pipeline-runner ```
================================================ FILE: i18n/en/docusaurus-plugin-content-docs/current/kubeflow/basic-requirements.md ================================================ --- title : "3. Install Requirements" description: "" sidebar_position: 3 contributors: ["Jongseob Jeon"] --- The recommended Python version for practice is python>=3.7. For those unfamiliar with the Python environment, please refer to [Appendix 1. Python Virtual Environment](../appendix/pyenv) and install the packages on the **client node**. The packages and versions required for the practice are as follows: - requirements.txt ```bash kfp==1.8.9 scikit-learn==1.0.1 mlflow==1.21.0 pandas==1.3.4 dill==0.3.4 ``` Activate the [Python virtual environment](../appendix/pyenv.md#python-가상환경-생성) created in the previous section. ```bash pyenv activate demo ``` We are proceeding with the package installation. ```bash pip3 install -U pip pip3 install kfp==1.8.9 scikit-learn==1.0.1 mlflow==1.21.0 pandas==1.3.4 dill==0.3.4 ``` ================================================ FILE: i18n/en/docusaurus-plugin-content-docs/current/kubeflow/basic-run.md ================================================ --- title : "7. Pipeline - Run" description: "" sidebar_position: 7 contributors: ["Jongseob Jeon"] --- ## Run Pipeline Now we will run the uploaded pipeline. ## Before Run ### 1. Create Experiment Experiments in Kubeflow are units that logically manage runs executed within them. When you first enter the namespace in Kubeflow, there are no Experiments created. Therefore, you must create an Experiment beforehand in order to run the pipeline. If an Experiment already exists, you can go to [Run Pipeline](../kubeflow/basic-run.md#run-pipeline-1). Experiments can be created via the Create Experiment button. ![run-0.png](./img/run-0.png) ### 2. Name 입력 ![run-1.png](./img/run-1.png) ## Run Pipeline ### 1. Select Create Run ![run-2.png](./img/run-2.png) ### 2. Select Experiment ![run-9.png](./img/run-9.png) ![run-10.png](./img/run-10.png) ### 3. Enter Pipeline Config Fill in the values of the Config provided when creating the pipeline. The uploaded pipeline requires input values for `number_1` and `number_2`. ![run-3.png](./img/run-3.png) ### 4. Start Click the Start button after entering the values. The pipeline will start running. ![run-4.png](./img/run-4.png) ## Run Result The executed pipelines can be viewed in the Runs tab. Clicking on a run provides detailed information related to the executed pipeline. ![run-5.png](./img/run-5.png) Upon clicking, the following screen appears. Components that have not yet executed are displayed in gray. ![run-6.png](./img/run-6.png) When a component has completed execution, it is marked with a green checkmark. ![run-7.png](./img/run-7.png) If we look at the last component, we can see that it has outputted the sum of the input values, which in this case is 8 (the sum of 3 and 5). ![run-8.png](./img/run-8.png) ================================================ FILE: i18n/en/docusaurus-plugin-content-docs/current/kubeflow/how-to-debug.md ================================================ --- title : "13. Component - Debugging" description: "" sidebar_position: 13 contributors: ["Jongseob Jeon"] --- ## Debugging Pipeline This page covers how to debug Kubeflow components. ## Failed Component We will modify a pipeline used in [Component - MLFlow](../kubeflow/advanced-mlflow.md#mlflow-pipeline) in this page. First, let's modify the pipeline so that the component fails. ```python from functools import partial import kfp from kfp.components import InputPath, OutputPath, create_component_from_func from kfp.dsl import pipeline @partial( create_component_from_func, packages_to_install=["pandas", "scikit-learn"], ) def load_iris_data( data_path: OutputPath("csv"), target_path: OutputPath("csv"), ): import pandas as pd from sklearn.datasets import load_iris iris = load_iris() data = pd.DataFrame(iris["data"], columns=iris["feature_names"]) target = pd.DataFrame(iris["target"], columns=["target"]) data["sepal length (cm)"] = None data.to_csv(data_path, index=False) target.to_csv(target_path, index=False) @partial( create_component_from_func, packages_to_install=["pandas"], ) def drop_na_from_csv( data_path: InputPath("csv"), output_path: OutputPath("csv"), ): import pandas as pd data = pd.read_csv(data_path) data = data.dropna() data.to_csv(output_path, index=False) @partial( create_component_from_func, packages_to_install=["dill", "pandas", "scikit-learn", "mlflow"], ) def train_from_csv( train_data_path: InputPath("csv"), train_target_path: InputPath("csv"), model_path: OutputPath("dill"), input_example_path: OutputPath("dill"), signature_path: OutputPath("dill"), conda_env_path: OutputPath("dill"), kernel: str, ): import dill import pandas as pd from sklearn.svm import SVC from mlflow.models.signature import infer_signature from mlflow.utils.environment import _mlflow_conda_env train_data = pd.read_csv(train_data_path) train_target = pd.read_csv(train_target_path) clf = SVC(kernel=kernel) clf.fit(train_data, train_target) with open(model_path, mode="wb") as file_writer: dill.dump(clf, file_writer) input_example = train_data.sample(1) with open(input_example_path, "wb") as file_writer: dill.dump(input_example, file_writer) signature = infer_signature(train_data, clf.predict(train_data)) with open(signature_path, "wb") as file_writer: dill.dump(signature, file_writer) conda_env = _mlflow_conda_env( additional_pip_deps=["dill", "pandas", "scikit-learn"] ) with open(conda_env_path, "wb") as file_writer: dill.dump(conda_env, file_writer) @pipeline(name="debugging_pipeline") def debugging_pipeline(kernel: str): iris_data = load_iris_data() drop_data = drop_na_from_csv(data=iris_data.outputs["data"]) model = train_from_csv( train_data=drop_data.outputs["output"], train_target=iris_data.outputs["target"], kernel=kernel, ) if __name__ == "__main__": kfp.compiler.Compiler().compile(debugging_pipeline, "debugging_pipeline.yaml") ``` The modifications are as follows: 1. In the `load_iris_data` component for loading data, `None` was injected into the `sepal length (cm)` feature. 2. In the `drop_na_from_csv` component, use the `drop_na()` function to remove rows with na values. Now let's upload and run the pipeline. After running, if you press Run you will see that it has failed in the `Train from csv` component. ![debug-0.png](./img/debug-0.png) Click on the failed component and check the log to see the reason for the failure. ![debug-2.png](./img/debug-2.png) If the log shows that the data count is 0 and the component did not run, there may be an issue with the input data. Let's investigate what might be the problem. First, click on the component and go to the Input/Output tab to download the input data. You can click on the link indicated by the red square to download the data. ![debug-5.png](./img/debug-5.png) Download both files to the same location. Then navigate to the specified path and check the downloaded files. ```bash ls ``` There are two files as follows. ```bash drop-na-from-csv-output.tgz load-iris-data-target.tgz ``` I will try to unzip it. ```bash tar -xzvf load-iris-data-target.tgz ; mv data target.csv tar -xzvf drop-na-from-csv-output.tgz ; mv data data.csv ``` And then run the component code using a Jupyter notebook. ![debug-3.png](./img/debug-3.png) Debugging revealed that dropping the data was based on rows instead of columns, resulting in all the data being removed. Now that we know the cause of the problem, we can modify the component to drop based on columns. ```python @partial( create_component_from_func, packages_to_install=["pandas"], ) def drop_na_from_csv( data_path: InputPath("csv"), output_path: OutputPath("csv"), ): import pandas as pd data = pd.read_csv(data_path) data = data.dropna(axis="columns") data.to_csv(output_path, index=False) ``` After modifying, upload the pipeline again and run it to confirm that it is running normally as follows. ![debug-6.png](./img/debug-6.png) ================================================ FILE: i18n/en/docusaurus-plugin-content-docs/current/kubeflow/kubeflow-concepts.md ================================================ --- title : "2. Kubeflow Concepts" description: "" sidebar_position: 2 contributors: ["Jongseob Jeon"] --- ## Component A component is composed of Component contents and a Component wrapper. A single component is delivered to Kubeflow through a Component wrapper and the delivered component executes the defined Component contents and produces artifacts. ![concept-0.png](./img/concept-0.png) ### Component Contents There are three components that make up the component contents: ![concept-1.png](./img/concept-1.png) 1. Environment 2. Python code w/ Config 3. Generates Artifacts Let's explore each component with an example. Here is a Python code that loads data, trains an SVC (Support Vector Classifier) model, and saves the SVC model. ```python import dill import pandas as pd from sklearn.svm import SVC train_data = pd.read_csv(train_data_path) train_target= pd.read_csv(train_target_path) clf= SVC( kernel=kernel ) clf.fit(train_data) with open(model_path, mode="wb") as file_writer: dill.dump(clf, file_writer) ``` The above Python code can be divided into components contents as follows. ![concept-2.png](./img/concept-2.png) Environment is the part of the Python code where the packages used in the code are imported. Next, Python Code w\ Config is where the given Config is used to actually perform the training. Finally, there is a process to save the artifacts. ### Component Wrapper Component wrappers deliver the necessary Config and execute tasks for component content. ![concept-3.png](./img/concept-3.png) In Kubeflow, component wrappers are defined as functions, similar to the `train_svc_from_csv` example above. When a component wrapper wraps the contents, it looks like the following: ![concept-4.png](./img/concept-4.png) ### Artifacts In the explanation above, it was mentioned that the component creates Artifacts. Artifacts is a term used to refer to any form of a file that is generated, such as evaluation results, logs, etc. Of the ones that we are interested in, the following are significant: Models, Data, Metrics, and etc. ![concept-5.png](./img/concept-5.png) - Model - Data - Metric - etc #### Model We defined the model as follows: > A model is a form that includes Python code, trained weights and network architecture, and an environment to run it. #### Data Data includes preprocessed features, model predictions, etc. #### Metric Metric is divided into two categories: dynamic metrics and static metrics. - Dynamic metrics refer to values that continuously change during the training process, such as train loss per epoch. - Static metrics refer to evaluation metrics, such as accuracy, that are calculated after the training is completed. ## Pipeline A pipeline consists of a collection of components and the order in which they are executed. The order forms a directed acyclic graph (DAG), which can include simple conditional statements. ![concept-6.png](./img/concept-6.png) ### Pipeline Config As mentioned earlier, components require config to be executed. The pipeline config contains the configs for all the components in the pipeline. ![concept-7.png](./img/concept-7.png) ## Run To execute a pipeline, the pipeline config specific to that pipeline is required. In Kubeflow, an executed pipeline is called a "Run." ![concept-8.png](./img/concept-8.png) When a pipeline is executed, each component generates artifacts. Kubeflow pipeline assigns a unique ID to each Run, and all artifacts generated during the Run are stored. ![concept-9.png](./img/concept-9.png) Now, let's learn how to write components and pipelines. ================================================ FILE: i18n/en/docusaurus-plugin-content-docs/current/kubeflow/kubeflow-intro.md ================================================ --- title : "1. Kubeflow Introduction" description: "" sidebar_position: 1 contributors: ["Jongseob Jeon"] --- To use Kubeflow, you need to write components and pipelines. The approach described in *MLOps for ALL* differs slightly from the method described on the [Kubeflow Pipeline official website](https://www.kubeflow.org/docs/components/pipelines/overview/quickstart/). Here, Kubeflow Pipeline is used as one of the components in the [elements that make up MLOps](../kubeflow/kubeflow-concepts.md#component-contents) rather than a standalone workflow. Now, let's understand what components and pipelines are and how to write them. ================================================ FILE: i18n/en/docusaurus-plugin-content-docs/current/kubeflow-dashboard-guide/_category_.json ================================================ { "label": "Kubeflow UI Guide", "position": 5, "link": { "type": "generated-index" } } ================================================ FILE: i18n/en/docusaurus-plugin-content-docs/current/kubeflow-dashboard-guide/experiments-and-others.md ================================================ --- title : "6. Kubeflow Pipeline Relates" description: "" sidebar_position: 6 contributors: ["Jaeyeon Kim"] --- In the left tabs of the Central Dashboard (KFP Experiments, Pipelines, Runs, Recurring Runs, Artifacts, Executions) you can manage Kubeflow Pipelines and the results of Pipeline execution and Pipeline Runs. ![left-tabs](./img/left-tabs.png) Kubeflow Pipelines are the main reason for using Kubeflow in *MLOps for ALL*, and details on how to create, execute, and check the results of Kubeflow Pipelines can be found in [3.Kubeflow](../kubeflow/kubeflow-intro). ================================================ FILE: i18n/en/docusaurus-plugin-content-docs/current/kubeflow-dashboard-guide/experiments.md ================================================ --- title : "5. Experiments(AutoML)" description: "" sidebar_position: 5 contributors: ["Jaeyeon Kim"] --- Next, we will click the Experiments(AutoML) tab on the left of the Central Dashboard. ![left-tabs](./img/left-tabs.png) ![automl](./img/automl.png) The Experiments(AutoML) page is where you can manage [Katib](https://www.kubeflow.org/docs/components/katib/overview/), which is responsible for AutoML through Hyperparameter Tuning and Neural Architecture Search in Kubeflow. The usage of Katib and Experiments(AutoML) is not covered in *MLOps for Everyone* v1.0, and will be added in v2.0. ================================================ FILE: i18n/en/docusaurus-plugin-content-docs/current/kubeflow-dashboard-guide/intro.md ================================================ --- title : "1. Central Dashboard" description: "" sidebar_position: 1 contributors: ["Jaeyeon Kim", "SeungTae Kim"] --- Once you have completed [Kubeflow installation](../setup-components/install-components-kf.md), you can access the dashboard through the following command. ```bash kubectl port-forward --address 0.0.0.0 svc/istio-ingressgateway -n istio-system 8080:80 ``` ![after-login](./img/after-login.png) The Central Dashboard is a UI that integrates all the features provided by Kubeflow. The features provided by the Central Dashboard can be divided based on the tabs on the left side ![left-tabs](./img/left-tabs.png) - Home - Notebooks - Tensorboards - Volumes - Models - Experiments(AutoML) - Experiments(KFP) - Pipelines - Runs - Recurring Runs - Artifacts - Executions Let's now look at the simple usage of each feature. ================================================ FILE: i18n/en/docusaurus-plugin-content-docs/current/kubeflow-dashboard-guide/notebooks.md ================================================ --- title : "2. Notebooks" description: "" sidebar_position: 2 contributors: ["Jaeyeon Kim"] --- ## Launch Notebook Server Click on the Notebooks tab on the left side of the Central Dashboard. ![left-tabs](./img/left-tabs.png) You will see a similar screen. The Notebooks tab is a page where users can independently create and access jupyter notebook and code server environments (hereinafter referred to as a notebook server). ![notebook-home](./img/notebook-home.png) Click the "+ NEW NOTEBOOK" button at the top right. ![new-notebook](./img/new-notebook.png) When the screen shown below appears, now specify the spec (Spec) of the notebook server to be created. ![create](./img/create.png)
For details for spec: - **name**: - Specifies a name to identify the notebook server. - **namespace**: - Cannot be changed. (It is automatically set to the namespace of the currently logged-in user account.) - **Image**: - Selects the image to use from pre-installed JupyterLab images with Python packages like sklearn, pytorch, tensorflow, etc. - If you want to use an image that utilizes GPU within the notebook server, refer to the **GPUs** section below. - If you want to use a custom notebook server that includes additional packages or source code, you can create a custom image and deploy it for use. - **CPU / RAM**: - Specifies the amount of resources required. - cpu: in core units - Represents the number of virtual cores, and can also be specified as a float value such as `1.5`, `2.7`, etc. - memory: in Gi units - **GPUs**: - Specifies the number of GPUs to allocate to the Jupyter notebook. - `None` - When GPU resources are not required. - 1, 2, 4 - Allocates 1, 2, or 4 GPUs. - GPU Vendor: - If you have followed the [(Optional) Setup GPU](../setup-kubernetes/setup-nvidia-gpu.md) guide and installed the NVIDIA GPU plugin, select NVIDIA. - **Workspace Volume**: - Specifies the amount of disk space required within the notebook server. - Do not change the Type and Name fields unless you want to increase the disk space or change the AccessMode. - Check the **"Don't use Persistent Storage for User's home"** checkbox only if it is not necessary to save the notebook server's work. **It is generally recommended not to check this option.** - If you want to use a pre-existing Persistent Volume Claim (PVC), select Type as "Existing" and enter the name of the PVC to use. - **Data Volumes**: - If additional storage resources are required, click the **"+ ADD VOLUME"** button to create them. - ~~Configurations, Affinity/Tolerations, Miscellaneous Settings~~ - These are generally not needed, so detailed explanations are omitted in *MLOps for All*.
If you followed the [Setup GPU (Optional)](../setup-kubernetes/setup-nvidia-gpu.md), select NVIDIA if you have installed the nvidia gpu plugin. ![creating](./img/creating.png) After creation, the **Status** will change to a green check mark icon, and the **CONNECT button** will be activated. ![created](./img/created.png) --- ## Accessing the Notebook Server Clicking the **CONNECT button** will open a new browser window, where you will see the following screen: ![notebook-access](./img/notebook-access.png) You can use the Notebook, Console, and Terminal icons in the **Launcher** to start using them. Notebook Interface ![notebook-console](./img/notebook-console.png) Terminal Interface ![terminal-console](./img/terminal-console.png) --- ## Stopping the Notebook Server If you haven't used the notebook server for an extended period of time, you can stop it to optimize resource usage in the Kubernetes cluster. **Note that stopping the notebook server will result in the deletion of all data stored outside the Workspace Volume or Data Volume specified when creating the notebook server.** If you haven't changed the path during notebook server creation, the default Workspace Volume path is `/home/jovyan` inside the notebook server, so any data stored outside the `/home/jovyan` directory will be deleted. Clicking the `STOP` button as shown below will stop the notebook server: ![notebook-stop](./img/notebook-stop.png) Once the server is stopped, the `CONNECT` button will be disabled. To restart the notebook server and use it again, click the `PLAY` button. ![notebook-restart](./img/notebook-restart.png) ================================================ FILE: i18n/en/docusaurus-plugin-content-docs/current/kubeflow-dashboard-guide/tensorboards.md ================================================ --- title : "3. Tensorboards" description: "" sidebar_position: 3 contributors: ["Jaeyeon Kim"] --- Let's click on the Tensorboards tab of the left tabs of the Central Dashboard next. ![left-tabs](./img/left-tabs.png) We can see the following screen. ![tensorboard](./img/tensorboard.png) The TensorBoard server created in this way can be used just like a regular remote TensorBoard server, or it can be used for the purpose of storing data directly from a Kubeflow Pipeline run for visualization purposes. You can refer to the [TensorBoard documentation](https://www.kubeflow.org/docs/components/pipelines/sdk/output-viewer/#tensorboard) for more information on using TensorBoard with Kubeflow Pipeline runs. There are various ways to visualize the results of Kubeflow Pipeline runs, and in *MLOps for ALL*, we will utilize the Visualization feature of Kubeflow components and the visualization capabilities of MLflow to enable more general use cases. Therefore, detailed explanations of the TensorBoards page will be omitted in this context. ================================================ FILE: i18n/en/docusaurus-plugin-content-docs/current/kubeflow-dashboard-guide/volumes.md ================================================ --- title : "4. Volumes" description: "" sidebar_position: 4 contributors: ["Jaeyeon Kim"] --- ## Volumes Next, let's click on the Volumes tab in the left of the Central Dashboard. ![left-tabs](./img/left-tabs.png) You will see the following screen. ![volumes](./img/volumes.png) Volumes tab provides the functionality to manage the Persistent Volume Claims (PVC) belonging to the current user's namespace in Kubernetes' Volume (Volume). By looking at the screenshot, you can see the information of the Volume created on the [1. Notebooks](../kubeflow-dashboard-guide/notebooks) page. It can be seen that the Storage Class of the Volume is set to local-path, which is the Default Storage Class installed at the time of Kubernetes cluster installation. In addition, the Volumes page can be used if you want to create, view, or delete a new Volume in the user namespace. --- ## Creating a Volume By clicking the `+ NEW VOLUME` button at the top right, you can see the following screen. ![new-volume](./img/new-volume.png) You can create a volume by specifying its name, size, storage class, and access mode. When you specify the desired resource specs to create a volume, its Status will be shown as Pending on this page. When you hover over the Status icon, you will see a message that this *(This volume will be bound when its first consumer is created.)* This is according to the volume creation policy of the [StorageClass](https://kubernetes.io/ko/docs/concepts/storage/storage-classes/) used in the lab, which is local-path. **This is not a problem situation.** When the Status is shown as Pending on this page, you can still specify the name of the volume in the notebook server or pod that you want to use the volume and the volume creation will be triggered at that time. ![creating-volume](./img/creating-volume.png) ================================================ FILE: i18n/en/docusaurus-plugin-content-docs/current/prerequisites/_category_.json ================================================ { "label": "Prerequisites", "position": 1, "link": { "type": "generated-index" } } ================================================ FILE: i18n/en/docusaurus-plugin-content-docs/current/prerequisites/docker/_category_.json ================================================ { "label": "Docker", "position": 1, "link": { "type": "generated-index" } } ================================================ FILE: i18n/en/docusaurus-plugin-content-docs/current/prerequisites/docker/advanced.md ================================================ --- title : "[Practice] Docker Advanced" description: "Practice to use docker more advanced way." sidebar_position: 6 contributors: ["Jongseob Jeon", "Jaeyeon Kim"] --- ## Making a good Docker image ### Considerations to make Docker image: When creating a Docker image using a Dockerfile, the **order** of the commands is important. This is because Docker images are composed of many Read-Only layers and when building the image, existing layers are **cached** and reused, so if you structure your Dockerfile with this in mind, you can **reduce the build time**. Each of the `RUN`, `ADD`, `COPY` commands in a Dockerfile are stored as one layer. For example, if we have the following `Dockerfile`: ```docker # Layer 1 FROM ubuntu:latest # Layer 2 RUN apt-get update && apt-get install python3 pip3 -y # Layer 3 RUN pip3 install -U pip && pip3 install torch # Layer 4 COPY src/ src/ # Layer 5 CMD python src/app.py ``` If you run the image built with the above `Dockerfile` with the command `docker run -it app:latest /bin/bash`, it can be represented in the following layers. ![layers.png](./img/layers.png) The topmost R/W layer does not affect the image. In other words, any changes made inside the container are volatile. When a lower layer is changed, all the layers above it need to be rebuilt. Therefore, the order of Dockerfile instructions is important. It is recommended to place the parts that are frequently changed towards the end. (e.g., `COPY src/ app/src/`) Conversely, parts that are unlikely to change should be placed towards the beginning. If there are parts that are rarely changed but used in multiple places, they can be consolidated. It is advisable to create a separate image for those common parts in advance and use it as a base image. For example, if you want to create separate images for an environment that uses `tensorflow-cpu` and another environment that uses `tensorflow-gpu`, you can do the following: Create a base image [`ghcr.io/makinarocks/python:3.8-base`](http://ghcr.io/makinarocks/python:3.8-base-cpu) that includes Python and other basic packages installed. Then, when creating the images with the CPU and GPU versions of TensorFlow, you can use the base image as the `FROM` instruction and write the separate instructions for installing TensorFlow in each Dockerfile. Managing two Dockerfiles in this way improves readability and reduces build time. Combining layers had performance benefits in older versions of Docker. However, since you cannot guarantee the Docker version in which your Docker containers will run, it is recommended to combine layers for readability purposes. It is best to combine layers that can be combined appropriately. Here is an example of a Dockerfile: ```docker # Bad Case RUN apt-get update RUN apt-get install build-essential -y RUN apt-get install curl -y RUN apt-get install jq -y RUN apt-get install git -y ``` This can be written by combining it as follows. ```docker # Better Case RUN apt-get update && \ apt-get install -y \ build-essential \ curl \ jq \ git ``` For convenience, it is better to use `.dockerignore`. `.dockerignore` is similar to `.gitignore` in the sense that it can be excluded when doing a `docker build` just like when doing a `git add`. More information can be found in the [Docker Official Documentation](https://docs.docker.com/develop/develop-images/dockerfile_best-practices/). ### ENTRYPOINT vs CMD `ENTRYPOINT` and `CMD` are both used when you want to execute a command at the runtime of the container. One of them must be present in the Dockerfile. - **Difference** - `CMD`: Easily modifiable when running `docker run` command - `ENTRYPOINT`: Requires the use of `--entrypoint` to modify When `ENTRYPOINT` and `CMD` are used together, `CMD` typically represents the arguments (parameters) for the command specified in `ENTRYPOINT`. For example, consider the following Dockerfile: ```docker FROM ubuntu:latest # 아래 4 가지 option 을 바꿔가며 직접 테스트해보시면 이해하기 편합니다. # 단, NO ENTRYPOINT 옵션은 base image 인 ubuntu:latest 에 이미 있어서 테스트해볼 수는 없고 나머지 v2, 3, 5, 6, 8, 9, 11, 12 를 테스트해볼 수 있습니다. # ENTRYPOINT echo "Hello ENTRYPOINT" # ENTRYPOINT ["echo", "Hello ENTRYPOINT"] # CMD echo "Hello CMD" # CMD ["echo", "Hello CMD"] ``` If you build and run the above `Dockerfile` with the parts marked as comments deactivated, you can get the following results: | | No ENTRYPOINT | ENTRYPOINT a b | ENTRYPOINT ["a", "b"] | | ------------------ | -------------- | -------------- | --------------------- | | **NO CMD** | Error! | /bin/sh -c a b | a b | | **CMD ["x", "y"]** | x y | /bin/sh -c a b | a b x y | | **CMD x y** | /bin/sh -c x y | /bin/sh -c a b | a b /bin/sh -c x y | - In Kubernetes pod, - `ENTRYPOINT` corresponds to the command - `CMD` corresponds to the arguments ### Naming docker tag Recommend not using "latest" as a tag for a Docker image, as it is the default tag name and can be easily overwritten unintentionally. It is important to ensure uniqueness of one image with one tag for the sake of collaboration and debugging in the production stage. Using the same tag for different contents can lead to dangling images, which are not shown in the `docker images` but still take up storage space. ### ETC 1. Logs and other information are stored separately from the container, not inside it. This is because data written from within the container can be lost at any time. 2. Secrets and environment-dependent information should not be written directly into the Dockerfile but should be passed in via environment variables or a .env config file. 3. There is a **linter** for Dockerfiles, so it is useful to use it when collaborating. [https://github.com/hadolint/hadolint](https://github.com/hadolint/hadolint) ## Several options for docker run When using Docker containers, there are some inconveniences. Specifically, Docker does not store any of the work done within the Docker container by default. This is because Docker containers use isolated file systems. Therefore, it is difficult to share data between multiple Docker containers. To solve this problem, there are two approaches offered by Docker. ![storage.png](./img/storage.png) #### Docker volume - Use the Docker CLI to directly manage a resource called `volume`. - Create a specific directory under the Docker area (`/var/lib/docker`) on the host and mount that path to a Docker container. #### Bind mount - Mount a specific path on the host to a Docker container. #### How to use? The usage is through the same interface, using the `-v` option. However, when using volumes, you need to manage them directly by performing commands like `docker volume create`, `docker volume ls`, `docker volume rm`, etc. - Docker volume ```bash docker run \ -v my_volume:/app \ nginx:latest ```` - Blind mount ```bash docker run \ -v /home/user/some/path:/app \ nginx:latest ``` When developing locally, bind mount can be convenient, but if you want to maintain a clean environment, using Docker volume and explicitly performing create and rm operations can be another approach. The way storage is provided in Kubernetes ultimately relies on Docker's bind mount as well. ### Docker run with resource limit Basically, docker containers can **fully utilize the CPU and memory resources of the host OS**. However, when using this, depending on the resource situation of the host OS, docker containers may abnormally terminate due to issues such as **OOM**. To address this problem, docker provides the `-m` [option](https://docs.docker.com/config/containers/resource_constraints/#limit-a-containers-access-to-memory) which allows you to **limit the usage of CPU and memory** when running the docker container. ```bash docker run -d -m 512m --memory-reservation=256m --name 512-limit ubuntu sleep 3600 docker run -d -m 1g --memory-reservation=256m --name 1g-limit ubuntu sleep 3600 ``` After running the Docker above, you can check the usage through the 'docker stats' command. ```bash CONTAINER ID NAME CPU % MEM USAGE / LIMIT MEM % NET I/O BLOCK I/O PIDS 4ea1258e2e09 1g-limit 0.00% 300KiB / 1GiB 0.03% 1kB / 0B 0B / 0B 1 4edf94b9a3e5 512-limit 0.00% 296KiB / 512MiB 0.06% 1.11kB / 0B 0B / 0B 1 ``` In Kubernetes, when you limit the CPU and memory resources of a pod resource, it is provided using this technique. ### docker run with restart policy If there is a need to keep a particular container running continuously, the `--restart=always` option is provided to try to re-create the container immediately after it is terminated. After entering the option, run the docker. ```bash docker run --restart=always ubuntu ``` Run `watch -n1 docker ps` to check if it is restarting. If it is running normally, `Restarting (0)` will be printed in STATUS. ```bash CONTAINER ID IMAGE COMMAND CREATED STATUS PORTS NAMES a911850276e8 ubuntu "bash" 35 seconds ago Restarting (0) 6 seconds ago hungry_vaughan ``` - [https://docs.docker.com/engine/reference/commandline/run/#restart-policies---restart](https://docs.docker.com/engine/reference/commandline/run/#restart-policies---restart) - Provides options such as "on-failure with max retries" and "always" When specifying the restart option for a job resource in Kubernetes, this approach is used. ### Running docker run as a background process By default, when running a Docker container, it is executed as a foreground process. This means that the terminal that launched the container is automatically attached to it, preventing you from running other commands. Let's try an example. Open two terminals, and in one terminal, continuously monitor `docker ps`, while in the other terminal, execute the following commands one by one and observe the behavior. #### First Practice ```bash docker run -it ubuntu sleep 10 ``` You must remain stopped for 10 seconds and you cannot perform any other commands from that container. After 10 seconds, you can check in docker ps that the container has terminated. #### Second Practice ```bash docker run -it ubuntu sleep 10 ``` After that, press `ctrl + p` -> `ctrl + q`. Now you can perform other commands in that terminal, and you can also see that the container is still alive for up to 10 seconds with `docker ps`. This situation, where you exit from the Docker container, is called "detached". Docker provides an option to run containers in detached mode, which allows you to run the container in the background while executing the `run` command. #### Third Practice ```bash docker run -d ubuntu sleep 10 ``` In detached mode, you can perform other actions in the terminal that executed the command. It is good to use detached mode appropriately according to the situation. For example, when developing a backend API server that communicates with the DB, the backend API server needs to be constantly checked with hot-loading while changing the source code, but the DB does not need to be monitored, so it can be executed as follows. Run the DB container in detached mode, and run the backend API server in attached mode to follow the logs. ## References - [https://towardsdatascience.com/docker-storage-598e385f4efe](https://towardsdatascience.com/docker-storage-598e385f4efe) - [https://vsupalov.com/docker-latest-tag/](https://vsupalov.com/docker-latest-tag/) - [https://docs.microsoft.com/ko-kr/azure/container-registry/container-registry-image-tag-version](https://docs.microsoft.com/ko-kr/azure/container-registry/container-registry-image-tag-version) - [https://stevelasker.blog/2018/03/01/docker-tagging-best-practices-for-tagging-and-versioning-docker-images/](https://stevelasker.blog/2018/03/01/docker-tagging-best-practices-for-tagging-and-versioning-docker-images/) ================================================ FILE: i18n/en/docusaurus-plugin-content-docs/current/prerequisites/docker/command.md ================================================ --- title : "[Practice] Docker command" description: "Practice to use docker command." sidebar_position: 4 contributors: ["Jongseob Jeon", "Jaeyeon Kim"] --- ## 1. Normal installation confirmation ```bash docker run hello-world ``` If installed correctly, you should be able to see the following message. ```bash Hello from Docker! This message shows that your installation appears to be working correctly. .... ``` **(For ubuntu)** If you want to use without sudo, please refer to the following site. - [https://docs.docker.com/engine/install/linux-postinstall/#manage-docker-as-a-non-root-user](https://docs.docker.com/engine/install/linux-postinstall/#manage-docker-as-a-non-root-user) ## 2. Docker Pull Docker pull is a command to download Docker images from a Docker image registry (a repository where Docker images are stored and shared). You can check the arguments available in docker pull using the command below. ```bash docker pull --help ``` If performed normally, it prints out as follows. ```bash Usage: docker pull [OPTIONS] NAME[:TAG|@DIGEST] Pull an image or a repository from a registry Options: -a, --all-tags Download all tagged images in the repository --disable-content-trust Skip image verification (default true) --platform string Set platform if server is multi-platform capable -q, --quiet Suppress verbose output ``` It can be seen here that docker pull takes two types of arguments. 1. `[OPTIONS]` 2. `NAME[:TAG|@DIGEST]` In order to use the `-a` and `-q` options from help, they must be used before the NAME. Let's try and pull the `ubuntu:18.04` image directly. ```bash docker pull ubuntu:18.04 ``` If interpreted correctly, the command means to pull an image with the tag `18.04` from an image named `ubuntu`. If performed successfully, it will produce an output similar to the following. ```bash 18.04: Pulling from library/ubuntu 20d796c36622: Pull complete Digest: sha256:42cd9143b6060261187a72716906187294b8b66653b50d70bc7a90ccade5c984 Status: Downloaded newer image for ubuntu:18.04 docker.io/library/ubuntu:18.04 ``` If you perform the above command, you will download the image called 'ubuntu:18.04' from a registry named [docker.io/library](http://docker.io/library/) to your laptop. - Note that - in the future, if you need to get a docker image from a certain **private** registry instead of docker.io or public docker hub, you can use [`docker login`](https://docs.docker.com/engine/reference/commandline/login/) to point to the certain registry, then use `docker pull`. Alternatively, you can set up an [insecure registry](https://stackoverflow.com/questions/42211380/add-insecure-registry-to-docker). - Also note that [`docker save`](https://docs.docker.com/engine/reference/commandline/save/) and [`docker load`](https://docs.docker.com/engine/reference/commandline/load/) commands are available to store and share docker images in the form of `.tar` file in an intranet. ## 3. Docker images This is the command to list the Docker images that exist locally. ```bash docker images --help ``` The arguments available for use in docker images are as follows. ```bash Usage: docker images [OPTIONS] [REPOSITORY[:TAG]] List images Options: -a, --all Show all images (default hides intermediate images) --digests Show digests -f, --filter filter Filter output based on conditions provided --format string Pretty-print images using a Go template --no-trunc Don't truncate output -q, --quiet Only show image IDs ``` Let's try executing the command below directly. ```bash docker images ``` If you install Docker and proceed with this practice, it will output something similar to this. ```bash REPOSITORY TAG IMAGE ID CREATED SIZE ubuntu 18.04 29e70752d7b2 2 days ago 56.7MB ``` If you use the `-q` argument among the possible arguments, only the `IMAGE ID` will be printed. ```bash docker images -q ``` ```bash 29e70752d7b2 ``` ## 4. Docker ps Command to output the list of currently running Docker containers. ```bash docker ps --help ``` Use the following arguments can be used with 'docker ps': ```bash Usage: docker ps [OPTIONS] List containers Options: -a, --all Show all containers (default shows just running) -f, --filter filter Filter output based on conditions provided --format string Pretty-print containers using a Go template -n, --last int Show n last created containers (includes all states) (default -1) -l, --latest Show the latest created container (includes all states) --no-trunc Don't truncate output -q, --quiet Only display container IDs -s, --size Display total file sizes ``` Let's try running the command below directly. ```bash docker ps ``` If there are no currently running containers, it will be as follows. ```bash CONTAINER ID IMAGE COMMAND CREATED STATUS PORTS NAMES ``` If there is a container running, it will look similar to this. ```bash CONTAINER ID IMAGE COMMAND CREATED STATUS PORTS NAMES c1e8f5e89d8d ubuntu "sleep 3600" 13 seconds ago Up 12 seconds trusting_newton ``` ## 5. Docker run Command to run a Docker container. ```bash docker run --help ``` The command to run docker run is as follows. ```bash Usage: docker run [OPTIONS] IMAGE [COMMAND] [ARG...] Run a command in a new container ``` What we need to confirm here is that the docker run command takes three types of arguments. 1. `[OPTIONS]` 2. `[COMMAND]` 3. `[ARG...]` Let's try running a docker container ourselves. ```bash ## Usage: docker run [OPTIONS] IMAGE [COMMAND] [ARG...] docker run -it --name demo1 ubuntu:18.04 /bin/bash ``` - `-it`: Combination of `-i` and `-t` options - Runs the container and connects it to an interactive terminal - `--name`: Assigns a name to the container for easier identification instead of using the container ID - `/bin/bash`: Specifies the command to be executed in the container upon startup, where `/bin/bash` opens a bash shell. After running the command, you can exit the container by using the `exit` command. When you enter the previously learned `docker ps` command, the following output will be displayed. ```bash CONTAINER ID IMAGE COMMAND CREATED STATUS PORTS NAMES ``` It was said that the container being executed was coming out, but for some reason the container that was just executed does not appear. The reason is that `docker ps` shows the currently running containers by default. If you want to see the stopped containers too, you must give the `-a` option. ```bash docker ps -a ``` Then the list of terminated containers will also be displayed. ```bash CONTAINER ID IMAGE COMMAND CREATED STATUS PORTS NAMES 4c1aa74a382a ubuntu:18.04 "/bin/bash" 2 minutes ago Exited (0) 2 minutes ago demo1 ``` ## 6. Docker exec Docker exec is a command that is used to issue commands or access the inside of a Docker container. ```bash docker exec --help ``` For example, let's try running the following command. ```bash docker run -d --name demo2 ubuntu:18.04 sleep 3600 ``` Here, the `-d` option is a command that allows the Docker container to run in the background so that even if the connection ends to the container, it continues to run. Use `docker ps` to check if it is currently running. It can be confirmed that it is running as follows. ```bash CONTAINER ID IMAGE COMMAND CREATED STATUS PORTS NAMES fc88a83e90f0 ubuntu:18.04 "sleep 3600" 4 seconds ago Up 3 seconds demo2 ``` Now let's connect to the running docker container through the `docker exec` command. ```bash docker exec -it demo2 /bin/bash ``` This is the same as the previous `docker run` command, allowing you to access the inside of the container. You can exit using `exit`. ## 7. Docker logs ```bash docker logs --help ``` I will have the following container be executed. ```bash docker run --name demo3 -d busybox sh -c "while true; do $(echo date); sleep 1; done" ``` By using the above command, we have set up a busybox container named "test" as a Docker container in the background and printed the current time once every second. Now let's check the log with the command below. ```bash docker logs demo3 ``` If performed normally, it will be similar to below. ```bash Sun Mar 6 11:06:49 UTC 2022 Sun Mar 6 11:06:50 UTC 2022 Sun Mar 6 11:06:51 UTC 2022 Sun Mar 6 11:06:52 UTC 2022 Sun Mar 6 11:06:53 UTC 2022 Sun Mar 6 11:06:54 UTC 2022 ``` However, if used this way, you can only check the logs taken so far. In this case, you can use the `-f` option to keep watching and outputting. ```bash docker logs demo3 -f ``` ## 8. Docker stop Command to stop a running Docker container. ```bash docker stop --help ``` Through `docker ps`, you can check the containers currently running, as follows. ```bash CONTAINER ID IMAGE COMMAND CREATED STATUS PORTS NAMES 730391669c39 busybox "sh -c 'while true; …" About a minute ago Up About a minute demo3 fc88a83e90f0 ubuntu:18.04 "sleep 3600" 4 minutes ago Up 4 minutes demo2 ``` Now let's try to stop Docker with `docker stop`. ```bash docker stop demo2 ``` After executing, type `docker ps` again. ```bash CONTAINER ID IMAGE COMMAND CREATED STATUS PORTS NAMES 730391669c39 busybox "sh -c 'while true; …" 2 minutes ago Up 2 minutes demo3 ``` Comparing with the above result, you can see that the demo2 container has disappeared from the list of currently running containers. The rest of the containers will also be stopped. ```bash docker stop demo3 ``` Docker rm: Command to delete a Docker container. ```bash docker rm --help ``` Docker containers are in a stopped state by default. That's why you can see stopped containers using `docker ps -a`. But why do we have to delete the stopped containers? Even when stopped, the data used in the Docker remains in the container. So you can restart the container through restarting. But this process will use disk. So in order to delete the containers that are not used at all, we should use the `docker rm` command. First, let's check the current containers. ```bash docker ps -a ``` There are three containers as follows. ```bash CONTAINER ID IMAGE COMMAND CREATED STATUS PORTS NAMES 730391669c39 busybox "sh -c 'while true; …" 4 minutes ago Exited (137) About a minute ago demo3 fc88a83e90f0 ubuntu:18.04 "sleep 3600" 7 minutes ago Exited (137) 2 minutes ago demo2 4c1aa74a382a ubuntu:18.04 "/bin/bash" 10 minutes ago Exited (0) 10 minutes ago demo1 ``` Let's try to delete the 'demo3' container through the following command. ```bash docker rm demo3 ``` The command `docker ps -a` reduced it to two lines as follows. ```bash CONTAINER ID IMAGE COMMAND CREATED STATUS PORTS NAMES fc88a83e90f0 ubuntu:18.04 "sleep 3600" 13 minutes ago Exited (137) 8 minutes ago demo2 4c1aa74a382a ubuntu:18.04 "/bin/bash" 16 minutes ago Exited (0) 16 minutes ago demo1 ``` Delete the remaining containers as well. ```bash docker rm demo2 docker rm demo1 ``` ## 10. Docker rmi Command to delete a Docker image. ```bash docker rmi --help ``` Use the following commands to check which images are currently on the local. ```bash docker images ``` The following is output. ```bash REPOSITORY TAG IMAGE ID CREATED SIZE busybox latest a8440bba1bc0 32 hours ago 1.41MB ubuntu 18.04 29e70752d7b2 2 days ago 56.7MB ``` I will try to delete the `busybox` image. ```bash docker rmi busybox ``` If you type `docker images` again, the following will appear. ```bash REPOSITORY TAG IMAGE ID CREATED SIZE ubuntu 18.04 29e70752d7b2 2 days ago 56.7MB ``` ## References - [https://docs.github.com/en/packages/working-with-a-github-packages-registry/working-with-the-container-registry](https://docs.github.com/en/packages/working-with-a-github-packages-registry/working-with-the-container-registry) ================================================ FILE: i18n/en/docusaurus-plugin-content-docs/current/prerequisites/docker/docker.md ================================================ --- title : "What is Docker?" description: "Introduction to Docker." sidebar_position: 3 contributors: ["Jongseob Jeon", "Jaeyeon Kim"] --- ## Container - Containerization: - A technology that allows applications to be executed uniformly anywhere. - Container Image: - A collection of all the files required to run an application. - → Similar to a mold for making fish-shaped bread (Bungeoppang). - Container: - A single process that is executed based on a container image. - → A fish-shaped bread (Bungeoppang) produced using a mold. ## Docker Docker is a platform that allows you to manage and use containers. Its slogan is "Build Once, Run Anywhere," guaranteeing the same execution results anywhere. In the Docker, the resources for the container are separated and the lifecycle is controlled by Linux kernel's cgroups, etc. However, it is too difficult to use these interfaces directly, so an abstraction layer is created. ![docker-layer.png](./img/docker-layer.png) Through this, users can easily control containers with just the user-friendly API **Docker CLI**. - Users can easily control containers using the user-friendly API called **Docker CLI**. ## Interpretation of Layer The roles of the layers mentioned above are as follows: 1. runC: Utilizes the functionality of the Linux kernel to isolate namespaces, CPUs, memory, filesystems, etc., for a container, which is a single process. 2. containerd: Acts as an abstraction layer to communicate with runC (OCI layer) and uses the standardized interface (OCI). 3. dockerd: Solely responsible for issuing commands to containerd. 4. Docker CLI: Users only need to issue commands to dockerd (Docker daemon) using Docker CLI. - During this communication process, Unix socket is used, so sometimes Docker-related errors occur, such as "the /var/run/docker.sock is in use" or "insufficient permissions" error messages. Although Docker encompasses many stages, when the term "Docker" is used, it can refer to Docker CLI, Dockerd (Docker daemon), or even a single Docker container, which can lead to confusion. In the upcoming text, the term "Docker" may be used in various contexts. ## For ML Engineer ML engineers use Docker for the following reasons: 1. ML training/inference code needs to be independent of the underlying operating system, Python version, Python environment, and specific versions of Python packages. 2. Therefore, the goal is to bundle not only the code but also all the dependent packages, environment variables, folder names, etc., into a single package. Containerization technology enables this. 3. Docker is one of the software tools that makes it easy to use and manage this technology, and the packaged units are referred to as Docker images. ================================================ FILE: i18n/en/docusaurus-plugin-content-docs/current/prerequisites/docker/images.md ================================================ --- title : "[Practice] Docker images" description: "Practice to use docker image." sidebar_position: 5 contributors: ["Jongseob Jeon", "Jaeyeon Kim"] --- - `docker commit` - running container 를 docker image 로 만드는 방법 - `docker commit -m "message" -a "author" ` - `docker commit` 을 사용하면, 수동으로 Dockerfile 을 만들지 않고도 도커 이미지를 만들 수 있습니다. ``` touch Dockerfile ``` 3. Move to the docker-practice folder. 4. Create an empty file called Dockerfile. 1. 이미지에 특정 패키지를 설치하는 명령어는 무엇입니까? Answer: `RUN` Translation: Let's look at the basic commands that can be used in Dockerfile one by one. FROM is a command that specifies which image to use as a base image for Dockerfile. When creating a Docker image, instead of creating the environment I intend from scratch, I can use a pre-made image such as `python:3.9`, `python-3.9-alpine`, etc. as the base and install pytorch and add my source code. ```docker FROM [:] [AS ] # 예시 FROM ubuntu FROM ubuntu:18.04 FROM nginx:latest AS ngx ``` The command to copy files or directories from the `` path on the host (local) to the `` path inside the container. ```docker COPY ... # 예시 COPY a.txt /some-directory/b.txt COPY my-directory /some-directory-2 ``` ADD is similar to COPY but it has additional features. ```docker # 1 - 호스트에 압축되어있는 파일을 풀면서 컨테이너 내부로 copy 할 수 있음 ADD scripts.tar.gz /tmp # 2 - Remote URLs 에 있는 파일을 소스 경로로 지정할 수 있음 ADD http://www.example.com/script.sh /tmp # 위 두 가지 기능을 사용하고 싶을 경우에만 COPY 대신 ADD 를 사용하는 것을 권장 ``` The command to run the specified command inside a Docker container. Docker images maintain the state in which the commands are executed. ```docker RUN RUN ["executable-command", "parameter1", "parameter2"] # 예시 RUN pip install torch RUN pip install -r requirements.txt ``` CMD specifies a command that the Docker container will **run when it starts**. There is a similar command called **ENTRYPOINT**. The difference between them will be discussed **later**. Note that only one **CMD** can be run in one Docker image, which is different from **RUN** command. ```docker CMD CMD ["executable-command", "parameter1", "parameter2"] CMD ["parameter1", "parameter2"] # ENTRYPOINT 와 함께 사용될 때 # 예시 CMD python main.py ``` WORKDIR is a command that specifies which directory inside the container to perform future additional commands. If the directory does not exist, it will be created. ```docker WORKDIR /path/to/workdir # 예시 WORKDIR /home/demo RUN pwd # /home/demo 가 출력됨 ``` This is a command to set the value of environment variables that will be used continuously inside the container. ```docker ENV ENV = # 예시 # default 언어 설정 RUN locale-gen ko_KR.UTF-8 ENV LANG ko_KR.UTF-8 ENV LANGUAGE ko_KR.UTF-8 ENV LC_ALL ko_KR.UTF-8 ``` You can specify the port/protocol to be opened from the container. If `` is not specified, TCP is set as the default. ```docker EXPOSE EXPOSE / # 예시 EXPOSE 8080 ``` Write a simple Dockerfile by using `vim Dockerfile` or an editor like vscode and write the following: ```docker # base image 를 ubuntu 18.04 로 설정합니다. FROM ubuntu:18.04 # apt-get update 명령을 실행합니다. RUN apt-get update # TEST env var의 값을 hello 로 지정합니다. ENV TEST hello # DOCKER CONTAINER 가 시작될 때, 환경변수 TEST 의 값을 출력합니다. CMD echo $TEST ``` Use the `docker build` command to create a Docker Image from a Dockerfile. ```bash docker build --help ``` Run the following command from the path where the Dockerfile is located. ```bash docker build -t my-image:v1.0.0 . ``` The command above means to build an image with the name "my-image" and the tag "v1.0.0" from the Dockerfile in the current path. Let's check if the image was built successfully. ```bash # grep : my-image 가 있는지를 잡아내는 (grep) 하는 명령어 docker images | grep my-image ``` If performed normally, it will output as follows. ```bash my-image v1.0.0 143114710b2d 3 seconds ago 87.9MB ``` Let's now **run** a docker container with the `my-image:v1.0.0` image that we just built. ```bash docker run my-image:v1.0.0 ``` If performed normally, it will result in the following. ```bash hello ``` Let's run a docker container and change the value of the `TEST` env var at the time of running the `my-image:v1.0.0` image we just built. ```bash docker run -e TEST=bye my-image:v1.0.0 ``` If performed normally, it will be as follows. ```bash bye ``` ================================================ FILE: i18n/en/docusaurus-plugin-content-docs/current/prerequisites/docker/install.md ================================================ --- title : "Install Docker" description: "Install docker to start." sidebar_position: 1 contributors: ["Jongseob Jeon", "Jaeyeon Kim"] --- ## Docker To practice Docker, you need to install Docker. The Docker installation varies depending on which OS you are using. Please refer to the official website for the Docker installation that fits your environment: - [ubuntu](https://docs.docker.com/engine/install/ubuntu/) - [mac](https://docs.docker.com/desktop/mac/install/) - [windows](https://docs.docker.com/desktop/windows/install/) ## Check Installation Check installation requires an OS, terminal environment where `docker run hello-world` runs correctly. | OS | Docker Engine | Terminal | | ------- | -------------- | ------------------ | | MacOS | Docker Desktop | zsh | | Windows | Docker Desktop | Powershell | | Windows | Docker Desktop | WSL2 | | Ubuntu | Docker Engine | bash | ## Before diving in.. It is possible that many metaphors and examples will be focused towards MLOps as they explain the necessary Docker usage to use MLOps. ================================================ FILE: i18n/en/docusaurus-plugin-content-docs/current/prerequisites/docker/introduction.md ================================================ --- title : "Why Docker & Kubernetes ?" description: "Introduction to Docker." sidebar_position: 2 contributors: ["Jongseob Jeon", "Jaeyeon Kim"] --- ## Why Kubernetes ? To operationalize machine learning models, additional functionalities beyond model development are required. 1. Training Phase - Schedule management for model training commands - Ensuring reproducibility of trained models 2. Deployment Phase - Traffic distribution - Monitoring service failures - Troubleshooting in case of failures Fortunately, the software development field has already put a lot of thought and effort into addressing these needs. Therefore, when deploying machine learning models, leveraging the outcomes of these considerations can be highly beneficial. Docker and Kubernetes are two prominent software products widely used in MLOps to address these needs. ## Docker & Kubernetes ### Not a software but a product Docker and Kubernetes are representative software (products) that provide containerization and container orchestration functions respectively. #### Docker Docker was the mainstream in the past, but its usage has been decreasing gradually with the addition of various paid policy. However, as of March 2022, it is still the most commonly used container virtualization software. ![sysdig-2019.png](./img/sysdig-2019.png)
[from sysdig 2019]
![sysdig-2021.png](./img/sysdig-2021.png)
[from sysdig 2021]
#### Kubernetes Kubernetes: Kubernetes is a product that has almost no comparison so far. ![cncf-survey.png](./img/cncf-survey.png)
[from cncf survey]
![t4-ai.png](./img/t4-ai.png)
[from t4.ai]
### History of Open source #### Initial Docker & Kubernetes At the beginning of Docker development, **one package** called Docker Engine contained multiple features such as API, CLI, networking, storage, etc., but it began to be **divided one by one** according to the philosophy of **MSA**. However, the initial Kubernetes included Docker Engine for container virtualization. Therefore, whenever the Docker version was updated, the interface of Docker Engine changed and Kubernetes was greatly affected. #### Open Container Initiative In order to alleviate such inconveniences, many groups interested in container technology such as Google have come together to start the Open Container Initiative (OCI) project to set standards for containers. Docker further separated its interface and developed Containerd, a Container Runtime that adheres to the OCI standard, and added an abstraction layer so that dockerd calls the API of Containerd. In accordance with this flow, Kubernetes also now supports not only Docker, but any Container Runtime that adheres to the OCI standard and the specified specifications with the Container Runtime Interface (CRI) specification, starting from version 1.5. #### CRI-O CRI-O is a container runtime developed by Red Hat, Intel, SUSE, and IBM, which adheres to the OCI standard + CRI specifications, specifically for Kubernetes. #### Current docker & kubernetes Currently, Docker and Kubernetes have been using Docker Engine as the default container runtime, but since Docker's API did not match the CRI specification (*OCI follows*), Kubernetes developed and supported a **dockershim** to make Docker's API compatible with CRI, (*it was a huge burden for Kubernetes, not for Docker*). This was **deprecated from Kubernetes v1.20 and abandoned from v1.23**. - v1.23 will be released in December 2021 So from Kubernetes v1.23, you can no longer use Docker natively. However, **users are not much affected by this change** because Docker images created through Docker Engine comply with the OCI standard, so they can be used regardless of what container runtime Kubernetes is made of. ### References - [*https://www.linkedin.com/pulse/containerd는-무엇이고-왜-중요할까-sean-lee/?originalSubdomain=kr*](https://www.linkedin.com/pulse/containerd%EB%8A%94-%EB%AC%B4%EC%97%87%EC%9D%B4%EA%B3%A0-%EC%99%9C-%EC%A4%91%EC%9A%94%ED%95%A0%EA%B9%8C-sean-lee/?originalSubdomain=kr) - [https://kubernetes.io/blog/2021/12/07/kubernetes-1-23-release-announcement/](https://kubernetes.io/blog/2021/12/07/kubernetes-1-23-release-announcement/) - [https://kubernetes.io/blog/2020/12/02/dockershim-faq/](https://kubernetes.io/blog/2020/12/02/dockershim-faq/) - [https://kubernetes.io/blog/2020/12/02/dont-panic-kubernetes-and-docker/](https://kubernetes.io/blog/2020/12/02/dont-panic-kubernetes-and-docker/) - [https://kubernetes.io/ko/blog/2020/12/02/dont-panic-kubernetes-and-docker/](https://kubernetes.io/ko/blog/2020/12/02/dont-panic-kubernetes-and-docker/) ================================================ FILE: i18n/en/docusaurus-plugin-content-docs/current/setup-components/_category_.json ================================================ { "label": "Setup Components", "position": 3, "link": { "type": "generated-index" } } ================================================ FILE: i18n/en/docusaurus-plugin-content-docs/current/setup-components/install-components-kf.md ================================================ --- title : "1. Kubeflow" description: "구성요소 설치 - Kubeflow" sidebar_position: 1 date: 2021-12-13 lastmod: 2021-12-20 contributors: ["Jaeyeon Kim", "SeungTae Kim"] --- ## Prepare the installation file Prepare the installation files for installing Kubeflow **v1.4.0** Clone the [kubeflow/manifests Repository](https://github.com/kubeflow/manifests) with the **v1.4.0** tag, and move to the corresponding folder. ```bash git clone -b v1.4.0 https://github.com/kubeflow/manifests.git cd manifests ``` ## Install each components The kubeflow/manifests repository provides installation commands for each component, but it often lacks information on potential issues that may arise during installation or how to verify if the installation was successful. This can make it challenging for first-time users. Therefore, in this document, we will provide instructions on how to verify the successful installation of each component. Please note that this document will not cover the installation of components that are not covered in *MLOps for ALL*, such as Knative, KFServing, and MPI Operator, as we prioritize efficient resource usage. ### Cert-manager 1. Install cert-manager. ```bash kustomize build common/cert-manager/cert-manager/base | kubectl apply -f - ``` If the installation is successful, you should see output similar to the following: ```bash namespace/cert-manager created customresourcedefinition.apiextensions.k8s.io/certificaterequests.cert-manager.io created customresourcedefinition.apiextensions.k8s.io/certificates.cert-manager.io created customresourcedefinition.apiextensions.k8s.io/challenges.acme.cert-manager.io created customresourcedefinition.apiextensions.k8s.io/clusterissuers.cert-manager.io created customresourcedefinition.apiextensions.k8s.io/issuers.cert-manager.io created customresourcedefinition.apiextensions.k8s.io/orders.acme.cert-manager.io created serviceaccount/cert-manager created serviceaccount/cert-manager-cainjector created serviceaccount/cert-manager-webhook created role.rbac.authorization.k8s.io/cert-manager-webhook:dynamic-serving created role.rbac.authorization.k8s.io/cert-manager-cainjector:leaderelection created role.rbac.authorization.k8s.io/cert-manager:leaderelection created clusterrole.rbac.authorization.k8s.io/cert-manager-cainjector created clusterrole.rbac.authorization.k8s.io/cert-manager-controller-approve:cert-manager-io created clusterrole.rbac.authorization.k8s.io/cert-manager-controller-certificates created clusterrole.rbac.authorization.k8s.io/cert-manager-controller-challenges created clusterrole.rbac.authorization.k8s.io/cert-manager-controller-clusterissuers created clusterrole.rbac.authorization.k8s.io/cert-manager-controller-ingress-shim created clusterrole.rbac.authorization.k8s.io/cert-manager-controller-issuers created clusterrole.rbac.authorization.k8s.io/cert-manager-controller-orders created clusterrole.rbac.authorization.k8s.io/cert-manager-edit created clusterrole.rbac.authorization.k8s.io/cert-manager-view created clusterrole.rbac.authorization.k8s.io/cert-manager-webhook:subjectaccessreviews created rolebinding.rbac.authorization.k8s.io/cert-manager-webhook:dynamic-serving created rolebinding.rbac.authorization.k8s.io/cert-manager-cainjector:leaderelection created rolebinding.rbac.authorization.k8s.io/cert-manager:leaderelection created clusterrolebinding.rbac.authorization.k8s.io/cert-manager-cainjector created clusterrolebinding.rbac.authorization.k8s.io/cert-manager-controller-approve:cert-manager-io created clusterrolebinding.rbac.authorization.k8s.io/cert-manager-controller-certificates created clusterrolebinding.rbac.authorization.k8s.io/cert-manager-controller-challenges created clusterrolebinding.rbac.authorization.k8s.io/cert-manager-controller-clusterissuers created clusterrolebinding.rbac.authorization.k8s.io/cert-manager-controller-ingress-shim created clusterrolebinding.rbac.authorization.k8s.io/cert-manager-controller-issuers created clusterrolebinding.rbac.authorization.k8s.io/cert-manager-controller-orders created clusterrolebinding.rbac.authorization.k8s.io/cert-manager-webhook:subjectaccessreviews created service/cert-manager created service/cert-manager-webhook created deployment.apps/cert-manager created deployment.apps/cert-manager-cainjector created deployment.apps/cert-manager-webhook created mutatingwebhookconfiguration.admissionregistration.k8s.io/cert-manager-webhook created validatingwebhookconfiguration.admissionregistration.k8s.io/cert-manager-webhook created ``` Wait for all 3 pods in the cert-manager namespace to become Running: ```bash kubectl get pod -n cert-manager ``` Once all the pods are Running, you should see output similar to the following: ```bash NAME READY STATUS RESTARTS AGE cert-manager-7dd5854bb4-7nmpd 1/1 Running 0 2m10s cert-manager-cainjector-64c949654c-2scxr 1/1 Running 0 2m10s cert-manager-webhook-6b57b9b886-7q6g2 1/1 Running 0 2m10s ``` 2. To install `kubeflow-issuer`, run the following command: ```bash kustomize build common/cert-manager/kubeflow-issuer/base | kubectl apply -f - ``` If the installation is successful, you should see the following output: ```bash clusterissuer.cert-manager.io/kubeflow-self-signing-issuer created ``` Note: If the `cert-manager-webhook` deployment is not in the Running state, you may encounter an error similar to the one below, and the `kubeflow-issuer` may not be installed. In this case, please ensure that all 3 pods of cert-manager are Running before retrying the command. If you encounter the below error, make sure that the `cert-manager` deployment and all its pods are running properly before proceeding. ```bash Error from server: error when retrieving current configuration of: Resource: "cert-manager.io/v1alpha2, Resource=clusterissuers", GroupVersionKind: "cert-manager.io/v1alpha2, Kind=ClusterIssuer" Name: "kubeflow-self-signing-issuer", Namespace: "" from server for: "STDIN": conversion webhook for cert-manager.io/v1, Kind=ClusterIssuer failed: Post "https://cert-manager-webhook.cert-manager.svc:443/convert?timeout=30s": dial tcp 10.101.177.157:443: connect: connection refused ``` ### Istio 1. Install Custom Resource Definition(CRD) for istio. ```bash kustomize build common/istio-1-9/istio-crds/base | kubectl apply -f - ``` if run properly, you should see the following output: ```bash customresourcedefinition.apiextensions.k8s.io/authorizationpolicies.security.istio.io created customresourcedefinition.apiextensions.k8s.io/destinationrules.networking.istio.io created customresourcedefinition.apiextensions.k8s.io/envoyfilters.networking.istio.io created customresourcedefinition.apiextensions.k8s.io/gateways.networking.istio.io created customresourcedefinition.apiextensions.k8s.io/istiooperators.install.istio.io created customresourcedefinition.apiextensions.k8s.io/peerauthentications.security.istio.io created customresourcedefinition.apiextensions.k8s.io/requestauthentications.security.istio.io created customresourcedefinition.apiextensions.k8s.io/serviceentries.networking.istio.io created customresourcedefinition.apiextensions.k8s.io/sidecars.networking.istio.io created customresourcedefinition.apiextensions.k8s.io/virtualservices.networking.istio.io created customresourcedefinition.apiextensions.k8s.io/workloadentries.networking.istio.io created customresourcedefinition.apiextensions.k8s.io/workloadgroups.networking.istio.io created ``` 1. Install istio namespace ```bash kustomize build common/istio-1-9/istio-namespace/base | kubectl apply -f - ``` if run properly, you should see the following output: ```bash namespace/istio-system created ``` 3. Install istio. ```bash kustomize build common/istio-1-9/istio-install/base | kubectl apply -f - ``` if run properly, you should see the following output: ```bash serviceaccount/istio-ingressgateway-service-account created serviceaccount/istio-reader-service-account created serviceaccount/istiod-service-account created role.rbac.authorization.k8s.io/istio-ingressgateway-sds created role.rbac.authorization.k8s.io/istiod-istio-system created clusterrole.rbac.authorization.k8s.io/istio-reader-istio-system created clusterrole.rbac.authorization.k8s.io/istiod-istio-system created rolebinding.rbac.authorization.k8s.io/istio-ingressgateway-sds created rolebinding.rbac.authorization.k8s.io/istiod-istio-system created clusterrolebinding.rbac.authorization.k8s.io/istio-reader-istio-system created clusterrolebinding.rbac.authorization.k8s.io/istiod-istio-system created configmap/istio created configmap/istio-sidecar-injector created service/istio-ingressgateway created service/istiod created deployment.apps/istio-ingressgateway created deployment.apps/istiod created envoyfilter.networking.istio.io/metadata-exchange-1.8 created envoyfilter.networking.istio.io/metadata-exchange-1.9 created envoyfilter.networking.istio.io/stats-filter-1.8 created envoyfilter.networking.istio.io/stats-filter-1.9 created envoyfilter.networking.istio.io/tcp-metadata-exchange-1.8 created envoyfilter.networking.istio.io/tcp-metadata-exchange-1.9 created envoyfilter.networking.istio.io/tcp-stats-filter-1.8 created envoyfilter.networking.istio.io/tcp-stats-filter-1.9 created envoyfilter.networking.istio.io/x-forwarded-host created gateway.networking.istio.io/istio-ingressgateway created authorizationpolicy.security.istio.io/global-deny-all created authorizationpolicy.security.istio.io/istio-ingressgateway created mutatingwebhookconfiguration.admissionregistration.k8s.io/istio-sidecar-injector created validatingwebhookconfiguration.admissionregistration.k8s.io/istiod-istio-system created ``` Wait for all 2 pods in the cert-manager namespace to become Running: ```bash kubectl get po -n istio-system ``` Once all the pods are Running, you should see output similar to the following: ```bash NAME READY STATUS RESTARTS AGE istio-ingressgateway-79b665c95-xm22l 1/1 Running 0 16s istiod-86457659bb-5h58w 1/1 Running 0 16s ``` ### Dex Now, let's install dex. ```bash kustomize build common/dex/overlays/istio | kubectl apply -f - ``` If performed normally, it will be printed as follows: ```bash namespace/auth created customresourcedefinition.apiextensions.k8s.io/authcodes.dex.coreos.com created serviceaccount/dex created clusterrole.rbac.authorization.k8s.io/dex created clusterrolebinding.rbac.authorization.k8s.io/dex created configmap/dex created secret/dex-oidc-client created service/dex created deployment.apps/dex created virtualservice.networking.istio.io/dex created ``` Wait until all one pod in the auth namespace is running. ```bash kubectl get po -n auth ``` When everyone is running, similar results will be printed. ```bash NAME READY STATUS RESTARTS AGE dex-5ddf47d88d-458cs 1/1 Running 1 12s ``` Install OIDC AuthService. ```bash kustomize build common/oidc-authservice/base | kubectl apply -f - ``` If performed normally, it will be printed as follows. ```bash configmap/oidc-authservice-parameters created secret/oidc-authservice-client created service/authservice created persistentvolumeclaim/authservice-pvc created statefulset.apps/authservice created envoyfilter.networking.istio.io/authn-filter created ``` Wait until the authservice-0 pod in the istio-system namespace is Running. ```bash kubectl get po -n istio-system -w ``` If everybody runs, a similar result will be printed. ```bash NAME READY STATUS RESTARTS AGE authservice-0 1/1 Running 0 14s istio-ingressgateway-79b665c95-xm22l 1/1 Running 0 2m37s istiod-86457659bb-5h58w 1/1 Running 0 2m37s ``` Create a Kubeflow Namespace. ```bash kustomize build common/kubeflow-namespace/base | kubectl apply -f - ``` If performed normally, it will be outputted as follows. ```bash namespace/kubeflow created ``` Retrieve the Kubeflow namespace. ```bash kubectl get ns kubeflow ``` If generated normally, similar results will be output. ```bash NAME STATUS AGE kubeflow Active 8s ``` Install kubeflow-roles. ```bash kustomize build common/kubeflow-roles/base | kubectl apply -f - ``` If properly performed, it will output as follows. ```bash clusterrole.rbac.authorization.k8s.io/kubeflow-admin created clusterrole.rbac.authorization.k8s.io/kubeflow-edit created clusterrole.rbac.authorization.k8s.io/kubeflow-kubernetes-admin created clusterrole.rbac.authorization.k8s.io/kubeflow-kubernetes-edit created clusterrole.rbac.authorization.k8s.io/kubeflow-kubernetes-view created clusterrole.rbac.authorization.k8s.io/kubeflow-view created ``` Retrieve the kubeflow roles just created. ```bash kubectl get clusterrole | grep kubeflow ``` The following 6 clusterroles will be output. ```bash kubeflow-admin 2021-12-03T08:51:36Z kubeflow-edit 2021-12-03T08:51:36Z kubeflow-kubernetes-admin 2021-12-03T08:51:36Z kubeflow-kubernetes-edit 2021-12-03T08:51:36Z kubeflow-kubernetes-view 2021-12-03T08:51:36Z kubeflow-view 2021-12-03T08:51:36Z ``` Install Kubeflow Istio Resources. ```bash kustomize build common/istio-1-9/kubeflow-istio-resources/base | kubectl apply -f - ``` If performed normally, it will be output as follows. ```bash clusterrole.rbac.authorization.k8s.io/kubeflow-istio-admin created clusterrole.rbac.authorization.k8s.io/kubeflow-istio-edit created clusterrole.rbac.authorization.k8s.io/kubeflow-istio-view created gateway.networking.istio.io/kubeflow-gateway created ``` Retrieve the Kubeflow roles just created. ```bash kubectl get clusterrole | grep kubeflow-istio ``` The following three clusterroles are output. ```bash kubeflow-istio-admin 2021-12-03T08:53:17Z kubeflow-istio-edit 2021-12-03T08:53:17Z kubeflow-istio-view 2021-12-03T08:53:17Z ``` Check if the gateway is properly installed in the Kubeflow namespace. ```bash kubectl get gateway -n kubeflow ``` If generated normally, a result similar to the following will be output. ```bash NAME AGE kubeflow-gateway 31s ``` Installing Kubeflow Pipelines. ```bash kustomize build apps/pipeline/upstream/env/platform-agnostic-multi-user | kubectl apply -f - ``` If performed normally, it will be output as follows. ```bash customresourcedefinition.apiextensions.k8s.io/clusterworkflowtemplates.argoproj.io created customresourcedefinition.apiextensions.k8s.io/cronworkflows.argoproj.io created customresourcedefinition.apiextensions.k8s.io/workfloweventbindings.argoproj.io created ...(생략) authorizationpolicy.security.istio.io/ml-pipeline-visualizationserver created authorizationpolicy.security.istio.io/mysql created authorizationpolicy.security.istio.io/service-cache-server created ``` This command is installing multiple resources at once, but there are resources with dependencies on the installation order. Therefore, depending on the time, a similar error may occur. ```bash "error: unable to recognize "STDIN": no matches for kind "CompositeController" in version "metacontroller.k8s.io/v1alpha1"" ``` If a similar error occurs, wait about 10 seconds and then try the command above again. ```bash kustomize build apps/pipeline/upstream/env/platform-agnostic-multi-user | kubectl apply -f - ``` Check to see if it has been installed correctly. ```bash kubectl get po -n kubeflow ``` Wait until all 16 pods are running as follows. ```bash NAME READY STATUS RESTARTS AGE cache-deployer-deployment-79fdf9c5c9-bjnbg 2/2 Running 1 5m3s cache-server-5bdf4f4457-48gbp 2/2 Running 0 5m3s kubeflow-pipelines-profile-controller-7b947f4748-8d26b 1/1 Running 0 5m3s metacontroller-0 1/1 Running 0 5m3s metadata-envoy-deployment-5b4856dd5-xtlkd 1/1 Running 0 5m3s metadata-grpc-deployment-6b5685488-kwvv7 2/2 Running 3 5m3s metadata-writer-548bd879bb-zjkcn 2/2 Running 1 5m3s minio-5b65df66c9-k5gzg 2/2 Running 0 5m3s ml-pipeline-8c4b99589-85jw6 2/2 Running 1 5m3s ml-pipeline-persistenceagent-d6bdc77bd-ssxrv 2/2 Running 0 5m3s ml-pipeline-scheduledworkflow-5db54d75c5-zk2cw 2/2 Running 0 5m2s ml-pipeline-ui-5bd8d6dc84-j7wqr 2/2 Running 0 5m2s ml-pipeline-viewer-crd-68fb5f4d58-mbcbg 2/2 Running 1 5m2s ml-pipeline-visualizationserver-8476b5c645-wljfm 2/2 Running 0 5m2s mysql-f7b9b7dd4-xfnw4 2/2 Running 0 5m2s workflow-controller-5cbbb49bd8-5zrwx 2/2 Running 1 5m2s ``` Additionally, please check if the ml-pipeline UI is connected properly. ```bash kubectl port-forward svc/ml-pipeline-ui -n kubeflow 8888:80 ``` Open the web browser and connect to the path [http://localhost:8888/#/pipelines/](http://localhost:8888/#/pipelines/). Confirm that the following screen is displayed. If you get the error "Connection refused on localhost", you can access it through the command line by setting the address, as long as there are no security issues. To check if the ml-pipeline UI connects normally, open the bind of all addresses with 0.0.0.0. ```bash kubectl port-forward --address 0.0.0.0 svc/ml-pipeline-ui -n kubeflow 8888:80 ``` Despite running with the above options, if connection refusal issues still occur, add access permission by allowing all TCP protocol ports in the firewall settings or by adding access permission to port 8888. When you open the web browser and access the path `http://:8888/#/pipelines/`, you can see the ml-pipeline UI screen. When accessing the other ports path that is being processed in the bottom, run the command in the same way as above and add the port number to the firewall to run it. English: We will install Katib. ```bash kustomize build apps/katib/upstream/installs/katib-with-kubeflow | kubectl apply -f - ``` If performed normally, it will be output as follows. ```bash customresourcedefinition.apiextensions.k8s.io/experiments.kubeflow.org created customresourcedefinition.apiextensions.k8s.io/suggestions.kubeflow.org created customresourcedefinition.apiextensions.k8s.io/trials.kubeflow.org created serviceaccount/katib-controller created serviceaccount/katib-ui created clusterrole.rbac.authorization.k8s.io/katib-controller created clusterrole.rbac.authorization.k8s.io/katib-ui created clusterrole.rbac.authorization.k8s.io/kubeflow-katib-admin created clusterrole.rbac.authorization.k8s.io/kubeflow-katib-edit created clusterrole.rbac.authorization.k8s.io/kubeflow-katib-view created clusterrolebinding.rbac.authorization.k8s.io/katib-controller created clusterrolebinding.rbac.authorization.k8s.io/katib-ui created configmap/katib-config created configmap/trial-templates created secret/katib-mysql-secrets created service/katib-controller created service/katib-db-manager created service/katib-mysql created service/katib-ui created persistentvolumeclaim/katib-mysql created deployment.apps/katib-controller created deployment.apps/katib-db-manager created deployment.apps/katib-mysql created deployment.apps/katib-ui created certificate.cert-manager.io/katib-webhook-cert created issuer.cert-manager.io/katib-selfsigned-issuer created virtualservice.networking.istio.io/katib-ui created mutatingwebhookconfiguration.admissionregistration.k8s.io/katib.kubeflow.org created validatingwebhookconfiguration.admissionregistration.k8s.io/katib.kubeflow.org created ``` Confirm if it has been installed properly. ```bash kubectl get po -n kubeflow | grep katib ``` Wait until four pods are Running, like this. ```bash katib-controller-68c47fbf8b-b985z 1/1 Running 0 82s katib-db-manager-6c948b6b76-2d9gr 1/1 Running 0 82s katib-mysql-7894994f88-scs62 1/1 Running 0 82s katib-ui-64bb96d5bf-d89kp 1/1 Running 0 82s ``` Additionally, we will confirm that the Katib UI is connected normally. ```bash kubectl port-forward svc/katib-ui -n kubeflow 8081:80 ``` Open the web browser and access the path [http://localhost:8081/katib/](http://localhost:8081/katib/) to confirm the following screen is displayed. ```bash kustomize build apps/centraldashboard/upstream/overlays/istio | kubectl apply -f - ``` If performed normally, it will be output as follows. ```bash serviceaccount/centraldashboard created role.rbac.authorization.k8s.io/centraldashboard created clusterrole.rbac.authorization.k8s.io/centraldashboard created rolebinding.rbac.authorization.k8s.io/centraldashboard created clusterrolebinding.rbac.authorization.k8s.io/centraldashboard created configmap/centraldashboard-config created configmap/centraldashboard-parameters created service/centraldashboard created deployment.apps/centraldashboard created virtualservice.networking.istio.io/centraldashboard created ``` Check to see if it has been installed normally. ```bash kubectl get po -n kubeflow | grep centraldashboard ``` Wait until one pod related to centraldashboard in the kubeflow namespace becomes Running. ```bash centraldashboard-8fc7d8cc-xl7ts 1/1 Running 0 52s ``` Additionally, we will check if the Central Dashboard UI is connected properly. ```bash kubectl port-forward svc/centraldashboard -n kubeflow 8082:80 ``` Open the web browser to connect to the path [http://localhost:8082/](http://localhost:8082/) and check that the following screen is displayed. ```bash kustomize build apps/admission-webhook/upstream/overlays/cert-manager | kubectl apply -f - ``` If performed normally, it will be output as follows. ```bash customresourcedefinition.apiextensions.k8s.io/poddefaults.kubeflow.org created serviceaccount/admission-webhook-service-account created clusterrole.rbac.authorization.k8s.io/admission-webhook-cluster-role created clusterrole.rbac.authorization.k8s.io/admission-webhook-kubeflow-poddefaults-admin created clusterrole.rbac.authorization.k8s.io/admission-webhook-kubeflow-poddefaults-edit created clusterrole.rbac.authorization.k8s.io/admission-webhook-kubeflow-poddefaults-view created clusterrolebinding.rbac.authorization.k8s.io/admission-webhook-cluster-role-binding created service/admission-webhook-service created deployment.apps/admission-webhook-deployment created certificate.cert-manager.io/admission-webhook-cert created issuer.cert-manager.io/admission-webhook-selfsigned-issuer created mutatingwebhookconfiguration.admissionregistration.k8s.io/admission-webhook-mutating-webhook-configuration created ``` Check if it is installed normally. ```bash kubectl get po -n kubeflow | grep admission-webhook ``` Wait until one pod is running. ```bash admission-webhook-deployment-667bd68d94-2hhrx 1/1 Running 0 11s ``` Install the Notebook controller. If done successfully, it will output as follows. deployment.apps/notebook-controller created ``` A CustomResourceDefinition.apiextensions.k8s.io/notebooks.kubeflow.org, ServiceAccount/notebook-controller-service-account, Role.rbac.authorization.k8s.io/notebook-controller-leader-election-role, ClusterRole.rbac.authorization.k8s.io/notebook-controller-kubeflow-notebooks-admin, ClusterRole.rbac.authorization.k8s.io/notebook-controller-kubeflow-notebooks-edit, ClusterRole.rbac.authorization.k8s.io/notebook-controller-kubeflow-notebooks-view, ClusterRole.rbac.authorization.k8s.io/notebook-controller-role, RoleBinding.rbac.authorization.k8s.io/notebook-controller-leader-election-rolebinding, ClusterRoleBinding.rbac.authorization.k8s.io/notebook-controller-role-binding, ConfigMap/notebook-controller-config-m Translation: Check if the installation was successful. Wait until one pod is running with the following command: kubectl get po -n kubeflow | grep notebook-controller. Translation: Install Jupyter Web App. If performed correctly, the following will be output. ``` Confirm that the installation was successful: configmap/jupyter-web-app-config-76844k4cd7 created configmap/jupyter-web-app-logos created configmap/jupyter-web-app-parameters-chmg88cm48 created service/jupyter-web-app-service created deployment.apps/jupyter-web-app-deployment created virtualservice.networking.istio.io/jupyter-web-app-jupyter-web-app created Wait until one pod is Running. English: We will install the Profile Controller. ```bash kustomize build apps/profiles/upstream/overlays/kubeflow | kubectl apply -f - ``` If performed normally, it will be outputted as follows. ```bash customresourcedefinition.apiextensions.k8s.io/profiles.kubeflow.org created serviceaccount/profiles-controller-service-account created role.rbac.authorization.k8s.io/profiles-leader-election-role created rolebinding.rbac.authorization.k8s.io/profiles-leader-election-rolebinding created clusterrolebinding.rbac.authorization.k8s.io/profiles-cluster-role-binding created configmap/namespace-labels-data-48h7kd55mc created configmap/profiles-config-46c7tgh6fd created service/profiles-kfam created deployment.apps/profiles-deployment created virtualservice.networking.istio.io/profiles-kfam created ``` Check to see if it is installed normally. ```bash kubectl get po -n kubeflow | grep profiles-deployment ``` Wait until one pod is running. ```bash profiles-deployment-89f7d88b-qsnrd 2/2 Running 0 42s ``` Install the Volumes Web App. ```bash kustomize build apps/volumes-web-app/upstream/overlays/istio | kubectl apply -f - ``` If performed normally, it will be output as follows. ```bash serviceaccount/volumes-web-app-service-account created clusterrole.rbac.authorization.k8s.io/volumes-web-app-cluster-role created clusterrole.rbac.authorization.k8s.io/volumes-web-app-kubeflow-volume-ui-admin created clusterrole.rbac.authorization.k8s.io/volumes-web-app-kubeflow-volume-ui-edit created clusterrole.rbac.authorization.k8s.io/volumes-web-app-kubeflow-volume-ui-view created clusterrolebinding.rbac.authorization.k8s.io/volumes-web-app-cluster-role-binding created configmap/volumes-web-app-parameters-4gg8cm2gmk created service/volumes-web-app-service created deployment.apps/volumes-web-app-deployment created virtualservice.networking.istio.io/volumes-web-app-volumes-web-app created ``` Check if it is installed normally. ```bash kubectl get po -n kubeflow | grep volumes-web-app ``` Wait until one pod is running. ```bash volumes-web-app-deployment-8589d664cc-62svl 1/1 Running 0 27s ``` ```bash Install Tensorboard Web App. Service account/tensorboards-web-app-service-account created, Cluster role.rbac.authorization.k8s.io/tensorboards-web-app-cluster-role created, Cluster role.rbac.authorization.k8s.io/tensorboards-web-app-kubeflow-tensorboard-ui-admin created, Cluster role.rbac.authorization.k8s.io/tensorboards-web-app-kubeflow-tensorboard-ui-edit created, Cluster role.rbac.authorization.k8s.io/tensorboards-web-app-kubeflow-tensorboard-ui-view created, Cluster role binding.rbac.authorization.k8s.io/tensorboards-web-app-cluster-role-binding created, Config map/tensorboards-web-app-parameters-g28fbd6cch created, Service/tensorboards-web-app-service created, Deployment.apps/tensorboards-web-app-deployment created, and Virtual service.networking.istio.io/t Check if it is installed correctly. ```bash Deployment "tensorboard-web-app-deployment-6ff79b7f44-qbzmw" created deployment.apps/tensorboard-controller-controller-manager created ``` A custom resource definition for 'tensorboards.tensorboard.kubeflow.org' was created, along with a service account, roles, role bindings, a config map, and a deployment for the controller manager metrics service. Check if the deployment.apps/tensorboard-controller-controller-manager was installed correctly. Wait for 1 pod to be Running. Translation: Installing Training Operator. ```bash kustomize build apps/training-operator/upstream/overlays/kubeflow | kubectl apply -f - ``` If performed normally, it will be output as follows. ```bash customresourcedefinition.apiextensions.k8s.io/mxjobs.kubeflow.org created customresourcedefinition.apiextensions.k8s.io/pytorchjobs.kubeflow.org created customresourcedefinition.apiextensions.k8s.io/tfjobs.kubeflow.org created customresourcedefinition.apiextensions.k8s.io/xgboostjobs.kubeflow.org created serviceaccount/training-operator created clusterrole.rbac.authorization.k8s.io/kubeflow-training-admin created clusterrole.rbac.authorization.k8s.io/kubeflow-training-edit created clusterrole.rbac.authorization.k8s.io/kubeflow-training-view created clusterrole.rbac.authorization.k8s.io/training-operator created clusterrolebinding.rbac.authorization.k8s.io/training-operator created service/training-operator created deployment.apps/training-operator created ``` Check to see if it has been installed normally. ```bash kubectl get po -n kubeflow | grep training-operator ``` Wait until one pod is up and running. ```bash training-operator-7d98f9dd88-6887f 1/1 Running 0 28s ``` ### User Namespace For using Kubeflow, create a Kubeflow Profile for the User to be used. ```bash kustomize build common/user-namespace/base | kubectl apply -f - ``` If performed normally, it will be outputted as follows. ```bash configmap/default-install-config-9h2h2b6hbk created profile.kubeflow.org/kubeflow-user-example-com created ``` Confirm that the kubeflow-user-example-com profile has been created. ```bash kubectl get profile ``` ```bash kubeflow-user-example-com 37s ``` ## Check installation Confirm successful installation by port forwarding to access Kubeflow central dashboard with web browser. ```bash kubectl port-forward svc/istio-ingressgateway -n istio-system 8080:80 ``` Open a web browser and connect to [http://localhost:8080](http://localhost:8080) to confirm that the following screen is displayed. ![login-ui](./img/login-after-install.png) Enter the following connection information to connect. - Email Address: `user@example.com` - Password: `12341234` ![central-dashboard](./img/after-login.png) ================================================ FILE: i18n/en/docusaurus-plugin-content-docs/current/setup-components/install-components-mlflow.md ================================================ --- title : "2. MLflow Tracking Server" description: "구성요소 설치 - MLflow" sidebar_position: 2 date: 2021-12-13 lastmod: 2021-12-13 contributors: ["Jaeyeon Kim"] --- ## Install MLflow Tracking Server MLflow is a popular open-source ML experiment management tool. In addition to [experiment management](https://mlflow.org/docs/latest/tracking.html#tracking), MLflow provides functionalities for ML [model packaging](https://mlflow.org/docs/latest/projects.html#projects), [deployment management](https://mlflow.org/docs/latest/models.html#models), and [model storage](https://mlflow.org/docs/latest/model-registry.html#registry). In *MLOps for ALL*, we will be using MLflow for experiment management purposes. o store the data managed by MLflow and provide a user interface, we will deploy the MLflow Tracking Server on the Kubernetes cluster. ## Before Install MLflow Tracking Server ### Install PostgreSQL DB MLflow Tracking Server deploys a PostgreSQL DB for use as a Backend Store to a Kubernetes cluster. First, create a namespace called `mlflow-system`. ```bash kubectl create ns mlflow-system ``` If the following message is output, it means that it has been generated normally. ```bash namespace/mlflow-system created ``` Create a Postgresql DB in the `mlflow-system` namespace. ```bash kubectl -n mlflow-system apply -f https://raw.githubusercontent.com/mlops-for-all/helm-charts/b94b5fe4133f769c04b25068b98ccfa7a505aa60/mlflow/manifests/postgres.yaml ``` If performed normally, it will be outputted as follows. ```bash service/postgresql-mlflow-service created deployment.apps/postgresql-mlflow created persistentvolumeclaim/postgresql-mlflow-pvc created ``` Wait until one postgresql related pod is running in the mlflow-system namespace. ```bash kubectl get pod -n mlflow-system | grep postgresql ``` If it is output similar to the following, it has executed normally. ```bash postgresql-mlflow-7b9bc8c79f-srkh7 1/1 Running 0 38s ``` ### Setup Minio We will utilize the Minio that was installed in the previous Kubeflow installation step. However, in order to separate it for kubeflow and mlflow purposes, we will create a mlflow-specific bucket. First, port-forward the minio-service to access Minio and create the bucket. ```bash kubectl port-forward svc/minio-service -n kubeflow 9000:9000 ``` Open a web browser and connect to [localhost:9000](http://localhost:9000) to display the following screen. ![minio-install](./img/minio-install.png) Enter the following credentials to log in: - Username: `minio` - Password: `minio123` Click the **`+`** button on the right side bottom, then click `Create Bucket`. ![create-bucket](./img/create-bucket.png) Enter `mlflow` in `Bucket Name` to create the bucket. If successfully created, you will see a bucket named `mlflow` on the left. ![mlflow-bucket](./img/mlflow-bucket.png) --- ## Let's Install MLflow Tracking Server ### Add Helm Repository ```bash helm repo add mlops-for-all https://mlops-for-all.github.io/helm-charts ``` If the following message is displayed, it means it has been added successfully. ```bash "mlops-for-all" has been added to your repositories ``` ### Update Helm Repository ```bash helm repo update ``` If the following message is displayed, it means that the update has been successfully completed. ```bash Hang tight while we grab the latest from your chart repositories... ...Successfully got an update from the "mlops-for-all" chart repository Update Complete. ⎈Happy Helming!⎈ ``` ### Helm Install Install mlflow-server Helm Chart version 0.2.0. ```bash helm install mlflow-server mlops-for-all/mlflow-server \ --namespace mlflow-system \ --version 0.2.0 ``` - The above Helm chart installs MLflow with the connection information for its backend store and artifacts store set to the default minio created during the Kubeflow installation process and the postgresql information created from the [PostgreSQL DB installation](#postgresql-db-installation) above. - If you want to use a separate DB or object storage, please refer to the [Helm Chart Repo](https://github.com/mlops-for-all/helm-charts/tree/main/mlflow/chart) and set the values separately during helm install. The following message should be displayed: ```bash NAME: mlflow-server LAST DEPLOYED: Sat Dec 18 22:02:13 2021 NAMESPACE: mlflow-system STATUS: deployed REVISION: 1 TEST SUITE: None ``` Check to see if it was installed normally. ```bash kubectl get pod -n mlflow-system | grep mlflow-server ``` Wait until one mlflow-server related pod is running in the mlflow-system namespace. If it is output similar to the following, then it has been successfully executed. ```bash mlflow-server-ffd66d858-6hm62 1/1 Running 0 74s ``` ### Check installation Let's now check if we can successfully connect to the MLflow Server. First, we will perform port forwarding in order to connect from the client node. ```bash kubectl port-forward svc/mlflow-server-service -n mlflow-system 5000:5000 ``` Open a web browser and connect to [localhost:5000](http://localhost:5000) and the following screen will be output. ![mlflow-install](./img/mlflow-install.png) ================================================ FILE: i18n/en/docusaurus-plugin-content-docs/current/setup-components/install-components-pg.md ================================================ --- title : "4. Prometheus & Grafana" description: "구성요소 설치 - Prometheus & Grafana" sidebar_position: 4 date: 2021-12-13 lastmod: 2021-12-13 contributors: ["Jaeyeon Kim"] --- ## Prometheus & Grafana Prometheus and Grafana are tools for monitoring. For stable service operation, it is necessary to continuously observe the status of the service and infrastructure where the service is operating, and to respond quickly based on the observed metrics when a problem arises. Among the many tools to efficiently perform such monitoring, *Everyone's MLOps* will use open source Prometheus and Grafana. For more information, please refer to the [Prometheus Official Documentation](https://prometheus.io/docs/introduction/overview/) and [Grafana Official Documentation](https://grafana.com/docs/). Prometheus is a tool to collect metrics from various targets, and Grafana is a tool to help visualize the gathered data. Although there is no dependency between them, they are often used together complementary to each other. In this page, we will install Prometheus and Grafana on a Kubernetes cluster, then send API requests to a SeldonDeployment created with Seldon-Core and check if metrics are collected successfully. We also install a dashboard to efficiently monitor the metrics of the SeldonDeployment created in Seldon-Core using Helm Chart version 1.12.0 from seldonio/seldon-core-analytics Helm Repository. ### Add Helm Repository ```bash helm repo add seldonio https://storage.googleapis.com/seldon-charts ``` If the following message is output, it means that it has been added successfully. ```bash "seldonio" has been added to your repositories ``` ### Update Helm Repository ```bash helm repo update ``` If the following message is displayed, it means that the update was successful. ```bash Hang tight while we grab the latest from your chart repositories... ...Successfully got an update from the "seldonio" chart repository ...Successfully got an update from the "datawire" chart repository Update Complete. ⎈Happy Helming!⎈ ``` ### Helm Install Install version 1.12.0 of the seldon-core-analytics Helm Chart. ```bash helm install seldon-core-analytics seldonio/seldon-core-analytics \ --namespace seldon-system \ --version 1.12.0 ``` The following message should be output. ```bash Skip... NAME: seldon-core-analytics LAST DEPLOYED: Tue Dec 14 18:29:38 2021 NAMESPACE: seldon-system STATUS: deployed REVISION: 1 ``` Check to see if it was installed normally. ```bash kubectl get pod -n seldon-system | grep seldon-core-analytics ``` Wait until 6 seldon-core-analytics related pods are Running in the seldon-system namespace. ```bash seldon-core-analytics-grafana-657c956c88-ng8wn 2/2 Running 0 114s seldon-core-analytics-kube-state-metrics-94bb6cb9-svs82 1/1 Running 0 114s seldon-core-analytics-prometheus-alertmanager-64cf7b8f5-nxbl8 2/2 Running 0 114s seldon-core-analytics-prometheus-node-exporter-5rrj5 1/1 Running 0 114s seldon-core-analytics-prometheus-pushgateway-8476474cff-sr4n6 1/1 Running 0 114s seldon-core-analytics-prometheus-seldon-685c664894-7cr45 2/2 Running 0 114s ``` ### Check installation Let's now check if we can connect to Grafana normally. First, we will port forward to connect to the client node. ```bash kubectl port-forward svc/seldon-core-analytics-grafana -n seldon-system 8090:80 ``` Open the web browser and connect to [localhost:8090](http://localhost:8090), then the following screen will be displayed. ![grafana-install](./img/grafana-install.png) Enter the following connection information to connect. - Email or username: `admin` - Password: `password` When you log in, the following screen will be displayed. ![grafana-login](./img/grafana-login.png) Click the dashboard icon on the left and click the `Manage` button. ![dashboard-click](./img/dashboard-click.png) You can see that the basic Grafana dashboard is included. Click the `Prediction Analytics` dashboard among them. ![dashboard](./img/dashboard.png) The Seldon Core API Dashboard is visible and can be confirmed with the following output. ![seldon-dashboard](./img/seldon-dashboard.png) ## References - [Seldon-Core-Analytics Helm Chart](https://github.com/SeldonIO/seldon-core/tree/master/helm-charts/seldon-core-analytics) ================================================ FILE: i18n/en/docusaurus-plugin-content-docs/current/setup-components/install-components-seldon.md ================================================ --- title : "3. Seldon-Core" description: "구성요소 설치 - Seldon-Core" sidebar_position: 3 date: 2021-12-13 lastmod: 2021-12-13 contributors: ["Jaeyeon Kim"] --- ## Seldon-Core Seldon-Core is one of the open source frameworks that can deploy and manage numerous machine learning models in Kubernetes environments. For more details, please refer to the official [product description page](https://www.seldon.io/tech/products/core/) and [GitHub](https://github.com/SeldonIO/seldon-core) of Seldon-Core and API Deployment part. ## Installing Seldon-Core In order to use Seldon-Core, modules such as Ambassador, which is responsible for Ingress of Kubernetes, and Istio are required [here](https://docs.seldon.io/projects/seldon-core/en/latest/workflow/install.html). Seldon-Core officially supports only Ambassador and Istio, and *MLOps for everyone* will use Ambassador to use Seldon-core, so we will install Ambassador. ### Adding Ambassador to the Helm Repository ```bash helm repo add datawire https://www.getambassador.io ``` If the following message is displayed, it means it has been added normally. ```bash "datawire" has been added to your repositories ``` ### Update Ambassador - Helm Repository ```bash helm repo update ``` If the following message is output, it means that the update has been completed normally. ```bash Hang tight while we grab the latest from your chart repositories... ...Successfully got an update from the "datawire" chart repository Update Complete. ⎈Happy Helming!⎈ ``` ### Ambassador - Helm Install Install version 6.9.3 of the Ambassador Chart. ```bash helm install ambassador datawire/ambassador \ --namespace seldon-system \ --create-namespace \ --set image.repository=quay.io/datawire/ambassador \ --set enableAES=false \ --set crds.keep=false \ --version 6.9.3 ``` The following message should be displayed. ```bash 생략... W1206 17:01:36.026326 26635 warnings.go:70] rbac.authorization.k8s.io/v1beta1 Role is deprecated in v1.17+, unavailable in v1.22+; use rbac.authorization.k8s.io/v1 Role W1206 17:01:36.029764 26635 warnings.go:70] rbac.authorization.k8s.io/v1beta1 RoleBinding is deprecated in v1.17+, unavailable in v1.22+; use rbac.authorization.k8s.io/v1 RoleBinding NAME: ambassador LAST DEPLOYED: Mon Dec 6 17:01:34 2021 NAMESPACE: seldon-system STATUS: deployed REVISION: 1 NOTES: ------------------------------------------------------------------------------- Congratulations! You've successfully installed Ambassador! ------------------------------------------------------------------------------- To get the IP address of Ambassador, run the following commands: NOTE: It may take a few minutes for the LoadBalancer IP to be available. You can watch the status of by running 'kubectl get svc -w --namespace seldon-system ambassador' On GKE/Azure: export SERVICE_IP=$(kubectl get svc --namespace seldon-system ambassador -o jsonpath='{.status.loadBalancer.ingress[0].ip}') On AWS: export SERVICE_IP=$(kubectl get svc --namespace seldon-system ambassador -o jsonpath='{.status.loadBalancer.ingress[0].hostname}') echo http://$SERVICE_IP: For help, visit our Slack at http://a8r.io/Slack or view the documentation online at https://www.getambassador.io. ``` Wait until four pods become running in the seldon-system. ```bash kubectl get pod -n seldon-system ``` ```bash ambassador-7f596c8b57-4s9xh 1/1 Running 0 7m15s ambassador-7f596c8b57-dt6lr 1/1 Running 0 7m15s ambassador-7f596c8b57-h5l6f 1/1 Running 0 7m15s ambassador-agent-77bccdfcd5-d5jxj 1/1 Running 0 7m15s ``` ### Seldon-Core - Helm Install Install version 1.11.2 of the seldon-core-operator Chart. ```bash helm install seldon-core seldon-core-operator \ --repo https://storage.googleapis.com/seldon-charts \ --namespace seldon-system \ --set usageMetrics.enabled=true \ --set ambassador.enabled=true \ --version 1.11.2 ``` The following message should be displayed. ```bash Skip... W1206 17:05:38.336391 28181 warnings.go:70] admissionregistration.k8s.io/v1beta1 ValidatingWebhookConfiguration is deprecated in v1.16+, unavailable in v1.22+; use admissionregistration.k8s.io/v1 ValidatingWebhookConfiguration NAME: seldon-core LAST DEPLOYED: Mon Dec 6 17:05:34 2021 NAMESPACE: seldon-system STATUS: deployed REVISION: 1 TEST SUITE: None ``` Wait until one seldon-controller-manager pod is Running in the seldon-system namespace. ```bash kubectl get pod -n seldon-system | grep seldon-controller ``` ```bash seldon-controller-manager-8457b8b5c7-r2frm 1/1 Running 0 2m22s ``` ## References - [Example Model Servers with Seldon](https://docs.seldon.io/projects/seldon-core/en/latest/examples/server_examples.html#examples-server-examples--page-root) ================================================ FILE: i18n/en/docusaurus-plugin-content-docs/current/setup-kubernetes/_category_.json ================================================ { "label": "Setup Kubernetes", "position": 2, "link": { "type": "generated-index" } } ================================================ FILE: i18n/en/docusaurus-plugin-content-docs/current/setup-kubernetes/install-kubernetes/_category_.json ================================================ { "label": "4. Install Kubernetes", "position": 4, "link": { "type": "generated-index" } } ================================================ FILE: i18n/en/docusaurus-plugin-content-docs/current/setup-kubernetes/install-kubernetes/kubernetes-with-k3s.md ================================================ --- title: "4.1. K3s" description: "" sidebar_position: 1 date: 2021-12-13 lastmod: 2021-12-20 draft: false weight: 221 contributors: ["Jongseob Jeon"] menu: docs: parent:../setup-kubernetes" images: [] --- ## 1. Prerequisite Before setting up a Kubernetes cluster, install the necessary components on the **cluster**. Please refer to [Install Prerequisite](../../setup-kubernetes/install-prerequisite.md) to install the necessary components on the **cluster** before installing Kubernetes. k3s uses containerd as the backend by default. However, we need to use docker as the backend to use GPU, so we will install the backend with the `--docker` option. ```bash curl -sfL https://get.k3s.io | INSTALL_K3S_VERSION=v1.21.7+k3s1 sh -s - server --disable traefik --disable servicelb --disable local-storage --docker ``` After installing k3s, check the k3s config. ```bash sudo cat /etc/rancher/k3s/k3s.yaml ``` If installed correctly, the following items will be output. (Security related keys are hidden with <...>.) ```bash apiVersion: v1 clusters: - cluster: certificate-authority-data: <...> server: https://127.0.0.1:6443 name: default contexts: - context: cluster: default user: default name: default current-context: default kind: Config preferences: {} users: - name: default user: client-certificate-data: <...> client-key-data: <...> ``` ## 2. Setup Kubernetes Cluster Set up the Kubernetes cluster by copying the k3s config to be used as the cluster’s kubeconfig. ```bash mkdir .kube sudo cp /etc/rancher/k3s/k3s.yaml .kube/config ``` Grant user access permission to the copied config file. ```bash sudo chown $USER:$USER .kube/config ``` ## 3. Setup Kubernetes Client Now move the kubeconfig configured in the cluster to the local. Set the path to `~/.kube/config` on the local. The config file copied at first has the server ip set to `https://127.0.0.1:6443`. Modify this value to match the ip of the cluster. (We modified it to `https://192.168.0.19:6443` to match the ip of the cluster used in this page.) ```bash apiVersion: v1 clusters: - cluster: certificate-authority-data: <...> server: https://192.168.0.19:6443 name: default contexts: - context: cluster: default user: default name: default current-context: default kind: Config preferences: {} users: - name: default user: client-certificate-data: <...> client-key-data: <...> ``` ## 4. Install Kubernetes Default Modules Please refer to [Setup Kubernetes Modules](../../setup-kubernetes/install-kubernetes-module.md) to install the following components: - helm - kustomize - CSI plugin - [Optional] nvidia-docker, nvidia-device-plugin ## 5. Verify Successful Installation Finally, check if the nodes are Ready and verify the OS, Docker, and Kubernetes versions. ```bash kubectl get nodes -o wide ``` If you see the following message, it means that the installation was successful. ```bash NAME STATUS ROLES AGE VERSION INTERNAL-IP EXTERNAL-IP OS-IMAGE KERNEL-VERSION CONTAINER-RUNTIME ubuntu Ready control-plane,master 11m v1.21.7+k3s1 192.168.0.19 Ubuntu 20.04.3 LTS 5.4.0-91-generic docker://20.10.11 ``` ## 6. References - [https://rancher.com/docs/k3s/latest/en/installation/install-options/](https://rancher.com/docs/k3s/latest/en/installation/install-options/) ================================================ FILE: i18n/en/docusaurus-plugin-content-docs/current/setup-kubernetes/install-kubernetes/kubernetes-with-kubeadm.md ================================================ --- title: "4.3. Kubeadm" description: "" sidebar_position: 3 date: 2021-12-13 lastmod: 2021-12-20 contributors: ["Youngcheol Jang"] --- ## 1. Prerequisite Before building a Kubernetes cluster, install the necessary components to the **cluster**. Please refer to [Install Prerequisite](../../setup-kubernetes/install-prerequisite.md) and install the necessary components to the **cluster**. Change the configuration of the network for Kubernetes. ```bash sudo modprobe br_netfilter cat < Ubuntu 20.04.3 LTS 5.4.0-91-generic docker://20.10.11 ``` ================================================ FILE: i18n/en/docusaurus-plugin-content-docs/current/setup-kubernetes/install-kubernetes-module.md ================================================ --- title: "5. Install Kubernetes Modules" description: "Install Helm, Kustomize" sidebar_position: 5 date: 2021-12-13 lastmod: 2021-12-20 contributors: ["Jaeyeon Kim"] --- ## Setup Kubernetes Modules On this page, we will explain how to install the modules that will be used on the cluster from the client nodes. All the processes introduced here will be done on the **client nodes**. ## Helm Helm is one of the package management tools that helps to deploy and manage resources related to Kubernetes packages at once. 1. Download Helm version 3.7.1 into the current folder. - For Linux amd64 ```bash wget https://get.helm.sh/helm-v3.7.1-linux-amd64.tar.gz ``` - Other OS refer to the [official website](https://github.com/helm/helm/releases/tag/v3.7.1) for the download path of the binary that matches the OS and CPU of your client node. 2. Unzip the file to use helm and move the file to its desired location. ```bash tar -zxvf helm-v3.7.1-linux-amd64.tar.gz sudo mv linux-amd64/helm /usr/local/bin/helm ``` 3. Check to see if the installation was successful: ```bash helm help ``` If you see the following message, it means that it has been installed normally. ```bash The Kubernetes package manager Common actions for Helm: - helm search: search for charts - helm pull: download a chart to your local directory to view - helm install: upload the chart to Kubernetes - helm list: list releases of charts Environment variables: | Name | Description | |--------------------------|---------------------------------------------------------------------| | $HELM_CACHE_HOME | set an alternative location for storing cached files. | | $HELM_CONFIG_HOME | set an alternative location for storing Helm configuration. | | $HELM_DATA_HOME | set an alternative location for storing Helm data. | ... ``` ## Kustomize Kustomize is one of the package management tools that helps to deploy and manage multiple Kubernetes resources at once. 1. Download the binary version of kustomize v3.10.0 in the current folder. - For Linux amd64 ```bash wget https://github.com/kubernetes-sigs/kustomize/releases/download/kustomize%2Fv3.10.0/kustomize_v3.10.0_linux_amd64.tar.gz ``` - Other OS can be downloaded from [kustomize/v3.10.0](https://github.com/kubernetes-sigs/kustomize/releases/tag/kustomize%2Fv3.10.0) after checking. 2. Unzip to use kustomize, and change the file location. ```bash tar -zxvf kustomize_v3.10.0_linux_amd64.tar.gz sudo mv kustomize /usr/local/bin/kustomize ``` 3. Check if it is installed correctly. ```bash kustomize help ``` If you see the following message, it means that it has been installed normally. ```bash Manages declarative configuration of Kubernetes. See https://sigs.k8s.io/kustomize Usage: kustomize [command] Available Commands: build Print configuration per contents of kustomization.yaml cfg Commands for reading and writing configuration. completion Generate shell completion script create Create a new kustomization in the current directory edit Edits a kustomization file fn Commands for running functions against configuration. ... ``` ## CSI Plugin : Local Path Provisioner 1. The CSI Plugin is a module that is responsible for storage within Kubernetes. Install the CSI Plugin, Local Path Provisioner, which is easy to use in single node clusters. ```bash kubectl apply -f https://raw.githubusercontent.com/rancher/local-path-provisioner/v0.0.20/deploy/local-path-storage.yaml ``` If you see the following messages, it means that the installation was successful: ```bash namespace/local-path-storage created serviceaccount/local-path-provisioner-service-account created clusterrole.rbac.authorization.k8s.io/local-path-provisioner-role created clusterrolebinding.rbac.authorization.k8s.io/local-path-provisioner-bind created deployment.apps/local-path-provisioner created storageclass.storage.k8s.io/local-path created configmap/local-path-config created ``` 2. Also, check if the provisioner pod in the local-path-storage namespace is Running by executing the following command: ```bash kubectl -n local-path-storage get pod ``` If successful, it will display the following output: ```bash NAME READY STATUS RESTARTS AGE local-path-provisioner-d744ccf98-xfcbk 1/1 Running 0 7m ``` 4. Execute the following command to change the default storage class: ```bash kubectl patch storageclass local-path -p '{"metadata": {"annotations":{"storageclass.kubernetes.io/is-default-class":"true"}}}' ``` If the command is successful, the following output will be displayed: ```bash storageclass.storage.k8s.io/local-path patched ``` 5. Verify that the default storage class has been set: ```bash kubectl get sc ``` Check if there is a storage class with the name `local-path (default)` in the NAME column: ```bash NAME PROVISIONER RECLAIMPOLICY VOLUMEBINDINGMODE ALLOWVOLUMEEXPANSION AGE local-path (default) rancher.io/local-path Delete WaitForFirstConsumer false 2h ``` ================================================ FILE: i18n/en/docusaurus-plugin-content-docs/current/setup-kubernetes/install-prerequisite.md ================================================ --- title: "3. Install Prerequisite" description: "Install docker" sidebar_position: 3 date: 2021-12-13 lastmod: 2021-12-20 contributors: ["Jaeyeon Kim", "Jongsun Shinn", "Sangwoo Shim"] --- On this page, we describe the components that need to be installed or configured on the **Cluster** and **Client** prior to installing Kubernetes. ## Install apt packages In order to enable smooth communication between the Client and the Cluster, Port-Forwarding needs to be performed. To enable Port-Forwarding, the following packages need to be installed on the **Cluster**. ```bash sudo apt-get update sudo apt-get install -y socat ``` ## Install Docker 1. Install apt packages for docker. ```bash sudo apt-get update && sudo apt-get install -y ca-certificates curl gnupg lsb-release ``` 2. add docker official GPG key. ```bash curl -fsSL https://download.docker.com/linux/ubuntu/gpg | sudo gpg --dearmor -o /usr/share/keyrings/docker-archive-keyring.gpg ``` 3. When installing Docker using the apt package manager, configure it to retrieve from the stable repository: ```bash echo \ "deb [arch=$(dpkg --print-architecture) signed-by=/usr/share/keyrings/docker-archive-keyring.gpg] https://download.docker.com/linux/ubuntu \ $(lsb_release -cs) stable" | sudo tee /etc/apt/sources.list.d/docker.list > /dev/null ``` 4. Check the currently available Docker versions for installation: ```bash sudo apt-get update && apt-cache madison docker-ce ``` Verify if the version `5:20.10.11~3-0~ubuntu-focal` is listed among the output: ```bash apt-cache madison docker-ce | grep 5:20.10.11~3-0~ubuntu-focal ``` If the addition was successful, the following output will be displayed: ```bash docker-ce | 5:20.10.11~3-0~ubuntu-focal | https://download.docker.com/linux/ubuntu focal/stable amd64 Packages ``` 5. Install Docker version `5:20.10.11~3-0~ubuntu-focal`: ```bash sudo apt-get install -y containerd.io docker-ce=5:20.10.11~3-0~ubuntu-focal docker-ce-cli=5:20.10.11~3-0~ubuntu-focal ``` 6. Check docker is installed. ```bash sudo docker run hello-world ``` If added successfully, it will output as follows: ```bash mlops@ubuntu:~$ sudo docker run hello-world Hello from Docker! This message shows that your installation appears to be working correctly. To generate this message, Docker took the following steps: 1. The Docker client contacted the Docker daemon. 2. The Docker daemon pulled the "hello-world" image from the Docker Hub. (amd64) 3. The Docker daemon created a new container from that image which runs the executable that produces the output you are currently reading. 4. The Docker daemon streamed that output to the Docker client, which sent it to your terminal. To try something more ambitious, you can run an Ubuntu container with: $ docker run -it ubuntu bash Share images, automate workflows, and more with a free Docker ID: https://hub.docker.com/ For more examples and ideas, visit: https://docs.docker.com/get-started/ ``` 7. Add permissions to use Docker commands without the `sudo` keyword by executing the following commands: ```bash sudo groupadd docker sudo usermod -aG docker $USER newgrp docker ``` 8. To verify that you can now use Docker commands without `sudo`, run the `docker run` command again: ```bash docker run hello-world ``` If you see the following message after executing the command, it means that the permissions have been successfully added: ```bash mlops@ubuntu:~$ docker run hello-world Hello from Docker! This message shows that your installation appears to be working correctly. To generate this message, Docker took the following steps: 1. The Docker client contacted the Docker daemon. 2. The Docker daemon pulled the "hello-world" image from the Docker Hub. (amd64) 3. The Docker daemon created a new container from that image which runs the executable that produces the output you are currently reading. 4. The Docker daemon streamed that output to the Docker client, which sent it to your terminal. To try something more ambitious, you can run an Ubuntu container with: $ docker run -it ubuntu bash Share images, automate workflows, and more with a free Docker ID: https://hub.docker.com/ For more examples and ideas, visit: https://docs.docker.com/get-started/ ``` ## Turn off Swap Memory In order for kubelet to work properly, **cluster** nodes must turn off the virtual memory called swap. The following command turns off the swap. **(When using cluster and client on the same desktop, turning off swap memory may result in a slowdown in speed)** ```bash sudo sed -i '/ swap / s/^\(.*\)$/#\1/g' /etc/fstab sudo swapoff -a ``` ## Install Kubectl kubectl is a client tool used to make API requests to a Kubernetes cluster. It needs to be installed on the client node. 1. Download kubectl version v1.21.7 to the current folder: ```bash curl -LO https://dl.k8s.io/release/v1.21.7/bin/linux/amd64/kubectl ``` 2. Change the file permissions and move it to the appropriate location to make kubectl executable: ```bash sudo install -o root -g root -m 0755 kubectl /usr/local/bin/kubectl ``` 3. Verify that kubectl is installed correctly: ```bash kubectl version --client ``` If you see the following message, it means that kubectl is installed successfully: ```bash Client Version: version.Info{Major:"1", Minor:"21", GitVersion:"v1.21.7", GitCommit:"1f86634ff08f37e54e8bfcd86bc90b61c98f84d4", GitTreeState:"clean", BuildDate:"2021-11-17T14:41:19Z", GoVersion:"go1.16.10", Compiler:"gc", Platform:"linux/amd64"} ``` 4. If you work with multiple Kubernetes clusters and need to manage multiple kubeconfig files or kube-contexts efficiently, you can refer to the following resources: - [Configuring Multiple kubeconfig on Your Machine](https://dev.to/aabiseverywhere/configuring-multiple-kubeconfig-on-your-machine-59eo) - [kubectx - Switch between Kubernetes contexts easily](https://github.com/ahmetb/kubectx) ## References - [Install Docker Engine on Ubuntu](https://docs.docker.com/engine/install/ubuntu/) - [Install and Set Up kubectl on Linux](https://kubernetes.io/docs/tasks/tools/install-kubectl-linux/) ================================================ FILE: i18n/en/docusaurus-plugin-content-docs/current/setup-kubernetes/intro.md ================================================ --- title: "1. Introduction" description: "Setup Introduction" sidebar_position: 1 date: 2021-12-13 lastmod: 2021-12-13 contributors: ["Jaeyeon Kim", "Jongsun Shinn", "Youngdon Tae", "SeungTae Kim"] --- ## Build MLOps System The biggest barrier when studying MLOps is the difficulty of setting up and using an MLOps system. Using public cloud platforms like AWS or GCP, or commercial tools like Weights & Biases or neptune.ai, can be costly, and starting from scratch to build the entire environment can be overwhelming and confusing. To address these challenges and help those who haven't been able to start with MLOps, *MLOps for ALL* will guide you on how to build and use an MLOps system from scratch, requiring only a desktop with Ubuntu installed. For those who cannot prepare a Ubuntu desktop environment, use virtual machines to set up the environment. > If you are using Windows or an Intel-based Mac for the *MLOps for ALL* practical exercises, you can prepare an Ubuntu desktop environment using virtual machine software such as VirtualBox or VMware. Please make sure to meet the recommended specifications when creating the virtual machine. > However, for those using an M1 Mac, as of the date of writing (February 2022), VirtualBox and VMware are not available. ([Check if macOS apps are optimized for M1 Apple Silicon Mac](https://isapplesiliconready.com/kr)) > Therefore, if you are not using a cloud environment, you can install UTM, Virtual machines for Mac, to use virtual machines. > (Purchasing and downloading software from the App Store is a form of donation-based payment. The free version is sufficient as it only differs in automatic updates.) > This virtual machine software supports the *Ubuntu 20.04.3 LTS* practice operating system, enabling you to perform the exercises on an M1 Mac. However, since it is not possible to use all the elements described in the [Components of MLOps](../introduction/component.md), *MLOps for ALL* will mainly focus on installing the representative open source software and connecting them to each other. It is not meant that installing open source software in *MLOps for ALL* is a standard, and we recommend choosing the appropriate tool that fits your situation. ## Components The components of the MLOps system that we will make in this article and each version have been verified in the following environment. To facilitate smooth testing, I will explain the setup of the **Cluster** and **Client** as separate entities. The **Cluster** refers to a single desktop with Ubuntu installed. The **Client** is recommended to be a different desktop, such as a laptop or another desktop with access to the Cluster or Kubernetes installation. However, if you only have one machine available, you can use the same desktop for both Cluster and Client purposes. ### Cluster #### 1. Software Below is the list of software that needs to be installed on the Cluster: | Software | Version | | --------------- | ----------- | | Ubuntu | 20.04.3 LTS | | Docker (Server) | 20.10.11 | | NVIDIA Driver | 470.86 | | Kubernetes | v1.21.7 | | Kubeflow | v1.4.0 | | MLFlow | v1.21.0 | #### 2. Helm Chart Below is the list of third-party software that needs to be installed using Helm: | Helm Chart Repo Name | Version | | ----------------------------- | ------- | | datawire/ambassador | 6.9.3 | | seldonio/seldon-core-operator | 1.11.2 | ### Client The Client has been validated on MacOS (Intel CPU) and Ubuntu 20.04. | Software | Version | | --------------- | ----------| | kubectl | v1.21.7 | | helm | v3.7.1 | | kustomize | v3.10.0 | ### Minimum System Requirements It is recommended that the Cluster meet the following specifications, which are dependent on the recommended specifications for Kubernetes and Kubeflow: - CPU: 6 cores - RAM: 12GB - DISK: 50GB - GPU: NVIDIA GPU (optional) ================================================ FILE: i18n/en/docusaurus-plugin-content-docs/current/setup-kubernetes/kubernetes.md ================================================ --- title : "2. Setup Kubernetes" description: "Setup Kubernetes" sidebar_position: 2 date: 2021-12-13 lastmod: 2021-12-13 contributors: ["Jaeyeon Kim"] --- ## Setup Kubernetes Cluster For those learning Kubernetes for the first time, the first barrier to entry is setting up a Kubernetes practice environment. The official tool that supports building a production-level Kubernetes cluster is kubeadm, but there are also tools such as kubespray and kops that help users set up more easily, and tools such as k3s, minikube, microk8s, and kind that help you set up a compact Kubernetes cluster easily for learning purposes. Each tool has its own advantages and disadvantages, so considering the preferences of each user, this article will use three tools: kubeadm, k3s, and minikube to set up a Kubernetes cluster. For detailed comparisons of each tool, please refer to the official Kubernetes [documentation](https://kubernetes.io/ko/docs/tasks/tools/). *MLOps for ALL* recommends **k3s** as a tool that is easy to use when setting up a Kubernetes cluster. If you want to use all the features of Kubernetes and configure the nodes, we recommend **kubeadm**. **minikube** has the advantage of being able to easily install other Kubernetes in an add-on format, in addition to the components we describe. In this *MLOps for ALL*, in order to use the components that will be built for MLOps smoothly, there are additional settings that must be configured when building the Kubernetes cluster using each of the tools. The scope of this **Setup Kubernetes** section is to build a k8s cluster on a desktop that already has Ubuntu OS installed and to confirm that external client nodes can access the Kubernetes cluster. The detailed setup procedure is composed of the following flow, as each of the three tools has its own setup procedure. ```bash 3. Setup Prerequisite 4. Setup Kubernetes 4.1. with k3s 4.2. with minikube 4.3. with kubeadm 5. Setup Kubernetes Modules ``` Let's now build a Kubernetes cluster by using each of the tools. You don't have to use all the tools, and you can use the tools that you are familiar with. ================================================ FILE: i18n/en/docusaurus-plugin-content-docs/current/setup-kubernetes/setup-nvidia-gpu.md ================================================ --- title: "6. (Optional) Setup GPU" description: "Install nvidia docker, nvidia device plugin" sidebar_position: 6 date: 2021-12-13 lastmod: 2021-12-13 contributors: ["Jaeyeon Kim"] --- For using GPU in Kubernetes and Kubeflow, the following tasks are required. ## 1. Install NVIDIA Driver If the following screen is output when executing `nvidia-smi`, please omit this step. ```bash mlops@ubuntu:~$ nvidia-smi +-----------------------------------------------------------------------------+ | NVIDIA-SMI 470.86 Driver Version: 470.86 CUDA Version: 11.4 | |-------------------------------+----------------------+----------------------+ | GPU Name Persistence-M| Bus-Id Disp.A | Volatile Uncorr. ECC | | Fan Temp Perf Pwr:Usage/Cap| Memory-Usage | GPU-Util Compute M. | | | | MIG M. | |===============================+======================+======================| | 0 NVIDIA GeForce ... Off | 00000000:01:00.0 Off | N/A | | 25% 32C P8 4W / 120W | 211MiB / 6078MiB | 0% Default | | | | N/A | +-------------------------------+----------------------+----------------------+ | 1 NVIDIA GeForce ... Off | 00000000:02:00.0 Off | N/A | | 0% 34C P8 7W / 175W | 5MiB / 7982MiB | 0% Default | | | | N/A | +-------------------------------+----------------------+----------------------+ +-----------------------------------------------------------------------------+ | Processes: | | GPU GI CI PID Type Process name GPU Memory | | ID ID Usage | |=============================================================================| | 0 N/A N/A 1644 G /usr/lib/xorg/Xorg 198MiB | | 0 N/A N/A 1893 G /usr/bin/gnome-shell 10MiB | | 1 N/A N/A 1644 G /usr/lib/xorg/Xorg 4MiB | +-----------------------------------------------------------------------------+ ``` If the output of nvidia-smi is not as above, please install the nvidia driver that fits your installed GPU. If you are not familiar with the installation of nvidia drivers, please install it through the following command. ```bash sudo add-apt-repository ppa:graphics-drivers/ppa sudo apt update && sudo apt install -y ubuntu-drivers-common sudo ubuntu-drivers autoinstall sudo reboot ``` ## 2. Install NVIDIA-Docker. Let's install NVIDIA-Docker. ```bash curl -s -L https://nvidia.github.io/nvidia-docker/gpgkey | \ sudo apt-key add - distribution=$(. /etc/os-release;echo $ID$VERSION_ID) curl -s -L https://nvidia.github.io/nvidia-docker/$distribution/nvidia-docker.list | sudo tee /etc/apt/sources.list.d/nvidia-docker.list sudo apt-get update sudo apt-get install -y nvidia-docker2 && sudo systemctl restart docker ``` To check if it is installed correctly, we will run the docker container using the GPU. ```bash sudo docker run --rm --gpus all nvidia/cuda:11.0-base nvidia-smi ``` If the following message appears, it means that the installation was successful: ```bash mlops@ubuntu:~$ sudo docker run --rm --gpus all nvidia/cuda:11.0-base nvidia-smi +-----------------------------------------------------------------------------+ | NVIDIA-SMI 470.86 Driver Version: 470.86 CUDA Version: 11.4 | |-------------------------------+----------------------+----------------------+ | GPU Name Persistence-M| Bus-Id Disp.A | Volatile Uncorr. ECC | | Fan Temp Perf Pwr:Usage/Cap| Memory-Usage | GPU-Util Compute M. | | | | MIG M. | |===============================+======================+======================| | 0 NVIDIA GeForce ... Off | 00000000:01:00.0 Off | N/A | | 25% 32C P8 4W / 120W | 211MiB / 6078MiB | 0% Default | | | | N/A | +-------------------------------+----------------------+----------------------+ | 1 NVIDIA GeForce ... Off | 00000000:02:00.0 Off | N/A | | 0% 34C P8 6W / 175W | 5MiB / 7982MiB | 0% Default | | | | N/A | +-------------------------------+----------------------+----------------------+ +-----------------------------------------------------------------------------+ | Processes: | | GPU GI CI PID Type Process name GPU Memory | | ID ID Usage | |=============================================================================| +-----------------------------------------------------------------------------+ ``` ## 3. Setting NVIDIA-Docker as the Default Container Runtime By default, Kubernetes uses Docker-CE as the default container runtime. To use NVIDIA GPU within Docker containers, you need to configure NVIDIA-Docker as the container runtime and modify the default runtime for creating pods. 1. Open the `/etc/docker/daemon.json` file and make the following modifications: ```bash sudo vi /etc/docker/daemon.json { "default-runtime": "nvidia", "runtimes": { "nvidia": { "path": "nvidia-container-runtime", "runtimeArgs": [] } } } ``` 2. After confirming the file changes, restart Docker. ```bash sudo systemctl daemon-reload sudo service docker restart ``` 3. Verify that the changes have been applied. ```bash sudo docker info | grep nvidia ``` If you see the following message, it means that the installation was successful. ```bash mlops@ubuntu:~$ docker info | grep nvidia Runtimes: io.containerd.runc.v2 io.containerd.runtime.v1.linux nvidia runc Default Runtime: nvidia ``` ## 4. Nvidia-Device-Plugin 1. Create the nvidia-device-plugin daemonset. ```bash kubectl create -f https://raw.githubusercontent.com/NVIDIA/k8s-device-plugin/v0.10.0/nvidia-device-plugin.yml ``` 2. Verify that the nvidia-device-plugin pod is in the RUNNING state. ```bash kubectl get pod -n kube-system | grep nvidia ``` You should see the following output: ```bash kube-system nvidia-device-plugin-daemonset-nlqh2 1/1 Running 0 1h ``` 3. Verify that the nodes have been configured to have GPUs available. ```bash kubectl get nodes "-o=custom-columns=NAME:.metadata.name,GPU:.status.allocatable.nvidia\.com/gpu" ``` If you see the following message, it means that the configuration was successful. (*In the *MLOps for ALL* tutorial cluster, there are two GPUs, so the output is 2. If the output shows the correct number of GPUs for your cluster, it is fine.) ```bash NAME GPU ubuntu 2 ``` If it is not configured, the GPU value will be displayed as ``. ================================================ FILE: i18n/en/docusaurus-plugin-content-docs/current.json ================================================ { "version.label": { "message": "Next", "description": "The label for version current" }, "sidebar.tutorialSidebar.category.Introduction": { "message": "Introduction", "description": "The label for category Introduction in sidebar tutorialSidebar" }, "sidebar.tutorialSidebar.category.Prerequisites": { "message": "Prerequisites", "description": "The label for category Prerequisites in sidebar tutorialSidebar" }, "sidebar.tutorialSidebar.category.Docker": { "message": "Docker", "description": "The label for category Docker in sidebar tutorialSidebar" }, "sidebar.tutorialSidebar.category.Setup Kubernetes": { "message": "Setup Kubernetes", "description": "The label for category Setup Kubernetes in sidebar tutorialSidebar" }, "sidebar.tutorialSidebar.category.4. Install Kubernetes": { "message": "4. Install Kubernetes", "description": "The label for category 4. Install Kubernetes in sidebar tutorialSidebar" }, "sidebar.tutorialSidebar.category.Setup Components": { "message": "Setup Components", "description": "The label for category Setup Components in sidebar tutorialSidebar" }, "sidebar.tutorialSidebar.category.Kubeflow UI Guide": { "message": "Kubeflow UI Guide", "description": "The label for category Kubeflow UI Guide in sidebar tutorialSidebar" }, "sidebar.tutorialSidebar.category.Kubeflow": { "message": "Kubeflow", "description": "The label for category Kubeflow in sidebar tutorialSidebar" }, "sidebar.tutorialSidebar.category.API Deployment": { "message": "API Deployment", "description": "The label for category API Deployment in sidebar tutorialSidebar" }, "sidebar.tutorialSidebar.category.Further Readings": { "message": "Further Readings", "description": "The label for category Further Readings in sidebar tutorialSidebar" }, "sidebar.tutorialSidebar.category.Appendix": { "message": "Appendix", "description": "The label for category Appendix in sidebar tutorialSidebar" }, "sidebar.preSidebar.category.Docker": { "message": "Docker", "description": "The label for category Docker in sidebar preSidebar" } } ================================================ FILE: i18n/en/docusaurus-plugin-content-docs/version-1.0/api-deployment/_category_.json ================================================ { "label": "API Deployment", "position": 7, "link": { "type": "generated-index" } } ================================================ FILE: i18n/en/docusaurus-plugin-content-docs/version-1.0/api-deployment/seldon-children.md ================================================ --- title : "6. Multi Models" description: "" sidebar_position: 6 contributors: ["Jongseob Jeon"] --- Previously, the methods explained were all targeted at a single model. On this page, we will look at how to connect multiple models. First, we will create a pipeline that creates two models. We will add a StandardScaler to the SVC model we used before and store it. ```python from functools import partial import kfp from kfp.components import InputPath, OutputPath, create_component_from_func @partial( create_component_from_func, packages_to_install=["pandas", "scikit-learn"], ) def load_iris_data( data_path: OutputPath("csv"), target_path: OutputPath("csv"), ): import pandas as pd from sklearn.datasets import load_iris iris = load_iris() data = pd.DataFrame(iris["data"], columns=iris["feature_names"]) target = pd.DataFrame(iris["target"], columns=["target"]) data.to_csv(data_path, index=False) target.to_csv(target_path, index=False) @partial( create_component_from_func, packages_to_install=["dill", "pandas", "scikit-learn", "mlflow"], ) def train_scaler_from_csv( data_path: InputPath("csv"), scaled_data_path: OutputPath("csv"), model_path: OutputPath("dill"), input_example_path: OutputPath("dill"), signature_path: OutputPath("dill"), conda_env_path: OutputPath("dill"), ): import dill import pandas as pd from sklearn.preprocessing import StandardScaler from mlflow.models.signature import infer_signature from mlflow.utils.environment import _mlflow_conda_env data = pd.read_csv(data_path) scaler = StandardScaler() scaled_data = scaler.fit_transform(data) scaled_data = pd.DataFrame(scaled_data, columns=data.columns, index=data.index) scaled_data.to_csv(scaled_data_path, index=False) with open(model_path, mode="wb") as file_writer: dill.dump(scaler, file_writer) input_example = data.sample(1) with open(input_example_path, "wb") as file_writer: dill.dump(input_example, file_writer) signature = infer_signature(data, scaler.transform(data)) with open(signature_path, "wb") as file_writer: dill.dump(signature, file_writer) conda_env = _mlflow_conda_env( additional_pip_deps=["scikit-learn"], install_mlflow=False ) with open(conda_env_path, "wb") as file_writer: dill.dump(conda_env, file_writer) @partial( create_component_from_func, packages_to_install=["dill", "pandas", "scikit-learn", "mlflow"], ) def train_svc_from_csv( train_data_path: InputPath("csv"), train_target_path: InputPath("csv"), model_path: OutputPath("dill"), input_example_path: OutputPath("dill"), signature_path: OutputPath("dill"), conda_env_path: OutputPath("dill"), kernel: str, ): import dill import pandas as pd from sklearn.svm import SVC from mlflow.models.signature import infer_signature from mlflow.utils.environment import _mlflow_conda_env train_data = pd.read_csv(train_data_path) train_target = pd.read_csv(train_target_path) clf = SVC(kernel=kernel) clf.fit(train_data, train_target) with open(model_path, mode="wb") as file_writer: dill.dump(clf, file_writer) input_example = train_data.sample(1) with open(input_example_path, "wb") as file_writer: dill.dump(input_example, file_writer) signature = infer_signature(train_data, clf.predict(train_data)) with open(signature_path, "wb") as file_writer: dill.dump(signature, file_writer) conda_env = _mlflow_conda_env( additional_pip_deps=["scikit-learn"], install_mlflow=False ) with open(conda_env_path, "wb") as file_writer: dill.dump(conda_env, file_writer) @partial( create_component_from_func, packages_to_install=["dill", "pandas", "scikit-learn", "mlflow", "boto3"], ) def upload_sklearn_model_to_mlflow( model_name: str, model_path: InputPath("dill"), input_example_path: InputPath("dill"), signature_path: InputPath("dill"), conda_env_path: InputPath("dill"), ): import os import dill from mlflow.sklearn import save_model from mlflow.tracking.client import MlflowClient os.environ["MLFLOW_S3_ENDPOINT_URL"] = "http://minio-service.kubeflow.svc:9000" os.environ["AWS_ACCESS_KEY_ID"] = "minio" os.environ["AWS_SECRET_ACCESS_KEY"] = "minio123" client = MlflowClient("http://mlflow-server-service.mlflow-system.svc:5000") with open(model_path, mode="rb") as file_reader: clf = dill.load(file_reader) with open(input_example_path, "rb") as file_reader: input_example = dill.load(file_reader) with open(signature_path, "rb") as file_reader: signature = dill.load(file_reader) with open(conda_env_path, "rb") as file_reader: conda_env = dill.load(file_reader) save_model( sk_model=clf, path=model_name, serialization_format="cloudpickle", conda_env=conda_env, signature=signature, input_example=input_example, ) run = client.create_run(experiment_id="0") client.log_artifact(run.info.run_id, model_name) from kfp.dsl import pipeline @pipeline(name="multi_model_pipeline") def multi_model_pipeline(kernel: str = "rbf"): iris_data = load_iris_data() scaled_data = train_scaler_from_csv(data=iris_data.outputs["data"]) _ = upload_sklearn_model_to_mlflow( model_name="scaler", model=scaled_data.outputs["model"], input_example=scaled_data.outputs["input_example"], signature=scaled_data.outputs["signature"], conda_env=scaled_data.outputs["conda_env"], ) model = train_svc_from_csv( train_data=scaled_data.outputs["scaled_data"], train_target=iris_data.outputs["target"], kernel=kernel, ) _ = upload_sklearn_model_to_mlflow( model_name="svc", model=model.outputs["model"], input_example=model.outputs["input_example"], signature=model.outputs["signature"], conda_env=model.outputs["conda_env"], ) if __name__ == "__main__": kfp.compiler.Compiler().compile(multi_model_pipeline, "multi_model_pipeline.yaml") ``` If you upload the pipeline, it will look like this. ![children-kubeflow.png](./img/children-kubeflow.png) When you check the MLflow dashboard, two models will be generated, as shown below. ![children-mlflow.png](./img/children-mlflow.png) After checking the run_id of each one, define the SeldonDeployment spec as follows. ```bash apiVersion: machinelearning.seldon.io/v1 kind: SeldonDeployment metadata: name: multi-model-example namespace: kubeflow-user-example-com spec: name: model predictors: - name: model componentSpecs: - spec: volumes: - name: model-provision-location emptyDir: {} initContainers: - name: scaler-initializer image: gcr.io/kfserving/storage-initializer:v0.4.0 args: - "s3://mlflow/mlflow/artifacts/0/7f445015a0e94519b003d316478766ef/artifacts/scaler" - "/mnt/models" volumeMounts: - mountPath: /mnt/models name: model-provision-location envFrom: - secretRef: name: seldon-init-container-secret - name: svc-initializer image: gcr.io/kfserving/storage-initializer:v0.4.0 args: - "s3://mlflow/mlflow/artifacts/0/87eb168e76264b39a24b0e5ca0fe922b/artifacts/svc" - "/mnt/models" volumeMounts: - mountPath: /mnt/models name: model-provision-location envFrom: - secretRef: name: seldon-init-container-secret containers: - name: scaler image: seldonio/mlflowserver:1.8.0-dev volumeMounts: - mountPath: /mnt/models name: model-provision-location readOnly: true securityContext: privileged: true runAsUser: 0 runAsGroup: 0 - name: svc image: seldonio/mlflowserver:1.8.0-dev volumeMounts: - mountPath: /mnt/models name: model-provision-location readOnly: true securityContext: privileged: true runAsUser: 0 runAsGroup: 0 graph: name: scaler type: MODEL parameters: - name: model_uri type: STRING value: "/mnt/models" - name: predict_method type: STRING value: "transform" children: - name: svc type: MODEL parameters: - name: model_uri type: STRING value: "/mnt/models" ``` Two models have been created so each model's initContainer and container must be defined. This field takes input as an array and the order does not matter. The order in which the models are executed is defined in the graph. ```bash graph: name: scaler type: MODEL parameters: - name: model_uri type: STRING value: "/mnt/models" - name: predict_method type: STRING value: "transform" children: - name: svc type: MODEL parameters: - name: model_uri type: STRING value: "/mnt/models" ``` The operation of the graph is to convert the initial value received into a predefined predict_method and then pass it to the model defined as children. In this case, the data is passed from scaler -> svc. Now let's create the above specifications in a yaml file. ```bash cat < multi-model.yaml apiVersion: machinelearning.seldon.io/v1 kind: SeldonDeployment metadata: name: multi-model-example namespace: kubeflow-user-example-com spec: name: model predictors: - name: model componentSpecs: - spec: volumes: - name: model-provision-location emptyDir: {} initContainers: - name: scaler-initializer image: gcr.io/kfserving/storage-initializer:v0.4.0 args: - "s3://mlflow/mlflow/artifacts/0/7f445015a0e94519b003d316478766ef/artifacts/scaler" - "/mnt/models" volumeMounts: - mountPath: /mnt/models name: model-provision-location envFrom: - secretRef: name: seldon-init-container-secret - name: svc-initializer image: gcr.io/kfserving/storage-initializer:v0.4.0 args: - "s3://mlflow/mlflow/artifacts/0/87eb168e76264b39a24b0e5ca0fe922b/artifacts/svc" - "/mnt/models" volumeMounts: - mountPath: /mnt/models name: model-provision-location envFrom: - secretRef: name: seldon-init-container-secret containers: - name: scaler image: ghcr.io/mlops-for-all/mlflowserver volumeMounts: - mountPath: /mnt/models name: model-provision-location readOnly: true securityContext: privileged: true runAsUser: 0 runAsGroup: 0 - name: svc image: ghcr.io/mlops-for-all/mlflowserver volumeMounts: - mountPath: /mnt/models name: model-provision-location readOnly: true securityContext: privileged: true runAsUser: 0 runAsGroup: 0 graph: name: scaler type: MODEL parameters: - name: model_uri type: STRING value: "/mnt/models" - name: predict_method type: STRING value: "transform" children: - name: svc type: MODEL parameters: - name: model_uri type: STRING value: "/mnt/models" EOF ``` Create an API through the following command. ```bash kubectl apply -f multi-model.yaml ``` If properly performed, it will be outputted as follows. ```bash seldondeployment.machinelearning.seldon.io/multi-model-example created ``` Check to see if it has been generated normally. ```bash kubectl get po -n kubeflow-user-example-com | grep multi-model-example ``` If it is created normally, a similar pod will be created. ```bash multi-model-example-model-0-scaler-svc-9955fb795-n9ffw 4/4 Running 0 2m30s ``` ================================================ FILE: i18n/en/docusaurus-plugin-content-docs/version-1.0/api-deployment/seldon-fields.md ================================================ --- title : "4. Seldon Fields" description: "" sidebar_position: 4 contributors: ["Jongseob Jeon"] --- Summary of how Seldon Core creates an API server: 1. initContainer downloads the required model from the model repository. 2. The downloaded model is passed to the container. 3. The container runs an API server enclosing the model. 4. The API can be requested at the generated API server address to receive the inference values from the model. The yaml file defining the custom resource, SeldonDeployment, which is most commonly used when using Seldon Core is as follows: ```bash apiVersion: machinelearning.seldon.io/v1 kind: SeldonDeployment metadata: name: seldon-example namespace: kubeflow-user-example-com spec: name: model predictors: - name: model componentSpecs: - spec: volumes: - name: model-provision-location emptyDir: {} initContainers: - name: model-initializer image: gcr.io/kfserving/storage-initializer:v0.4.0 args: - "gs://seldon-models/v1.12.0-dev/sklearn/iris" - "/mnt/models" volumeMounts: - mountPath: /mnt/models name: model-provision-location containers: - name: model image: seldonio/sklearnserver:1.8.0-dev volumeMounts: - mountPath: /mnt/models name: model-provision-location readOnly: true securityContext: privileged: true runAsUser: 0 runAsGroup: 0 graph: name: model type: MODEL parameters: - name: model_uri type: STRING value: "/mnt/models" children: [] ``` The `name` and `predictors` fields of SeldonDeployment are required fields. `name` is mainly used as a name to differentiate pods in Kubernetes and does not have a major effect. `predictors` must be a single array consisting of `name`, `componentSpecs` and `graph` defined. Here also, `name` is mainly used as a name to differentiate pods in Kubernetes and does not have a major effect. Now let's take a look at the fields that need to be defined in `componentSpecs` and `graph`. ## componentSpecs `componentSpecs` must be a single array consisting of the `spec` key. The `spec` must have the fields `volumes`, `initContainers` and `containers` defined. ### volumes ```bash volumes: - name: model-provision-location emptyDir: {} ``` `Volumes` refer to the space used to store the models downloaded from the initContainer, which is received as an array with the components `name` and `emptyDir`. These values are used only once when downloading and moving the models, so they do not need to be modified significantly. ```bash - name: model-initializer image: gcr.io/kfserving/storage-initializer:v0.4.0 args: - "gs://seldon-models/v1.12.0-dev/sklearn/iris" - "/mnt/models" volumeMounts: - mountPath: /mnt/models name: model-provision-location ``` The `args` field contains the system arguments necessary to download the model from the model repository and move it to the specified model path. It provides the required parameters for the initContainer to perform the downloading and storage operations. initContainer is responsible for downloading the model to be used from the API, so the fields used determine the information needed to download data from the model registry. The value of initContainer consists of n arrays, and each model needs to be specified separately. #### name `name` is the name of the pod in Kubernetes, and it is recommended to use `{model_name}-initializer` for debugging. #### image `image` is the name of the image used to download the model, and there are two recommended images by - gcr.io/kfserving/storage-initializer:v0.4.0 - seldonio/rclone-storage-initializer:1.13.0-dev For more detailed information, please refer to the following resources: - [kfserving](https://docs.seldon.io/projects/seldon-core/en/latest/servers/kfserving-storage-initializer.html) - [rclone](https://github.com/SeldonIO/seldon-core/tree/master/components/rclone-storage-initializer) In MLOps for ALL, we use kfserving for downloading and storing models. #### args ```bash args: - "gs://seldon-models/v1.12.0-dev/sklearn/iris" - "/mnt/models" ``` When the gcr.io/kfserving/storage-initializer:v0.4.0 Docker image is run (`run`), it takes an argument in the form of an array. The first array value is the address of the model to be downloaded. The second array value is the address where the downloaded model will be stored (Seldon Core usually stores it in `/mnt/models`). ### volumeMounts ```bash volumeMounts: - mountPath: /mnt/models name: model-provision-location ``` `volumeMounts` is a field that attaches volumes to the Kubernetes to share `/mnt/models` as described in volumes. For more information, refer to Kubernetes Volume [Kubernetes Volume](https://kubernetes.io/docs/concepts/storage/volumes/)." ### container ```bash containers: - name: model image: seldonio/sklearnserver:1.8.0-dev volumeMounts: - mountPath: /mnt/models name: model-provision-location readOnly: true securityContext: privileged: true runAsUser: 0 runAsGroup: 0 ``` Container defines the fields that determine the configuration when the model is run in an API form. #### name The `name` field refers to the name of the pod in Kubernetes. It should be the name of the model being used. #### image The `image` field represents the image used to convert the model into an API. The image should have all the necessary packages installed when the model is loaded. Seldon Core provides official images for different types of models, including: - seldonio/sklearnserver - seldonio/mlflowserver - seldonio/xgboostserver - seldonio/tfserving You can choose the appropriate image based on the type of model you are using. #### volumeMounts ```bash volumeMounts: - mountPath: /mnt/models name: model-provision-location readOnly: true ``` This is a field that tells the path where the data downloaded from initContainer is located. Here, to prevent the model from being modified, `readOnly: true` will also be given. #### securityContext ```bash securityContext: privileged: true runAsUser: 0 runAsGroup: 0 ``` When installing necessary packages, pod may not be able to perform the package installation due to lack of permission. To address this, root permission is granted (although this could cause security issues when in actual service). ## graph ```bash graph: name: model type: MODEL parameters: - name: model_uri type: STRING value: "/mnt/models" children: [] ``` This is a field that defines the order in which the model operates. ### name The `name` field refers to the name of the model graph. It should match the name defined in the container. ### type The `type` field can have four different values: 1. TRANSFORMER 2. MODEL 3. OUTPUT_TRANSFORMER 4. ROUTER For detailed explanations of each type, you can refer to the [Seldon Core Complex Graphs Metadata Example](https://docs.seldon.io/projects/seldon-core/en/latest/examples/graph-metadata.html). ### parameters The `parameters` field contains values used in the class init. For the sklearnserver, you can find the required values in the [following file](https://github.com/SeldonIO/seldon-core/blob/master/servers/sklearnserver/sklearnserver/SKLearnServer.py). ```python class SKLearnServer(SeldonComponent): def __init__(self, model_uri: str = None, method: str = "predict_proba"): ``` If you look at the code, you can define `model_uri` and `method`. ### children The `children` field is used when creating the sequence diagram. More details about this field will be explained on the following page. ================================================ FILE: i18n/en/docusaurus-plugin-content-docs/version-1.0/api-deployment/seldon-iris.md ================================================ --- title : "2. Deploy SeldonDeployment" description: "" sidebar_position: 2 date: 2021-12-22 lastmod: 2021-12-22 contributors: ["Youngcheol Jang", "SeungTae Kim"] --- ## Deploy with SeldonDeployment Let's deploy our trained model as an API using SeldonDeployment. SeldonDeployment is a custom resource definition (CRD) defined to deploy models as REST/gRPC servers on Kubernetes. #### 1. Prerequisites We will conduct the SeldonDeployment related practice in a new namespace called seldon-deploy. After creating the namespace, set seldon-deploy as the current namespace. ```bash kubectl create namespace seldon-deploy kubectl config set-context --current --namespace=seldon-deploy ``` ### 2. Define Spec Generate a yaml file to deploy SeldonDeployment. In this page, we will use a publicly available iris model. Because this iris model is trained through the sklearn framework, we use SKLEARN_SERVER. ```bash cat < iris-sdep.yaml apiVersion: machinelearning.seldon.io/v1alpha2 kind: SeldonDeployment metadata: name: sklearn namespace: seldon-deploy spec: name: iris predictors: - graph: children: [] implementation: SKLEARN_SERVER modelUri: gs://seldon-models/v1.12.0-dev/sklearn/iris name: classifier name: default replicas: 1 EOF ``` Deploy yaml file. ```bash kubectl apply -f iris-sdep.yaml ``` Check if the deployment was successful through the following command. ```bash kubectl get pods --selector seldon-app=sklearn-default -n seldon-deploy ``` If everyone runs, similar results will be printed. ```bash NAME READY STATUS RESTARTS AGE sklearn-default-0-classifier-5fdfd7bb77-ls9tr 2/2 Running 0 5m ``` ## Ingress URL Now, send a inference request to the deployed model to get the inference result. The API created by the SeldonDeployment follows the following rule: `http://{NODE_IP}:{NODE_PORT}/seldon/{namespace}/{seldon-deployment-name}/api/v1.0/{method-name}/` ### NODE_IP / NODE_PORT [Since Seldon Core was installed with Ambassador as the Ingress Controller](../setup-components/install-components-seldon.md), all APIs created by SeldonDeployment can be requested through the Ambassador Ingress gateway. Therefore, first set the url of the Ambassador Ingress Gateway as an environment variable. ```bash export NODE_IP=$(kubectl get nodes -o jsonpath='{ $.items[*].status.addresses[?(@.type=="InternalIP")].address }') export NODE_PORT=$(kubectl get service ambassador -n seldon-system -o jsonpath="{.spec.ports[0].nodePort}") ``` Check the set url. ```bash echo "NODE_IP"=$NODE_IP echo "NODE_PORT"=$NODE_PORT ``` It should be outputted similarly as follows, and if set through the cloud, you can check that internal IP address is set. ```bash NODE_IP=192.168.0.19 NODE_PORT=30486 ``` ### namespace / seldon-deployment-name This refers to the `namespace` and `seldon-deployment-name` where the SeldonDeployment is deployed and used to define the values defined in the metadata when defining the spec. ```bash metadata: name: sklearn namespace: seldon-deploy ``` In the example above, `namespace` is seldon-deploy, `seldon-deployment-name` is sklearn. ### method-name In SeldonDeployment, the commonly used `method-name` has two options: 1. doc 2. predictions The detailed usage of each method is explained below. ## Using Swagger First, let's explore how to use the doc method, which allows access to the Swagger generated by Seldon. ### 1. Accessing Swagger According to the provided ingress URL rules, you can access the Swagger documentation using the following URL: `http://192.168.0.19:30486/seldon/seldon-deploy/sklearn/api/v1.0/doc/` ![iris-swagger1.png](./img/iris-swagger1.png) ### 2. Selecting Swagger Predictions In the Swagger UI, select the `/seldon/seldon-deploy/sklearn/api/v1.0/predictions` endpoint. ![iris-swagger2.png](./img/iris-swagger2.png) ### 3. Choosing *Try it out* ![iris-swagger3.png](./img/iris-swagger3.png) ### 4. Inputting data in the Request body ![iris-swagger4.png](./img/iris-swagger4.png) Enter the following data into the Request body. ```bash { "data": { "ndarray":[[1.0, 2.0, 5.0, 6.0]] } } ``` ### 5. Check the inference results You can click the `Execute` button to obtain the inference result. ![iris-swagger5.png](./img/iris-swagger5.png) If everything is executed successfully, you will obtain the following inference result. ```bash { "data": { "names": [ "t:0", "t:1", "t:2" ], "ndarray": [ [ 9.912315378486697e-7, 0.0007015931307746079, 0.9992974156376876 ] ] }, "meta": { "requestPath": { "classifier": "seldonio/sklearnserver:1.11.2" } } } ``` ## Using CLI Also, you can use http client CLI tools such as curl to make API requests. For example, requesting `/predictions` as follows ```bash curl -X POST http://$NODE_IP:$NODE_PORT/seldon/seldon-deploy/sklearn/api/v1.0/predictions \ -H 'Content-Type: application/json' \ -d '{ "data": { "ndarray": [[1,2,3,4]] } }' ``` You can confirm that the following response is outputted normally. ```bash {"data":{"names":["t:0","t:1","t:2"],"ndarray":[[0.0006985194531162835,0.00366803903943666,0.995633441507447]]},"meta":{"requestPath":{"classifier":"seldonio/sklearnserver:1.11.2"}}} ``` ================================================ FILE: i18n/en/docusaurus-plugin-content-docs/version-1.0/api-deployment/seldon-mlflow.md ================================================ --- title : "5. Model from MLflow" description: "" sidebar_position: 5 contributors: ["Jongseob Jeon"] --- ## Model from MLflow On this page, we will learn how to create an API using a model saved in the [MLflow Component](../kubeflow/advanced-mlflow.md). ## Secret The initContainer needs credentials to access minio and download the model. The credentials for access to minio are as follows. ```bash apiVersion: v1 type: Opaque kind: Secret metadata: name: seldon-init-container-secret namespace: kubeflow-user-example-com data: AWS_ACCESS_KEY_ID: bWluaW8K= AWS_SECRET_ACCESS_KEY: bWluaW8xMjM= AWS_ENDPOINT_URL: aHR0cDovL21pbmlvLm1ha2luYXJvY2tzLmFp USE_SSL: ZmFsc2U= ``` The input value for `AWS_ACCESS_KEY_ID` is `minio`. However, since the input value for the secret must be an encoded value, the value that is actually entered must be the value that comes out after performing the following. The values that need to be entered in data are as follows. - AWS_ACCESS_KEY_ID: minio - AWS_SECRET_ACCESS_KEY: minio123 - AWS_ENDPOINT_URL: http://minio-service.kubeflow.svc:9000 - USE_SSL: false The encoding can be done using the following command. ```bash echo -n minio | base64 ``` Then the following values will be output. ```bash bWluaW8= ``` If you do the encoding for the entire value, it will look like this: - AWS_ACCESS_KEY_ID: minio= - AWS_SECRET_ACCESS_KEY: minio123= - AWS_ENDPOINT_URL: http://minio-service.kubeflow.svc:9000= - USE_SSL: false= You can generate a yaml file through the following command to create the secret. ```bash cat < seldon-init-container-secret.yaml apiVersion: v1 kind: Secret metadata: name: seldon-init-container-secret namespace: kubeflow-user-example-com type: Opaque data: AWS_ACCESS_KEY_ID: bWluaW8= AWS_SECRET_ACCESS_KEY: bWluaW8xMjM= AWS_ENDPOINT_URL: aHR0cDovL21pbmlvLXNlcnZpY2Uua3ViZWZsb3cuc3ZjOjkwMDA= USE_SSL: ZmFsc2U= EOF ``` Create the secret through the following command. ```bash kubectl apply -f seldon-init-container-secret.yaml ``` If performed normally, it will be output as follows. ```bash secret/seldon-init-container-secret created ``` ## Seldon Core yaml Now let's write the yaml file to create Seldon Core. ```bash apiVersion: machinelearning.seldon.io/v1 kind: SeldonDeployment metadata: name: seldon-example namespace: kubeflow-user-example-com spec: name: model predictors: - name: model componentSpecs: - spec: volumes: - name: model-provision-location emptyDir: {} initContainers: - name: model-initializer image: gcr.io/kfserving/storage-initializer:v0.4.0 args: - "s3://mlflow/mlflow/artifacts/0/74ba8e33994144f599e50b3be176cdb0/artifacts/svc" - "/mnt/models" volumeMounts: - mountPath: /mnt/models name: model-provision-location envFrom: - secretRef: name: seldon-init-container-secret containers: - name: model image: ghcr.io/mlops-for-all/mlflowserver volumeMounts: - mountPath: /mnt/models name: model-provision-location readOnly: true securityContext: privileged: true runAsUser: 0 runAsGroup: 0 graph: name: model type: MODEL parameters: - name: model_uri type: STRING value: "/mnt/models" children: [] ``` There are two major changes compared to the previously created [Seldon Fields](../api-deployment/seldon-fields.md): 1. The `envFrom` field is added to the initContainer. 2. The address in the args has been changed to `s3://mlflow/mlflow/artifacts/0/74ba8e33994144f599e50b3be176cdb0/artifacts/svc`. ### args Previously, we mentioned that the first element of the args array is the path to the model we want to download. So, how can we determine the path of the model stored in MLflow? To find the path, go back to MLflow and click on the run, then click on the model, as shown below: ![seldon-mlflow-0.png](./img/seldon-mlflow-0.png) You can use the path obtained from there. ### envFrom This process involves providing the environment variables required to access MinIO and download the model. We will use the `seldon-init-container-secret` created earlier. ## API Creation First, let's generate the YAML file based on the specification defined above. ```bash apiVersion: machinelearning.seldon.io/v1 kind: SeldonDeployment metadata: name: seldon-example namespace: kubeflow-user-example-com spec: name: model predictors: - name: model componentSpecs: - spec: volumes: - name: model-provision-location emptyDir: {} initContainers: - name: model-initializer image: gcr.io/kfserving/storage-initializer:v0.4.0 args: - "s3://mlflow/mlflow/artifacts/0/74ba8e33994144f599e50b3be176cdb0/artifacts/svc" - "/mnt/models" volumeMounts: - mountPath: /mnt/models name: model-provision-location envFrom: - secretRef: name: seldon-init-container-secret containers: - name: model image: ghcr.io/mlops-for-all/mlflowserver volumeMounts: - mountPath: /mnt/models name: model-provision-location readOnly: true securityContext: privileged: true runAsUser: 0 runAsGroup: 0 graph: name: model type: MODEL parameters: - name: model_uri type: STRING value: "/mnt/models" - name: xtype type: STRING value: "dataframe" children: [] EOF ``` Create a seldon pod. ```bash kubectl apply -f seldon-mlflow.yaml ``` If it is performed normally, it will be outputted as follows. ```bash seldondeployment.machinelearning.seldon.io/seldon-example created ``` Now we wait until the pod is up and running properly. ```bash kubectl get po -n kubeflow-user-example-com | grep seldon ``` If it is outputted similarly to the following, the API has been created normally. ```bash seldon-example-model-0-model-5c949bd894-c5f28 3/3 Running 0 69s ``` You can confirm the execution through the following request on the API created through the CLI. ```bash curl -X POST http://$NODE_IP:$NODE_PORT/seldon/seldon-deploy/sklearn/api/v1.0/predictions \ -H 'Content-Type: application/json' \ -d '{ "data": { "ndarray": [ [ 143.0, 0.0, 30.0, 30.0 ] ], "names": [ "sepal length (cm)", "sepal width (cm)", "petal length (cm)", "petal width (cm)" ] } }' ``` If executed normally, you can get the following results. ```bash {"data":{"names":[],"ndarray":["Virginica"]},"meta":{"requestPath":{"model":"ghcr.io/mlops-for-all/mlflowserver:e141f57"}}} ``` ================================================ FILE: i18n/en/docusaurus-plugin-content-docs/version-1.0/api-deployment/seldon-pg.md ================================================ --- title : "3. Seldon Monitoring" description: "Prometheus & Grafana 확인하기" sidebar_position: 3 date: 2021-12-24 lastmod: 2021-12-24 contributors: ["Jongseob Jeon"] --- ## Grafana & Prometheus Now, let's perform repeated API requests with the SeldonDeployment we created on the [previous page](../api-deployment/seldon-iris.md) and check if the dashboard changes. ### Dashboard [Forward the dashboard created earlier](../setup-components/install-components-pg.md). ```bash kubectl port-forward svc/seldon-core-analytics-grafana -n seldon-system 8090:80 ``` ### Request API Request **repeated** to the [previously created Seldon Deployment](../api-deployment/seldon-iris.md#using-cli). ```bash curl -X POST http://$NODE_IP:$NODE_PORT/seldon/seldon-deploy/sklearn/api/v1.0/predictions \ -H 'Content-Type: application/json' \ -d '{ "data": { "ndarray": [[1,2,3,4]] } }' ``` Furthermore, when checking the Grafana dashboard, you can observe that the Global Request Rate increases momentarily from `0 ops`. ![repeat-raise.png](./img/repeat-raise.png) This confirms that Prometheus and Grafana have been successfully installed and configured. ================================================ FILE: i18n/en/docusaurus-plugin-content-docs/version-1.0/api-deployment/what-is-api-deployment.md ================================================ --- title : "1. What is API Deployment?" description: "" sidebar_position: 1 date: 2021-12-22 lastmod: 2021-12-22 contributors: ["Youngcheol Jang"] --- ## What is API Deployment? After training a machine learning model, how should it be used? When training a machine learning model, you expect a model with higher performance to come out, but when you infer with the trained model, you want to get the inference results quickly and easily. When you want to check the inference results of the model, you can load the trained model and infer through a Jupyter notebook or a Python script. However, this method becomes inefficient as the model gets bigger, and you can only use the model in the environment where the trained model exists and cannot be used by many people. Therefore, when machine learning is used in actual services, it uses an API to use the trained model. The model is loaded only once in the environment where the API server is running, and you can easily get the inference results using DNS, and you can also link it with other services. However, there is a lot of ancillary work necessary to make the model into an API. In order to make it easier to make an API, machine learning frameworks such as Tensorflow have developed inference engines. Using inference engines, we can create APIs (REST or gRPC) that can load and infer from machine learning models developed and trained in the corresponding frameworks. When we send a request with the data we want to infer to an API server built using these inference engines, the engine performs the inference and sends back the results in the response. Some well-known open-source inference engines include: - [Tensorflow: Tensorflow Serving](https://github.com/tensorflow/serving) - [PyTorch: Torchserve](https://github.com/pytorch/serve) - [ONNX: ONNX Runtime](https://github.com/microsoft/onnxruntime) While not officially supported in open-source, there are also inference engines developed for popular frameworks like sklearn and XGBoost. Deploying and serving the model's inference results through an API is called **API deployment**. ## Serving Framework I introduced the fact that various inference engines have been developed. Now, if we want to deploy these inference engines in a Kubernetes environment for API deployment, what steps are involved? We need to deploy various Kubernetes resources such as Deployments for the inference engines, Services to create endpoints for sending inference requests, and Ingress to forward external inference requests to the inference engines. Additionally, we may need to handle requirements such as scaling out when there is a high volume of inference requests, monitoring the status of the inference engines, and updating the version when an improved model is available. There are many considerations when operating an inference engine, and it goes beyond just a few tasks. To address these requirements, serving frameworks have been developed to further abstract the deployment of inference engines in a Kubernetes environment. Some popular serving frameworks include: - [Seldon Core](https://github.com/SeldonIO/seldon-core) - [Kserve](https://github.com/kserve) - [BentoML](https://github.com/bentoml/BentoML) In *MLOps for ALL*, we use Seldon Core to demonstrate the process of API deployment. ================================================ FILE: i18n/en/docusaurus-plugin-content-docs/version-1.0/appendix/_category_.json ================================================ { "label": "Appendix", "position": 9, "link": { "type": "generated-index" } } ================================================ FILE: i18n/en/docusaurus-plugin-content-docs/version-1.0/appendix/metallb.md ================================================ --- title: "2. Install load balancer metallb for Bare Metal Cluster" sidebar_position: 2 --- ## What is MetalLB? ## Installing MetalLB When using Kubernetes on cloud platforms such as AWS, GCP, and Azure, they provide their own load balancers. However, for on-premises clusters, an additional module needs to be installed to enable load balancing. [MetalLB](https://metallb.universe.tf/) is an open-source project that provides a load balancer for bare metal environments. ## Requirements | Requirement | Version and Details | | ----------------------------------------------------------- | ------------------------------------------------------------ | | Kubernetes | Version >= v1.13.0 without built-in load balancing | | [Compatible Network CNI](https://metallb.universe.tf/installation/network-addons/) | Calico, Canal, Cilium, Flannel, Kube-ovn, Kube-router, Weave Net | | IPv4 addresses | Used for MetalLB deployment | | BGP mode | One or more routers that support BGP functionality | | TCP/UDP port 7946 open between nodes | Memberlist requirement | ### MetalLB Installation #### Preparation If you are using kube-proxy in IPVS mode, starting from Kubernetes v1.14.2, you need to enable strict ARP mode. By default, Kube-router enables strict ARP, so this feature is not required if you are using Kube-router as a service proxy. Before applying strict ARP mode, check the current mode. ```bash # see what changes would be made, returns nonzero returncode if different kubectl get configmap kube-proxy -n kube-system -o yaml | \ grep strictARP ``` ```bash strictARP: false ``` If strictARP: false is outputted, run the following to change it to strictARP: true. (If strictARP: true is already outputted, you do not need to execute the following command). ```bash # actually apply the changes, returns nonzero returncode on errors only kubectl get configmap kube-proxy -n kube-system -o yaml | \ sed -e "s/strictARP: false/strictARP: true/" | \ kubectl apply -f - -n kube-system ``` If performed normally, it will be output as follows. ```bash Warning: resource configmaps/kube-proxy is missing the kubectl.kubernetes.io/last-applied-configuration annotation which is required by kubectl apply. kubectl apply should only be used on resources created declaratively by either kubectl create --save-config or kubectl apply. The missing annotation will be patched automatically. configmap/kube-proxy configured ``` ### Installation - Manifest #### 1. Install MetalLB. ```bash kubectl apply -f https://raw.githubusercontent.com/metallb/metallb/v0.11.0/manifests/namespace.yaml kubectl apply -f https://raw.githubusercontent.com/metallb/metallb/v0.11.0/manifests/metallb.yaml ``` #### 2. Check installation. Wait until both pods in the metallb-system namespace are Running. ```bash kubectl get pod -n metallb-system ``` When everthing is Running, similar results will be output. ```bash NAME READY STATUS RESTARTS AGE controller-7dcc8764f4-8n92q 1/1 Running 1 1m speaker-fnf8l 1/1 Running 1 1m ``` The components of the manifest are as follows: - metallb-system/controller - Deployed as a deployment, responsible for assigning external IP addresses for load balancing. - metallb-system/speaker - Deployed as a daemonset, responsible for configuring network communication to connect external traffic and services. The service includes RBAC permissions which are necessary for the controller and speaker components to operate. ## Configuration Setting up the load balancing policy of MetalLB can be done by deploying a configmap containing the related configuration information. There are two modes that can be configured in MetalLB: 1. [Layer 2 Mode](https://metallb.universe.tf/concepts/layer2/) 2. [BGP Mode](https://metallb.universe.tf/concepts/bgp/) Here we will proceed with Layer 2 mode. ### Layer 2 Configuration In the Layer 2 mode, it is enough to set only the range of IP addresses to be used simply. When using Layer 2 mode, it is not necessary to bind IP to the network interface of the worker node, because it operates in a way that it responds directly to the ARP request of the local network and provides the computer's MAC address to the client. The following `metallb_config.yaml` file is the configuration for MetalLB to provide control over the IP range of 192.168.35.100 ~ 192.168.35.110, and to configure Layer 2 mode. In case the cluster node and the client node are separated, the range of 192.168.35.100 ~ 192.168.35.110 must be accessible by both the client node and the cluster node. #### metallb_config.yaml ```bash apiVersion: v1 kind: ConfigMap metadata: namespace: metallb-system name: config data: config: | address-pools: - name: default protocol: layer2 addresses: - 192.168.35.100-192.168.35.110 # IP 대역폭 ``` Apply the above settings. ```test kubectl apply -f metallb_config.yaml ``` If deployed normally, it will output as follows. ```test configmap/config created ``` ## Using MetalLB ### Kubeflow Dashboard First, before getting the load-balancing feature from MetalLB, check the current status by changing the type of the istio-ingressgateway service in the istio-system namespace to `LoadBalancer` to provide the Kubeflow Dashboard. ```bash kubectl get svc/istio-ingressgateway -n istio-system ``` The type of this service is ClusterIP and you can see that the External-IP value is `none`. ```bash NAME TYPE CLUSTER-IP EXTERNAL-IP PORT(S) AGE istio-ingressgateway ClusterIP 10.103.72.5 15021/TCP,80/TCP,443/TCP,31400/TCP,15443/TCP 4h21m ``` Change the type to LoadBalancer and if you want to input a desired IP address, add the loadBalancerIP item. If you do not add it, IP addresses will be assigned sequentially from the IP address pool set above. ```bash kubectl edit svc/istio-ingressgateway -n istio-system ``` ```bash spec: clusterIP: 10.103.72.5 clusterIPs: - 10.103.72.5 ipFamilies: - IPv4 ipFamilyPolicy: SingleStack ports: - name: status-port port: 15021 protocol: TCP targetPort: 15021 - name: http2 port: 80 protocol: TCP targetPort: 8080 - name: https port: 443 protocol: TCP targetPort: 8443 - name: tcp port: 31400 protocol: TCP targetPort: 31400 - name: tls port: 15443 protocol: TCP targetPort: 15443 selector: app: istio-ingressgateway istio: ingressgateway sessionAffinity: None type: LoadBalancer # Change ClusterIP to LoadBalancer loadBalancerIP: 192.168.35.100 # Add IP status: loadBalancer: {} ``` If you check again, you will see that the External-IP value is `192.168.35.100`. ```bash kubectl get svc/istio-ingressgateway -n istio-system ``` ```bash NAME TYPE CLUSTER-IP EXTERNAL-IP PORT(S) AGE istio-ingressgateway LoadBalancer 10.103.72.5 192.168.35.100 15021:31054/TCP,80:30853/TCP,443:30443/TCP,31400:30012/TCP,15443:31650/TCP 5h1m ``` Open a web browser and connect to [http://192.168.35.100](http://192.168.35.100) to verify the following screen is output. ![login-after-istio-ingressgateway-setting.png](./img/login-after-istio-ingressgateway-setting.png) ### minio Dashboard First, we check the current status before changing the type of minio-service, which provides the Dashboard of minio, in the kubeflow namespace to LoadBalancer to receive the load balancing function from MetalLB. ```bash kubectl get svc/minio-service -n kubeflow ``` The type of this service is ClusterIP and you can confirm that the External-IP value is `none`. ```bash NAME TYPE CLUSTER-IP EXTERNAL-IP PORT(S) AGE minio-service ClusterIP 10.109.209.87 9000/TCP 5h14m ``` Change the type to LoadBalancer and if you want to enter an IP address, add the loadBalancerIP item. If you do not add, the IP address will be assigned sequentially from the IP address pool set above. ```bash kubectl edit svc/minio-service -n kubeflow ``` ```bash apiVersion: v1 kind: Service metadata: annotations: kubectl.kubernetes.io/last-applied-configuration: | {"apiVersion":"v1","kind":"Service","metadata":{"annotations":{},"labels":{"application-crd-id":"kubeflow-pipelines"},"name":"minio-ser> creationTimestamp: "2022-01-05T08:44:23Z" labels: application-crd-id: kubeflow-pipelines name: minio-service namespace: kubeflow resourceVersion: "21120" uid: 0053ee28-4f87-47bb-ad6b-7ad68aa29a48 spec: clusterIP: 10.109.209.87 clusterIPs: - 10.109.209.87 ipFamilies: - IPv4 ipFamilyPolicy: SingleStack ports: - name: http port: 9000 protocol: TCP targetPort: 9000 selector: app: minio application-crd-id: kubeflow-pipelines sessionAffinity: None type: LoadBalancer # Change ClusterIP to LoadBalancer loadBalancerIP: 192.168.35.101 # Add IP status: loadBalancer: {} ``` If we check again, we can see that the External-IP value is `192.168.35.101`. ```bash kubectl get svc/minio-service -n kubeflow ``` ```bash NAME TYPE CLUSTER-IP EXTERNAL-IP PORT(S) AGE minio-service LoadBalancer 10.109.209.87 192.168.35.101 9000:31371/TCP 5h21m ``` Open a web browser and connect to [http://192.168.35.101:9000](http://192.168.35.101:9000) to confirm the following screen is printed. ![login-after-minio-setting.png](./img/login-after-minio-setting.png) ### mlflow Dashboard First, we check the current status before changing the type of mlflow-server-service service in the mlflow-system namespace that provides the mlflow Dashboard to LoadBalancer to receive load balancing function from MetalLB. ```bash kubectl get svc/mlflow-server-service -n mlflow-system ``` The type of this service is ClusterIP and you can confirm that the External-IP value is `none`. ```bash NAME TYPE CLUSTER-IP EXTERNAL-IP PORT(S) AGE mlflow-server-service ClusterIP 10.111.173.209 5000/TCP 4m50s ``` Change the type to LoadBalancer and if you want to input the desired IP address, add the loadBalancerIP item. If you do not add it, the IP address will be assigned sequentially from the IP address pool set above. ```bash kubectl edit svc/mlflow-server-service -n mlflow-system ``` ```bash apiVersion: v1 kind: Service metadata: annotations: meta.helm.sh/release-name: mlflow-server meta.helm.sh/release-namespace: mlflow-system creationTimestamp: "2022-01-07T04:00:19Z" labels: app.kubernetes.io/managed-by: Helm name: mlflow-server-service namespace: mlflow-system resourceVersion: "276246" uid: e5d39fb7-ad98-47e7-b512-f9c673055356 spec: clusterIP: 10.111.173.209 clusterIPs: - 10.111.173.209 ipFamilies: - IPv4 ipFamilyPolicy: SingleStack ports: - port: 5000 protocol: TCP targetPort: 5000 selector: app.kubernetes.io/name: mlflow-server sessionAffinity: None type: LoadBalancer # Change ClusterIP to LoadBalancer loadBalancerIP: 192.168.35.102 # Add IP status: loadBalancer: {} ``` If we check again, we can see that the External-IP value is `192.168.35.102`. ```bash kubectl get svc/mlflow-server-service -n mlflow-system ``` ```bash NAME TYPE CLUSTER-IP EXTERNAL-IP PORT(S) AGE mlflow-server-service LoadBalancer 10.111.173.209 192.168.35.102 5000:32287/TCP 6m11s ``` Open the web browser and connect to [http://192.168.35.102:5000](http://192.168.35.102:5000) to confirm the following screen is displayed. ![login-after-mlflow-setting.png](./img/login-after-mlflow-setting.png) ### Grafana Dashboard First, check the current status before changing the type of seldon-core-analytics-grafana service in the seldon-system namespace which provides Grafana's Dashboard to receive Load Balancing function from MetalLB. ```bash kubectl get svc/seldon-core-analytics-grafana -n seldon-system ``` The type of the corresponding service is ClusterIP, and you can see that the External-IP value is `none`. ```bash NAME TYPE CLUSTER-IP EXTERNAL-IP PORT(S) AGE seldon-core-analytics-grafana ClusterIP 10.109.20.161 80/TCP 94s ``` Change the type to LoadBalancer and if you want to enter an IP address, add the loadBalancerIP item. If not, an IP address will be assigned sequentially from the IP address pool set above. ```bash kubectl edit svc/seldon-core-analytics-grafana -n seldon-system ``` ```bash apiVersion: v1 kind: Service metadata: annotations: meta.helm.sh/release-name: seldon-core-analytics meta.helm.sh/release-namespace: seldon-system creationTimestamp: "2022-01-07T04:16:47Z" labels: app.kubernetes.io/instance: seldon-core-analytics app.kubernetes.io/managed-by: Helm app.kubernetes.io/name: grafana app.kubernetes.io/version: 7.0.3 helm.sh/chart: grafana-5.1.4 name: seldon-core-analytics-grafana namespace: seldon-system resourceVersion: "280605" uid: 75073b78-92ec-472c-b0d5-240038ea8fa5 spec: clusterIP: 10.109.20.161 clusterIPs: - 10.109.20.161 ipFamilies: - IPv4 ipFamilyPolicy: SingleStack ports: - name: service port: 80 protocol: TCP targetPort: 3000 selector: app.kubernetes.io/instance: seldon-core-analytics app.kubernetes.io/name: grafana sessionAffinity: None type: LoadBalancer # Change ClusterIP to LoadBalancer loadBalancerIP: 192.168.35.103 # Add IP status: loadBalancer: {} ``` If you check again, you can see that the External-IP value is `192.168.35.103`. ```bash kubectl get svc/seldon-core-analytics-grafana -n seldon-system ``` ```bash NAME TYPE CLUSTER-IP EXTERNAL-IP PORT(S) AGE seldon-core-analytics-grafana LoadBalancer 10.109.20.161 192.168.35.103 80:31191/TCP 5m14s ``` Open the Web Browser and connect to http://192.168.35.103:80 to confirm that the following screen is displayed. ![login-after-grafana-setting.png](./img/login-after-grafana-setting.png) ================================================ FILE: i18n/en/docusaurus-plugin-content-docs/version-1.0/appendix/pyenv.md ================================================ --- title: "1. Install Python virtual environment" sidebar_position: 1 --- ## Python virtual environment When working with Python, there may be cases where you want to use multiple versions of Python environments or manage package versions separately for different projects. To easily manage Python environments or Python package environments in a virtualized manner, there are tools available such as pyenv, conda, virtualenv, and venv. Among these, *MLOps for ALL* covers the installation of [pyenv](https://github.com/pyenv/pyenv) and [pyenv-virtualenv](https://github.com/pyenv/pyenv-virtualenv). pyenv helps manage Python versions, while pyenv-virtualenv is a plugin for pyenv that helps manage Python package environments. ## Installing pyenv ### Prerequisites Prerequisites vary depending on the operating system. Please refer to the [following page](https://github.com/pyenv/pyenv/wiki#suggested-build-environment) and install the required packages accordingly. ### Installation - macOS 1. Install pyenv, pyenv-virtualenv ```bash brew update brew install pyenv brew install pyenv-virtualenv ``` 2. Set pyenv For macOS, assuming the use of zsh since the default shell has changed to zsh in Catalina version and later, setting up pyenv. ```bash echo 'eval "$(pyenv init -)"' >> ~/.zshrc echo 'eval "$(pyenv virtualenv-init -)"' >> ~/.zshrc source ~/.zshrc ``` Check if the pyenv command is executed properly. ```bash pyenv --help ``` ```bash $ pyenv --help Usage: pyenv [] Some useful pyenv commands are: --version Display the version of pyenv activate Activate virtual environment commands List all available pyenv commands deactivate Deactivate virtual environment exec Run an executable with the selected Python version global Set or show the global Python version(s) help Display help for a command hooks List hook scripts for a given pyenv command init Configure the shell environment for pyenv install Install a Python version using python-build local Set or show the local application-specific Python version(s) prefix Display prefix for a Python version rehash Rehash pyenv shims (run this after installing executables) root Display the root directory where versions and shims are kept shell Set or show the shell-specific Python version shims List existing pyenv shims uninstall Uninstall a specific Python version version Show the current Python version(s) and its origin version-file Detect the file that sets the current pyenv version version-name Show the current Python version version-origin Explain how the current Python version is set versions List all Python versions available to pyenv virtualenv Create a Python virtualenv using the pyenv-virtualenv plugin virtualenv-delete Uninstall a specific Python virtualenv virtualenv-init Configure the shell environment for pyenv-virtualenv virtualenv-prefix Display real_prefix for a Python virtualenv version virtualenvs List all Python virtualenvs found in `$PYENV_ROOT/versions/*'. whence List all Python versions that contain the given executable which Display the full path to an executable See `pyenv help ' for information on a specific command. For full documentation, see: https://github.com/pyenv/pyenv#readme ``` ### Installation - Ubuntu 1. Install pyenv and pyenv-virtualenv ```bash curl https://pyenv.run | bash ``` If the following content is output, it means that the installation is successful. ```bash % Total % Received % Xferd Average Speed Time Time Time Current Dload Upload Total Spent Left Speed 0 0 0 0 0 0 0 0 --:--:-- --:--:-- 0 0 0 0 0 0 0 0 --:--:-- --:--:-- 100 270 100 270 0 0 239 0 0:00:01 0:00:01 --:--:-- 239 Cloning into '/home/mlops/.pyenv'... r ... Skip... ... remote: Enumerating objects: 10, done. remote: Counting objects: 100% (10/10), done. remote: Compressing objects: 100% (6/6), done. remote: Total 10 (delta 1), reused 6 (delta 0), pack-reused 0 Unpacking objects: 100% (10/10), 2.92 KiB | 2.92 MiB/s, done. WARNING: seems you still have not added 'pyenv' to the load path. # See the README for instructions on how to set up # your shell environment for Pyenv. # Load pyenv-virtualenv automatically by adding # the following to ~/.bashrc: eval "$(pyenv virtualenv-init -)" ``` 2. Set pyenv Assuming the use of bash shell as the default shell, configure pyenv and pyenv-virtualenv to be used in bash. ```bash sudo vi ~/.bashrc ``` Enter the following string and save it. ```bash export PATH="$HOME/.pyenv/bin:$PATH" eval "$(pyenv init -)" eval "$(pyenv virtualenv-init -)" ``` Restart the shell. ```bash exec $SHELL ``` Check if the pyenv command is executed properly. ```bash pyenv --help ``` If the following message is displayed, it means that the settings have been configured correctly. ```bash $ pyenv pyenv 2.2.2 Usage: pyenv [] Some useful pyenv commands are: --version Display the version of pyenv activate Activate virtual environment commands List all available pyenv commands deactivate Deactivate virtual environment doctor Verify pyenv installation and development tools to build pythons. exec Run an executable with the selected Python version global Set or show the global Python version(s) help Display help for a command hooks List hook scripts for a given pyenv command init Configure the shell environment for pyenv install Install a Python version using python-build local Set or show the local application-specific Python version(s) prefix Display prefix for a Python version rehash Rehash pyenv shims (run this after installing executables) root Display the root directory where versions and shims are kept shell Set or show the shell-specific Python version shims List existing pyenv shims uninstall Uninstall a specific Python version version Show the current Python version(s) and its origin version-file Detect the file that sets the current pyenv version version-name Show the current Python version version-origin Explain how the current Python version is set versions List all Python versions available to pyenv virtualenv Create a Python virtualenv using the pyenv-virtualenv plugin virtualenv-delete Uninstall a specific Python virtualenv virtualenv-init Configure the shell environment for pyenv-virtualenv virtualenv-prefix Display real_prefix for a Python virtualenv version virtualenvs List all Python virtualenvs found in `$PYENV_ROOT/versions/*'. whence List all Python versions that contain the given executable which Display the full path to an executable See `pyenv help ' for information on a specific command. For full documentation, see: https://github.com/pyenv/pyenv#readme ``` ## Using pyenv ### Install python version Using the `pyenv install ` command, you can install the desired Python version. In this page, we will install the Python 3.7.12 version that is used by Kubeflow by default as an example. ```bash pyenv install 3.7.12 ``` If installed normally, the following message will be printed. ```bash $ pyenv install 3.7.12 Downloading Python-3.7.12.tar.xz... -> https://www.python.org/ftp/python/3.7.12/Python-3.7.12.tar.xz Installing Python-3.7.12... patching file Doc/library/ctypes.rst patching file Lib/test/test_unicode.py patching file Modules/_ctypes/_ctypes.c patching file Modules/_ctypes/callproc.c patching file Modules/_ctypes/ctypes.h patching file setup.py patching file 'Misc/NEWS.d/next/Core and Builtins/2020-06-30-04-44-29.bpo-41100.PJwA6F.rst' patching file Modules/_decimal/libmpdec/mpdecimal.h Installed Python-3.7.12 to /home/mlops/.pyenv/versions/3.7.12 ``` ### Create python virtual environment Create a Python virtual environment with the `pyenv virtualenv ` command to create a Python virtual environment with the desired Python version. For example, let's create a Python virtual environment called `demo` with Python 3.7.12 version. ```bash pyenv virtualenv 3.7.12 demo ``` ```bash $ pyenv virtualenv 3.7.12 demo Looking in links: /tmp/tmpffqys0gv Requirement already satisfied: setuptools in /home/mlops/.pyenv/versions/3.7.12/envs/demo/lib/python3.7/site-packages (47.1.0) Requirement already satisfied: pip in /home/mlops/.pyenv/versions/3.7.12/envs/demo/lib/python3.7/site-packages (20.1.1) ``` ### Activating python virtual environment Use the `pyenv activate ` command to use the virtual environment created in this way. For example, we will use a Python virtual environment called `demo`. ```bash pyenv activate demo ``` You can see that the information of the current virtual environment is printed at the front of the shell. Before ```bash mlops@ubuntu:~$ pyenv activate demo ``` After ```bash pyenv-virtualenv: prompt changing will be removed from future release. configure `export PYENV_VIRTUALENV_DISABLE_PROMPT=1' to simulate the behavior. (demo) mlops@ubuntu:~$ ``` ### Deactivating python virtual environment You can deactivate the currently active virtualenv by using the command `source deactivate`. ```bash source deactivate ``` Before ```bash (demo) mlops@ubuntu:~$ source deactivate ``` After ```bash mlops@ubuntu:~$ ``` ================================================ FILE: i18n/en/docusaurus-plugin-content-docs/version-1.0/further-readings/_category_.json ================================================ { "label": "Further Readings", "position": 8, "link": { "type": "generated-index" } } ================================================ FILE: i18n/en/docusaurus-plugin-content-docs/version-1.0/further-readings/info.md ================================================ --- title: "Further Readings" date: 2021-12-21 lastmod: 2021-12-21 --- ## MLOps Component From the components covered in [MLOps Concepts](../introduction/component.md), the following diagram illustrates them. ![open-stacks-0.png](./img/open-stacks-0.png) The technology stacks covered in *Everyone's MLOps* are as follows. ![open-stacks-1.png](./img/open-stacks-1.png) | | Storage | [Minio](https://min.io/) | | | Data Processing | [Apache Spark](https://spark.apache.org/) | | | Data Visualization | [Tableau](https://www.tableau.com/) | | Workflow Mgmt. | Orchestration | [Airflow](https://airflow.apache.org/) | | | Scheduling | [Kubernetes](https://kubernetes.io/) | | Security & Compliance | Authentication & Authorization | [Ldap](https://www.openldap.org/) | | | Data Encryption & Tokenization | [Vault](https://www.vaultproject.io/) | | | Governance & Auditing | [Open Policy Agent](https://www.openpolicyagent.org/) | As you can see, there are still many MLOps components that we have not covered yet. We could not cover them all this time due to time constraints, but if you need it, it might be a good idea to refer to the following open source projects first. ![open-stacks-2.png](./img/open-stacks-2.png) For details: | Mgmt. | Component | Open Soruce | | -------------------------- | --------------------------- | ------------------------------------- | | Data Mgmt. | Collection | [Kafka](https://kafka.apache.org/) | | | Validation | [Beam](https://beam.apache.org/) | | | Feature Store | [Flink](https://flink.apache.org/) | | ML Model Dev. & Experiment | Modeling | [Jupyter](https://jupyter.org/) | | | Analysis & Experiment Mgmt. | [MLflow](https://mlflow.org/) | | | HPO Tuning & AutoML | [Katib](https://github.com/kubeflow/katib) | | Deploy Mgmt. | Serving Framework | [Seldon Core](https://docs.seldon.io/projects/seldon-core/en/latest/index.html) | | | A/B Test | [Iter8](https://iter8.tools/) | | | Monitoring | [Grafana](https://grafana.com/oss/grafana/), [Prometheus](https://prometheus.io/) | | Process Mgmt. | pipeline | [Kubeflow](https://www.kubeflow.org/) | | | CI/CD | [Github Action](https://docs.github.com/en/actions) | | | Continuous Training | [Argo Events](https://argoproj.github.io/events/) | | Platform Mgmt. | Configuration Mgmt. | [Consul](https://www.consul.io/) | | | Code Version Mgmt. | [Github](https://github.com/), [Minio](https://min.io/) | | | Logging | (EFK) [Elastic Search](https://www.elastic.co/kr/elasticsearch/), [Fluentd](https://www.fluentd.org/), [Kibana](https://www.elastic.co/kr/kibana/) | | | Resource Mgmt. | [Kubernetes](https://kubernetes.io/) | ================================================ FILE: i18n/en/docusaurus-plugin-content-docs/version-1.0/introduction/_category_.json ================================================ { "label": "Introduction", "position": 1, "link": { "type": "generated-index" } } ================================================ FILE: i18n/en/docusaurus-plugin-content-docs/version-1.0/introduction/component.md ================================================ --- title : "3. Components of MLOps" description: "Describe MLOps Components" sidebar_position: 3 date: 2021-12-03 lastmod: 2021-12-10 contributors: ["Youngcheol Jang"] --- ## Practitioners guide to MLOps Google's white paper [Practitioners guide to MLOps: A framework for continuous delivery and automation of machine learning] published in May 2021 mentions the following core functionalities of MLOps: ![mlops-component](./img/mlops-component.png) Let's look at what each feature does. ### 1. Experimentation Experimentation provides machine learning engineers with the following capabilities for data analysis, prototyping model development, and implementing training functionality: - Integration with version control tools like Git and a notebook (Jupyter Notebook) environment - Experiment tracking capabilities including data used, hyperparameters, and evaluation metrics - Data and model analysis and visualization capabilities ### 2. Data Processing Data Processing enables working with large volumes of data during the stages of model development, continuous training, and API deployment by providing the following functionalities: - Data connectors compatible with various data sources and services - Data encoders and decoders compatible with different data formats - Data transformation and feature engineering capabilities for different data types - Scalable batch and streaming data processing capabilities for training and serving ### 3. Model Training Model Training offers functionalities to efficiently execute algorithms for model training: - Environment provisioning for ML framework execution - Distributed training environment for multiple GPUs and distributed training - Hyperparameter tuning and optimization capabilities ### 4. Model Evaluation Model evaluation provides the following capabilities to observe the performance of models in both experimental and production environments: - Model performance evaluation on evaluation datasets - Tracking prediction performance across different continuous training runs - Comparison and visualization of performance between different models - Model output interpretation using interpretable AI techniques ### 5. Model Serving Model serving offers functionalities to deploy and serve models in production environments: - Low-latency and high-availability inference capabilities - Support for various ML model serving frameworks (TensorFlow Serving, TorchServe, NVIDIA Triton, Scikit-learn, XGBoost, etc.) - Advanced inference routines, such as preprocessing or postprocessing, and multi-model ensembling for final results - Autoscaling capabilities to handle spiking inference requests - Logging of inference requests and results ### 6. Online Experimentation Online experimentation provides capabilities to validate the performance of newly generated models when deployed. This functionality should be integrated with a Model Registry to coordinate the deployment of new models. - Canary and shadow deployment features - A/B testing capabilities - Multi-armed bandit testing functionality ### 7. Model Monitoring Model monitoring enables the monitoring of deployed models in production environments to ensure proper functioning and provides information on model performance degradation and the need for updates. ### 8. ML Pipeline ML Pipeline offers the following functionalities to configure, control, and automate complex ML training and inference workflows in production environments: - Pipeline execution through various event sources - ML metadata tracking and integration for pipeline parameter and artifact management - Support for built-in components for common ML tasks and user-defined components - Provisioning of different execution environments ### 9. Model Registry The Model Registry provides the capability to manage the lifecycle of machine learning models in a centralized repository. - Registration, tracking, and versioning of trained and deployed models - Storage of information about the required data and runtime packages for deployment ### 10. Dataset and Feature Repository - Sharing, search, reuse, and versioning capabilities for datasets - Real-time processing and low-latency serving capabilities for event streaming and online inference tasks - Support for various types of data, such as images, text, and tabular data ### 11. ML Metadata and Artifact Tracking In each stage of MLOps, various artifacts are generated. ML metadata refers to the information about these artifacts. ML metadata and artifact management provide the following functionalities to manage the location, type, attributes, and associations with experiments: - History management for ML artifacts - Tracking and sharing of experiments and pipeline parameter configurations - Storage, access, visualization, and download capabilities for ML artifacts - Integration with other MLOps functionalities ================================================ FILE: i18n/en/docusaurus-plugin-content-docs/version-1.0/introduction/intro.md ================================================ --- title : "1. What is MLOps?" description: "Introduction to MLOps" sidebar_position: 1 date: 2021-1./img to MLOps" lastmod: 2022-03-05 contributors: ["Jongseob Jeon"] --- ## Machine Learning Project Since 2012, when Alexnet was introduced, Machine Learning and Deep Learning have been introduced in any domain where data exists, such as Computer Vision and Natural Language Processing. Deep Learning and Machine Learning were referred to collectively as AI, and the need for AI was shouted from many media. And many companies conducted numerous projects using Machine Learning and Deep Learning. But what was the result? Byungchan Eum, the Head of North East Asia at Element AI, said “If 10 companies start an AI project, 9 of them will only be able to do concept validation (POC)”. In this way, in many projects, Machine Learning and Deep Learning only showed the possibility that they could solve this problem and then disappeared. And around this time, the outlook that [AI Winter was coming again](https://www.aifutures.org/2021/ai-winter-is-coming/) also began to emerge. Why did most projects end at the concept validation (POC) stage? Because it is impossible to operate an actual service with only Machine Learning and Deep Learning code. At the actual service stage, the portion taken up by machine learning and deep learning code is not as large as one would think, so one must consider many other aspects besides simply the performance of the model. Google has pointed out this problem in their 2015 paper [Hidden Technical Debt in Machine Learning Systems](https://proceedings.neurips.cc/paper/2015/file/86df7dcfd896fcaf2674f757a2463eba-Paper.pdf). However, at the time this paper was released, many ML engineers were busy proving the potential of deep learning and machine learning, so the points made in the paper were not given much attention. And after a few years, machine learning and deep learning had proven their potential and people were now looking to apply it to actual services. However, soon many people realized that actual services were not as easy as they thought. ## Devops MLOps is not a new concept, but rather a term derived from the development methodology called DevOps. Therefore, understanding DevOps can help in understanding MLOps. ### DevOps DevOps is a portmanteau of "Development" and "Operations," referring to a development and operations methodology that emphasizes communication, collaboration, and integration between software developers and IT professionals. It encompasses both the development and operation phases of software, aiming to achieve a symbiotic relationship between the two. The primary goal of DevOps is to enable organizations to develop and deploy software products and services rapidly by fostering close collaboration and interdependence between development and operations teams. ### Silo Effect Let's explore why DevOps is necessary through a simple scenario. In the early stages of a service, there are fewer supported features, and the team or company is relatively small. At this point, there may not be a clear distinction between development and operations, or the teams may be small. The key point here is the small scale. In such cases, there are many points of contact for effective communication, and with a limited number of services to focus on, it is possible to rapidly improve the service. However, as the service scales up, the development and operations teams tend to separate, and the physical limitations of communication channels become apparent. For example, in meetings involving multiple teams, only team leaders or a small number of seniors may attend, rather than the entire team. These limitations in communication channels inevitably lead to a lack of communication. Consequently, the development team continues to develop new features, while the operations team faces issues during deployment caused by the features developed by the development team. When such situations are repeated, it can lead to organizational silos, a phenomenon known as silo mentality. ![silo](./img/silo.png) > Indeed, the term "silo" originally refers to a tall, cylindrical structure used for storing grain or livestock feed. Silos are designed to keep the stored materials separate and prevent them from mixing. > In the context of organizations, the "silo effect" or "organizational silos effect" refers to a phenomenon where departments or teams within an organization operate independently and prioritize their own interests without effective collaboration. It reflects a mentality where individual departments focus on building their own "silos" and solely pursue their own interests. The silo effect can lead to a decline in service quality and hinder organizational performance. To address this issue, DevOps emerged as a solution. DevOps emphasizes collaboration, communication, and integration between development and operations teams, breaking down the barriers and fostering a culture of shared responsibility and collaboration. By promoting cross-functional teamwork and streamlining processes, DevOps aims to overcome silos and improve the efficiency and effectiveness of software development and operations. ### CI/CD Continuous Integration (CI) and Continuous Delivery (CD) are concrete methods to break down the barriers between development teams and operations teams. ![cicd](./img/cicd.png) Through this method, the development team can understand the operational environment and check whether the features being developed can be seamlessly deployed. The operations team can deploy validated features or improved products more often to increase customer product experience. In summary, DevOps is a methodology to solve the problem between development teams and operations teams. ## MLOps ### 1) ML + Ops DevOps is a methodology that addresses the challenges between development and operations teams, promoting collaboration and effective communication. By applying DevOps principles, development teams gain a better understanding of the operational environment, and the developed features can be seamlessly integrated and deployed. On the other hand, operations teams can deploy validated features or improved products more frequently, enhancing the overall customer experience. MLOps, which stands for Machine Learning Operations, extends the DevOps principles and practices specifically to the field of machine learning. In MLOps, the "Dev" in DevOps is replaced with "ML" to emphasize the unique challenges and considerations related to machine learning. MLOps aims to address the issues that arise between machine learning teams and operations teams. To understand these issues, let's consider an example using a recommendation system. #### Rule-Based Approach In the initial stages of building a recommendation system, a simple rule-based approach may be used. For example, items could be recommended based on the highest sales volume in the past week. With this approach, there is no need for model updates unless there are specific reasons for modification. #### Machine Learning Approach As the scale of the service grows and more log data accumulates, machine learning models can be developed based on item-based or user-based recommendations. In this case, the models are periodically retrained and redeployed. #### Deep Learning Approach When there is a greater demand for personalized recommendations and a need for models that deliver higher performance, deep learning models are developed. Similar to machine learning, these models are periodically retrained and redeployed. By considering these examples, it becomes evident that challenges can arise between the machine learning team and the operations team. MLOps aims to address these challenges and provide a methodology and set of practices to facilitate the development, deployment, and operation of machine learning models in a collaborative and efficient manner. ![graph](./img/graph.png) If we represent the concepts explained earlier on a graph, with model complexity on the x-axis and model performance on the y-axis, we can observe an upward trend where the model performance improves as the complexity increases. This often leads to the emergence of separate machine learning teams specializing in transitioning from traditional machine learning to deep learning. If there are only a few models to manage, collaboration between teams can be sufficient to address the challenges. However, as the number of models to develop increases, silos similar to those observed in DevOps can emerge. Considering the goals of DevOps, we can understand the goals of MLOps as ensuring that the developed models can be deployed successfully. While DevOps focuses on verifying that the features developed by the development team can be deployed correctly, MLOps focuses on verifying that the models developed by the machine learning team can be deployed effectively. ### 2) ML -> Ops However, recent MLOps-related products and explanations indicate that the goals are not limited to what was previously described. In some cases, the goal is to enable the machine learning team to directly operate and manage the models they develop. This need arises from the process of ongoing machine learning projects. In the case of recommendation systems, it was possible to start with simple models in operations. However, in domains such as natural language processing and image analysis, it is common to perform verification (POC) to determine if deep learning models can solve the given tasks. Once the verification is complete, the focus shifts to developing the operational environment for serving the models. However, it may not be easy for the machine learning team to handle this challenge with their internal capabilities alone. This is where MLOps becomes necessary. ### 3) Conclusion In summary, MLOps has two main goals. The earlier explanation of MLOps focused on ML+Ops, aiming to enhance productivity and collaboration between the two teams. On the other hand, the latter explanation focused on ML -> Ops, aiming to enable the machine learning team to directly operate and manage their models. ================================================ FILE: i18n/en/docusaurus-plugin-content-docs/version-1.0/introduction/levels.md ================================================ --- title : "2. Levels of MLOps" description: "Levels of MLOps" sidebar_position: 2 date: 2021-12-03 lastmod: 2022-03-05 contributors: ["Jongseob Jeon"] --- This page will look at the steps of MLOps outlined by Google and explore what the core features of MLOps are. ## Hidden Technical Debt in ML System Google has been talking about the need for MLOps since as far back as 2015. The paper Hidden Technical Debt in Machine Learning Systems encapsulates this idea from Google. ![paper](./img/paper.png) The key takeaway from this paper is that the machine learning code is only a small part of the entire system when it comes to building products with machine learning. Google developed MLOps by evolving this paper and expanding the term. More details can be found on the [Google Cloud homepage](https://cloud.google.com/architecture/mlops-continuous-delivery-and-automation-pipelines-in-machine-learning). In this post, we will try to explain what Google means by MLOps. Google divided the evolution of MLOps into three (0-2) stages. Before explaining each stage, let's review some of the concepts described in the previous post. In order to operate a machine learning model, there is a machine learning team responsible for developing the model and an operations team responsible for deployment and operations. MLOps is needed for the successful collaboration of these two teams. We have previously said that it can be done simply through Continuous Integration (CI) / Continuous Deployment (CD), so let us see how to do CI / CD. ## Level 0: Manual Process ![level-0](./img/level-0.png) At the 0th stage, two teams communicate through a "model". The machine learning team trains the model with accumulated data and delivers the trained model to the operation team. The operation team then deploys the model delivered in this way. ![toon](./img/toon.png) Initial machine learning models are deployed through this "model" centered communication. However, there are several problems with this distribution method. For example, if some functions use Python 3.7 and some use Python 3.8, we often see the following situation. The reason for this situation lies in the characteristics of the machine learning model. Three things are needed for the trained machine learning model to work: 1. Python code 2. Trained weights 3. Environment (Packages, versions) If any of these three aspects is communicated incorrectly, the model may fail to function or make unexpected predictions. However, in many cases, models fail to work due to environmental mismatches. Machine learning relies on various open-source libraries, and due to the nature of open-source, even the same function can produce different results depending on the version used. In the early stages of a service, when there are not many models to manage, these issues can be resolved quickly. However, as the number of managed features increases and communication becomes more challenging, it becomes difficult to deploy models with better performance quickly. ## Level 1: Automated ML Pipeline ### Pipeline ![level-1-pipeline](./img/level-1-pipeline.png) So, in MLOps, "pipeline" is used to prevent such problems. The MLOps pipeline ensures that the model operates in the same environment as the one used by the machine learning engineer during model development, using containers like Docker. This helps prevent situations where the model doesn't work due to differences in the environment. However, the term "pipeline" is used in a broader context and in various tasks. What is the role of the pipeline that machine learning engineers create? The pipeline created by machine learning engineers produces trained models. Therefore, it would be more accurate to refer to it as a training pipeline rather than just a pipeline. ### Continuous Training ![level-1-ct.png](./img/level-1-ct.png) And the concept of Continuous Training (CT) is added. So why is CT necessary? #### Auto Retrain In the real world, data exhibits a characteristic called "Data Shift," where the data distribution keeps changing over time. As a result, models trained in the past may experience performance degradation over time. The simplest and most effective solution to this problem is to retrain the model using recent data. By retraining the model according to the changed data distribution, it can regain its performance. #### Auto Deploy However, in industries such as manufacturing, where multiple recipes are processed in a single factory, it may not always be desirable to retrain the model unconditionally. One common example is the blind spot. For example, in an automotive production line, a model A was created and used for predictions. If an entirely different model B is introduced, it represents unseen data patterns, and a new model is trained for model B. Now, the model will make predictions for model B. However, if the data switches back to model A, what should be done? If there are only retraining rules, a new model for model A will be trained again. However, machine learning models require a sufficient amount of data to demonstrate satisfactory performance. The term "blind spot" refers to a period in which the model does not work while gathering enough data. There is a simple solution to address this blind spot. It involves checking whether there was a previous model for model A and, if so, using the previous model for prediction instead of immediately training a new model. This way, using meta-data associated with the model to automatically switch models is known as Auto Deploy. To summarize, for Continuous Training (CT), both Auto Retrain and Auto Deploy are necessary. They complement each other's weaknesses and enable the model's performance to be maintained continuously. ## Level 2: Automating the CI/CD Pipeline ![level-2](./img/level-2.png) The title of Step 2 is the automation of CI and CD. In DevOps, the focus of CI/CD is on source code. So what is the focus of CI/CD in MLOps? In MLOps, the focus of CI/CD is also on source code, but more specifically, it can be seen as the training pipeline. Therefore, when it comes to training models, it is important to verify whether the model is trained correctly (CI) and whether the trained model functions properly (CD) in response to relevant changes that can impact the training process. Hence, CI/CD should be performed when there are direct modifications to the code used for training. In addition to code, the versions of the packages used and changes in the Python version are also part of CI/CD. In many cases, machine learning utilizes open-source packages. However, open-source packages can have changes in the internal logic of functions when their versions are updated. Although notifications may be provided when there are certain version updates, significant changes in versions can go unnoticed. Therefore, when the versions of the packages used change, it is important to perform CI/CD to ensure that the model is trained and functions correctly. In summary, in MLOps, CI/CD focuses on the source code, particularly the training pipeline, to verify that the model is trained correctly and functions properly. This includes checking for direct code modifications and changes in package versions or Python versions to ensure the integrity of the training and functioning processes of the model. ================================================ FILE: i18n/en/docusaurus-plugin-content-docs/version-1.0/introduction/why_kubernetes.md ================================================ --- title : "4. Why Kubernetes?" description: "Reason for using k8s in MLOps" sidebar_position: 4 date: 2021-12-03 lastmod: 2021-12-10 contributors: ["Jaeyeon Kim"] --- ## MLOps & Kubernetes When talking about MLOps, why is the word Kubernetes always heard together? To build a successful MLOps system, various components are needed as described in [Components of MLOps](../introduction/component.md), but to operate them organically at the infrastructure level, there are many issues to be solved. For example, simply running a large number of machine learning model requests in order, ensuring the same execution environment in other workspaces, and responding quickly when a deployed service has a failure. The need for containers and container orchestration systems appears here. With the introduction of container orchestration systems such as Kubernetes, efficient isolation and management of execution environments can be achieved. By introducing a container orchestration system, it is possible to prevent situations such as *'Is anyone using cluster 1?', 'Who killed my process that was using GPU?', 'Who updated the x package on the cluster?* when developing and deploying machine learning models while a few developers share a small number of clusters. ## Container Microsoft defines a container as follows: What is a container then? In Microsoft, a container is defined as [follows](https://azure.microsoft.com/en-us/overview/what-is-a-container/). > Container: Standardized, portable packaging of an application's code, libraries, and configuration files But why is a container needed for machine learning? Machine learning models can behave differently depending on the operating system, Python execution environment, package version, etc. To prevent this, the technology used to share and execute the entire dependent execution environment with the source code used in machine learning is called containerization technology. This packaged form is called a container image, and by sharing the container image, users can ensure the same execution results on any system. In other words, by sharing not just the Jupyter Notebook file or the source code and requirements.txt file of the model, but the entire container image with the execution environment, you can avoid situations such as *"It works on my notebook, why not yours?"*. One translation of the Korean sentence to English is: "One of the common misunderstandings that people who are new to containers often make is to assume that "container == Docker". Docker is not a concept that has the same meaning as containers; rather, it is a tool that provides features to make it easier and more flexible to use containers, such as launching containers and creating and sharing container images. In summary, container is a virtualization technology, and Docker is an implementation of virtualization technology. However, Docker has become the mainstream quickly due to its easy usability and high efficiency among various container virtualization tools, so when people think of containers, they often think of Docker automatically. There are various reasons why the container and Docker ecosystem have become the mainstream, but for technical reasons, I won't go into that detail since it is outside the scope of Everybody's MLOps. ## Container Orchestration System Then what is a container orchestration system? As inferred from the word "orchestration," it can be compared to a system that coordinates the operation of numerous containers to work together harmoniously. In container-based systems, services are provided to users in the form of containers. If the number of containers to be managed is small, a single operator can sufficiently handle all situations. However, if there are hundreds of containers running in dozens of clusters and they need to function continuously without causing any failures, it becomes nearly impossible for a single operator to monitor the proper functioning of all services and respond to issues. For example, continuous monitoring is required to ensure that all services are functioning properly. If a specific service experiences a failure, the operator needs to investigate the problem by examining the logs of multiple containers. Additionally, they need to handle various tasks such as scheduling and load balancing to prevent work overload on specific clusters or containers, as well as scaling operations. A container orchestration system is software that provides functionality to manage and operate the states of numerous containers continuously and automatically, making the process of managing and operating a large number of containers somewhat easier. How can it be used in machine learning? For example, a container that packages deep learning training code that requires a GPU can be executed on a cluster with available GPUs. A container that packages data preprocessing code requiring a large amount of memory can be executed on a cluster with ample memory. If there is an issue with the cluster during training, the system can automatically move the same container to a different cluster and continue the training, eliminating the need for manual intervention. Developing such a system that automates management without requiring manual intervention is the goal. As of the writing of this text in 2022, Kubernetes is considered the de facto standard for container orchestration systems. According to the [survey](https://www.cncf.io/blog/2018/08/29/cncf-survey-use-of-cloud-native-technologies-in-production-has-grown-over-200-percent/) released by CNCF in 2018, Kubernetes was already showing its prominence. The [survey](https://www.cncf.io/wp-content/uploads/2020/08/CNCF_Survey_Report.pdf) published in 2019 indicates that 78% of respondents were using Kubernetes at a production level. ![k8s-graph](./img/k8s-graph.png) The growth of the Kubernetes ecosystem can be attributed to various reasons. However, similar to Docker, Kubernetes is not exclusively limited to machine learning-based services. Since delving into detailed technical content would require a substantial amount of discussion, this edition of "MLOps for ALL" will omit the detailed explanation of Kubernetes. ================================================ FILE: i18n/en/docusaurus-plugin-content-docs/version-1.0/kubeflow/_category_.json ================================================ { "label": "Kubeflow", "position": 6, "link": { "type": "generated-index" } } ================================================ FILE: i18n/en/docusaurus-plugin-content-docs/version-1.0/kubeflow/advanced-component.md ================================================ --- title : "8. Component - InputPath/OutputPath" description: "" sidebar_position: 8 contributors: ["Jongseob Jeon", "SeungTae Kim"] --- ## Complex Outputs On this page, we will write the code example from [Kubeflow Concepts](../kubeflow/kubeflow-concepts.md#component-contents) as a component. ## Component Contents Below is the component content used in [Kubeflow Concepts](../kubeflow/kubeflow-concepts.md#component-contents). ```python import dill import pandas as pd from sklearn.svm import SVC train_data = pd.read_csv(train_data_path) train_target = pd.read_csv(train_target_path) clf = SVC(kernel=kernel) clf.fit(train_data, train_target) with open(model_path, mode="wb") as file_writer: dill.dump(clf, file_writer) ``` ## Component Wrapper ### Define a standalone Python function With the necessary Configs for the Component Wrapper, it will look like this. ```python def train_from_csv( train_data_path: str, train_target_path: str, model_path: str, kernel: str, ): import dill import pandas as pd from sklearn.svm import SVC train_data = pd.read_csv(train_data_path) train_target = pd.read_csv(train_target_path) clf = SVC(kernel=kernel) clf.fit(train_data, train_target) with open(model_path, mode="wb") as file_writer: dill.dump(clf, file_writer) ``` In the [Basic Usage Component]](../kubeflow/basic-component), we explained that you should provide type hints for input and output when describing. But what about complex objects such as dataframes, models, that cannot be used in json? When passing values between functions in Python, objects can be returned and their value will be stored in the host's memory, so the same object can be used in the next function. However, in Kubeflow, components are running independently on each container, that is, they are not sharing the same memory, so you cannot pass objects in the same way as in a normal Python function. The only information that can be passed between components is in `json` format. Therefore, objects of types that cannot be converted into json format such as Model or DataFrame must be passed in some other way. Kubeflow solves this by storing the data in a file instead of memory, and then using the file to pass information. Since the path of the stored file is a string, it can be passed between components. However, in Kubeflow, the user does not know the path of the file before the execution. For this, Kubeflow provides a magic related to the input and output paths, `InputPath` and `OutputPath`. `InputPath` literally means the input path, and `OutputPath` literally means the output path. For example, in a component that generates and returns data, `data_path: OutputPath()` is created as an argument. And in a component that receives data, `data_path: InputPath()` is created as an argument. Once these are created, when connecting them in a pipeline, Kubeflow automatically generates and inputs the necessary paths. Therefore, users no longer need to worry about the paths and only need to consider the relationships between components. Based on this information, when rewriting the component wrapper, it would look like the following. ```python from kfp.components import InputPath, OutputPath def train_from_csv( train_data_path: InputPath("csv"), train_target_path: InputPath("csv"), model_path: OutputPath("dill"), kernel: str, ): import dill import pandas as pd from sklearn.svm import SVC train_data = pd.read_csv(train_data_path) train_target = pd.read_csv(train_target_path) clf = SVC(kernel=kernel) clf.fit(train_data, train_target) with open(model_path, mode="wb") as file_writer: dill.dump(clf, file_writer) ``` InputPath or OutputPath can accept a string. This string is the format of the file to be input or output. However, it does not necessarily mean that the file has to be stored in this format. It just serves as a helper for type checking when compiling the pipeline. If the file format is not fixed, then no input is needed (it serves the role of something like `Any` in type hints). ### Convert to Kubeflow Format Convert the written component into a format that can be used in Kubeflow. ```python from kfp.components import InputPath, OutputPath, create_component_from_func @create_component_from_func def train_from_csv( train_data_path: InputPath("csv"), train_target_path: InputPath("csv"), model_path: OutputPath("dill"), kernel: str, ): import dill import pandas as pd from sklearn.svm import SVC train_data = pd.read_csv(train_data_path) train_target = pd.read_csv(train_target_path) clf = SVC(kernel=kernel) clf.fit(train_data, train_target) with open(model_path, mode="wb") as file_writer: dill.dump(clf, file_writer) ``` ## Rule for using InputPath/OutputPath There are rules to follow when using InputPath or OutputPath arguments in pipeline. ### Load Data Component To execute the previously written component, a component that generates data is created since data is required. ```python from functools import partial from kfp.components import InputPath, OutputPath, create_component_from_func @create_component_from_func def load_iris_data( data_path: OutputPath("csv"), target_path: OutputPath("csv"), ): import pandas as pd from sklearn.datasets import load_iris iris = load_iris() data = pd.DataFrame(iris["data"], columns=iris["feature_names"]) target = pd.DataFrame(iris["target"], columns=["target"]) data.to_csv(data_path, index=False) target.to_csv(target_path, index=False) ``` ### Write Pipeline Now let's write the pipeline. ```python from kfp.dsl import pipeline @pipeline(name="complex_pipeline") def complex_pipeline(kernel: str): iris_data = load_iris_data() model = train_from_csv( train_data=iris_data.outputs["data"], train_target=iris_data.outputs["target"], kernel=kernel, ) ``` Have you noticed something strange? All the `_path` suffixes have disappeared from the arguments received in the input and output. We can see that instead of accessing `iris_data.outputs["data_path"]`, we are accessing `iris_data.outputs["data"]`. This happens because Kubeflow has a rule that paths created with `InputPath` and `OutputPath` can be accessed without the `_path` suffix when accessed from the pipeline. However, if you upload the pipeline just written, it will not run. The reason is explained on the next page. ================================================ FILE: i18n/en/docusaurus-plugin-content-docs/version-1.0/kubeflow/advanced-environment.md ================================================ --- title : "9. Component - Environment" description: "" sidebar_position: 9 contributors: ["Jongseob Jeon"] --- ## Component Environment When we run the pipeline written in [8. Component - InputPath/OutputPath](../kubeflow/advanced-component.md), it fails. Let's find out why it fails and modify it so that it can run properly. ### Convert to Kubeflow Format Let's convert the component written [earlier](../kubeflow/advanced-component.md#convert-to-kubeflow-format) into a yaml file. ```python from kfp.components import InputPath, OutputPath, create_component_from_func @create_component_from_func def train_from_csv( train_data_path: InputPath("csv"), train_target_path: InputPath("csv"), model_path: OutputPath("dill"), kernel: str, ): import dill import pandas as pd from sklearn.svm import SVC train_data = pd.read_csv(train_data_path) train_target = pd.read_csv(train_target_path) clf = SVC(kernel=kernel) clf.fit(train_data, train_target) with open(model_path, mode="wb") as file_writer: dill.dump(clf, file_writer) if __name__ == "__main__": train_from_csv.component_spec.save("train_from_csv.yaml") ``` If you run the script above, you will get a `train_from_csv.yaml` file like the one below. ```bash name: Train from csv inputs: - {name: train_data, type: csv} - {name: train_target, type: csv} - {name: model, type: dill} - {name: kernel, type: String} implementation: container: image: python:3.7 command: - sh - -ec - | program_path=$(mktemp) printf "%s" "$0" > "$program_path" python3 -u "$program_path" "$@" - | def train_from_csv( train_data_path, train_target_path, model_path, kernel, ): import dill import pandas as pd from sklearn.svm import SVC train_data = pd.read_csv(train_data_path) train_target = pd.read_csv(train_target_path) clf = SVC(kernel=kernel) clf.fit(train_data, train_target) with open(model_path, mode="wb") as file_writer: dill.dump(clf, file_writer) import argparse _parser = argparse.ArgumentParser(prog='Train from csv', description='') _parser.add_argument("--train-data", dest="train_data_path", type=str, required=True, default=argparse.SUPPRESS) _parser.add_argument("--train-target", dest="train_target_path", type=str, required=True, default=argparse.SUPPRESS) _parser.add_argument("--model", dest="model_path", type=str, required=True, default=argparse.SUPPRESS) _parser.add_argument("--kernel", dest="kernel", type=str, required=True, default=argparse.SUPPRESS) _parsed_args = vars(_parser.parse_args()) _outputs = train_from_csv(**_parsed_args) args: - --train-data - {inputPath: train_data} - --train-target - {inputPath: train_target} - --model - {inputPath: model} - --kernel - {inputValue: kernel} ``` According to the content explained in the [Basic Usage Component](../kubeflow/basic-component.md#convert-to-kubeflow-format) previously mentioned, this component will be executed as follows: 1. `docker pull python:3.7` 2. run `command` However, when running the component created above, an error will occur. The reason is in the way the component wrapper is executed. Kubeflow uses Kubernetes, so the component wrapper runs the component content on its own separate container. In detail, the image specified in the generated `train_from_csv.yaml` is `image: python:3.7`. There may be some people who notice why it is not running for some reason. The `python:3.7` image does not have the packages we want to use, such as `dill`, `pandas`, and `sklearn`, installed. Therefore, when executing, it fails with an error indicating that the packages are not found. So, how can we add the packages? ## Adding packages During the process of converting Kubeflow, there are two ways to add packages: 1. Using `base_image` 2. Using `package_to_install` Let's check what arguments the function `create_component_from_func` used to compile the components can receive. ```bash def create_component_from_func( func: Callable, output_component_file: Optional[str] = None, base_image: Optional[str] = None, packages_to_install: List[str] = None, annotations: Optional[Mapping[str, str]] = None, ): ``` - `func`: Function that creates the component wrapper to be made into a component. - `base_image`: Image that the component wrapper will run on. - `packages_to_install`: Additional packages that need to be installed for the component to use. ### 1. base_image Take a closer look at the sequence in which the component is executed and it will be as follows: 1. `docker pull base_image` 2. `pip install packages_to_install` 3. run `command` If the base_image used by the component already has all the packages installed, you can use it without installing additional packages. For example, on this page we are going to write a Dockerfile like this: ```dockerfile FROM python:3.7 RUN pip install dill pandas scikit-learn ``` Let's build the image using the Dockerfile above. The Docker hub we will use for the practice is ghcr. You can choose a Docker hub according to your environment and upload it. ```bash docker build . -f Dockerfile -t ghcr.io/mlops-for-all/base-image docker push ghcr.io/mlops-for-all/base-image ``` Now let's try inputting the base image. ```python from functools import partial from kfp.components import InputPath, OutputPath, create_component_from_func @partial( create_component_from_func, base_image="ghcr.io/mlops-for-all/base-image:latest", ) def train_from_csv( train_data_path: InputPath("csv"), train_target_path: InputPath("csv"), model_path: OutputPath("dill"), kernel: str, ): import dill import pandas as pd from sklearn.svm import SVC train_data = pd.read_csv(train_data_path) train_target = pd.read_csv(train_target_path) clf = SVC(kernel=kernel) clf.fit(train_data, train_target) with open(model_path, mode="wb") as file_writer: dill.dump(clf, file_writer) if __name__ == "__main__": train_from_csv.component_spec.save("train_from_csv.yaml") ``` If you compile the generated component, it will appear as follows. ```bash name: Train from csv inputs: - {name: train_data, type: csv} - {name: train_target, type: csv} - {name: kernel, type: String} outputs: - {name: model, type: dill} implementation: container: image: ghcr.io/mlops-for-all/base-image:latest command: - sh - -ec - | program_path=$(mktemp) printf "%s" "$0" > "$program_path" python3 -u "$program_path" "$@" - | def _make_parent_dirs_and_return_path(file_path: str): import os os.makedirs(os.path.dirname(file_path), exist_ok=True) return file_path def train_from_csv( train_data_path, train_target_path, model_path, kernel, ): import dill import pandas as pd from sklearn.svm import SVC train_data = pd.read_csv(train_data_path) train_target = pd.read_csv(train_target_path) clf = SVC(kernel=kernel) clf.fit(train_data, train_target) with open(model_path, mode="wb") as file_writer: dill.dump(clf, file_writer) import argparse _parser = argparse.ArgumentParser(prog='Train from csv', description='') _parser.add_argument("--train-data", dest="train_data_path", type=str, required=True, default=argparse.SUPPRESS) _parser.add_argument("--train-target", dest="train_target_path", type=str, required=True, default=argparse.SUPPRESS) _parser.add_argument("--kernel", dest="kernel", type=str, required=True, default=argparse.SUPPRESS) _parser.add_argument("--model", dest="model_path", type=_make_parent_dirs_and_return_path, required=True, default=argparse.SUPPRESS) _parsed_args = vars(_parser.parse_args()) _outputs = train_from_csv(**_parsed_args) args: - --train-data - {inputPath: train_data} - --train-target - {inputPath: train_target} - --kernel - {inputValue: kernel} - --model - {outputPath: model} ``` We can confirm that the base_image has been changed to the value we have set. ### 2. packages_to_install However, when packages are added, it takes a lot of time to create a new Docker image. In this case, we can use the `packages_to_install` argument to easily add packages to the container. ```python from functools import partial from kfp.components import InputPath, OutputPath, create_component_from_func @partial( create_component_from_func, packages_to_install=["dill==0.3.4", "pandas==1.3.4", "scikit-learn==1.0.1"], ) def train_from_csv( train_data_path: InputPath("csv"), train_target_path: InputPath("csv"), model_path: OutputPath("dill"), kernel: str, ): import dill import pandas as pd from sklearn.svm import SVC train_data = pd.read_csv(train_data_path) train_target = pd.read_csv(train_target_path) clf = SVC(kernel=kernel) clf.fit(train_data, train_target) with open(model_path, mode="wb") as file_writer: dill.dump(clf, file_writer) if __name__ == "__main__": train_from_csv.component_spec.save("train_from_csv.yaml") ``` If you execute the script, the `train_from_csv.yaml` file will be generated. ```bash name: Train from csv inputs: - {name: train_data, type: csv} - {name: train_target, type: csv} - {name: kernel, type: String} outputs: - {name: model, type: dill} implementation: container: image: python:3.7 command: - sh - -c - (PIP_DISABLE_PIP_VERSION_CHECK=1 python3 -m pip install --quiet --no-warn-script-location 'dill==0.3.4' 'pandas==1.3.4' 'scikit-learn==1.0.1' || PIP_DISABLE_PIP_VERSION_CHECK=1 python3 -m pip install --quiet --no-warn-script-location 'dill==0.3.4' 'pandas==1.3.4' 'scikit-learn==1.0.1' --user) && "$0" "$@" - sh - -ec - | program_path=$(mktemp) printf "%s" "$0" > "$program_path" python3 -u "$program_path" "$@" - | def _make_parent_dirs_and_return_path(file_path: str): import os os.makedirs(os.path.dirname(file_path), exist_ok=True) return file_path def train_from_csv( train_data_path, train_target_path, model_path, kernel, ): import dill import pandas as pd from sklearn.svm import SVC train_data = pd.read_csv(train_data_path) train_target = pd.read_csv(train_target_path) clf = SVC(kernel=kernel) clf.fit(train_data, train_target) with open(model_path, mode="wb") as file_writer: dill.dump(clf, file_writer) import argparse _parser = argparse.ArgumentParser(prog='Train from csv', description='') _parser.add_argument("--train-data", dest="train_data_path", type=str, required=True, default=argparse.SUPPRESS) _parser.add_argument("--train-target", dest="train_target_path", type=str, required=True, default=argparse.SUPPRESS) _parser.add_argument("--kernel", dest="kernel", type=str, required=True, default=argparse.SUPPRESS) _parser.add_argument("--model", dest="model_path", type=_make_parent_dirs_and_return_path, required=True, default=argparse.SUPPRESS) _parsed_args = vars(_parser.parse_args()) _outputs = train_from_csv(**_parsed_args) args: - --train-data - {inputPath: train_data} - --train-target - {inputPath: train_target} - --kernel - {inputValue: kernel} - --model - {outputPath: model} ``` If we take a closer look at the order in which the components written above are executed, it looks like this: 1. `docker pull python:3.7` 2. `pip install dill==0.3.4 pandas==1.3.4 scikit-learn==1.0.1` 3. run `command` When the generated yaml file is closely examined, the following lines are automatically added, so that the necessary packages are installed and the program runs smoothly without errors. ```bash command: - sh - -c - (PIP_DISABLE_PIP_VERSION_CHECK=1 python3 -m pip install --quiet --no-warn-script-location 'dill==0.3.4' 'pandas==1.3.4' 'scikit-learn==1.0.1' || PIP_DISABLE_PIP_VERSION_CHECK=1 python3 -m pip install --quiet --no-warn-script-location 'dill==0.3.4' 'pandas==1.3.4' 'scikit-learn==1.0.1' --user) && "$0" "$@" ``` ================================================ FILE: i18n/en/docusaurus-plugin-content-docs/version-1.0/kubeflow/advanced-mlflow.md ================================================ --- title : "12. Component - MLFlow" description: "" sidebar_position: 12 date: 2021-12-13 lastmod: 2021-12-20 contributors: ["Jongseob Jeon", "SeungTae Kim"] --- ## MLFlow Component In this page, we will explain the process of writing a component to store the model in MLFlow so that the model trained in [Advanced Usage Component](../kubeflow/advanced-component.md) can be linked to API deployment. ## MLFlow in Local In order to store the model in MLFlow and use it in serving, the following items are needed. - model - signature - input_example - conda_env We will look into the process of saving a model to MLFlow through Python code. ### 1. Train model The following steps involve training an SVC model using the iris dataset. ```python import pandas as pd from sklearn.datasets import load_iris from sklearn.svm import SVC iris = load_iris() data = pd.DataFrame(iris["data"], columns=iris["feature_names"]) target = pd.DataFrame(iris["target"], columns=["target"]) clf = SVC(kernel="rbf") clf.fit(data, target) ``` ### 2. MLFLow Infos This process creates the necessary information for MLFlow. ```python from mlflow.models.signature import infer_signature from mlflow.utils.environment import _mlflow_conda_env input_example = data.sample(1) signature = infer_signature(data, clf.predict(data)) conda_env = _mlflow_conda_env(additional_pip_deps=["dill", "pandas", "scikit-learn"]) ``` Each variable's content is as follows. - `input_example` | sepal length (cm) | sepal width (cm) | petal length (cm) | petal width (cm) | | --- | --- | --- | --- | | 6.5 | 6.7 | 3.1 | 4.4 | - `signature` ```python inputs: ['sepal length (cm)': double, 'sepal width (cm)': double, 'petal length (cm)': double, 'petal width (cm)': double] outputs: [Tensor('int64', (-1,))] ``` - `conda_env` ```python {'name': 'mlflow-env', 'channels': ['conda-forge'], 'dependencies': ['python=3.8.10', 'pip', {'pip': ['mlflow', 'dill', 'pandas', 'scikit-learn']}]} ``` ### 3. Save MLFLow Infos Next, we save the learned information and the model. Since the trained model uses the sklearn package, we can easily save the model using `mlflow.sklearn`. ```python from mlflow.sklearn import save_model save_model( sk_model=clf, path="svc", serialization_format="cloudpickle", conda_env=conda_env, signature=signature, input_example=input_example, ) ``` If you work locally, a svc folder will be created and the following files will be generated. ```bash ls svc ``` If you execute the command above, you can check the following output value. ```bash MLmodel conda.yaml input_example.json model.pkl requirements.txt ``` Each file will be as follows if checked. - MLmodel ```bash flavors: python_function: env: conda.yaml loader_module: mlflow.sklearn model_path: model.pkl python_version: 3.8.10 sklearn: pickled_model: model.pkl serialization_format: cloudpickle sklearn_version: 1.0.1 saved_input_example_info: artifact_path: input_example.json pandas_orient: split type: dataframe signature: inputs: '[{"name": "sepal length (cm)", "type": "double"}, {"name": "sepal width (cm)", "type": "double"}, {"name": "petal length (cm)", "type": "double"}, {"name": "petal width (cm)", "type": "double"}]' outputs: '[{"type": "tensor", "tensor-spec": {"dtype": "int64", "shape": [-1]}}]' utc_time_created: '2021-12-06 06:52:30.612810' ``` - conda.yaml ```bash channels: - conda-forge dependencies: - python=3.8.10 - pip - pip: - mlflow - dill - pandas - scikit-learn name: mlflow-env ``` - input_example.json ```bash { "columns": [ "sepal length (cm)", "sepal width (cm)", "petal length (cm)", "petal width (cm)" ], "data": [ [6.7, 3.1, 4.4, 1.4] ] } ``` - requirements.txt ```bash mlflow dill pandas scikit-learn ``` - model.pkl ## MLFlow on Server Now, let's proceed with the task of uploading the saved model to the MLflow server. ```python import mlflow with mlflow.start_run(): mlflow.log_artifact("svc/") ``` Save and open the `mlruns` directory generated path with `mlflow ui` command to launch mlflow server and dashboard. Access the mlflow dashboard, click the generated run to view it as below. ![mlflow-0.png](./img/mlflow-0.png) (This screen may vary depending on the version of mlflow.) ## MLFlow Component Now, let's write a reusable component in Kubeflow. The ways of writing components that can be reused are broadly divided into three categories. 1. After saving the necessary environment in the component responsible for model training, the MLflow component is only responsible for the upload. ![mlflow-1.png](./img/mlflow-1.png) 2. Pass the trained model and data to the MLflow component, which is responsible for saving and uploading. ![mlflow-2.png](./img/mlflow-2.png) 3. The component responsible for model training handles both saving and uploading. ![mlflow-3.png](./img/mlflow-3.png) We are trying to manage the model through the first approach. The reason is that we don't need to write the code to upload the MLFlow model every time like three times for each component written. Reusing components is possible by the methods 1 and 2. However, in the case of 2, it is necessary to deliver the trained image and packages to the component, so ultimately additional information about the component must be delivered. In order to proceed with the method 1, the learning component must also be changed. Code that stores the environment needed to save the model must be added. ```python from functools import partial from kfp.components import InputPath, OutputPath, create_component_from_func @partial( create_component_from_func, packages_to_install=["dill", "pandas", "scikit-learn", "mlflow"], ) def train_from_csv( train_data_path: InputPath("csv"), train_target_path: InputPath("csv"), model_path: OutputPath("dill"), input_example_path: OutputPath("dill"), signature_path: OutputPath("dill"), conda_env_path: OutputPath("dill"), kernel: str, ): import dill import pandas as pd from sklearn.svm import SVC from mlflow.models.signature import infer_signature from mlflow.utils.environment import _mlflow_conda_env train_data = pd.read_csv(train_data_path) train_target = pd.read_csv(train_target_path) clf = SVC(kernel=kernel) clf.fit(train_data, train_target) with open(model_path, mode="wb") as file_writer: dill.dump(clf, file_writer) input_example = train_data.sample(1) with open(input_example_path, "wb") as file_writer: dill.dump(input_example, file_writer) signature = infer_signature(train_data, clf.predict(train_data)) with open(signature_path, "wb") as file_writer: dill.dump(signature, file_writer) conda_env = _mlflow_conda_env( additional_pip_deps=["dill", "pandas", "scikit-learn"] ) with open(conda_env_path, "wb") as file_writer: dill.dump(conda_env, file_writer) ``` Write a component to upload to MLFlow. At this time, configure the uploaded MLFlow endpoint to be connected to the [mlflow service](../setup-components/install-components-mlflow.md) that we installed. In this case, use the Kubernetes Service DNS Name of the Minio installed at the time of MLFlow Server installation. As this service is created in the Kubeflow namespace with the name minio-service, set it to `http://minio-service.kubeflow.svc:9000`. Similarly, for the tracking_uri address, use the Kubernetes Service DNS Name of the MLFlow server and set it to `http://mlflow-server-service.mlflow-system.svc:5000`. ```python from functools import partial from kfp.components import InputPath, create_component_from_func @partial( create_component_from_func, packages_to_install=["dill", "pandas", "scikit-learn", "mlflow", "boto3"], ) def upload_sklearn_model_to_mlflow( model_name: str, model_path: InputPath("dill"), input_example_path: InputPath("dill"), signature_path: InputPath("dill"), conda_env_path: InputPath("dill"), ): import os import dill from mlflow.sklearn import save_model from mlflow.tracking.client import MlflowClient os.environ["MLFLOW_S3_ENDPOINT_URL"] = "http://minio-service.kubeflow.svc:9000" os.environ["AWS_ACCESS_KEY_ID"] = "minio" os.environ["AWS_SECRET_ACCESS_KEY"] = "minio123" client = MlflowClient("http://mlflow-server-service.mlflow-system.svc:5000") with open(model_path, mode="rb") as file_reader: clf = dill.load(file_reader) with open(input_example_path, "rb") as file_reader: input_example = dill.load(file_reader) with open(signature_path, "rb") as file_reader: signature = dill.load(file_reader) with open(conda_env_path, "rb") as file_reader: conda_env = dill.load(file_reader) save_model( sk_model=clf, path=model_name, serialization_format="cloudpickle", conda_env=conda_env, signature=signature, input_example=input_example, ) run = client.create_run(experiment_id="0") client.log_artifact(run.info.run_id, model_name) ``` ## MLFlow Pipeline Now let's connect the components we have written and create a pipeline. ### Data Component The data we will use to train the model is sklearn's iris. We will write a component to generate the data. ```python from functools import partial from kfp.components import InputPath, OutputPath, create_component_from_func @partial( create_component_from_func, packages_to_install=["pandas", "scikit-learn"], ) def load_iris_data( data_path: OutputPath("csv"), target_path: OutputPath("csv"), ): import pandas as pd from sklearn.datasets import load_iris iris = load_iris() data = pd.DataFrame(iris["data"], columns=iris["feature_names"]) target = pd.DataFrame(iris["target"], columns=["target"]) data.to_csv(data_path, index=False) target.to_csv(target_path, index=False) ``` ### Pipeline The pipeline code can be written as follows. ```python from kfp.dsl import pipeline @pipeline(name="mlflow_pipeline") def mlflow_pipeline(kernel: str, model_name: str): iris_data = load_iris_data() model = train_from_csv( train_data=iris_data.outputs["data"], train_target=iris_data.outputs["target"], kernel=kernel, ) _ = upload_sklearn_model_to_mlflow( model_name=model_name, model=model.outputs["model"], input_example=model.outputs["input_example"], signature=model.outputs["signature"], conda_env=model.outputs["conda_env"], ) ``` ### Run If you organize the components and pipelines written above into a single Python file, it would look like this. ```python from functools import partial import kfp from kfp.components import InputPath, OutputPath, create_component_from_func from kfp.dsl import pipeline @partial( create_component_from_func, packages_to_install=["pandas", "scikit-learn"], ) def load_iris_data( data_path: OutputPath("csv"), target_path: OutputPath("csv"), ): import pandas as pd from sklearn.datasets import load_iris iris = load_iris() data = pd.DataFrame(iris["data"], columns=iris["feature_names"]) target = pd.DataFrame(iris["target"], columns=["target"]) data.to_csv(data_path, index=False) target.to_csv(target_path, index=False) @partial( create_component_from_func, packages_to_install=["dill", "pandas", "scikit-learn", "mlflow"], ) def train_from_csv( train_data_path: InputPath("csv"), train_target_path: InputPath("csv"), model_path: OutputPath("dill"), input_example_path: OutputPath("dill"), signature_path: OutputPath("dill"), conda_env_path: OutputPath("dill"), kernel: str, ): import dill import pandas as pd from sklearn.svm import SVC from mlflow.models.signature import infer_signature from mlflow.utils.environment import _mlflow_conda_env train_data = pd.read_csv(train_data_path) train_target = pd.read_csv(train_target_path) clf = SVC(kernel=kernel) clf.fit(train_data, train_target) with open(model_path, mode="wb") as file_writer: dill.dump(clf, file_writer) input_example = train_data.sample(1) with open(input_example_path, "wb") as file_writer: dill.dump(input_example, file_writer) signature = infer_signature(train_data, clf.predict(train_data)) with open(signature_path, "wb") as file_writer: dill.dump(signature, file_writer) conda_env = _mlflow_conda_env( additional_pip_deps=["dill", "pandas", "scikit-learn"] ) with open(conda_env_path, "wb") as file_writer: dill.dump(conda_env, file_writer) @partial( create_component_from_func, packages_to_install=["dill", "pandas", "scikit-learn", "mlflow", "boto3"], ) def upload_sklearn_model_to_mlflow( model_name: str, model_path: InputPath("dill"), input_example_path: InputPath("dill"), signature_path: InputPath("dill"), conda_env_path: InputPath("dill"), ): import os import dill from mlflow.sklearn import save_model from mlflow.tracking.client import MlflowClient os.environ["MLFLOW_S3_ENDPOINT_URL"] = "http://minio-service.kubeflow.svc:9000" os.environ["AWS_ACCESS_KEY_ID"] = "minio" os.environ["AWS_SECRET_ACCESS_KEY"] = "minio123" client = MlflowClient("http://mlflow-server-service.mlflow-system.svc:5000") with open(model_path, mode="rb") as file_reader: clf = dill.load(file_reader) with open(input_example_path, "rb") as file_reader: input_example = dill.load(file_reader) with open(signature_path, "rb") as file_reader: signature = dill.load(file_reader) with open(conda_env_path, "rb") as file_reader: conda_env = dill.load(file_reader) save_model( sk_model=clf, path=model_name, serialization_format="cloudpickle", conda_env=conda_env, signature=signature, input_example=input_example, ) run = client.create_run(experiment_id="0") client.log_artifact(run.info.run_id, model_name) @pipeline(name="mlflow_pipeline") def mlflow_pipeline(kernel: str, model_name: str): iris_data = load_iris_data() model = train_from_csv( train_data=iris_data.outputs["data"], train_target=iris_data.outputs["target"], kernel=kernel, ) _ = upload_sklearn_model_to_mlflow( model_name=model_name, model=model.outputs["model"], input_example=model.outputs["input_example"], signature=model.outputs["signature"], conda_env=model.outputs["conda_env"], ) if __name__ == "__main__": kfp.compiler.Compiler().compile(mlflow_pipeline, "mlflow_pipeline.yaml") ```

mlflow_pipeline.yaml ```bash apiVersion: argoproj.io/v1alpha1 kind: Workflow metadata: generateName: mlflow-pipeline- annotations: {pipelines.kubeflow.org/kfp_sdk_version: 1.8.10, pipelines.kubeflow.org/pipeline_compilation_time: '2022-01-19T14:14:11.999807', pipelines.kubeflow.org/pipeline_spec: '{"inputs": [{"name": "kernel", "type": "String"}, {"name": "model_name", "type": "String"}], "name": "mlflow_pipeline"}'} labels: {pipelines.kubeflow.org/kfp_sdk_version: 1.8.10} spec: entrypoint: mlflow-pipeline templates: - name: load-iris-data container: args: [--data, /tmp/outputs/data/data, --target, /tmp/outputs/target/data] command: - sh - -c - (PIP_DISABLE_PIP_VERSION_CHECK=1 python3 -m pip install --quiet --no-warn-script-location 'pandas' 'scikit-learn' || PIP_DISABLE_PIP_VERSION_CHECK=1 python3 -m pip install --quiet --no-warn-script-location 'pandas' 'scikit-learn' --user) && "$0" "$@" - sh - -ec - | program_path=$(mktemp) printf "%s" "$0" > "$program_path" python3 -u "$program_path" "$@" - | def _make_parent_dirs_and_return_path(file_path: str): import os os.makedirs(os.path.dirname(file_path), exist_ok=True) return file_path def load_iris_data( data_path, target_path, ): import pandas as pd from sklearn.datasets import load_iris iris = load_iris() data = pd.DataFrame(iris["data"], columns=iris["feature_names"]) target = pd.DataFrame(iris["target"], columns=["target"]) data.to_csv(data_path, index=False) target.to_csv(target_path, index=False) import argparse _parser = argparse.ArgumentParser(prog='Load iris data', description='') _parser.add_argument("--data", dest="data_path", type=_make_parent_dirs_and_return_path, required=True, default=argparse.SUPPRESS) _parser.add_argument("--target", dest="target_path", type=_make_parent_dirs_and_return_path, required=True, default=argparse.SUPPRESS) _parsed_args = vars(_parser.parse_args()) _outputs = load_iris_data(**_parsed_args) image: python:3.7 outputs: artifacts: - {name: load-iris-data-data, path: /tmp/outputs/data/data} - {name: load-iris-data-target, path: /tmp/outputs/target/data} metadata: labels: pipelines.kubeflow.org/kfp_sdk_version: 1.8.10 pipelines.kubeflow.org/pipeline-sdk-type: kfp pipelines.kubeflow.org/enable_caching: "true" annotations: {pipelines.kubeflow.org/component_spec: '{"implementation": {"container": {"args": ["--data", {"outputPath": "data"}, "--target", {"outputPath": "target"}], "command": ["sh", "-c", "(PIP_DISABLE_PIP_VERSION_CHECK=1 python3 -m pip install --quiet --no-warn-script-location ''pandas'' ''scikit-learn'' || PIP_DISABLE_PIP_VERSION_CHECK=1 python3 -m pip install --quiet --no-warn-script-location ''pandas'' ''scikit-learn'' --user) && \"$0\" \"$@\"", "sh", "-ec", "program_path=$(mktemp)\nprintf \"%s\" \"$0\" > \"$program_path\"\npython3 -u \"$program_path\" \"$@\"\n", "def _make_parent_dirs_and_return_path(file_path: str):\n import os\n os.makedirs(os.path.dirname(file_path), exist_ok=True)\n return file_path\n\ndef load_iris_data(\n data_path,\n target_path,\n):\n import pandas as pd\n from sklearn.datasets import load_iris\n\n iris = load_iris()\n\n data = pd.DataFrame(iris[\"data\"], columns=iris[\"feature_names\"])\n target = pd.DataFrame(iris[\"target\"], columns=[\"target\"])\n\n data.to_csv(data_path, index=False)\n target.to_csv(target_path, index=False)\n\nimport argparse\n_parser = argparse.ArgumentParser(prog=''Load iris data'', description='''')\n_parser.add_argument(\"--data\", dest=\"data_path\", type=_make_parent_dirs_and_return_path, required=True, default=argparse.SUPPRESS)\n_parser.add_argument(\"--target\", dest=\"target_path\", type=_make_parent_dirs_and_return_path, required=True, default=argparse.SUPPRESS)\n_parsed_args = vars(_parser.parse_args())\n\n_outputs = load_iris_data(**_parsed_args)\n"], "image": "python:3.7"}}, "name": "Load iris data", "outputs": [{"name": "data", "type": "csv"}, {"name": "target", "type": "csv"}]}', pipelines.kubeflow.org/component_ref: '{}'} - name: mlflow-pipeline inputs: parameters: - {name: kernel} - {name: model_name} dag: tasks: - {name: load-iris-data, template: load-iris-data} - name: train-from-csv template: train-from-csv dependencies: [load-iris-data] arguments: parameters: - {name: kernel, value: '{{inputs.parameters.kernel}}'} artifacts: - {name: load-iris-data-data, from: '{{tasks.load-iris-data.outputs.artifacts.load-iris-data-data}}'} - {name: load-iris-data-target, from: '{{tasks.load-iris-data.outputs.artifacts.load-iris-data-target}}'} - name: upload-sklearn-model-to-mlflow template: upload-sklearn-model-to-mlflow dependencies: [train-from-csv] arguments: parameters: - {name: model_name, value: '{{inputs.parameters.model_name}}'} artifacts: - {name: train-from-csv-conda_env, from: '{{tasks.train-from-csv.outputs.artifacts.train-from-csv-conda_env}}'} - {name: train-from-csv-input_example, from: '{{tasks.train-from-csv.outputs.artifacts.train-from-csv-input_example}}'} - {name: train-from-csv-model, from: '{{tasks.train-from-csv.outputs.artifacts.train-from-csv-model}}'} - {name: train-from-csv-signature, from: '{{tasks.train-from-csv.outputs.artifacts.train-from-csv-signature}}'} - name: train-from-csv container: args: [--train-data, /tmp/inputs/train_data/data, --train-target, /tmp/inputs/train_target/data, --kernel, '{{inputs.parameters.kernel}}', --model, /tmp/outputs/model/data, --input-example, /tmp/outputs/input_example/data, --signature, /tmp/outputs/signature/data, --conda-env, /tmp/outputs/conda_env/data] command: - sh - -c - (PIP_DISABLE_PIP_VERSION_CHECK=1 python3 -m pip install --quiet --no-warn-script-location 'dill' 'pandas' 'scikit-learn' 'mlflow' || PIP_DISABLE_PIP_VERSION_CHECK=1 python3 -m pip install --quiet --no-warn-script-location 'dill' 'pandas' 'scikit-learn' 'mlflow' --user) && "$0" "$@" - sh - -ec - | program_path=$(mktemp) printf "%s" "$0" > "$program_path" python3 -u "$program_path" "$@" - | def _make_parent_dirs_and_return_path(file_path: str): import os os.makedirs(os.path.dirname(file_path), exist_ok=True) return file_path def train_from_csv( train_data_path, train_target_path, model_path, input_example_path, signature_path, conda_env_path, kernel, ): import dill import pandas as pd from sklearn.svm import SVC from mlflow.models.signature import infer_signature from mlflow.utils.environment import _mlflow_conda_env train_data = pd.read_csv(train_data_path) train_target = pd.read_csv(train_target_path) clf = SVC(kernel=kernel) clf.fit(train_data, train_target) with open(model_path, mode="wb") as file_writer: dill.dump(clf, file_writer) input_example = train_data.sample(1) with open(input_example_path, "wb") as file_writer: dill.dump(input_example, file_writer) signature = infer_signature(train_data, clf.predict(train_data)) with open(signature_path, "wb") as file_writer: dill.dump(signature, file_writer) conda_env = _mlflow_conda_env( additional_pip_deps=["dill", "pandas", "scikit-learn"] ) with open(conda_env_path, "wb") as file_writer: dill.dump(conda_env, file_writer) import argparse _parser = argparse.ArgumentParser(prog='Train from csv', description='') _parser.add_argument("--train-data", dest="train_data_path", type=str, required=True, default=argparse.SUPPRESS) _parser.add_argument("--train-target", dest="train_target_path", type=str, required=True, default=argparse.SUPPRESS) _parser.add_argument("--kernel", dest="kernel", type=str, required=True, default=argparse.SUPPRESS) _parser.add_argument("--model", dest="model_path", type=_make_parent_dirs_and_return_path, required=True, default=argparse.SUPPRESS) _parser.add_argument("--input-example", dest="input_example_path", type=_make_parent_dirs_and_return_path, required=True, default=argparse.SUPPRESS) _parser.add_argument("--signature", dest="signature_path", type=_make_parent_dirs_and_return_path, required=True, default=argparse.SUPPRESS) _parser.add_argument("--conda-env", dest="conda_env_path", type=_make_parent_dirs_and_return_path, required=True, default=argparse.SUPPRESS) _parsed_args = vars(_parser.parse_args()) _outputs = train_from_csv(**_parsed_args) image: python:3.7 inputs: parameters: - {name: kernel} artifacts: - {name: load-iris-data-data, path: /tmp/inputs/train_data/data} - {name: load-iris-data-target, path: /tmp/inputs/train_target/data} outputs: artifacts: - {name: train-from-csv-conda_env, path: /tmp/outputs/conda_env/data} - {name: train-from-csv-input_example, path: /tmp/outputs/input_example/data} - {name: train-from-csv-model, path: /tmp/outputs/model/data} - {name: train-from-csv-signature, path: /tmp/outputs/signature/data} metadata: labels: pipelines.kubeflow.org/kfp_sdk_version: 1.8.10 pipelines.kubeflow.org/pipeline-sdk-type: kfp pipelines.kubeflow.org/enable_caching: "true" annotations: {pipelines.kubeflow.org/component_spec: '{"implementation": {"container": {"args": ["--train-data", {"inputPath": "train_data"}, "--train-target", {"inputPath": "train_target"}, "--kernel", {"inputValue": "kernel"}, "--model", {"outputPath": "model"}, "--input-example", {"outputPath": "input_example"}, "--signature", {"outputPath": "signature"}, "--conda-env", {"outputPath": "conda_env"}], "command": ["sh", "-c", "(PIP_DISABLE_PIP_VERSION_CHECK=1 python3 -m pip install --quiet --no-warn-script-location ''dill'' ''pandas'' ''scikit-learn'' ''mlflow'' || PIP_DISABLE_PIP_VERSION_CHECK=1 python3 -m pip install --quiet --no-warn-script-location ''dill'' ''pandas'' ''scikit-learn'' ''mlflow'' --user) && \"$0\" \"$@\"", "sh", "-ec", "program_path=$(mktemp)\nprintf \"%s\" \"$0\" > \"$program_path\"\npython3 -u \"$program_path\" \"$@\"\n", "def _make_parent_dirs_and_return_path(file_path: str):\n import os\n os.makedirs(os.path.dirname(file_path), exist_ok=True)\n return file_path\n\ndef train_from_csv(\n train_data_path,\n train_target_path,\n model_path,\n input_example_path,\n signature_path,\n conda_env_path,\n kernel,\n):\n import dill\n import pandas as pd\n from sklearn.svm import SVC\n\n from mlflow.models.signature import infer_signature\n from mlflow.utils.environment import _mlflow_conda_env\n\n train_data = pd.read_csv(train_data_path)\n train_target = pd.read_csv(train_target_path)\n\n clf = SVC(kernel=kernel)\n clf.fit(train_data, train_target)\n\n with open(model_path, mode=\"wb\") as file_writer:\n dill.dump(clf, file_writer)\n\n input_example = train_data.sample(1)\n with open(input_example_path, \"wb\") as file_writer:\n dill.dump(input_example, file_writer)\n\n signature = infer_signature(train_data, clf.predict(train_data))\n with open(signature_path, \"wb\") as file_writer:\n dill.dump(signature, file_writer)\n\n conda_env = _mlflow_conda_env(\n additional_pip_deps=[\"dill\", \"pandas\", \"scikit-learn\"]\n )\n with open(conda_env_path, \"wb\") as file_writer:\n dill.dump(conda_env, file_writer)\n\nimport argparse\n_parser = argparse.ArgumentParser(prog=''Train from csv'', description='''')\n_parser.add_argument(\"--train-data\", dest=\"train_data_path\", type=str, required=True, default=argparse.SUPPRESS)\n_parser.add_argument(\"--train-target\", dest=\"train_target_path\", type=str, required=True, default=argparse.SUPPRESS)\n_parser.add_argument(\"--kernel\", dest=\"kernel\", type=str, required=True, default=argparse.SUPPRESS)\n_parser.add_argument(\"--model\", dest=\"model_path\", type=_make_parent_dirs_and_return_path, required=True, default=argparse.SUPPRESS)\n_parser.add_argument(\"--input-example\", dest=\"input_example_path\", type=_make_parent_dirs_and_return_path, required=True, default=argparse.SUPPRESS)\n_parser.add_argument(\"--signature\", dest=\"signature_path\", type=_make_parent_dirs_and_return_path, required=True, default=argparse.SUPPRESS)\n_parser.add_argument(\"--conda-env\", dest=\"conda_env_path\", type=_make_parent_dirs_and_return_path, required=True, default=argparse.SUPPRESS)\n_parsed_args = vars(_parser.parse_args())\n\n_outputs = train_from_csv(**_parsed_args)\n"], "image": "python:3.7"}}, "inputs": [{"name": "train_data", "type": "csv"}, {"name": "train_target", "type": "csv"}, {"name": "kernel", "type": "String"}], "name": "Train from csv", "outputs": [{"name": "model", "type": "dill"}, {"name": "input_example", "type": "dill"}, {"name": "signature", "type": "dill"}, {"name": "conda_env", "type": "dill"}]}', pipelines.kubeflow.org/component_ref: '{}', pipelines.kubeflow.org/arguments.parameters: '{"kernel": "{{inputs.parameters.kernel}}"}'} - name: upload-sklearn-model-to-mlflow container: args: [--model-name, '{{inputs.parameters.model_name}}', --model, /tmp/inputs/model/data, --input-example, /tmp/inputs/input_example/data, --signature, /tmp/inputs/signature/data, --conda-env, /tmp/inputs/conda_env/data] command: - sh - -c - (PIP_DISABLE_PIP_VERSION_CHECK=1 python3 -m pip install --quiet --no-warn-script-location 'dill' 'pandas' 'scikit-learn' 'mlflow' 'boto3' || PIP_DISABLE_PIP_VERSION_CHECK=1 python3 -m pip install --quiet --no-warn-script-location 'dill' 'pandas' 'scikit-learn' 'mlflow' 'boto3' --user) && "$0" "$@" - sh - -ec - | program_path=$(mktemp) printf "%s" "$0" > "$program_path" python3 -u "$program_path" "$@" - | def upload_sklearn_model_to_mlflow( model_name, model_path, input_example_path, signature_path, conda_env_path, ): import os import dill from mlflow.sklearn import save_model from mlflow.tracking.client import MlflowClient os.environ["MLFLOW_S3_ENDPOINT_URL"] = "http://minio-service.kubeflow.svc:9000" os.environ["AWS_ACCESS_KEY_ID"] = "minio" os.environ["AWS_SECRET_ACCESS_KEY"] = "minio123" client = MlflowClient("http://mlflow-server-service.mlflow-system.svc:5000") with open(model_path, mode="rb") as file_reader: clf = dill.load(file_reader) with open(input_example_path, "rb") as file_reader: input_example = dill.load(file_reader) with open(signature_path, "rb") as file_reader: signature = dill.load(file_reader) with open(conda_env_path, "rb") as file_reader: conda_env = dill.load(file_reader) save_model( sk_model=clf, path=model_name, serialization_format="cloudpickle", conda_env=conda_env, signature=signature, input_example=input_example, ) run = client.create_run(experiment_id="0") client.log_artifact(run.info.run_id, model_name) import argparse _parser = argparse.ArgumentParser(prog='Upload sklearn model to mlflow', description='') _parser.add_argument("--model-name", dest="model_name", type=str, required=True, default=argparse.SUPPRESS) _parser.add_argument("--model", dest="model_path", type=str, required=True, default=argparse.SUPPRESS) _parser.add_argument("--input-example", dest="input_example_path", type=str, required=True, default=argparse.SUPPRESS) _parser.add_argument("--signature", dest="signature_path", type=str, required=True, default=argparse.SUPPRESS) _parser.add_argument("--conda-env", dest="conda_env_path", type=str, required=True, default=argparse.SUPPRESS) _parsed_args = vars(_parser.parse_args()) _outputs = upload_sklearn_model_to_mlflow(**_parsed_args) image: python:3.7 inputs: parameters: - {name: model_name} artifacts: - {name: train-from-csv-conda_env, path: /tmp/inputs/conda_env/data} - {name: train-from-csv-input_example, path: /tmp/inputs/input_example/data} - {name: train-from-csv-model, path: /tmp/inputs/model/data} - {name: train-from-csv-signature, path: /tmp/inputs/signature/data} metadata: labels: pipelines.kubeflow.org/kfp_sdk_version: 1.8.10 pipelines.kubeflow.org/pipeline-sdk-type: kfp pipelines.kubeflow.org/enable_caching: "true" annotations: {pipelines.kubeflow.org/component_spec: '{"implementation": {"container": {"args": ["--model-name", {"inputValue": "model_name"}, "--model", {"inputPath": "model"}, "--input-example", {"inputPath": "input_example"}, "--signature", {"inputPath": "signature"}, "--conda-env", {"inputPath": "conda_env"}], "command": ["sh", "-c", "(PIP_DISABLE_PIP_VERSION_CHECK=1 python3 -m pip install --quiet --no-warn-script-location ''dill'' ''pandas'' ''scikit-learn'' ''mlflow'' ''boto3'' || PIP_DISABLE_PIP_VERSION_CHECK=1 python3 -m pip install --quiet --no-warn-script-location ''dill'' ''pandas'' ''scikit-learn'' ''mlflow'' ''boto3'' --user) && \"$0\" \"$@\"", "sh", "-ec", "program_path=$(mktemp)\nprintf \"%s\" \"$0\" > \"$program_path\"\npython3 -u \"$program_path\" \"$@\"\n", "def upload_sklearn_model_to_mlflow(\n model_name,\n model_path,\n input_example_path,\n signature_path,\n conda_env_path,\n):\n import os\n import dill\n from mlflow.sklearn import save_model\n\n from mlflow.tracking.client import MlflowClient\n\n os.environ[\"MLFLOW_S3_ENDPOINT_URL\"] = \"http://minio-service.kubeflow.svc:9000\"\n os.environ[\"AWS_ACCESS_KEY_ID\"] = \"minio\"\n os.environ[\"AWS_SECRET_ACCESS_KEY\"] = \"minio123\"\n\n client = MlflowClient(\"http://mlflow-server-service.mlflow-system.svc:5000\")\n\n with open(model_path, mode=\"rb\") as file_reader:\n clf = dill.load(file_reader)\n\n with open(input_example_path, \"rb\") as file_reader:\n input_example = dill.load(file_reader)\n\n with open(signature_path, \"rb\") as file_reader:\n signature = dill.load(file_reader)\n\n with open(conda_env_path, \"rb\") as file_reader:\n conda_env = dill.load(file_reader)\n\n save_model(\n sk_model=clf,\n path=model_name,\n serialization_format=\"cloudpickle\",\n conda_env=conda_env,\n signature=signature,\n input_example=input_example,\n )\n run = client.create_run(experiment_id=\"0\")\n client.log_artifact(run.info.run_id, model_name)\n\nimport argparse\n_parser = argparse.ArgumentParser(prog=''Upload sklearn model to mlflow'', description='''')\n_parser.add_argument(\"--model-name\", dest=\"model_name\", type=str, required=True, default=argparse.SUPPRESS)\n_parser.add_argument(\"--model\", dest=\"model_path\", type=str, required=True, default=argparse.SUPPRESS)\n_parser.add_argument(\"--input-example\", dest=\"input_example_path\", type=str, required=True, default=argparse.SUPPRESS)\n_parser.add_argument(\"--signature\", dest=\"signature_path\", type=str, required=True, default=argparse.SUPPRESS)\n_parser.add_argument(\"--conda-env\", dest=\"conda_env_path\", type=str, required=True, default=argparse.SUPPRESS)\n_parsed_args = vars(_parser.parse_args())\n\n_outputs = upload_sklearn_model_to_mlflow(**_parsed_args)\n"], "image": "python:3.7"}}, "inputs": [{"name": "model_name", "type": "String"}, {"name": "model", "type": "dill"}, {"name": "input_example", "type": "dill"}, {"name": "signature", "type": "dill"}, {"name": "conda_env", "type": "dill"}], "name": "Upload sklearn model to mlflow"}', pipelines.kubeflow.org/component_ref: '{}', pipelines.kubeflow.org/arguments.parameters: '{"model_name": "{{inputs.parameters.model_name}}"}'} arguments: parameters: - {name: kernel} - {name: model_name} serviceAccountName: pipeline-runner ```

After generating the mlflow_pipeline.yaml file after execution, upload the pipeline and execute it to check the results of the run. ![mlflow-svc-0](./img/mlflow-svc-0.png) Port-forward the mlflow service to access the MLflow UI. ```bash kubectl port-forward svc/mlflow-server-service -n mlflow-system 5000:5000 ``` Open the web browser and connect to localhost:5000. You will then be able to see that the run has been created as follows. ![mlflow-svc-1](./img/mlflow-svc-1.png) Click on run to verify that the trained model file is present. ![mlflow-svc-2](./img/mlflow-svc-2.png) ================================================ FILE: i18n/en/docusaurus-plugin-content-docs/version-1.0/kubeflow/advanced-pipeline.md ================================================ --- title : "10. Pipeline - Setting" description: "" sidebar_position: 10 contributors: ["Jongseob Jeon"] --- ## Pipeline Setting In this page, we will look at values that can be set in the pipeline. ## Display Name Created within the pipeline, components have two names: - task_name: the function name when writing the component - display_name: the name that appears in the kubeflow UI For example, in the case where both components are set to Print and return number, it is difficult to tell which component is 1 or 2. ![run-7](./img/run-7.png) ### set_display_name The solution for this is the display_name. We can set the display_name in the pipeline by using the set_display_name [attribute](https://kubeflow-pipelines.readthedocs.io/en/latest/source/kfp.dsl.html#kfp.dsl.ContainerOp.set_display_name) of the component. ```python import kfp from kfp.components import create_component_from_func from kfp.dsl import pipeline @create_component_from_func def print_and_return_number(number: int) -> int: print(number) return number @create_component_from_func def sum_and_print_numbers(number_1: int, number_2: int): print(number_1 + number_2) @pipeline(name="example_pipeline") def example_pipeline(number_1: int, number_2: int): number_1_result = print_and_return_number(number_1).set_display_name("This is number 1") number_2_result = print_and_return_number(number_2).set_display_name("This is number 2") sum_result = sum_and_print_numbers( number_1=number_1_result.output, number_2=number_2_result.output ).set_display_name("This is sum of number 1 and number 2") if __name__ == "__main__": kfp.compiler.Compiler().compile(example_pipeline, "example_pipeline.yaml") ``` If you run this script and check the resulting `example_pipeline.yaml`, it would be like this.

example_pipeline.yaml ```bash apiVersion: argoproj.io/v1alpha1 kind: Workflow metadata: generateName: example-pipeline- annotations: {pipelines.kubeflow.org/kfp_sdk_version: 1.8.9, pipelines.kubeflow.org/pipeline_compilation_time: '2021-12-09T18:11:43.193190', pipelines.kubeflow.org/pipeline_spec: '{"inputs": [{"name": "number_1", "type": "Integer"}, {"name": "number_2", "type": "Integer"}], "name": "example_pipeline"}'} labels: {pipelines.kubeflow.org/kfp_sdk_version: 1.8.9} spec: entrypoint: example-pipeline templates: - name: example-pipeline inputs: parameters: - {name: number_1} - {name: number_2} dag: tasks: - name: print-and-return-number template: print-and-return-number arguments: parameters: - {name: number_1, value: '{{inputs.parameters.number_1}}'} - name: print-and-return-number-2 template: print-and-return-number-2 arguments: parameters: - {name: number_2, value: '{{inputs.parameters.number_2}}'} - name: sum-and-print-numbers template: sum-and-print-numbers dependencies: [print-and-return-number, print-and-return-number-2] arguments: parameters: - {name: print-and-return-number-2-Output, value: '{{tasks.print-and-return-number-2.outputs.parameters.print-and-return-number-2-Output}}'} - {name: print-and-return-number-Output, value: '{{tasks.print-and-return-number.outputs.parameters.print-and-return-number-Output}}'} - name: print-and-return-number container: args: [--number, '{{inputs.parameters.number_1}}', '----output-paths', /tmp/outputs/Output/data] command: - sh - -ec - | program_path=$(mktemp) printf "%s" "$0" > "$program_path" python3 -u "$program_path" "$@" - | def print_and_return_number(number): print(number) return number def _serialize_int(int_value: int) -> str: if isinstance(int_value, str): return int_value if not isinstance(int_value, int): raise TypeError('Value "{}" has type "{}" instead of int.'.format( str(int_value), str(type(int_value)))) return str(int_value) import argparse _parser = argparse.ArgumentParser(prog='Print and return number', description='') _parser.add_argument("--number", dest="number", type=int, required=True, default=argparse.SUPPRESS) _parser.add_argument("----output-paths", dest="_output_paths", type=str, nargs=1) _parsed_args = vars(_parser.parse_args()) _output_files = _parsed_args.pop("_output_paths", []) _outputs = print_and_return_number(**_parsed_args) _outputs = [_outputs] _output_serializers = [ _serialize_int, ] import os for idx, output_file in enumerate(_output_files): try: os.makedirs(os.path.dirname(output_file)) except OSError: pass with open(output_file, 'w') as f: f.write(_output_serializers[idx](_outputs[idx])) image: python:3.7 inputs: parameters: - {name: number_1} outputs: parameters: - name: print-and-return-number-Output valueFrom: {path: /tmp/outputs/Output/data} artifacts: - {name: print-and-return-number-Output, path: /tmp/outputs/Output/data} metadata: annotations: {pipelines.kubeflow.org/task_display_name: This is number 1, pipelines.kubeflow.org/component_spec: '{"implementation": {"container": {"args": ["--number", {"inputValue": "number"}, "----output-paths", {"outputPath": "Output"}], "command": ["sh", "-ec", "program_path=$(mktemp)\nprintf \"%s\" \"$0\" > \"$program_path\"\npython3 -u \"$program_path\" \"$@\"\n", "def print_and_return_number(number):\n print(number)\n return number\n\ndef _serialize_int(int_value: int) -> str:\n if isinstance(int_value, str):\n return int_value\n if not isinstance(int_value, int):\n raise TypeError(''Value \"{}\" has type \"{}\" instead of int.''.format(\n str(int_value), str(type(int_value))))\n return str(int_value)\n\nimport argparse\n_parser = argparse.ArgumentParser(prog=''Print and return number'', description='''')\n_parser.add_argument(\"--number\", dest=\"number\", type=int, required=True, default=argparse.SUPPRESS)\n_parser.add_argument(\"----output-paths\", dest=\"_output_paths\", type=str, nargs=1)\n_parsed_args = vars(_parser.parse_args())\n_output_files = _parsed_args.pop(\"_output_paths\", [])\n\n_outputs = print_and_return_number(**_parsed_args)\n\n_outputs = [_outputs]\n\n_output_serializers = [\n _serialize_int,\n\n]\n\nimport os\nfor idx, output_file in enumerate(_output_files):\n try:\n os.makedirs(os.path.dirname(output_file))\n except OSError:\n pass\n with open(output_file, ''w'') as f:\n f.write(_output_serializers[idx](_outputs[idx]))\n"], "image": "python:3.7"}}, "inputs": [{"name": "number", "type": "Integer"}], "name": "Print and return number", "outputs": [{"name": "Output", "type": "Integer"}]}', pipelines.kubeflow.org/component_ref: '{}', pipelines.kubeflow.org/arguments.parameters: '{"number": "{{inputs.parameters.number_1}}"}'} labels: pipelines.kubeflow.org/kfp_sdk_version: 1.8.9 pipelines.kubeflow.org/pipeline-sdk-type: kfp pipelines.kubeflow.org/enable_caching: "true" - name: print-and-return-number-2 container: args: [--number, '{{inputs.parameters.number_2}}', '----output-paths', /tmp/outputs/Output/data] command: - sh - -ec - | program_path=$(mktemp) printf "%s" "$0" > "$program_path" python3 -u "$program_path" "$@" - | def print_and_return_number(number): print(number) return number def _serialize_int(int_value: int) -> str: if isinstance(int_value, str): return int_value if not isinstance(int_value, int): raise TypeError('Value "{}" has type "{}" instead of int.'.format( str(int_value), str(type(int_value)))) return str(int_value) import argparse _parser = argparse.ArgumentParser(prog='Print and return number', description='') _parser.add_argument("--number", dest="number", type=int, required=True, default=argparse.SUPPRESS) _parser.add_argument("----output-paths", dest="_output_paths", type=str, nargs=1) _parsed_args = vars(_parser.parse_args()) _output_files = _parsed_args.pop("_output_paths", []) _outputs = print_and_return_number(**_parsed_args) _outputs = [_outputs] _output_serializers = [ _serialize_int, ] import os for idx, output_file in enumerate(_output_files): try: os.makedirs(os.path.dirname(output_file)) except OSError: pass with open(output_file, 'w') as f: f.write(_output_serializers[idx](_outputs[idx])) image: python:3.7 inputs: parameters: - {name: number_2} outputs: parameters: - name: print-and-return-number-2-Output valueFrom: {path: /tmp/outputs/Output/data} artifacts: - {name: print-and-return-number-2-Output, path: /tmp/outputs/Output/data} metadata: annotations: {pipelines.kubeflow.org/task_display_name: This is number 2, pipelines.kubeflow.org/component_spec: '{"implementation": {"container": {"args": ["--number", {"inputValue": "number"}, "----output-paths", {"outputPath": "Output"}], "command": ["sh", "-ec", "program_path=$(mktemp)\nprintf \"%s\" \"$0\" > \"$program_path\"\npython3 -u \"$program_path\" \"$@\"\n", "def print_and_return_number(number):\n print(number)\n return number\n\ndef _serialize_int(int_value: int) -> str:\n if isinstance(int_value, str):\n return int_value\n if not isinstance(int_value, int):\n raise TypeError(''Value \"{}\" has type \"{}\" instead of int.''.format(\n str(int_value), str(type(int_value))))\n return str(int_value)\n\nimport argparse\n_parser = argparse.ArgumentParser(prog=''Print and return number'', description='''')\n_parser.add_argument(\"--number\", dest=\"number\", type=int, required=True, default=argparse.SUPPRESS)\n_parser.add_argument(\"----output-paths\", dest=\"_output_paths\", type=str, nargs=1)\n_parsed_args = vars(_parser.parse_args())\n_output_files = _parsed_args.pop(\"_output_paths\", [])\n\n_outputs = print_and_return_number(**_parsed_args)\n\n_outputs = [_outputs]\n\n_output_serializers = [\n _serialize_int,\n\n]\n\nimport os\nfor idx, output_file in enumerate(_output_files):\n try:\n os.makedirs(os.path.dirname(output_file))\n except OSError:\n pass\n with open(output_file, ''w'') as f:\n f.write(_output_serializers[idx](_outputs[idx]))\n"], "image": "python:3.7"}}, "inputs": [{"name": "number", "type": "Integer"}], "name": "Print and return number", "outputs": [{"name": "Output", "type": "Integer"}]}', pipelines.kubeflow.org/component_ref: '{}', pipelines.kubeflow.org/arguments.parameters: '{"number": "{{inputs.parameters.number_2}}"}'} labels: pipelines.kubeflow.org/kfp_sdk_version: 1.8.9 pipelines.kubeflow.org/pipeline-sdk-type: kfp pipelines.kubeflow.org/enable_caching: "true" - name: sum-and-print-numbers container: args: [--number-1, '{{inputs.parameters.print-and-return-number-Output}}', --number-2, '{{inputs.parameters.print-and-return-number-2-Output}}'] command: - sh - -ec - | program_path=$(mktemp) printf "%s" "$0" > "$program_path" python3 -u "$program_path" "$@" - | def sum_and_print_numbers(number_1, number_2): print(number_1 + number_2) import argparse _parser = argparse.ArgumentParser(prog='Sum and print numbers', description='') _parser.add_argument("--number-1", dest="number_1", type=int, required=True, default=argparse.SUPPRESS) _parser.add_argument("--number-2", dest="number_2", type=int, required=True, default=argparse.SUPPRESS) _parsed_args = vars(_parser.parse_args()) _outputs = sum_and_print_numbers(**_parsed_args) image: python:3.7 inputs: parameters: - {name: print-and-return-number-2-Output} - {name: print-and-return-number-Output} metadata: annotations: {pipelines.kubeflow.org/task_display_name: This is sum of number 1 and number 2, pipelines.kubeflow.org/component_spec: '{"implementation": {"container": {"args": ["--number-1", {"inputValue": "number_1"}, "--number-2", {"inputValue": "number_2"}], "command": ["sh", "-ec", "program_path=$(mktemp)\nprintf \"%s\" \"$0\" > \"$program_path\"\npython3 -u \"$program_path\" \"$@\"\n", "def sum_and_print_numbers(number_1, number_2):\n print(number_1 + number_2)\n\nimport argparse\n_parser = argparse.ArgumentParser(prog=''Sum and print numbers'', description='''')\n_parser.add_argument(\"--number-1\", dest=\"number_1\", type=int, required=True, default=argparse.SUPPRESS)\n_parser.add_argument(\"--number-2\", dest=\"number_2\", type=int, required=True, default=argparse.SUPPRESS)\n_parsed_args = vars(_parser.parse_args())\n\n_outputs = sum_and_print_numbers(**_parsed_args)\n"], "image": "python:3.7"}}, "inputs": [{"name": "number_1", "type": "Integer"}, {"name": "number_2", "type": "Integer"}], "name": "Sum and print numbers"}', pipelines.kubeflow.org/component_ref: '{}', pipelines.kubeflow.org/arguments.parameters: '{"number_1": "{{inputs.parameters.print-and-return-number-Output}}", "number_2": "{{inputs.parameters.print-and-return-number-2-Output}}"}'} labels: pipelines.kubeflow.org/kfp_sdk_version: 1.8.9 pipelines.kubeflow.org/pipeline-sdk-type: kfp pipelines.kubeflow.org/enable_caching: "true" arguments: parameters: - {name: number_1} - {name: number_2} serviceAccountName: pipeline-runner ```

If compared with the previous file, the **`pipelines.kubeflow.org/task_display_name`** key has been newly created. ### UI in Kubeflow We will upload the version of the previously created [pipeline](../kubeflow/basic-pipeline-upload.md#upload-pipeline-version) using the files we created earlier. ![adv-pipeline-0.png](./img/adv-pipeline-0.png) As you can see, the configured name is displayed as shown above. ## Resources ### GPU By default, when the pipeline runs components as Kubernetes pods, it uses the default resource specifications. If you need to train a model using a GPU and the Kubernetes environment doesn't allocate a GPU, the training may not be performed correctly. To address this, you can use the `set_gpu_limit()` [attribute](https://kubeflow-pipelines.readthedocs.io/en/latest/source/kfp.dsl.html?highlight=set_gpu_limit#kfp.dsl.UserContainer.set_gpu_limit) to set the GPU limit. ```python import kfp from kfp.components import create_component_from_func from kfp.dsl import pipeline @create_component_from_func def print_and_return_number(number: int) -> int: print(number) return number @create_component_from_func def sum_and_print_numbers(number_1: int, number_2: int): print(number_1 + number_2) @pipeline(name="example_pipeline") def example_pipeline(number_1: int, number_2: int): number_1_result = print_and_return_number(number_1).set_display_name("This is number 1") number_2_result = print_and_return_number(number_2).set_display_name("This is number 2") sum_result = sum_and_print_numbers( number_1=number_1_result.output, number_2=number_2_result.output ).set_display_name("This is sum of number 1 and number 2").set_gpu_limit(1) if __name__ == "__main__": kfp.compiler.Compiler().compile(example_pipeline, "example_pipeline.yaml") ``` If you execute the above script, you can see that the resources has been added with `{nvidia.com/gpu: 1}` in the generated file when you look closely at `sum-and-print-numbers`. Through this, you can allocate a GPU. ```bash - name: sum-and-print-numbers container: args: [--number-1, '{{inputs.parameters.print-and-return-number-Output}}', --number-2, '{{inputs.parameters.print-and-return-number-2-Output}}'] command: - sh - -ec - | program_path=$(mktemp) printf "%s" "$0" > "$program_path" python3 -u "$program_path" "$@" - | def sum_and_print_numbers(number_1, number_2): print(number_1 + number_2) import argparse _parser = argparse.ArgumentParser(prog='Sum and print numbers', description='') _parser.add_argument("--number-1", dest="number_1", type=int, required=True, default=argparse.SUPPRESS) _parser.add_argument("--number-2", dest="number_2", type=int, required=True, default=argparse.SUPPRESS) _parsed_args = vars(_parser.parse_args()) _outputs = sum_and_print_numbers(**_parsed_args) image: python:3.7 resources: limits: {nvidia.com/gpu: 1} ``` ### CPU The function to set the number of CPUs can be set using the `.set_cpu_limit()` attribute [attribute](https://kubeflow-pipelines.readthedocs.io/en/latest/source/kfp.dsl.html?highlight=set_gpu_limit#kfp.dsl.Sidecar.set_cpu_limit). The difference from GPUs is that the input must be a string, not an int. ```python import kfp from kfp.components import create_component_from_func from kfp.dsl import pipeline @create_component_from_func def print_and_return_number(number: int) -> int: print(number) return number @create_component_from_func def sum_and_print_numbers(number_1: int, number_2: int): print(number_1 + number_2) @pipeline(name="example_pipeline") def example_pipeline(number_1: int, number_2: int): number_1_result = print_and_return_number(number_1).set_display_name("This is number 1") number_2_result = print_and_return_number(number_2).set_display_name("This is number 2") sum_result = sum_and_print_numbers( number_1=number_1_result.output, number_2=number_2_result.output ).set_display_name("This is sum of number 1 and number 2").set_gpu_limit(1).set_cpu_limit("16") if __name__ == "__main__": kfp.compiler.Compiler().compile(example_pipeline, "example_pipeline.yaml") ``` The changed part only can be confirmed as follows. ```bash resources: limits: {nvidia.com/gpu: 1, cpu: '16'} ``` ### Memory Memory can be set using the `.set_memory_limit()` [attribute](https://kubeflow-pipelines.readthedocs.io/en/latest/source/kfp.dsl.html?highlight=set_gpu_limit#kfp.dsl.Sidecar.set_memory_limit). ```python import kfp from kfp.components import create_component_from_func from kfp.dsl import pipeline @create_component_from_func def print_and_return_number(number: int) -> int: print(number) return number @create_component_from_func def sum_and_print_numbers(number_1: int, number_2: int): print(number_1 + number_2) @pipeline(name="example_pipeline") def example_pipeline(number_1: int, number_2: int): number_1_result = print_and_return_number(number_1).set_display_name("This is number 1") number_2_result = print_and_return_number(number_2).set_display_name("This is number 2") sum_result = sum_and_print_numbers( number_1=number_1_result.output, number_2=number_2_result.output ).set_display_name("This is sum of number 1 and number 2").set_gpu_limit(1).set_memory_limit("1G") if __name__ == "__main__": kfp.compiler.Compiler().compile(example_pipeline, "example_pipeline.yaml") ``` The changed parts are as follows if checked. ```bash resources: limits: {nvidia.com/gpu: 1, memory: 1G} ``` ================================================ FILE: i18n/en/docusaurus-plugin-content-docs/version-1.0/kubeflow/advanced-run.md ================================================ --- title : "11. Pipeline - Run Result" description: "" sidebar_position: 11 contributors: ["Jongseob Jeon", "SeungTae Kim"] --- ## Run Result Click Run Result and you will see three tabs: Graph, Run Output, and Config. ![advanced-run-0.png](./img/advanced-run-0.png) ## Graph ![advanced-run-1.png](./img/advanced-run-1.png) In the graph, if you click on the run component, you can check the running information of the component. ### Input/Output The Input/Output tab allows you to view and download the Configurations, Input, and Output Artifacts used in the components. ### Logs In the Logs tab, you can view all the stdout output generated during the execution of the Python code. However, pods are deleted after a certain period of time, so you may not be able to view them in this tab after a certain time. In that case, you can check them in the main-logs section of the Output artifacts. ### Visualizations The Visualizations tab displays plots generated by the components. To generate a plot, you can save the desired values as an argument using `mlpipeline_ui_metadata: OutputPath("UI_Metadata")`. The plot should be in HTML format. The conversion process is as follows. ```python @partial( create_component_from_func, packages_to_install=["matplotlib"], ) def plot_linear( mlpipeline_ui_metadata: OutputPath("UI_Metadata") ): import base64 import json from io import BytesIO import matplotlib.pyplot as plt plt.plot(x=[1, 2, 3], y=[1, 2,3]) tmpfile = BytesIO() plt.savefig(tmpfile, format="png") encoded = base64.b64encode(tmpfile.getvalue()).decode("utf-8") html = f"" metadata = { "outputs": [ { "type": "web-app", "storage": "inline", "source": html, }, ], } with open(mlpipeline_ui_metadata, "w") as html_writer: json.dump(metadata, html_writer) ``` If written in pipeline, it will be like this. ```python from functools import partial import kfp from kfp.components import create_component_from_func, OutputPath from kfp.dsl import pipeline @partial( create_component_from_func, packages_to_install=["matplotlib"], ) def plot_linear(mlpipeline_ui_metadata: OutputPath("UI_Metadata")): import base64 import json from io import BytesIO import matplotlib.pyplot as plt plt.plot([1, 2, 3], [1, 2, 3]) tmpfile = BytesIO() plt.savefig(tmpfile, format="png") encoded = base64.b64encode(tmpfile.getvalue()).decode("utf-8") html = f"" metadata = { "outputs": [ { "type": "web-app", "storage": "inline", "source": html, }, ], } with open(mlpipeline_ui_metadata, "w") as html_writer: json.dump(metadata, html_writer) @pipeline(name="plot_pipeline") def plot_pipeline(): plot_linear() if __name__ == "__main__": kfp.compiler.Compiler().compile(plot_pipeline, "plot_pipeline.yaml") ``` If you run this script and check the resulting `plot_pipeline.yaml`, you will see the following.

plot_pipeline.yaml ```bash apiVersion: argoproj.io/v1alpha1 kind: Workflow metadata: generateName: plot-pipeline- annotations: {pipelines.kubeflow.org/kfp_sdk_version: 1.8.9, pipelines.kubeflow.org/pipeline_compilation_time: '2 022-01-17T13:31:32.963214', pipelines.kubeflow.org/pipeline_spec: '{"name": "plot_pipeline"}'} labels: {pipelines.kubeflow.org/kfp_sdk_version: 1.8.9} spec: entrypoint: plot-pipeline templates: - name: plot-linear container: args: [--mlpipeline-ui-metadata, /tmp/outputs/mlpipeline_ui_metadata/data] command: - sh - -c - (PIP_DISABLE_PIP_VERSION_CHECK=1 python3 -m pip install --quiet --no-warn-script-location 'matplotlib' || PIP_DISABLE_PIP_VERSION_CHECK=1 python3 -m pip install --quiet --no-warn-script-location 'matplotlib' --user) && "$0" "$@" - sh - -ec - | program_path=$(mktemp) printf "%s" "$0" > "$program_path" python3 -u "$program_path" "$@" - | def _make_parent_dirs_and_return_path(file_path: str): import os os.makedirs(os.path.dirname(file_path), exist_ok=True) return file_path def plot_linear(mlpipeline_ui_metadata): import base64 import json from io import BytesIO import matplotlib.pyplot as plt plt.plot([1, 2, 3], [1, 2, 3]) tmpfile = BytesIO() plt.savefig(tmpfile, format="png") encoded = base64.b64encode(tmpfile.getvalue()).decode("utf-8") html = f"" metadata = { "outputs": [ { "type": "web-app", "storage": "inline", "source": html, }, ], } with open(mlpipeline_ui_metadata, "w") as html_writer: json.dump(metadata, html_writer) import argparse _parser = argparse.ArgumentParser(prog='Plot linear', description='') _parser.add_argument("--mlpipeline-ui-metadata", dest="mlpipeline_ui_metadata", type=_make_parent_dirs_and_return_path, required=True, default=argparse.SUPPRESS) _parsed_args = vars(_parser.parse_args()) _outputs = plot_linear(**_parsed_args) image: python:3.7 outputs: artifacts: - {name: mlpipeline-ui-metadata, path: /tmp/outputs/mlpipeline_ui_metadata/data} metadata: labels: pipelines.kubeflow.org/kfp_sdk_version: 1.8.9 pipelines.kubeflow.org/pipeline-sdk-type: kfp pipelines.kubeflow.org/enable_caching: "true" annotations: {pipelines.kubeflow.org/component_spec: '{"implementation": {"container": {"args": ["--mlpipeline-ui-metadata", {"outputPath": "mlpipeline_ui_metadata"}], "command": ["sh", "-c", "(PIP_DISABLE_PIP_VERSION_CHECK=1 python3 -m pip install --quiet --no-warn-script-location ''matplotlib'' || PIP_DISABLE_PIP_VERSION_CHECK=1 python3 -m pip install --quiet --no-warn-script-location ''matplotlib'' --user) && \"$0\" \"$@\"", "sh", "-ec", "program_path=$(mktemp)\nprintf \"%s\" \"$0\" > \"$program_path\"\npython3 -u \"$program_path\" \"$@\"\n", "def _make_parent_dirs_and_return_path(file_path: str):\n import os\n os.makedirs(os.path.dirname(file_path), exist_ok=True)\n return file_path\n\ndef plot_linear(mlpipeline_ui_metadata):\n import base64\n import json\n from io import BytesIO\n\n import matplotlib.pyplot as plt\n\n plt.plot([1, 2, 3], [1, 2, 3])\n\n tmpfile = BytesIO()\n plt.savefig(tmpfile, format=\"png\")\n encoded = base64.b64encode(tmpfile.getvalue()).decode(\"utf-8\")\n\n html = f\"\"\n metadata = {\n \"outputs\": [\n {\n \"type\": \"web-app\",\n \"storage\": \"inline\",\n \"source\": html,\n },\n ],\n }\n with open(mlpipeline_ui_metadata, \"w\") as html_writer:\n json.dump(metadata, html_writer)\n\nimport argparse\n_parser = argparse.ArgumentParser(prog=''Plot linear'', description='''')\n_parser.add_argument(\"--mlpipeline-ui-metadata\", dest=\"mlpipeline_ui_metadata\", type=_make_parent_dirs_and_return_path, required=True, default=argparse.SUPPRESS)\n_parsed_args = vars(_parser.parse_args())\n\n_outputs = plot_linear(**_parsed_args)\n"], "image": "python:3.7"}}, "name": "Plot linear", "outputs": [{"name": "mlpipeline_ui_metadata", "type": "UI_Metadata"}]}', pipelines.kubeflow.org/component_ref: '{}'} - name: plot-pipeline dag: tasks: - {name: plot-linear, template: plot-linear} arguments: parameters: [] serviceAccountName: pipeline-runner ```

After running, click Visualization. ![advanced-run-5.png](./img/advanced-run-5.png) ## Run output ![advanced-run-2.png](./img/advanced-run-2.png) Run output is where Kubeflow gathers the Artifacts generated in the specified form and shows the evaluation index (Metric). To show the evaluation index (Metric), you can save the name and value you want to show in the `mlpipeline_metrics_path: OutputPath("Metrics")` argument in json format. For example, you can write it like this. ```python @create_component_from_func def show_metric_of_sum( number: int, mlpipeline_metrics_path: OutputPath("Metrics"), ): import json metrics = { "metrics": [ { "name": "sum_value", "numberValue": number, }, ], } with open(mlpipeline_metrics_path, "w") as f: json.dump(metrics, f) ``` We will add a component to generate evaluation metrics to the pipeline created in the [Pipeline](../kubeflow/basic-pipeline.md) and execute it. The whole pipeline is as follows. ```python import kfp from kfp.components import create_component_from_func, OutputPath from kfp.dsl import pipeline @create_component_from_func def print_and_return_number(number: int) -> int: print(number) return number @create_component_from_func def sum_and_print_numbers(number_1: int, number_2: int) -> int: sum_number = number_1 + number_2 print(sum_number) return sum_number @create_component_from_func def show_metric_of_sum( number: int, mlpipeline_metrics_path: OutputPath("Metrics"), ): import json metrics = { "metrics": [ { "name": "sum_value", "numberValue": number, }, ], } with open(mlpipeline_metrics_path, "w") as f: json.dump(metrics, f) @pipeline(name="example_pipeline") def example_pipeline(number_1: int, number_2: int): number_1_result = print_and_return_number(number_1) number_2_result = print_and_return_number(number_2) sum_result = sum_and_print_numbers( number_1=number_1_result.output, number_2=number_2_result.output ) show_metric_of_sum(sum_result.output) if __name__ == "__main__": kfp.compiler.Compiler().compile(example_pipeline, "example_pipeline.yaml") ``` After execution, click Run Output and it will show like this. ![advanced-run-4.png](./img/advanced-run-4.png) ## Config ![advanced-run-3.png](./img/advanced-run-3.png) In the Config tab, you can view all the values received as pipeline configurations. ================================================ FILE: i18n/en/docusaurus-plugin-content-docs/version-1.0/kubeflow/basic-component.md ================================================ --- title : "4. Component - Write" description: "" sidebar_position: 4 contributors: ["Jongseob Jeon"] --- ## Component In order to write a component, the following must be written: 1. Writing Component Contents 2. Writing Component Wrapper Now, let's look at each process. ## Component Contents Component Contents are no different from the Python code we commonly write. For example, let's try writing a component that takes a number as input, prints it, and then returns it. We can write it in Python code like this. ```python print(number) ``` However, when this code is run, an error occurs and it does not work because the `number` that should be printed is not defined. As we saw in [Kubeflow Concepts](../kubeflow/kubeflow-concepts.md), values like `number` that are required in component content are defined in **Config**. In order to execute component content, the necessary Configs must be passed from the component wrapper. ## Component Wrapper ### Define a standalone Python function Now we need to create a component wrapper to be able to pass the required Configs. Without a separate Config, it will be like this when wrapped with a component wrapper. ```python def print_and_return_number(): print(number) return number ``` Now we add the required Config for the content as an argument to the wrapper. However, it is not just writing the argument but also writing the type hint of the argument. When Kubeflow converts the pipeline into the Kubeflow format, it checks if the specified input and output types are matched in the connection between the components. If the format of the input required by the component does not match the output received from another component, the pipeline cannot be created. Now we complete the component wrapper by writing down the argument, its type and the type to be returned as follows. ```python def print_and_return_number(number: int) -> int: print(number) return number ``` In Kubeflow, you can only use types that can be expressed in json as return values. The most commonly used and recommended types are as follows: - int - float - str If you want to return multiple values instead of a single value, you must use `collections.namedtuple`. For more details, please refer to the Kubeflow official documentation [Kubeflow Official Documentation](https://www.kubeflow.org/docs/components/pipelines/sdk/python-function-components/#passing-parameters-by-value). For example, if you want to write a component that returns the quotient and remainder of a number when divided by 2, it should be written as follows. ```python from typing import NamedTuple def divide_and_return_number( number: int, ) -> NamedTuple("DivideOutputs", [("quotient", int), ("remainder", int)]): from collections import namedtuple quotient, remainder = divmod(number, 2) print("quotient is", quotient) print("remainder is", remainder) divide_outputs = namedtuple( "DivideOutputs", [ "quotient", "remainder", ], ) return divide_outputs(quotient, remainder) ``` ### Convert to Kubeflow Format Now you have to convert the written component into a format that can be used in Kubeflow. The conversion can be done through `kfp.components.create_component_from_func`. This converted form can be imported as a function in Python and used in the pipeline. ```python from kfp.components import create_component_from_func @create_component_from_func def print_and_return_number(number: int) -> int: print(number) return number ``` ### Share component with yaml file If it is not possible to share with Python code, you can share components with a YAML file and use them. To do this, first convert the component to a YAML file and then use it in the pipeline with `kfp.components.load_component_from_file`. First, let's explain the process of converting the written component to a YAML file. ```python from kfp.components import create_component_from_func @create_component_from_func def print_and_return_number(number: int) -> int: print(number) return number if __name__ == "__main__": print_and_return_number.component_spec.save("print_and_return_number.yaml") ``` If you run the Python code you wrote, a file called `print_and_return_number.yaml` will be created. When you check the file, it will be as follows. ```bash name: Print and return number inputs: - {name: number, type: Integer} outputs: - {name: Output, type: Integer} implementation: container: image: python:3.7 command: - sh - -ec - | program_path=$(mktemp) printf "%s" "$0" > "$program_path" python3 -u "$program_path" "$@" - | def print_and_return_number(number): print(number) return number def _serialize_int(int_value: int) -> str: if isinstance(int_value, str): return int_value if not isinstance(int_value, int): raise TypeError('Value "{}" has type "{}" instead of int.'.format(str(int_value), str(type(int_value)))) return str(int_value) import argparse _parser = argparse.ArgumentParser(prog='Print and return number', description='') _parser.add_argument("--number", dest="number", type=int, required=True, default=argparse.SUPPRESS) _parser.add_argument("----output-paths", dest="_output_paths", type=str, nargs=1) _parsed_args = vars(_parser.parse_args()) _output_files = _parsed_args.pop("_output_paths", []) _outputs = print_and_return_number(**_parsed_args) _outputs = [_outputs] _output_serializers = [ _serialize_int, ] import os for idx, output_file in enumerate(_output_files): try: os.makedirs(os.path.dirname(output_file)) except OSError: pass with open(output_file, 'w') as f: f.write(_output_serializers[idx](_outputs[idx])) args: - --number - {inputValue: number} - '----output-paths' - {outputPath: Output} ``` Now the generated file can be shared and used in the pipeline as follows. ```python from kfp.components import load_component_from_file print_and_return_number = load_component_from_file("print_and_return_number.yaml") ``` ## How Kubeflow executes component In Kubeflow, the execution order of components is as follows: 1. `docker pull `: Pull the image containing the execution environment information of the defined component. 2. Run `command`: Execute the component's content within the pulled image. Taking `print_and_return_number.yaml` as an example, the default image in `@create_component_from_func` is `python:3.7`, so the component's content will be executed based on that image. 1. `docker pull python:3.7` 2. `print(number)` ## References: - [Getting Started With Python function based components](https://www.kubeflow.org/docs/components/pipelines/sdk/python-function-components/#getting-started-with-python-function-based-components) ================================================ FILE: i18n/en/docusaurus-plugin-content-docs/version-1.0/kubeflow/basic-pipeline-upload.md ================================================ --- title : "6. Pipeline - Upload" description: "" sidebar_position: 6 contributors: ["Jongseob Jeon"] --- ## Upload Pipeline Now, let's upload the pipeline we created directly to kubeflow. Pipeline uploads can be done through the kubeflow dashboard UI. Use the method used in [Install Kubeflow](../setup-components/install-components-kf.md) to do port forwarding. ```bash kubectl port-forward svc/istio-ingressgateway -n istio-system 8080:80 ``` Access [http://localhost:8080](http://localhost:8080) to open the dashboard. ### 1. Click Pipelines Tab ![pipeline-gui-0.png](./img/pipeline-gui-0.png) ### 2. Click Upload Pipeline ![pipeline-gui-1.png](./img/pipeline-gui-1.png) ### 3. Click Choose file ![pipeline-gui-2.png](./img/pipeline-gui-2.png) ### 4. Upload created yaml file ![pipeline-gui-3.png](./img/pipeline-gui-3.png) ### 5. Create ![pipeline-gui-4.png](./img/pipeline-gui-4.png) ## Upload Pipeline Version The uploaded pipeline allows you to manage versions through uploads. However, it serves the role of gathering pipelines with the same name rather than version management at the code level, such as Github. In the example above, clicking on example_pipeline will bring up the following screen. ![pipeline-gui-5.png](./img/pipeline-gui-5.png) If you click this screen shows. ![pipeline-gui-4.png](./img/pipeline-gui-4.png) If you click Upload Version, a screen appears where you can upload the pipeline. ![pipeline-gui-6.png](./img/pipeline-gui-6.png) Now, upload your pipeline. ![pipeline-gui-7.png](./img/pipeline-gui-7.png) Once uploaded, you can check the pipeline version as follows. ![pipeline-gui-8.png](./img/pipeline-gui-8.png) ================================================ FILE: i18n/en/docusaurus-plugin-content-docs/version-1.0/kubeflow/basic-pipeline.md ================================================ --- title : "5. Pipeline - Write" description: "" sidebar_position: 5 contributors: ["Jongseob Jeon"] --- ## Pipeline Components do not run independently but rather as components of a pipeline. Therefore, in order to run a component, a pipeline must be written. And in order to write a pipeline, a set of components and the order of execution of those components is necessary. On this page, we will create a pipeline with a component that takes a number as input and outputs it, and a component that takes two numbers from two components and outputs the sum. ## Component Set First, let's create the components that will be used in the pipeline. 1. `print_and_return_number` This component prints and returns the input number. Since the component returns the input value, we specify `int` as the return type hint. ```python @create_component_from_func def print_and_return_number(number: int) -> int: print(number) return number ``` 2. `sum_and_print_numbers` This component calculates the sum of two input numbers and prints it. Similarly, since the component returns the sum, we specify `int` as the return type hint. ```python @create_component_from_func def sum_and_print_numbers(number_1: int, number_2: int) -> int: sum_num = number_1 + number_2 print(sum_num) return sum_num ``` ## Component Order ### Define Order If you have created the necessary set of components, the next step is to define their sequence. The diagram below represents the order of the pipeline components to be created on this page. ![pipeline-0.png](./img/pipeline-0.png) ### Single Output Now let's translate this sequence into code. First, writing `print_and_return_number_1` and `print_and_return_number_2` from the picture above would look like this. ```python def example_pipeline(): number_1_result = print_and_return_number(number_1) number_2_result = print_and_return_number(number_2) ``` Run the component and store the return values in `number_1_result` and `number_2_result`, respectively. The return value of the stored `number_1_result` can be used through `number_1_resulst.output`. ### Multi Output In the example above, the components return a single value, so it can be directly used with `output`. However, if there are multiple return values, they will be stored in `outputs` as a dictionary. You can use the keys to access the desired return values. Let's consider an example with a component that returns multiple values, like the one mentioned in the [component](../kubeflow/basic-component.md#define-a-standalone-python-function) definition. The `divide_and_return_number` component returns `quotient` and `remainder`. Here's an example of passing these two values to `print_and_return_number`: ```python def multi_pipeline(): divided_result = divde_and_return_number(number) num_1_result = print_and_return_number(divided_result.outputs["quotient"]) num_2_result = print_and_return_number(divided_result.outputs["remainder"]) ``` Store the result of `divide_and_return_number` in `divided_result` and you can get the values of each by `divided_result.outputs["quotient"]` and `divided_result.outputs["remainder"]`. ### Write to python code Now, let's get back to the main topic and pass the result of these two values to `sum_and_print_numbers`. ```python def example_pipeline(): number_1_result = print_and_return_number(number_1) number_2_result = print_and_return_number(number_2) sum_result = sum_and_print_numbers( number_1=number_1_result.output, number_2=number_2_result.output ) ``` Next, gather the necessary Configs for each component and define it as a pipeline Config. ```python def example_pipeline(number_1: int, number_2:int): number_1_result = print_and_return_number(number_1) number_2_result = print_and_return_number(number_2) sum_result = sum_and_print_numbers( number_1=number_1_result.output, number_2=number_2_result.output ) ``` ## Convert to Kubeflow Format Finally, convert it into a format that can be used in Kubeflow. The conversion can be done using the `kfp.dsl.pipeline` function. ```python from kfp.dsl import pipeline @pipeline(name="example_pipeline") def example_pipeline(number_1: int, number_2: int): number_1_result = print_and_return_number(number_1) number_2_result = print_and_return_number(number_2) sum_result = sum_and_print_numbers( number_1=number_1_result.output, number_2=number_2_result.output ) ``` In order to run a pipeline in Kubeflow, it needs to be compiled into the designated yaml format as only yaml format is possible, so the created pipeline needs to be compiled into a specific yaml format. Compilation can be done using the following command. ```python if __name__ == "__main__": import kfp kfp.compiler.Compiler().compile(example_pipeline, "example_pipeline.yaml") ``` ## Conclusion As explained earlier, if we gather the content into a Python code, it will look like this. ```python import kfp from kfp.components import create_component_from_func from kfp.dsl import pipeline @create_component_from_func def print_and_return_number(number: int) -> int: print(number) return number @create_component_from_func def sum_and_print_numbers(number_1: int, number_2: int): print(number_1 + number_2) @pipeline(name="example_pipeline") def example_pipeline(number_1: int, number_2: int): number_1_result = print_and_return_number(number_1) number_2_result = print_and_return_number(number_2) sum_result = sum_and_print_numbers( number_1=number_1_result.output, number_2=number_2_result.output ) if __name__ == "__main__": kfp.compiler.Compiler().compile(example_pipeline, "example_pipeline.yaml") ``` The compiled result is as follows.
example_pipeline.yaml ```bash apiVersion: argoproj.io/v1alpha1 kind: Workflow metadata: generateName: example-pipeline- annotations: {pipelines.kubeflow.org/kfp_sdk_version: 1.6.3, pipelines.kubeflow.org/pipeline_compilation_time: '2021-12-05T13:38:51.566777', pipelines.kubeflow.org/pipeline_spec: '{"inputs": [{"name": "number_1", "type": "Integer"}, {"name": "number_2", "type": "Integer"}], "name": "example_pipeline"}'} labels: {pipelines.kubeflow.org/kfp_sdk_version: 1.6.3} spec: entrypoint: example-pipeline templates: - name: example-pipeline inputs: parameters: - {name: number_1} - {name: number_2} dag: tasks: - name: print-and-return-number template: print-and-return-number arguments: parameters: - {name: number_1, value: '{{inputs.parameters.number_1}}'} - name: print-and-return-number-2 template: print-and-return-number-2 arguments: parameters: - {name: number_2, value: '{{inputs.parameters.number_2}}'} - name: sum-and-print-numbers template: sum-and-print-numbers dependencies: [print-and-return-number, print-and-return-number-2] arguments: parameters: - {name: print-and-return-number-2-Output, value: '{{tasks.print-and-return-number-2.outputs.parameters.print-and-return-number-2-Output}}'} - {name: print-and-return-number-Output, value: '{{tasks.print-and-return-number.outputs.parameters.print-and-return-number-Output}}'} - name: print-and-return-number container: args: [--number, '{{inputs.parameters.number_1}}', '----output-paths', /tmp/outputs/Output/data] command: - sh - -ec - | program_path=$(mktemp) printf "%s" "$0" > "$program_path" python3 -u "$program_path" "$@" - | def print_and_return_number(number): print(number) return number def _serialize_int(int_value: int) -> str: if isinstance(int_value, str): return int_value if not isinstance(int_value, int): raise TypeError('Value "{}" has type "{}" instead of int.'.format(str(int_value), str(type(int_value)))) return str(int_value) import argparse _parser = argparse.ArgumentParser(prog='Print and return number', description='') _parser.add_argument("--number", dest="number", type=int, required=True, default=argparse.SUPPRESS) _parser.add_argument("----output-paths", dest="_output_paths", type=str, nargs=1) _parsed_args = vars(_parser.parse_args()) _output_files = _parsed_args.pop("_output_paths", []) _outputs = print_and_return_number(**_parsed_args) _outputs = [_outputs] _output_serializers = [ _serialize_int, ] import os for idx, output_file in enumerate(_output_files): try: os.makedirs(os.path.dirname(output_file)) except OSError: pass with open(output_file, 'w') as f: f.write(_output_serializers[idx](_outputs[idx])) image: python:3.7 inputs: parameters: - {name: number_1} outputs: parameters: - name: print-and-return-number-Output valueFrom: {path: /tmp/outputs/Output/data} artifacts: - {name: print-and-return-number-Output, path: /tmp/outputs/Output/data} metadata: labels: {pipelines.kubeflow.org/kfp_sdk_version: 1.6.3, pipelines.kubeflow.org/pipeline-sdk-type: kfp} annotations: {pipelines.kubeflow.org/component_spec: '{"implementation": {"container": {"args": ["--number", {"inputValue": "number"}, "----output-paths", {"outputPath": "Output"}], "command": ["sh", "-ec", "program_path=$(mktemp)\nprintf \"%s\" \"$0\" > \"$program_path\"\npython3 -u \"$program_path\" \"$@\"\n", "def print_and_return_number(number):\n print(number)\n return number\n\ndef _serialize_int(int_value: int) -> str:\n if isinstance(int_value, str):\n return int_value\n if not isinstance(int_value, int):\n raise TypeError(''Value \"{}\" has type \"{}\" instead of int.''.format(str(int_value), str(type(int_value))))\n return str(int_value)\n\nimport argparse\n_parser = argparse.ArgumentParser(prog=''Print and return number'', description='''')\n_parser.add_argument(\"--number\", dest=\"number\", type=int, required=True, default=argparse.SUPPRESS)\n_parser.add_argument(\"----output-paths\", dest=\"_output_paths\", type=str, nargs=1)\n_parsed_args = vars(_parser.parse_args())\n_output_files = _parsed_args.pop(\"_output_paths\", [])\n\n_outputs = print_and_return_number(**_parsed_args)\n\n_outputs = [_outputs]\n\n_output_serializers = [\n _serialize_int,\n\n]\n\nimport os\nfor idx, output_file in enumerate(_output_files):\n try:\n os.makedirs(os.path.dirname(output_file))\n except OSError:\n pass\n with open(output_file, ''w'') as f:\n f.write(_output_serializers[idx](_outputs[idx]))\n"], "image": "python:3.7"}}, "inputs": [{"name": "number", "type": "Integer"}], "name": "Print and return number", "outputs": [{"name": "Output", "type": "Integer"}]}', pipelines.kubeflow.org/component_ref: '{}', pipelines.kubeflow.org/arguments.parameters: '{"number": "{{inputs.parameters.number_1}}"}'} - name: print-and-return-number-2 container: args: [--number, '{{inputs.parameters.number_2}}', '----output-paths', /tmp/outputs/Output/data] command: - sh - -ec - | program_path=$(mktemp) printf "%s" "$0" > "$program_path" python3 -u "$program_path" "$@" - | def print_and_return_number(number): print(number) return number def _serialize_int(int_value: int) -> str: if isinstance(int_value, str): return int_value if not isinstance(int_value, int): raise TypeError('Value "{}" has type "{}" instead of int.'.format(str(int_value), str(type(int_value)))) return str(int_value) import argparse _parser = argparse.ArgumentParser(prog='Print and return number', description='') _parser.add_argument("--number", dest="number", type=int, required=True, default=argparse.SUPPRESS) _parser.add_argument("----output-paths", dest="_output_paths", type=str, nargs=1) _parsed_args = vars(_parser.parse_args()) _output_files = _parsed_args.pop("_output_paths", []) _outputs = print_and_return_number(**_parsed_args) _outputs = [_outputs] _output_serializers = [ _serialize_int, ] import os for idx, output_file in enumerate(_output_files): try: os.makedirs(os.path.dirname(output_file)) except OSError: pass with open(output_file, 'w') as f: f.write(_output_serializers[idx](_outputs[idx])) image: python:3.7 inputs: parameters: - {name: number_2} outputs: parameters: - name: print-and-return-number-2-Output valueFrom: {path: /tmp/outputs/Output/data} artifacts: - {name: print-and-return-number-2-Output, path: /tmp/outputs/Output/data} metadata: labels: {pipelines.kubeflow.org/kfp_sdk_version: 1.6.3, pipelines.kubeflow.org/pipeline-sdk-type: kfp} annotations: {pipelines.kubeflow.org/component_spec: '{"implementation": {"container": {"args": ["--number", {"inputValue": "number"}, "----output-paths", {"outputPath": "Output"}], "command": ["sh", "-ec", "program_path=$(mktemp)\nprintf \"%s\" \"$0\" > \"$program_path\"\npython3 -u \"$program_path\" \"$@\"\n", "def print_and_return_number(number):\n print(number)\n return number\n\ndef _serialize_int(int_value: int) -> str:\n if isinstance(int_value, str):\n return int_value\n if not isinstance(int_value, int):\n raise TypeError(''Value \"{}\" has type \"{}\" instead of int.''.format(str(int_value), str(type(int_value))))\n return str(int_value)\n\nimport argparse\n_parser = argparse.ArgumentParser(prog=''Print and return number'', description='''')\n_parser.add_argument(\"--number\", dest=\"number\", type=int, required=True, default=argparse.SUPPRESS)\n_parser.add_argument(\"----output-paths\", dest=\"_output_paths\", type=str, nargs=1)\n_parsed_args = vars(_parser.parse_args())\n_output_files = _parsed_args.pop(\"_output_paths\", [])\n\n_outputs = print_and_return_number(**_parsed_args)\n\n_outputs = [_outputs]\n\n_output_serializers = [\n _serialize_int,\n\n]\n\nimport os\nfor idx, output_file in enumerate(_output_files):\n try:\n os.makedirs(os.path.dirname(output_file))\n except OSError:\n pass\n with open(output_file, ''w'') as f:\n f.write(_output_serializers[idx](_outputs[idx]))\n"], "image": "python:3.7"}}, "inputs": [{"name": "number", "type": "Integer"}], "name": "Print and return number", "outputs": [{"name": "Output", "type": "Integer"}]}', pipelines.kubeflow.org/component_ref: '{}', pipelines.kubeflow.org/arguments.parameters: '{"number": "{{inputs.parameters.number_2}}"}'} - name: sum-and-print-numbers container: args: [--number-1, '{{inputs.parameters.print-and-return-number-Output}}', --number-2, '{{inputs.parameters.print-and-return-number-2-Output}}'] command: - sh - -ec - | program_path=$(mktemp) printf "%s" "$0" > "$program_path" python3 -u "$program_path" "$@" - | def sum_and_print_numbers(number_1, number_2): print(number_1 + number_2) import argparse _parser = argparse.ArgumentParser(prog='Sum and print numbers', description='') _parser.add_argument("--number-1", dest="number_1", type=int, required=True, default=argparse.SUPPRESS) _parser.add_argument("--number-2", dest="number_2", type=int, required=True, default=argparse.SUPPRESS) _parsed_args = vars(_parser.parse_args()) _outputs = sum_and_print_numbers(**_parsed_args) image: python:3.7 inputs: parameters: - {name: print-and-return-number-2-Output} - {name: print-and-return-number-Output} metadata: labels: {pipelines.kubeflow.org/kfp_sdk_version: 1.6.3, pipelines.kubeflow.org/pipeline-sdk-type: kfp} annotations: {pipelines.kubeflow.org/component_spec: '{"implementation": {"container": {"args": ["--number-1", {"inputValue": "number_1"}, "--number-2", {"inputValue": "number_2"}], "command": ["sh", "-ec", "program_path=$(mktemp)\nprintf \"%s\" \"$0\" > \"$program_path\"\npython3 -u \"$program_path\" \"$@\"\n", "def sum_and_print_numbers(number_1, number_2):\n print(number_1 + number_2)\n\nimport argparse\n_parser = argparse.ArgumentParser(prog=''Sum and print numbers'', description='''')\n_parser.add_argument(\"--number-1\", dest=\"number_1\", type=int, required=True, default=argparse.SUPPRESS)\n_parser.add_argument(\"--number-2\", dest=\"number_2\", type=int, required=True, default=argparse.SUPPRESS)\n_parsed_args = vars(_parser.parse_args())\n\n_outputs = sum_and_print_numbers(**_parsed_args)\n"], "image": "python:3.7"}}, "inputs": [{"name": "number_1", "type": "Integer"}, {"name": "number_2", "type": "Integer"}], "name": "Sum and print numbers"}', pipelines.kubeflow.org/component_ref: '{}', pipelines.kubeflow.org/arguments.parameters: '{"number_1": "{{inputs.parameters.print-and-return-number-Output}}", "number_2": "{{inputs.parameters.print-and-return-number-2-Output}}"}'} arguments: parameters: - {name: number_1} - {name: number_2} serviceAccountName: pipeline-runner ```
================================================ FILE: i18n/en/docusaurus-plugin-content-docs/version-1.0/kubeflow/basic-requirements.md ================================================ --- title : "3. Install Requirements" description: "" sidebar_position: 3 contributors: ["Jongseob Jeon"] --- The recommended Python version for practice is python>=3.7. For those unfamiliar with the Python environment, please refer to [Appendix 1. Python Virtual Environment](../appendix/pyenv) and install the packages on the **client node**. The packages and versions required for the practice are as follows: - requirements.txt ```bash kfp==1.8.9 scikit-learn==1.0.1 mlflow==1.21.0 pandas==1.3.4 dill==0.3.4 ``` Activate the [Python virtual environment](../appendix/pyenv.md#python-가상환경-생성) created in the previous section. ```bash pyenv activate demo ``` We are proceeding with the package installation. ```bash pip3 install -U pip pip3 install kfp==1.8.9 scikit-learn==1.0.1 mlflow==1.21.0 pandas==1.3.4 dill==0.3.4 ``` ================================================ FILE: i18n/en/docusaurus-plugin-content-docs/version-1.0/kubeflow/basic-run.md ================================================ --- title : "7. Pipeline - Run" description: "" sidebar_position: 7 contributors: ["Jongseob Jeon"] --- ## Run Pipeline Now we will run the uploaded pipeline. ## Before Run ### 1. Create Experiment Experiments in Kubeflow are units that logically manage runs executed within them. When you first enter the namespace in Kubeflow, there are no Experiments created. Therefore, you must create an Experiment beforehand in order to run the pipeline. If an Experiment already exists, you can go to [Run Pipeline](../kubeflow/basic-run.md#run-pipeline-1). Experiments can be created via the Create Experiment button. ![run-0.png](./img/run-0.png) ### 2. Name 입력 ![run-1.png](./img/run-1.png) ## Run Pipeline ### 1. Select Create Run ![run-2.png](./img/run-2.png) ### 2. Select Experiment ![run-9.png](./img/run-9.png) ![run-10.png](./img/run-10.png) ### 3. Enter Pipeline Config Fill in the values of the Config provided when creating the pipeline. The uploaded pipeline requires input values for `number_1` and `number_2`. ![run-3.png](./img/run-3.png) ### 4. Start Click the Start button after entering the values. The pipeline will start running. ![run-4.png](./img/run-4.png) ## Run Result The executed pipelines can be viewed in the Runs tab. Clicking on a run provides detailed information related to the executed pipeline. ![run-5.png](./img/run-5.png) Upon clicking, the following screen appears. Components that have not yet executed are displayed in gray. ![run-6.png](./img/run-6.png) When a component has completed execution, it is marked with a green checkmark. ![run-7.png](./img/run-7.png) If we look at the last component, we can see that it has outputted the sum of the input values, which in this case is 8 (the sum of 3 and 5). ![run-8.png](./img/run-8.png) ================================================ FILE: i18n/en/docusaurus-plugin-content-docs/version-1.0/kubeflow/how-to-debug.md ================================================ --- title : "13. Component - Debugging" description: "" sidebar_position: 13 contributors: ["Jongseob Jeon"] --- ## Debugging Pipeline This page covers how to debug Kubeflow components. ## Failed Component We will modify a pipeline used in [Component - MLFlow](../kubeflow/advanced-mlflow.md#mlflow-pipeline) in this page. First, let's modify the pipeline so that the component fails. ```python from functools import partial import kfp from kfp.components import InputPath, OutputPath, create_component_from_func from kfp.dsl import pipeline @partial( create_component_from_func, packages_to_install=["pandas", "scikit-learn"], ) def load_iris_data( data_path: OutputPath("csv"), target_path: OutputPath("csv"), ): import pandas as pd from sklearn.datasets import load_iris iris = load_iris() data = pd.DataFrame(iris["data"], columns=iris["feature_names"]) target = pd.DataFrame(iris["target"], columns=["target"]) data["sepal length (cm)"] = None data.to_csv(data_path, index=False) target.to_csv(target_path, index=False) @partial( create_component_from_func, packages_to_install=["pandas"], ) def drop_na_from_csv( data_path: InputPath("csv"), output_path: OutputPath("csv"), ): import pandas as pd data = pd.read_csv(data_path) data = data.dropna() data.to_csv(output_path, index=False) @partial( create_component_from_func, packages_to_install=["dill", "pandas", "scikit-learn", "mlflow"], ) def train_from_csv( train_data_path: InputPath("csv"), train_target_path: InputPath("csv"), model_path: OutputPath("dill"), input_example_path: OutputPath("dill"), signature_path: OutputPath("dill"), conda_env_path: OutputPath("dill"), kernel: str, ): import dill import pandas as pd from sklearn.svm import SVC from mlflow.models.signature import infer_signature from mlflow.utils.environment import _mlflow_conda_env train_data = pd.read_csv(train_data_path) train_target = pd.read_csv(train_target_path) clf = SVC(kernel=kernel) clf.fit(train_data, train_target) with open(model_path, mode="wb") as file_writer: dill.dump(clf, file_writer) input_example = train_data.sample(1) with open(input_example_path, "wb") as file_writer: dill.dump(input_example, file_writer) signature = infer_signature(train_data, clf.predict(train_data)) with open(signature_path, "wb") as file_writer: dill.dump(signature, file_writer) conda_env = _mlflow_conda_env( additional_pip_deps=["dill", "pandas", "scikit-learn"] ) with open(conda_env_path, "wb") as file_writer: dill.dump(conda_env, file_writer) @pipeline(name="debugging_pipeline") def debugging_pipeline(kernel: str): iris_data = load_iris_data() drop_data = drop_na_from_csv(data=iris_data.outputs["data"]) model = train_from_csv( train_data=drop_data.outputs["output"], train_target=iris_data.outputs["target"], kernel=kernel, ) if __name__ == "__main__": kfp.compiler.Compiler().compile(debugging_pipeline, "debugging_pipeline.yaml") ``` The modifications are as follows: 1. In the `load_iris_data` component for loading data, `None` was injected into the `sepal length (cm)` feature. 2. In the `drop_na_from_csv` component, use the `drop_na()` function to remove rows with na values. Now let's upload and run the pipeline. After running, if you press Run you will see that it has failed in the `Train from csv` component. ![debug-0.png](./img/debug-0.png) Click on the failed component and check the log to see the reason for the failure. ![debug-2.png](./img/debug-2.png) If the log shows that the data count is 0 and the component did not run, there may be an issue with the input data. Let's investigate what might be the problem. First, click on the component and go to the Input/Output tab to download the input data. You can click on the link indicated by the red square to download the data. ![debug-5.png](./img/debug-5.png) Download both files to the same location. Then navigate to the specified path and check the downloaded files. ```bash ls ``` There are two files as follows. ```bash drop-na-from-csv-output.tgz load-iris-data-target.tgz ``` I will try to unzip it. ```bash tar -xzvf load-iris-data-target.tgz ; mv data target.csv tar -xzvf drop-na-from-csv-output.tgz ; mv data data.csv ``` And then run the component code using a Jupyter notebook. ![debug-3.png](./img/debug-3.png) Debugging revealed that dropping the data was based on rows instead of columns, resulting in all the data being removed. Now that we know the cause of the problem, we can modify the component to drop based on columns. ```python @partial( create_component_from_func, packages_to_install=["pandas"], ) def drop_na_from_csv( data_path: InputPath("csv"), output_path: OutputPath("csv"), ): import pandas as pd data = pd.read_csv(data_path) data = data.dropna(axis="columns") data.to_csv(output_path, index=False) ``` After modifying, upload the pipeline again and run it to confirm that it is running normally as follows. ![debug-6.png](./img/debug-6.png) ================================================ FILE: i18n/en/docusaurus-plugin-content-docs/version-1.0/kubeflow/kubeflow-concepts.md ================================================ --- title : "2. Kubeflow Concepts" description: "" sidebar_position: 2 contributors: ["Jongseob Jeon"] --- ## Component A component is composed of Component contents and a Component wrapper. A single component is delivered to Kubeflow through a Component wrapper and the delivered component executes the defined Component contents and produces artifacts. ![concept-0.png](./img/concept-0.png) ### Component Contents There are three components that make up the component contents: ![concept-1.png](./img/concept-1.png) 1. Environment 2. Python code w/ Config 3. Generates Artifacts Let's explore each component with an example. Here is a Python code that loads data, trains an SVC (Support Vector Classifier) model, and saves the SVC model. ```python import dill import pandas as pd from sklearn.svm import SVC train_data = pd.read_csv(train_data_path) train_target= pd.read_csv(train_target_path) clf= SVC( kernel=kernel ) clf.fit(train_data) with open(model_path, mode="wb") as file_writer: dill.dump(clf, file_writer) ``` The above Python code can be divided into components contents as follows. ![concept-2.png](./img/concept-2.png) Environment is the part of the Python code where the packages used in the code are imported. Next, Python Code w\ Config is where the given Config is used to actually perform the training. Finally, there is a process to save the artifacts. ### Component Wrapper Component wrappers deliver the necessary Config and execute tasks for component content. ![concept-3.png](./img/concept-3.png) In Kubeflow, component wrappers are defined as functions, similar to the `train_svc_from_csv` example above. When a component wrapper wraps the contents, it looks like the following: ![concept-4.png](./img/concept-4.png) ### Artifacts In the explanation above, it was mentioned that the component creates Artifacts. Artifacts is a term used to refer to any form of a file that is generated, such as evaluation results, logs, etc. Of the ones that we are interested in, the following are significant: Models, Data, Metrics, and etc. ![concept-5.png](./img/concept-5.png) - Model - Data - Metric - etc #### Model We defined the model as follows: > A model is a form that includes Python code, trained weights and network architecture, and an environment to run it. #### Data Data includes preprocessed features, model predictions, etc. #### Metric Metric is divided into two categories: dynamic metrics and static metrics. - Dynamic metrics refer to values that continuously change during the training process, such as train loss per epoch. - Static metrics refer to evaluation metrics, such as accuracy, that are calculated after the training is completed. ## Pipeline A pipeline consists of a collection of components and the order in which they are executed. The order forms a directed acyclic graph (DAG), which can include simple conditional statements. ![concept-6.png](./img/concept-6.png) ### Pipeline Config As mentioned earlier, components require config to be executed. The pipeline config contains the configs for all the components in the pipeline. ![concept-7.png](./img/concept-7.png) ## Run To execute a pipeline, the pipeline config specific to that pipeline is required. In Kubeflow, an executed pipeline is called a "Run." ![concept-8.png](./img/concept-8.png) When a pipeline is executed, each component generates artifacts. Kubeflow pipeline assigns a unique ID to each Run, and all artifacts generated during the Run are stored. ![concept-9.png](./img/concept-9.png) Now, let's learn how to write components and pipelines. ================================================ FILE: i18n/en/docusaurus-plugin-content-docs/version-1.0/kubeflow/kubeflow-intro.md ================================================ --- title : "1. Kubeflow Introduction" description: "" sidebar_position: 1 contributors: ["Jongseob Jeon"] --- To use Kubeflow, you need to write components and pipelines. The approach described in *MLOps for ALL* differs slightly from the method described on the [Kubeflow Pipeline official website](https://www.kubeflow.org/docs/components/pipelines/overview/quickstart/). Here, Kubeflow Pipeline is used as one of the components in the [elements that make up MLOps](../kubeflow/kubeflow-concepts.md#component-contents) rather than a standalone workflow. Now, let's understand what components and pipelines are and how to write them. ================================================ FILE: i18n/en/docusaurus-plugin-content-docs/version-1.0/kubeflow-dashboard-guide/_category_.json ================================================ { "label": "Kubeflow UI Guide", "position": 5, "link": { "type": "generated-index" } } ================================================ FILE: i18n/en/docusaurus-plugin-content-docs/version-1.0/kubeflow-dashboard-guide/experiments-and-others.md ================================================ --- title : "6. Kubeflow Pipeline Relates" description: "" sidebar_position: 6 contributors: ["Jaeyeon Kim"] --- In the left tabs of the Central Dashboard (KFP Experiments, Pipelines, Runs, Recurring Runs, Artifacts, Executions) you can manage Kubeflow Pipelines and the results of Pipeline execution and Pipeline Runs. ![left-tabs](./img/left-tabs.png) Kubeflow Pipelines are the main reason for using Kubeflow in *MLOps for ALL*, and details on how to create, execute, and check the results of Kubeflow Pipelines can be found in [3.Kubeflow](../kubeflow/kubeflow-intro). ================================================ FILE: i18n/en/docusaurus-plugin-content-docs/version-1.0/kubeflow-dashboard-guide/experiments.md ================================================ --- title : "5. Experiments(AutoML)" description: "" sidebar_position: 5 contributors: ["Jaeyeon Kim"] --- Next, we will click the Experiments(AutoML) tab on the left of the Central Dashboard. ![left-tabs](./img/left-tabs.png) ![automl](./img/automl.png) The Experiments(AutoML) page is where you can manage [Katib](https://www.kubeflow.org/docs/components/katib/overview/), which is responsible for AutoML through Hyperparameter Tuning and Neural Architecture Search in Kubeflow. The usage of Katib and Experiments(AutoML) is not covered in *MLOps for Everyone* v1.0, and will be added in v2.0. ================================================ FILE: i18n/en/docusaurus-plugin-content-docs/version-1.0/kubeflow-dashboard-guide/intro.md ================================================ --- title : "1. Central Dashboard" description: "" sidebar_position: 1 contributors: ["Jaeyeon Kim", "SeungTae Kim"] --- Once you have completed [Kubeflow installation](../setup-components/install-components-kf.md), you can access the dashboard through the following command. ```bash kubectl port-forward --address 0.0.0.0 svc/istio-ingressgateway -n istio-system 8080:80 ``` ![after-login](./img/after-login.png) The Central Dashboard is a UI that integrates all the features provided by Kubeflow. The features provided by the Central Dashboard can be divided based on the tabs on the left side ![left-tabs](./img/left-tabs.png) - Home - Notebooks - Tensorboards - Volumes - Models - Experiments(AutoML) - Experiments(KFP) - Pipelines - Runs - Recurring Runs - Artifacts - Executions Let's now look at the simple usage of each feature. ================================================ FILE: i18n/en/docusaurus-plugin-content-docs/version-1.0/kubeflow-dashboard-guide/notebooks.md ================================================ --- title : "2. Notebooks" description: "" sidebar_position: 2 contributors: ["Jaeyeon Kim"] --- ## Launch Notebook Server Click on the Notebooks tab on the left side of the Central Dashboard. ![left-tabs](./img/left-tabs.png) You will see a similar screen. The Notebooks tab is a page where users can independently create and access jupyter notebook and code server environments (hereinafter referred to as a notebook server). ![notebook-home](./img/notebook-home.png) Click the "+ NEW NOTEBOOK" button at the top right. ![new-notebook](./img/new-notebook.png) When the screen shown below appears, now specify the spec (Spec) of the notebook server to be created. ![create](./img/create.png)
For details for spec: - **name**: - Specifies a name to identify the notebook server. - **namespace**: - Cannot be changed. (It is automatically set to the namespace of the currently logged-in user account.) - **Image**: - Selects the image to use from pre-installed JupyterLab images with Python packages like sklearn, pytorch, tensorflow, etc. - If you want to use an image that utilizes GPU within the notebook server, refer to the **GPUs** section below. - If you want to use a custom notebook server that includes additional packages or source code, you can create a custom image and deploy it for use. - **CPU / RAM**: - Specifies the amount of resources required. - cpu: in core units - Represents the number of virtual cores, and can also be specified as a float value such as `1.5`, `2.7`, etc. - memory: in Gi units - **GPUs**: - Specifies the number of GPUs to allocate to the Jupyter notebook. - `None` - When GPU resources are not required. - 1, 2, 4 - Allocates 1, 2, or 4 GPUs. - GPU Vendor: - If you have followed the [(Optional) Setup GPU](../setup-kubernetes/setup-nvidia-gpu.md) guide and installed the NVIDIA GPU plugin, select NVIDIA. - **Workspace Volume**: - Specifies the amount of disk space required within the notebook server. - Do not change the Type and Name fields unless you want to increase the disk space or change the AccessMode. - Check the **"Don't use Persistent Storage for User's home"** checkbox only if it is not necessary to save the notebook server's work. **It is generally recommended not to check this option.** - If you want to use a pre-existing Persistent Volume Claim (PVC), select Type as "Existing" and enter the name of the PVC to use. - **Data Volumes**: - If additional storage resources are required, click the **"+ ADD VOLUME"** button to create them. - ~~Configurations, Affinity/Tolerations, Miscellaneous Settings~~ - These are generally not needed, so detailed explanations are omitted in *MLOps for All*.
If you followed the [Setup GPU (Optional)](../setup-kubernetes/setup-nvidia-gpu.md), select NVIDIA if you have installed the nvidia gpu plugin. ![creating](./img/creating.png) After creation, the **Status** will change to a green check mark icon, and the **CONNECT button** will be activated. ![created](./img/created.png) --- ## Accessing the Notebook Server Clicking the **CONNECT button** will open a new browser window, where you will see the following screen: ![notebook-access](./img/notebook-access.png) You can use the Notebook, Console, and Terminal icons in the **Launcher** to start using them. Notebook Interface ![notebook-console](./img/notebook-console.png) Terminal Interface ![terminal-console](./img/terminal-console.png) --- ## Stopping the Notebook Server If you haven't used the notebook server for an extended period of time, you can stop it to optimize resource usage in the Kubernetes cluster. **Note that stopping the notebook server will result in the deletion of all data stored outside the Workspace Volume or Data Volume specified when creating the notebook server.** If you haven't changed the path during notebook server creation, the default Workspace Volume path is `/home/jovyan` inside the notebook server, so any data stored outside the `/home/jovyan` directory will be deleted. Clicking the `STOP` button as shown below will stop the notebook server: ![notebook-stop](./img/notebook-stop.png) Once the server is stopped, the `CONNECT` button will be disabled. To restart the notebook server and use it again, click the `PLAY` button. ![notebook-restart](./img/notebook-restart.png) ================================================ FILE: i18n/en/docusaurus-plugin-content-docs/version-1.0/kubeflow-dashboard-guide/tensorboards.md ================================================ --- title : "3. Tensorboards" description: "" sidebar_position: 3 contributors: ["Jaeyeon Kim"] --- Let's click on the Tensorboards tab of the left tabs of the Central Dashboard next. ![left-tabs](./img/left-tabs.png) We can see the following screen. ![tensorboard](./img/tensorboard.png) The TensorBoard server created in this way can be used just like a regular remote TensorBoard server, or it can be used for the purpose of storing data directly from a Kubeflow Pipeline run for visualization purposes. You can refer to the [TensorBoard documentation](https://www.kubeflow.org/docs/components/pipelines/sdk/output-viewer/#tensorboard) for more information on using TensorBoard with Kubeflow Pipeline runs. There are various ways to visualize the results of Kubeflow Pipeline runs, and in *MLOps for ALL*, we will utilize the Visualization feature of Kubeflow components and the visualization capabilities of MLflow to enable more general use cases. Therefore, detailed explanations of the TensorBoards page will be omitted in this context. ================================================ FILE: i18n/en/docusaurus-plugin-content-docs/version-1.0/kubeflow-dashboard-guide/volumes.md ================================================ --- title : "4. Volumes" description: "" sidebar_position: 4 contributors: ["Jaeyeon Kim"] --- ## Volumes Next, let's click on the Volumes tab in the left of the Central Dashboard. ![left-tabs](./img/left-tabs.png) You will see the following screen. ![volumes](./img/volumes.png) Volumes tab provides the functionality to manage the Persistent Volume Claims (PVC) belonging to the current user's namespace in Kubernetes' Volume (Volume). By looking at the screenshot, you can see the information of the Volume created on the [1. Notebooks](../kubeflow-dashboard-guide/notebooks) page. It can be seen that the Storage Class of the Volume is set to local-path, which is the Default Storage Class installed at the time of Kubernetes cluster installation. In addition, the Volumes page can be used if you want to create, view, or delete a new Volume in the user namespace. --- ## Creating a Volume By clicking the `+ NEW VOLUME` button at the top right, you can see the following screen. ![new-volume](./img/new-volume.png) You can create a volume by specifying its name, size, storage class, and access mode. When you specify the desired resource specs to create a volume, its Status will be shown as Pending on this page. When you hover over the Status icon, you will see a message that this *(This volume will be bound when its first consumer is created.)* This is according to the volume creation policy of the [StorageClass](https://kubernetes.io/ko/docs/concepts/storage/storage-classes/) used in the lab, which is local-path. **This is not a problem situation.** When the Status is shown as Pending on this page, you can still specify the name of the volume in the notebook server or pod that you want to use the volume and the volume creation will be triggered at that time. ![creating-volume](./img/creating-volume.png) ================================================ FILE: i18n/en/docusaurus-plugin-content-docs/version-1.0/prerequisites/_category_.json ================================================ { "label": "Prerequisites", "position": 1, "link": { "type": "generated-index" } } ================================================ FILE: i18n/en/docusaurus-plugin-content-docs/version-1.0/prerequisites/docker/_category_.json ================================================ { "label": "Docker", "position": 1, "link": { "type": "generated-index" } } ================================================ FILE: i18n/en/docusaurus-plugin-content-docs/version-1.0/prerequisites/docker/advanced.md ================================================ --- title : "[Practice] Docker Advanced" description: "Practice to use docker more advanced way." sidebar_position: 6 contributors: ["Jongseob Jeon", "Jaeyeon Kim"] --- ## Making a good Docker image ### Considerations to make Docker image: When creating a Docker image using a Dockerfile, the **order** of the commands is important. This is because Docker images are composed of many Read-Only layers and when building the image, existing layers are **cached** and reused, so if you structure your Dockerfile with this in mind, you can **reduce the build time**. Each of the `RUN`, `ADD`, `COPY` commands in a Dockerfile are stored as one layer. For example, if we have the following `Dockerfile`: ```docker # Layer 1 FROM ubuntu:latest # Layer 2 RUN apt-get update && apt-get install python3 pip3 -y # Layer 3 RUN pip3 install -U pip && pip3 install torch # Layer 4 COPY src/ src/ # Layer 5 CMD python src/app.py ``` If you run the image built with the above `Dockerfile` with the command `docker run -it app:latest /bin/bash`, it can be represented in the following layers. ![layers.png](./img/layers.png) The topmost R/W layer does not affect the image. In other words, any changes made inside the container are volatile. When a lower layer is changed, all the layers above it need to be rebuilt. Therefore, the order of Dockerfile instructions is important. It is recommended to place the parts that are frequently changed towards the end. (e.g., `COPY src/ app/src/`) Conversely, parts that are unlikely to change should be placed towards the beginning. If there are parts that are rarely changed but used in multiple places, they can be consolidated. It is advisable to create a separate image for those common parts in advance and use it as a base image. For example, if you want to create separate images for an environment that uses `tensorflow-cpu` and another environment that uses `tensorflow-gpu`, you can do the following: Create a base image [`ghcr.io/makinarocks/python:3.8-base`](http://ghcr.io/makinarocks/python:3.8-base-cpu) that includes Python and other basic packages installed. Then, when creating the images with the CPU and GPU versions of TensorFlow, you can use the base image as the `FROM` instruction and write the separate instructions for installing TensorFlow in each Dockerfile. Managing two Dockerfiles in this way improves readability and reduces build time. Combining layers had performance benefits in older versions of Docker. However, since you cannot guarantee the Docker version in which your Docker containers will run, it is recommended to combine layers for readability purposes. It is best to combine layers that can be combined appropriately. Here is an example of a Dockerfile: ```docker # Bad Case RUN apt-get update RUN apt-get install build-essential -y RUN apt-get install curl -y RUN apt-get install jq -y RUN apt-get install git -y ``` This can be written by combining it as follows. ```docker # Better Case RUN apt-get update && \ apt-get install -y \ build-essential \ curl \ jq \ git ``` For convenience, it is better to use `.dockerignore`. `.dockerignore` is similar to `.gitignore` in the sense that it can be excluded when doing a `docker build` just like when doing a `git add`. More information can be found in the [Docker Official Documentation](https://docs.docker.com/develop/develop-images/dockerfile_best-practices/). ### ENTRYPOINT vs CMD `ENTRYPOINT` and `CMD` are both used when you want to execute a command at the runtime of the container. One of them must be present in the Dockerfile. - **Difference** - `CMD`: Easily modifiable when running `docker run` command - `ENTRYPOINT`: Requires the use of `--entrypoint` to modify When `ENTRYPOINT` and `CMD` are used together, `CMD` typically represents the arguments (parameters) for the command specified in `ENTRYPOINT`. For example, consider the following Dockerfile: ```docker FROM ubuntu:latest # 아래 4 가지 option 을 바꿔가며 직접 테스트해보시면 이해하기 편합니다. # 단, NO ENTRYPOINT 옵션은 base image 인 ubuntu:latest 에 이미 있어서 테스트해볼 수는 없고 나머지 v2, 3, 5, 6, 8, 9, 11, 12 를 테스트해볼 수 있습니다. # ENTRYPOINT echo "Hello ENTRYPOINT" # ENTRYPOINT ["echo", "Hello ENTRYPOINT"] # CMD echo "Hello CMD" # CMD ["echo", "Hello CMD"] ``` If you build and run the above `Dockerfile` with the parts marked as comments deactivated, you can get the following results: | | No ENTRYPOINT | ENTRYPOINT a b | ENTRYPOINT ["a", "b"] | | ------------------ | -------------- | -------------- | --------------------- | | **NO CMD** | Error! | /bin/sh -c a b | a b | | **CMD ["x", "y"]** | x y | /bin/sh -c a b | a b x y | | **CMD x y** | /bin/sh -c x y | /bin/sh -c a b | a b /bin/sh -c x y | - In Kubernetes pod, - `ENTRYPOINT` corresponds to the command - `CMD` corresponds to the arguments ### Naming docker tag Recommend not using "latest" as a tag for a Docker image, as it is the default tag name and can be easily overwritten unintentionally. It is important to ensure uniqueness of one image with one tag for the sake of collaboration and debugging in the production stage. Using the same tag for different contents can lead to dangling images, which are not shown in the `docker images` but still take up storage space. ### ETC 1. Logs and other information are stored separately from the container, not inside it. This is because data written from within the container can be lost at any time. 2. Secrets and environment-dependent information should not be written directly into the Dockerfile but should be passed in via environment variables or a .env config file. 3. There is a **linter** for Dockerfiles, so it is useful to use it when collaborating. [https://github.com/hadolint/hadolint](https://github.com/hadolint/hadolint) ## Several options for docker run When using Docker containers, there are some inconveniences. Specifically, Docker does not store any of the work done within the Docker container by default. This is because Docker containers use isolated file systems. Therefore, it is difficult to share data between multiple Docker containers. To solve this problem, there are two approaches offered by Docker. ![storage.png](./img/storage.png) #### Docker volume - Use the Docker CLI to directly manage a resource called `volume`. - Create a specific directory under the Docker area (`/var/lib/docker`) on the host and mount that path to a Docker container. #### Bind mount - Mount a specific path on the host to a Docker container. #### How to use? The usage is through the same interface, using the `-v` option. However, when using volumes, you need to manage them directly by performing commands like `docker volume create`, `docker volume ls`, `docker volume rm`, etc. - Docker volume ```bash docker run \ -v my_volume:/app \ nginx:latest ```` - Blind mount ```bash docker run \ -v /home/user/some/path:/app \ nginx:latest ``` When developing locally, bind mount can be convenient, but if you want to maintain a clean environment, using Docker volume and explicitly performing create and rm operations can be another approach. The way storage is provided in Kubernetes ultimately relies on Docker's bind mount as well. ### Docker run with resource limit Basically, docker containers can **fully utilize the CPU and memory resources of the host OS**. However, when using this, depending on the resource situation of the host OS, docker containers may abnormally terminate due to issues such as **OOM**. To address this problem, docker provides the `-m` [option](https://docs.docker.com/config/containers/resource_constraints/#limit-a-containers-access-to-memory) which allows you to **limit the usage of CPU and memory** when running the docker container. ```bash docker run -d -m 512m --memory-reservation=256m --name 512-limit ubuntu sleep 3600 docker run -d -m 1g --memory-reservation=256m --name 1g-limit ubuntu sleep 3600 ``` After running the Docker above, you can check the usage through the 'docker stats' command. ```bash CONTAINER ID NAME CPU % MEM USAGE / LIMIT MEM % NET I/O BLOCK I/O PIDS 4ea1258e2e09 1g-limit 0.00% 300KiB / 1GiB 0.03% 1kB / 0B 0B / 0B 1 4edf94b9a3e5 512-limit 0.00% 296KiB / 512MiB 0.06% 1.11kB / 0B 0B / 0B 1 ``` In Kubernetes, when you limit the CPU and memory resources of a pod resource, it is provided using this technique. ### docker run with restart policy If there is a need to keep a particular container running continuously, the `--restart=always` option is provided to try to re-create the container immediately after it is terminated. After entering the option, run the docker. ```bash docker run --restart=always ubuntu ``` Run `watch -n1 docker ps` to check if it is restarting. If it is running normally, `Restarting (0)` will be printed in STATUS. ```bash CONTAINER ID IMAGE COMMAND CREATED STATUS PORTS NAMES a911850276e8 ubuntu "bash" 35 seconds ago Restarting (0) 6 seconds ago hungry_vaughan ``` - [https://docs.docker.com/engine/reference/commandline/run/#restart-policies---restart](https://docs.docker.com/engine/reference/commandline/run/#restart-policies---restart) - Provides options such as "on-failure with max retries" and "always" When specifying the restart option for a job resource in Kubernetes, this approach is used. ### Running docker run as a background process By default, when running a Docker container, it is executed as a foreground process. This means that the terminal that launched the container is automatically attached to it, preventing you from running other commands. Let's try an example. Open two terminals, and in one terminal, continuously monitor `docker ps`, while in the other terminal, execute the following commands one by one and observe the behavior. #### First Practice ```bash docker run -it ubuntu sleep 10 ``` You must remain stopped for 10 seconds and you cannot perform any other commands from that container. After 10 seconds, you can check in docker ps that the container has terminated. #### Second Practice ```bash docker run -it ubuntu sleep 10 ``` After that, press `ctrl + p` -> `ctrl + q`. Now you can perform other commands in that terminal, and you can also see that the container is still alive for up to 10 seconds with `docker ps`. This situation, where you exit from the Docker container, is called "detached". Docker provides an option to run containers in detached mode, which allows you to run the container in the background while executing the `run` command. #### Third Practice ```bash docker run -d ubuntu sleep 10 ``` In detached mode, you can perform other actions in the terminal that executed the command. It is good to use detached mode appropriately according to the situation. For example, when developing a backend API server that communicates with the DB, the backend API server needs to be constantly checked with hot-loading while changing the source code, but the DB does not need to be monitored, so it can be executed as follows. Run the DB container in detached mode, and run the backend API server in attached mode to follow the logs. ## References - [https://towardsdatascience.com/docker-storage-598e385f4efe](https://towardsdatascience.com/docker-storage-598e385f4efe) - [https://vsupalov.com/docker-latest-tag/](https://vsupalov.com/docker-latest-tag/) - [https://docs.microsoft.com/ko-kr/azure/container-registry/container-registry-image-tag-version](https://docs.microsoft.com/ko-kr/azure/container-registry/container-registry-image-tag-version) - [https://stevelasker.blog/2018/03/01/docker-tagging-best-practices-for-tagging-and-versioning-docker-images/](https://stevelasker.blog/2018/03/01/docker-tagging-best-practices-for-tagging-and-versioning-docker-images/) ================================================ FILE: i18n/en/docusaurus-plugin-content-docs/version-1.0/prerequisites/docker/command.md ================================================ --- title : "[Practice] Docker command" description: "Practice to use docker command." sidebar_position: 4 contributors: ["Jongseob Jeon", "Jaeyeon Kim"] --- ## 1. Normal installation confirmation ```bash docker run hello-world ``` If installed correctly, you should be able to see the following message. ```bash Hello from Docker! This message shows that your installation appears to be working correctly. .... ``` **(For ubuntu)** If you want to use without sudo, please refer to the following site. - [https://docs.docker.com/engine/install/linux-postinstall/#manage-docker-as-a-non-root-user](https://docs.docker.com/engine/install/linux-postinstall/#manage-docker-as-a-non-root-user) ## 2. Docker Pull Docker pull is a command to download Docker images from a Docker image registry (a repository where Docker images are stored and shared). You can check the arguments available in docker pull using the command below. ```bash docker pull --help ``` If performed normally, it prints out as follows. ```bash Usage: docker pull [OPTIONS] NAME[:TAG|@DIGEST] Pull an image or a repository from a registry Options: -a, --all-tags Download all tagged images in the repository --disable-content-trust Skip image verification (default true) --platform string Set platform if server is multi-platform capable -q, --quiet Suppress verbose output ``` It can be seen here that docker pull takes two types of arguments. 1. `[OPTIONS]` 2. `NAME[:TAG|@DIGEST]` In order to use the `-a` and `-q` options from help, they must be used before the NAME. Let's try and pull the `ubuntu:18.04` image directly. ```bash docker pull ubuntu:18.04 ``` If interpreted correctly, the command means to pull an image with the tag `18.04` from an image named `ubuntu`. If performed successfully, it will produce an output similar to the following. ```bash 18.04: Pulling from library/ubuntu 20d796c36622: Pull complete Digest: sha256:42cd9143b6060261187a72716906187294b8b66653b50d70bc7a90ccade5c984 Status: Downloaded newer image for ubuntu:18.04 docker.io/library/ubuntu:18.04 ``` If you perform the above command, you will download the image called 'ubuntu:18.04' from a registry named [docker.io/library](http://docker.io/library/) to your laptop. - Note that - in the future, if you need to get a docker image from a certain **private** registry instead of docker.io or public docker hub, you can use [`docker login`](https://docs.docker.com/engine/reference/commandline/login/) to point to the certain registry, then use `docker pull`. Alternatively, you can set up an [insecure registry](https://stackoverflow.com/questions/42211380/add-insecure-registry-to-docker). - Also note that [`docker save`](https://docs.docker.com/engine/reference/commandline/save/) and [`docker load`](https://docs.docker.com/engine/reference/commandline/load/) commands are available to store and share docker images in the form of `.tar` file in an intranet. ## 3. Docker images This is the command to list the Docker images that exist locally. ```bash docker images --help ``` The arguments available for use in docker images are as follows. ```bash Usage: docker images [OPTIONS] [REPOSITORY[:TAG]] List images Options: -a, --all Show all images (default hides intermediate images) --digests Show digests -f, --filter filter Filter output based on conditions provided --format string Pretty-print images using a Go template --no-trunc Don't truncate output -q, --quiet Only show image IDs ``` Let's try executing the command below directly. ```bash docker images ``` If you install Docker and proceed with this practice, it will output something similar to this. ```bash REPOSITORY TAG IMAGE ID CREATED SIZE ubuntu 18.04 29e70752d7b2 2 days ago 56.7MB ``` If you use the `-q` argument among the possible arguments, only the `IMAGE ID` will be printed. ```bash docker images -q ``` ```bash 29e70752d7b2 ``` ## 4. Docker ps Command to output the list of currently running Docker containers. ```bash docker ps --help ``` Use the following arguments can be used with 'docker ps': ```bash Usage: docker ps [OPTIONS] List containers Options: -a, --all Show all containers (default shows just running) -f, --filter filter Filter output based on conditions provided --format string Pretty-print containers using a Go template -n, --last int Show n last created containers (includes all states) (default -1) -l, --latest Show the latest created container (includes all states) --no-trunc Don't truncate output -q, --quiet Only display container IDs -s, --size Display total file sizes ``` Let's try running the command below directly. ```bash docker ps ``` If there are no currently running containers, it will be as follows. ```bash CONTAINER ID IMAGE COMMAND CREATED STATUS PORTS NAMES ``` If there is a container running, it will look similar to this. ```bash CONTAINER ID IMAGE COMMAND CREATED STATUS PORTS NAMES c1e8f5e89d8d ubuntu "sleep 3600" 13 seconds ago Up 12 seconds trusting_newton ``` ## 5. Docker run Command to run a Docker container. ```bash docker run --help ``` The command to run docker run is as follows. ```bash Usage: docker run [OPTIONS] IMAGE [COMMAND] [ARG...] Run a command in a new container ``` What we need to confirm here is that the docker run command takes three types of arguments. 1. `[OPTIONS]` 2. `[COMMAND]` 3. `[ARG...]` Let's try running a docker container ourselves. ```bash ## Usage: docker run [OPTIONS] IMAGE [COMMAND] [ARG...] docker run -it --name demo1 ubuntu:18.04 /bin/bash ``` - `-it`: Combination of `-i` and `-t` options - Runs the container and connects it to an interactive terminal - `--name`: Assigns a name to the container for easier identification instead of using the container ID - `/bin/bash`: Specifies the command to be executed in the container upon startup, where `/bin/bash` opens a bash shell. After running the command, you can exit the container by using the `exit` command. When you enter the previously learned `docker ps` command, the following output will be displayed. ```bash CONTAINER ID IMAGE COMMAND CREATED STATUS PORTS NAMES ``` It was said that the container being executed was coming out, but for some reason the container that was just executed does not appear. The reason is that `docker ps` shows the currently running containers by default. If you want to see the stopped containers too, you must give the `-a` option. ```bash docker ps -a ``` Then the list of terminated containers will also be displayed. ```bash CONTAINER ID IMAGE COMMAND CREATED STATUS PORTS NAMES 4c1aa74a382a ubuntu:18.04 "/bin/bash" 2 minutes ago Exited (0) 2 minutes ago demo1 ``` ## 6. Docker exec Docker exec is a command that is used to issue commands or access the inside of a Docker container. ```bash docker exec --help ``` For example, let's try running the following command. ```bash docker run -d --name demo2 ubuntu:18.04 sleep 3600 ``` Here, the `-d` option is a command that allows the Docker container to run in the background so that even if the connection ends to the container, it continues to run. Use `docker ps` to check if it is currently running. It can be confirmed that it is running as follows. ```bash CONTAINER ID IMAGE COMMAND CREATED STATUS PORTS NAMES fc88a83e90f0 ubuntu:18.04 "sleep 3600" 4 seconds ago Up 3 seconds demo2 ``` Now let's connect to the running docker container through the `docker exec` command. ```bash docker exec -it demo2 /bin/bash ``` This is the same as the previous `docker run` command, allowing you to access the inside of the container. You can exit using `exit`. ## 7. Docker logs ```bash docker logs --help ``` I will have the following container be executed. ```bash docker run --name demo3 -d busybox sh -c "while true; do $(echo date); sleep 1; done" ``` By using the above command, we have set up a busybox container named "test" as a Docker container in the background and printed the current time once every second. Now let's check the log with the command below. ```bash docker logs demo3 ``` If performed normally, it will be similar to below. ```bash Sun Mar 6 11:06:49 UTC 2022 Sun Mar 6 11:06:50 UTC 2022 Sun Mar 6 11:06:51 UTC 2022 Sun Mar 6 11:06:52 UTC 2022 Sun Mar 6 11:06:53 UTC 2022 Sun Mar 6 11:06:54 UTC 2022 ``` However, if used this way, you can only check the logs taken so far. In this case, you can use the `-f` option to keep watching and outputting. ```bash docker logs demo3 -f ``` ## 8. Docker stop Command to stop a running Docker container. ```bash docker stop --help ``` Through `docker ps`, you can check the containers currently running, as follows. ```bash CONTAINER ID IMAGE COMMAND CREATED STATUS PORTS NAMES 730391669c39 busybox "sh -c 'while true; …" About a minute ago Up About a minute demo3 fc88a83e90f0 ubuntu:18.04 "sleep 3600" 4 minutes ago Up 4 minutes demo2 ``` Now let's try to stop Docker with `docker stop`. ```bash docker stop demo2 ``` After executing, type `docker ps` again. ```bash CONTAINER ID IMAGE COMMAND CREATED STATUS PORTS NAMES 730391669c39 busybox "sh -c 'while true; …" 2 minutes ago Up 2 minutes demo3 ``` Comparing with the above result, you can see that the demo2 container has disappeared from the list of currently running containers. The rest of the containers will also be stopped. ```bash docker stop demo3 ``` Docker rm: Command to delete a Docker container. ```bash docker rm --help ``` Docker containers are in a stopped state by default. That's why you can see stopped containers using `docker ps -a`. But why do we have to delete the stopped containers? Even when stopped, the data used in the Docker remains in the container. So you can restart the container through restarting. But this process will use disk. So in order to delete the containers that are not used at all, we should use the `docker rm` command. First, let's check the current containers. ```bash docker ps -a ``` There are three containers as follows. ```bash CONTAINER ID IMAGE COMMAND CREATED STATUS PORTS NAMES 730391669c39 busybox "sh -c 'while true; …" 4 minutes ago Exited (137) About a minute ago demo3 fc88a83e90f0 ubuntu:18.04 "sleep 3600" 7 minutes ago Exited (137) 2 minutes ago demo2 4c1aa74a382a ubuntu:18.04 "/bin/bash" 10 minutes ago Exited (0) 10 minutes ago demo1 ``` Let's try to delete the 'demo3' container through the following command. ```bash docker rm demo3 ``` The command `docker ps -a` reduced it to two lines as follows. ```bash CONTAINER ID IMAGE COMMAND CREATED STATUS PORTS NAMES fc88a83e90f0 ubuntu:18.04 "sleep 3600" 13 minutes ago Exited (137) 8 minutes ago demo2 4c1aa74a382a ubuntu:18.04 "/bin/bash" 16 minutes ago Exited (0) 16 minutes ago demo1 ``` Delete the remaining containers as well. ```bash docker rm demo2 docker rm demo1 ``` ## 10. Docker rmi Command to delete a Docker image. ```bash docker rmi --help ``` Use the following commands to check which images are currently on the local. ```bash docker images ``` The following is output. ```bash REPOSITORY TAG IMAGE ID CREATED SIZE busybox latest a8440bba1bc0 32 hours ago 1.41MB ubuntu 18.04 29e70752d7b2 2 days ago 56.7MB ``` I will try to delete the `busybox` image. ```bash docker rmi busybox ``` If you type `docker images` again, the following will appear. ```bash REPOSITORY TAG IMAGE ID CREATED SIZE ubuntu 18.04 29e70752d7b2 2 days ago 56.7MB ``` ## References - [https://docs.github.com/en/packages/working-with-a-github-packages-registry/working-with-the-container-registry](https://docs.github.com/en/packages/working-with-a-github-packages-registry/working-with-the-container-registry) ================================================ FILE: i18n/en/docusaurus-plugin-content-docs/version-1.0/prerequisites/docker/docker.md ================================================ --- title : "What is Docker?" description: "Introduction to Docker." sidebar_position: 3 contributors: ["Jongseob Jeon", "Jaeyeon Kim"] --- ## Container - Containerization: - A technology that allows applications to be executed uniformly anywhere. - Container Image: - A collection of all the files required to run an application. - → Similar to a mold for making fish-shaped bread (Bungeoppang). - Container: - A single process that is executed based on a container image. - → A fish-shaped bread (Bungeoppang) produced using a mold. ## Docker Docker is a platform that allows you to manage and use containers. Its slogan is "Build Once, Run Anywhere," guaranteeing the same execution results anywhere. In the Docker, the resources for the container are separated and the lifecycle is controlled by Linux kernel's cgroups, etc. However, it is too difficult to use these interfaces directly, so an abstraction layer is created. ![docker-layer.png](./img/docker-layer.png) Through this, users can easily control containers with just the user-friendly API **Docker CLI**. - Users can easily control containers using the user-friendly API called **Docker CLI**. ## Interpretation of Layer The roles of the layers mentioned above are as follows: 1. runC: Utilizes the functionality of the Linux kernel to isolate namespaces, CPUs, memory, filesystems, etc., for a container, which is a single process. 2. containerd: Acts as an abstraction layer to communicate with runC (OCI layer) and uses the standardized interface (OCI). 3. dockerd: Solely responsible for issuing commands to containerd. 4. Docker CLI: Users only need to issue commands to dockerd (Docker daemon) using Docker CLI. - During this communication process, Unix socket is used, so sometimes Docker-related errors occur, such as "the /var/run/docker.sock is in use" or "insufficient permissions" error messages. Although Docker encompasses many stages, when the term "Docker" is used, it can refer to Docker CLI, Dockerd (Docker daemon), or even a single Docker container, which can lead to confusion. In the upcoming text, the term "Docker" may be used in various contexts. ## For ML Engineer ML engineers use Docker for the following reasons: 1. ML training/inference code needs to be independent of the underlying operating system, Python version, Python environment, and specific versions of Python packages. 2. Therefore, the goal is to bundle not only the code but also all the dependent packages, environment variables, folder names, etc., into a single package. Containerization technology enables this. 3. Docker is one of the software tools that makes it easy to use and manage this technology, and the packaged units are referred to as Docker images. ================================================ FILE: i18n/en/docusaurus-plugin-content-docs/version-1.0/prerequisites/docker/images.md ================================================ --- title : "[Practice] Docker images" description: "Practice to use docker image." sidebar_position: 5 contributors: ["Jongseob Jeon", "Jaeyeon Kim"] --- - `docker commit` - running container 를 docker image 로 만드는 방법 - `docker commit -m "message" -a "author" ` - `docker commit` 을 사용하면, 수동으로 Dockerfile 을 만들지 않고도 도커 이미지를 만들 수 있습니다. ``` touch Dockerfile ``` 3. Move to the docker-practice folder. 4. Create an empty file called Dockerfile. 1. 이미지에 특정 패키지를 설치하는 명령어는 무엇입니까? Answer: `RUN` Translation: Let's look at the basic commands that can be used in Dockerfile one by one. FROM is a command that specifies which image to use as a base image for Dockerfile. When creating a Docker image, instead of creating the environment I intend from scratch, I can use a pre-made image such as `python:3.9`, `python-3.9-alpine`, etc. as the base and install pytorch and add my source code. ```docker FROM [:] [AS ] # 예시 FROM ubuntu FROM ubuntu:18.04 FROM nginx:latest AS ngx ``` The command to copy files or directories from the `` path on the host (local) to the `` path inside the container. ```docker COPY ... # 예시 COPY a.txt /some-directory/b.txt COPY my-directory /some-directory-2 ``` ADD is similar to COPY but it has additional features. ```docker # 1 - 호스트에 압축되어있는 파일을 풀면서 컨테이너 내부로 copy 할 수 있음 ADD scripts.tar.gz /tmp # 2 - Remote URLs 에 있는 파일을 소스 경로로 지정할 수 있음 ADD http://www.example.com/script.sh /tmp # 위 두 가지 기능을 사용하고 싶을 경우에만 COPY 대신 ADD 를 사용하는 것을 권장 ``` The command to run the specified command inside a Docker container. Docker images maintain the state in which the commands are executed. ```docker RUN RUN ["executable-command", "parameter1", "parameter2"] # 예시 RUN pip install torch RUN pip install -r requirements.txt ``` CMD specifies a command that the Docker container will **run when it starts**. There is a similar command called **ENTRYPOINT**. The difference between them will be discussed **later**. Note that only one **CMD** can be run in one Docker image, which is different from **RUN** command. ```docker CMD CMD ["executable-command", "parameter1", "parameter2"] CMD ["parameter1", "parameter2"] # ENTRYPOINT 와 함께 사용될 때 # 예시 CMD python main.py ``` WORKDIR is a command that specifies which directory inside the container to perform future additional commands. If the directory does not exist, it will be created. ```docker WORKDIR /path/to/workdir # 예시 WORKDIR /home/demo RUN pwd # /home/demo 가 출력됨 ``` This is a command to set the value of environment variables that will be used continuously inside the container. ```docker ENV ENV = # 예시 # default 언어 설정 RUN locale-gen ko_KR.UTF-8 ENV LANG ko_KR.UTF-8 ENV LANGUAGE ko_KR.UTF-8 ENV LC_ALL ko_KR.UTF-8 ``` You can specify the port/protocol to be opened from the container. If `` is not specified, TCP is set as the default. ```docker EXPOSE EXPOSE / # 예시 EXPOSE 8080 ``` Write a simple Dockerfile by using `vim Dockerfile` or an editor like vscode and write the following: ```docker # base image 를 ubuntu 18.04 로 설정합니다. FROM ubuntu:18.04 # apt-get update 명령을 실행합니다. RUN apt-get update # TEST env var의 값을 hello 로 지정합니다. ENV TEST hello # DOCKER CONTAINER 가 시작될 때, 환경변수 TEST 의 값을 출력합니다. CMD echo $TEST ``` Use the `docker build` command to create a Docker Image from a Dockerfile. ```bash docker build --help ``` Run the following command from the path where the Dockerfile is located. ```bash docker build -t my-image:v1.0.0 . ``` The command above means to build an image with the name "my-image" and the tag "v1.0.0" from the Dockerfile in the current path. Let's check if the image was built successfully. ```bash # grep : my-image 가 있는지를 잡아내는 (grep) 하는 명령어 docker images | grep my-image ``` If performed normally, it will output as follows. ```bash my-image v1.0.0 143114710b2d 3 seconds ago 87.9MB ``` Let's now **run** a docker container with the `my-image:v1.0.0` image that we just built. ```bash docker run my-image:v1.0.0 ``` If performed normally, it will result in the following. ```bash hello ``` Let's run a docker container and change the value of the `TEST` env var at the time of running the `my-image:v1.0.0` image we just built. ```bash docker run -e TEST=bye my-image:v1.0.0 ``` If performed normally, it will be as follows. ```bash bye ``` ================================================ FILE: i18n/en/docusaurus-plugin-content-docs/version-1.0/prerequisites/docker/install.md ================================================ --- title : "Install Docker" description: "Install docker to start." sidebar_position: 1 contributors: ["Jongseob Jeon", "Jaeyeon Kim"] --- ## Docker To practice Docker, you need to install Docker. The Docker installation varies depending on which OS you are using. Please refer to the official website for the Docker installation that fits your environment: - [ubuntu](https://docs.docker.com/engine/install/ubuntu/) - [mac](https://docs.docker.com/desktop/mac/install/) - [windows](https://docs.docker.com/desktop/windows/install/) ## Check Installation Check installation requires an OS, terminal environment where `docker run hello-world` runs correctly. | OS | Docker Engine | Terminal | | ------- | -------------- | ------------------ | | MacOS | Docker Desktop | zsh | | Windows | Docker Desktop | Powershell | | Windows | Docker Desktop | WSL2 | | Ubuntu | Docker Engine | bash | ## Before diving in.. It is possible that many metaphors and examples will be focused towards MLOps as they explain the necessary Docker usage to use MLOps. ================================================ FILE: i18n/en/docusaurus-plugin-content-docs/version-1.0/prerequisites/docker/introduction.md ================================================ --- title : "Why Docker & Kubernetes ?" description: "Introduction to Docker." sidebar_position: 2 contributors: ["Jongseob Jeon", "Jaeyeon Kim"] --- ## Why Kubernetes ? To operationalize machine learning models, additional functionalities beyond model development are required. 1. Training Phase - Schedule management for model training commands - Ensuring reproducibility of trained models 2. Deployment Phase - Traffic distribution - Monitoring service failures - Troubleshooting in case of failures Fortunately, the software development field has already put a lot of thought and effort into addressing these needs. Therefore, when deploying machine learning models, leveraging the outcomes of these considerations can be highly beneficial. Docker and Kubernetes are two prominent software products widely used in MLOps to address these needs. ## Docker & Kubernetes ### Not a software but a product Docker and Kubernetes are representative software (products) that provide containerization and container orchestration functions respectively. #### Docker Docker was the mainstream in the past, but its usage has been decreasing gradually with the addition of various paid policy. However, as of March 2022, it is still the most commonly used container virtualization software. ![sysdig-2019.png](./img/sysdig-2019.png)
[from sysdig 2019]
![sysdig-2021.png](./img/sysdig-2021.png)
[from sysdig 2021]
#### Kubernetes Kubernetes: Kubernetes is a product that has almost no comparison so far. ![cncf-survey.png](./img/cncf-survey.png)
[from cncf survey]
![t4-ai.png](./img/t4-ai.png)
[from t4.ai]
### History of Open source #### Initial Docker & Kubernetes At the beginning of Docker development, **one package** called Docker Engine contained multiple features such as API, CLI, networking, storage, etc., but it began to be **divided one by one** according to the philosophy of **MSA**. However, the initial Kubernetes included Docker Engine for container virtualization. Therefore, whenever the Docker version was updated, the interface of Docker Engine changed and Kubernetes was greatly affected. #### Open Container Initiative In order to alleviate such inconveniences, many groups interested in container technology such as Google have come together to start the Open Container Initiative (OCI) project to set standards for containers. Docker further separated its interface and developed Containerd, a Container Runtime that adheres to the OCI standard, and added an abstraction layer so that dockerd calls the API of Containerd. In accordance with this flow, Kubernetes also now supports not only Docker, but any Container Runtime that adheres to the OCI standard and the specified specifications with the Container Runtime Interface (CRI) specification, starting from version 1.5. #### CRI-O CRI-O is a container runtime developed by Red Hat, Intel, SUSE, and IBM, which adheres to the OCI standard + CRI specifications, specifically for Kubernetes. #### Current docker & kubernetes Currently, Docker and Kubernetes have been using Docker Engine as the default container runtime, but since Docker's API did not match the CRI specification (*OCI follows*), Kubernetes developed and supported a **dockershim** to make Docker's API compatible with CRI, (*it was a huge burden for Kubernetes, not for Docker*). This was **deprecated from Kubernetes v1.20 and abandoned from v1.23**. - v1.23 will be released in December 2021 So from Kubernetes v1.23, you can no longer use Docker natively. However, **users are not much affected by this change** because Docker images created through Docker Engine comply with the OCI standard, so they can be used regardless of what container runtime Kubernetes is made of. ### References - [*https://www.linkedin.com/pulse/containerd는-무엇이고-왜-중요할까-sean-lee/?originalSubdomain=kr*](https://www.linkedin.com/pulse/containerd%EB%8A%94-%EB%AC%B4%EC%97%87%EC%9D%B4%EA%B3%A0-%EC%99%9C-%EC%A4%91%EC%9A%94%ED%95%A0%EA%B9%8C-sean-lee/?originalSubdomain=kr) - [https://kubernetes.io/blog/2021/12/07/kubernetes-1-23-release-announcement/](https://kubernetes.io/blog/2021/12/07/kubernetes-1-23-release-announcement/) - [https://kubernetes.io/blog/2020/12/02/dockershim-faq/](https://kubernetes.io/blog/2020/12/02/dockershim-faq/) - [https://kubernetes.io/blog/2020/12/02/dont-panic-kubernetes-and-docker/](https://kubernetes.io/blog/2020/12/02/dont-panic-kubernetes-and-docker/) - [https://kubernetes.io/ko/blog/2020/12/02/dont-panic-kubernetes-and-docker/](https://kubernetes.io/ko/blog/2020/12/02/dont-panic-kubernetes-and-docker/) ================================================ FILE: i18n/en/docusaurus-plugin-content-docs/version-1.0/setup-components/_category_.json ================================================ { "label": "Setup Components", "position": 3, "link": { "type": "generated-index" } } ================================================ FILE: i18n/en/docusaurus-plugin-content-docs/version-1.0/setup-components/install-components-kf.md ================================================ --- title : "1. Kubeflow" description: "구성요소 설치 - Kubeflow" sidebar_position: 1 date: 2021-12-13 lastmod: 2021-12-20 contributors: ["Jaeyeon Kim", "SeungTae Kim"] --- ## Prepare the installation file Prepare the installation files for installing Kubeflow **v1.4.0** Clone the [kubeflow/manifests Repository](https://github.com/kubeflow/manifests) with the **v1.4.0** tag, and move to the corresponding folder. ```bash git clone -b v1.4.0 https://github.com/kubeflow/manifests.git cd manifests ``` ## Install each components The kubeflow/manifests repository provides installation commands for each component, but it often lacks information on potential issues that may arise during installation or how to verify if the installation was successful. This can make it challenging for first-time users. Therefore, in this document, we will provide instructions on how to verify the successful installation of each component. Please note that this document will not cover the installation of components that are not covered in *MLOps for ALL*, such as Knative, KFServing, and MPI Operator, as we prioritize efficient resource usage. ### Cert-manager 1. Install cert-manager. ```bash kustomize build common/cert-manager/cert-manager/base | kubectl apply -f - ``` If the installation is successful, you should see output similar to the following: ```bash namespace/cert-manager created customresourcedefinition.apiextensions.k8s.io/certificaterequests.cert-manager.io created customresourcedefinition.apiextensions.k8s.io/certificates.cert-manager.io created customresourcedefinition.apiextensions.k8s.io/challenges.acme.cert-manager.io created customresourcedefinition.apiextensions.k8s.io/clusterissuers.cert-manager.io created customresourcedefinition.apiextensions.k8s.io/issuers.cert-manager.io created customresourcedefinition.apiextensions.k8s.io/orders.acme.cert-manager.io created serviceaccount/cert-manager created serviceaccount/cert-manager-cainjector created serviceaccount/cert-manager-webhook created role.rbac.authorization.k8s.io/cert-manager-webhook:dynamic-serving created role.rbac.authorization.k8s.io/cert-manager-cainjector:leaderelection created role.rbac.authorization.k8s.io/cert-manager:leaderelection created clusterrole.rbac.authorization.k8s.io/cert-manager-cainjector created clusterrole.rbac.authorization.k8s.io/cert-manager-controller-approve:cert-manager-io created clusterrole.rbac.authorization.k8s.io/cert-manager-controller-certificates created clusterrole.rbac.authorization.k8s.io/cert-manager-controller-challenges created clusterrole.rbac.authorization.k8s.io/cert-manager-controller-clusterissuers created clusterrole.rbac.authorization.k8s.io/cert-manager-controller-ingress-shim created clusterrole.rbac.authorization.k8s.io/cert-manager-controller-issuers created clusterrole.rbac.authorization.k8s.io/cert-manager-controller-orders created clusterrole.rbac.authorization.k8s.io/cert-manager-edit created clusterrole.rbac.authorization.k8s.io/cert-manager-view created clusterrole.rbac.authorization.k8s.io/cert-manager-webhook:subjectaccessreviews created rolebinding.rbac.authorization.k8s.io/cert-manager-webhook:dynamic-serving created rolebinding.rbac.authorization.k8s.io/cert-manager-cainjector:leaderelection created rolebinding.rbac.authorization.k8s.io/cert-manager:leaderelection created clusterrolebinding.rbac.authorization.k8s.io/cert-manager-cainjector created clusterrolebinding.rbac.authorization.k8s.io/cert-manager-controller-approve:cert-manager-io created clusterrolebinding.rbac.authorization.k8s.io/cert-manager-controller-certificates created clusterrolebinding.rbac.authorization.k8s.io/cert-manager-controller-challenges created clusterrolebinding.rbac.authorization.k8s.io/cert-manager-controller-clusterissuers created clusterrolebinding.rbac.authorization.k8s.io/cert-manager-controller-ingress-shim created clusterrolebinding.rbac.authorization.k8s.io/cert-manager-controller-issuers created clusterrolebinding.rbac.authorization.k8s.io/cert-manager-controller-orders created clusterrolebinding.rbac.authorization.k8s.io/cert-manager-webhook:subjectaccessreviews created service/cert-manager created service/cert-manager-webhook created deployment.apps/cert-manager created deployment.apps/cert-manager-cainjector created deployment.apps/cert-manager-webhook created mutatingwebhookconfiguration.admissionregistration.k8s.io/cert-manager-webhook created validatingwebhookconfiguration.admissionregistration.k8s.io/cert-manager-webhook created ``` Wait for all 3 pods in the cert-manager namespace to become Running: ```bash kubectl get pod -n cert-manager ``` Once all the pods are Running, you should see output similar to the following: ```bash NAME READY STATUS RESTARTS AGE cert-manager-7dd5854bb4-7nmpd 1/1 Running 0 2m10s cert-manager-cainjector-64c949654c-2scxr 1/1 Running 0 2m10s cert-manager-webhook-6b57b9b886-7q6g2 1/1 Running 0 2m10s ``` 2. To install `kubeflow-issuer`, run the following command: ```bash kustomize build common/cert-manager/kubeflow-issuer/base | kubectl apply -f - ``` If the installation is successful, you should see the following output: ```bash clusterissuer.cert-manager.io/kubeflow-self-signing-issuer created ``` Note: If the `cert-manager-webhook` deployment is not in the Running state, you may encounter an error similar to the one below, and the `kubeflow-issuer` may not be installed. In this case, please ensure that all 3 pods of cert-manager are Running before retrying the command. If you encounter the below error, make sure that the `cert-manager` deployment and all its pods are running properly before proceeding. ```bash Error from server: error when retrieving current configuration of: Resource: "cert-manager.io/v1alpha2, Resource=clusterissuers", GroupVersionKind: "cert-manager.io/v1alpha2, Kind=ClusterIssuer" Name: "kubeflow-self-signing-issuer", Namespace: "" from server for: "STDIN": conversion webhook for cert-manager.io/v1, Kind=ClusterIssuer failed: Post "https://cert-manager-webhook.cert-manager.svc:443/convert?timeout=30s": dial tcp 10.101.177.157:443: connect: connection refused ``` ### Istio 1. Install Custom Resource Definition(CRD) for istio. ```bash kustomize build common/istio-1-9/istio-crds/base | kubectl apply -f - ``` if run properly, you should see the following output: ```bash customresourcedefinition.apiextensions.k8s.io/authorizationpolicies.security.istio.io created customresourcedefinition.apiextensions.k8s.io/destinationrules.networking.istio.io created customresourcedefinition.apiextensions.k8s.io/envoyfilters.networking.istio.io created customresourcedefinition.apiextensions.k8s.io/gateways.networking.istio.io created customresourcedefinition.apiextensions.k8s.io/istiooperators.install.istio.io created customresourcedefinition.apiextensions.k8s.io/peerauthentications.security.istio.io created customresourcedefinition.apiextensions.k8s.io/requestauthentications.security.istio.io created customresourcedefinition.apiextensions.k8s.io/serviceentries.networking.istio.io created customresourcedefinition.apiextensions.k8s.io/sidecars.networking.istio.io created customresourcedefinition.apiextensions.k8s.io/virtualservices.networking.istio.io created customresourcedefinition.apiextensions.k8s.io/workloadentries.networking.istio.io created customresourcedefinition.apiextensions.k8s.io/workloadgroups.networking.istio.io created ``` 1. Install istio namespace ```bash kustomize build common/istio-1-9/istio-namespace/base | kubectl apply -f - ``` if run properly, you should see the following output: ```bash namespace/istio-system created ``` 3. Install istio. ```bash kustomize build common/istio-1-9/istio-install/base | kubectl apply -f - ``` if run properly, you should see the following output: ```bash serviceaccount/istio-ingressgateway-service-account created serviceaccount/istio-reader-service-account created serviceaccount/istiod-service-account created role.rbac.authorization.k8s.io/istio-ingressgateway-sds created role.rbac.authorization.k8s.io/istiod-istio-system created clusterrole.rbac.authorization.k8s.io/istio-reader-istio-system created clusterrole.rbac.authorization.k8s.io/istiod-istio-system created rolebinding.rbac.authorization.k8s.io/istio-ingressgateway-sds created rolebinding.rbac.authorization.k8s.io/istiod-istio-system created clusterrolebinding.rbac.authorization.k8s.io/istio-reader-istio-system created clusterrolebinding.rbac.authorization.k8s.io/istiod-istio-system created configmap/istio created configmap/istio-sidecar-injector created service/istio-ingressgateway created service/istiod created deployment.apps/istio-ingressgateway created deployment.apps/istiod created envoyfilter.networking.istio.io/metadata-exchange-1.8 created envoyfilter.networking.istio.io/metadata-exchange-1.9 created envoyfilter.networking.istio.io/stats-filter-1.8 created envoyfilter.networking.istio.io/stats-filter-1.9 created envoyfilter.networking.istio.io/tcp-metadata-exchange-1.8 created envoyfilter.networking.istio.io/tcp-metadata-exchange-1.9 created envoyfilter.networking.istio.io/tcp-stats-filter-1.8 created envoyfilter.networking.istio.io/tcp-stats-filter-1.9 created envoyfilter.networking.istio.io/x-forwarded-host created gateway.networking.istio.io/istio-ingressgateway created authorizationpolicy.security.istio.io/global-deny-all created authorizationpolicy.security.istio.io/istio-ingressgateway created mutatingwebhookconfiguration.admissionregistration.k8s.io/istio-sidecar-injector created validatingwebhookconfiguration.admissionregistration.k8s.io/istiod-istio-system created ``` Wait for all 2 pods in the cert-manager namespace to become Running: ```bash kubectl get po -n istio-system ``` Once all the pods are Running, you should see output similar to the following: ```bash NAME READY STATUS RESTARTS AGE istio-ingressgateway-79b665c95-xm22l 1/1 Running 0 16s istiod-86457659bb-5h58w 1/1 Running 0 16s ``` ### Dex Now, let's install dex. ```bash kustomize build common/dex/overlays/istio | kubectl apply -f - ``` If performed normally, it will be printed as follows: ```bash namespace/auth created customresourcedefinition.apiextensions.k8s.io/authcodes.dex.coreos.com created serviceaccount/dex created clusterrole.rbac.authorization.k8s.io/dex created clusterrolebinding.rbac.authorization.k8s.io/dex created configmap/dex created secret/dex-oidc-client created service/dex created deployment.apps/dex created virtualservice.networking.istio.io/dex created ``` Wait until all one pod in the auth namespace is running. ```bash kubectl get po -n auth ``` When everyone is running, similar results will be printed. ```bash NAME READY STATUS RESTARTS AGE dex-5ddf47d88d-458cs 1/1 Running 1 12s ``` Install OIDC AuthService. ```bash kustomize build common/oidc-authservice/base | kubectl apply -f - ``` If performed normally, it will be printed as follows. ```bash configmap/oidc-authservice-parameters created secret/oidc-authservice-client created service/authservice created persistentvolumeclaim/authservice-pvc created statefulset.apps/authservice created envoyfilter.networking.istio.io/authn-filter created ``` Wait until the authservice-0 pod in the istio-system namespace is Running. ```bash kubectl get po -n istio-system -w ``` If everybody runs, a similar result will be printed. ```bash NAME READY STATUS RESTARTS AGE authservice-0 1/1 Running 0 14s istio-ingressgateway-79b665c95-xm22l 1/1 Running 0 2m37s istiod-86457659bb-5h58w 1/1 Running 0 2m37s ``` Create a Kubeflow Namespace. ```bash kustomize build common/kubeflow-namespace/base | kubectl apply -f - ``` If performed normally, it will be outputted as follows. ```bash namespace/kubeflow created ``` Retrieve the Kubeflow namespace. ```bash kubectl get ns kubeflow ``` If generated normally, similar results will be output. ```bash NAME STATUS AGE kubeflow Active 8s ``` Install kubeflow-roles. ```bash kustomize build common/kubeflow-roles/base | kubectl apply -f - ``` If properly performed, it will output as follows. ```bash clusterrole.rbac.authorization.k8s.io/kubeflow-admin created clusterrole.rbac.authorization.k8s.io/kubeflow-edit created clusterrole.rbac.authorization.k8s.io/kubeflow-kubernetes-admin created clusterrole.rbac.authorization.k8s.io/kubeflow-kubernetes-edit created clusterrole.rbac.authorization.k8s.io/kubeflow-kubernetes-view created clusterrole.rbac.authorization.k8s.io/kubeflow-view created ``` Retrieve the kubeflow roles just created. ```bash kubectl get clusterrole | grep kubeflow ``` The following 6 clusterroles will be output. ```bash kubeflow-admin 2021-12-03T08:51:36Z kubeflow-edit 2021-12-03T08:51:36Z kubeflow-kubernetes-admin 2021-12-03T08:51:36Z kubeflow-kubernetes-edit 2021-12-03T08:51:36Z kubeflow-kubernetes-view 2021-12-03T08:51:36Z kubeflow-view 2021-12-03T08:51:36Z ``` Install Kubeflow Istio Resources. ```bash kustomize build common/istio-1-9/kubeflow-istio-resources/base | kubectl apply -f - ``` If performed normally, it will be output as follows. ```bash clusterrole.rbac.authorization.k8s.io/kubeflow-istio-admin created clusterrole.rbac.authorization.k8s.io/kubeflow-istio-edit created clusterrole.rbac.authorization.k8s.io/kubeflow-istio-view created gateway.networking.istio.io/kubeflow-gateway created ``` Retrieve the Kubeflow roles just created. ```bash kubectl get clusterrole | grep kubeflow-istio ``` The following three clusterroles are output. ```bash kubeflow-istio-admin 2021-12-03T08:53:17Z kubeflow-istio-edit 2021-12-03T08:53:17Z kubeflow-istio-view 2021-12-03T08:53:17Z ``` Check if the gateway is properly installed in the Kubeflow namespace. ```bash kubectl get gateway -n kubeflow ``` If generated normally, a result similar to the following will be output. ```bash NAME AGE kubeflow-gateway 31s ``` Installing Kubeflow Pipelines. ```bash kustomize build apps/pipeline/upstream/env/platform-agnostic-multi-user | kubectl apply -f - ``` If performed normally, it will be output as follows. ```bash customresourcedefinition.apiextensions.k8s.io/clusterworkflowtemplates.argoproj.io created customresourcedefinition.apiextensions.k8s.io/cronworkflows.argoproj.io created customresourcedefinition.apiextensions.k8s.io/workfloweventbindings.argoproj.io created ...(생략) authorizationpolicy.security.istio.io/ml-pipeline-visualizationserver created authorizationpolicy.security.istio.io/mysql created authorizationpolicy.security.istio.io/service-cache-server created ``` This command is installing multiple resources at once, but there are resources with dependencies on the installation order. Therefore, depending on the time, a similar error may occur. ```bash "error: unable to recognize "STDIN": no matches for kind "CompositeController" in version "metacontroller.k8s.io/v1alpha1"" ``` If a similar error occurs, wait about 10 seconds and then try the command above again. ```bash kustomize build apps/pipeline/upstream/env/platform-agnostic-multi-user | kubectl apply -f - ``` Check to see if it has been installed correctly. ```bash kubectl get po -n kubeflow ``` Wait until all 16 pods are running as follows. ```bash NAME READY STATUS RESTARTS AGE cache-deployer-deployment-79fdf9c5c9-bjnbg 2/2 Running 1 5m3s cache-server-5bdf4f4457-48gbp 2/2 Running 0 5m3s kubeflow-pipelines-profile-controller-7b947f4748-8d26b 1/1 Running 0 5m3s metacontroller-0 1/1 Running 0 5m3s metadata-envoy-deployment-5b4856dd5-xtlkd 1/1 Running 0 5m3s metadata-grpc-deployment-6b5685488-kwvv7 2/2 Running 3 5m3s metadata-writer-548bd879bb-zjkcn 2/2 Running 1 5m3s minio-5b65df66c9-k5gzg 2/2 Running 0 5m3s ml-pipeline-8c4b99589-85jw6 2/2 Running 1 5m3s ml-pipeline-persistenceagent-d6bdc77bd-ssxrv 2/2 Running 0 5m3s ml-pipeline-scheduledworkflow-5db54d75c5-zk2cw 2/2 Running 0 5m2s ml-pipeline-ui-5bd8d6dc84-j7wqr 2/2 Running 0 5m2s ml-pipeline-viewer-crd-68fb5f4d58-mbcbg 2/2 Running 1 5m2s ml-pipeline-visualizationserver-8476b5c645-wljfm 2/2 Running 0 5m2s mysql-f7b9b7dd4-xfnw4 2/2 Running 0 5m2s workflow-controller-5cbbb49bd8-5zrwx 2/2 Running 1 5m2s ``` Additionally, please check if the ml-pipeline UI is connected properly. ```bash kubectl port-forward svc/ml-pipeline-ui -n kubeflow 8888:80 ``` Open the web browser and connect to the path [http://localhost:8888/#/pipelines/](http://localhost:8888/#/pipelines/). Confirm that the following screen is displayed. If you get the error "Connection refused on localhost", you can access it through the command line by setting the address, as long as there are no security issues. To check if the ml-pipeline UI connects normally, open the bind of all addresses with 0.0.0.0. ```bash kubectl port-forward --address 0.0.0.0 svc/ml-pipeline-ui -n kubeflow 8888:80 ``` Despite running with the above options, if connection refusal issues still occur, add access permission by allowing all TCP protocol ports in the firewall settings or by adding access permission to port 8888. When you open the web browser and access the path `http://:8888/#/pipelines/`, you can see the ml-pipeline UI screen. When accessing the other ports path that is being processed in the bottom, run the command in the same way as above and add the port number to the firewall to run it. English: We will install Katib. ```bash kustomize build apps/katib/upstream/installs/katib-with-kubeflow | kubectl apply -f - ``` If performed normally, it will be output as follows. ```bash customresourcedefinition.apiextensions.k8s.io/experiments.kubeflow.org created customresourcedefinition.apiextensions.k8s.io/suggestions.kubeflow.org created customresourcedefinition.apiextensions.k8s.io/trials.kubeflow.org created serviceaccount/katib-controller created serviceaccount/katib-ui created clusterrole.rbac.authorization.k8s.io/katib-controller created clusterrole.rbac.authorization.k8s.io/katib-ui created clusterrole.rbac.authorization.k8s.io/kubeflow-katib-admin created clusterrole.rbac.authorization.k8s.io/kubeflow-katib-edit created clusterrole.rbac.authorization.k8s.io/kubeflow-katib-view created clusterrolebinding.rbac.authorization.k8s.io/katib-controller created clusterrolebinding.rbac.authorization.k8s.io/katib-ui created configmap/katib-config created configmap/trial-templates created secret/katib-mysql-secrets created service/katib-controller created service/katib-db-manager created service/katib-mysql created service/katib-ui created persistentvolumeclaim/katib-mysql created deployment.apps/katib-controller created deployment.apps/katib-db-manager created deployment.apps/katib-mysql created deployment.apps/katib-ui created certificate.cert-manager.io/katib-webhook-cert created issuer.cert-manager.io/katib-selfsigned-issuer created virtualservice.networking.istio.io/katib-ui created mutatingwebhookconfiguration.admissionregistration.k8s.io/katib.kubeflow.org created validatingwebhookconfiguration.admissionregistration.k8s.io/katib.kubeflow.org created ``` Confirm if it has been installed properly. ```bash kubectl get po -n kubeflow | grep katib ``` Wait until four pods are Running, like this. ```bash katib-controller-68c47fbf8b-b985z 1/1 Running 0 82s katib-db-manager-6c948b6b76-2d9gr 1/1 Running 0 82s katib-mysql-7894994f88-scs62 1/1 Running 0 82s katib-ui-64bb96d5bf-d89kp 1/1 Running 0 82s ``` Additionally, we will confirm that the Katib UI is connected normally. ```bash kubectl port-forward svc/katib-ui -n kubeflow 8081:80 ``` Open the web browser and access the path [http://localhost:8081/katib/](http://localhost:8081/katib/) to confirm the following screen is displayed. ```bash kustomize build apps/centraldashboard/upstream/overlays/istio | kubectl apply -f - ``` If performed normally, it will be output as follows. ```bash serviceaccount/centraldashboard created role.rbac.authorization.k8s.io/centraldashboard created clusterrole.rbac.authorization.k8s.io/centraldashboard created rolebinding.rbac.authorization.k8s.io/centraldashboard created clusterrolebinding.rbac.authorization.k8s.io/centraldashboard created configmap/centraldashboard-config created configmap/centraldashboard-parameters created service/centraldashboard created deployment.apps/centraldashboard created virtualservice.networking.istio.io/centraldashboard created ``` Check to see if it has been installed normally. ```bash kubectl get po -n kubeflow | grep centraldashboard ``` Wait until one pod related to centraldashboard in the kubeflow namespace becomes Running. ```bash centraldashboard-8fc7d8cc-xl7ts 1/1 Running 0 52s ``` Additionally, we will check if the Central Dashboard UI is connected properly. ```bash kubectl port-forward svc/centraldashboard -n kubeflow 8082:80 ``` Open the web browser to connect to the path [http://localhost:8082/](http://localhost:8082/) and check that the following screen is displayed. ```bash kustomize build apps/admission-webhook/upstream/overlays/cert-manager | kubectl apply -f - ``` If performed normally, it will be output as follows. ```bash customresourcedefinition.apiextensions.k8s.io/poddefaults.kubeflow.org created serviceaccount/admission-webhook-service-account created clusterrole.rbac.authorization.k8s.io/admission-webhook-cluster-role created clusterrole.rbac.authorization.k8s.io/admission-webhook-kubeflow-poddefaults-admin created clusterrole.rbac.authorization.k8s.io/admission-webhook-kubeflow-poddefaults-edit created clusterrole.rbac.authorization.k8s.io/admission-webhook-kubeflow-poddefaults-view created clusterrolebinding.rbac.authorization.k8s.io/admission-webhook-cluster-role-binding created service/admission-webhook-service created deployment.apps/admission-webhook-deployment created certificate.cert-manager.io/admission-webhook-cert created issuer.cert-manager.io/admission-webhook-selfsigned-issuer created mutatingwebhookconfiguration.admissionregistration.k8s.io/admission-webhook-mutating-webhook-configuration created ``` Check if it is installed normally. ```bash kubectl get po -n kubeflow | grep admission-webhook ``` Wait until one pod is running. ```bash admission-webhook-deployment-667bd68d94-2hhrx 1/1 Running 0 11s ``` Install the Notebook controller. If done successfully, it will output as follows. deployment.apps/notebook-controller created ``` A CustomResourceDefinition.apiextensions.k8s.io/notebooks.kubeflow.org, ServiceAccount/notebook-controller-service-account, Role.rbac.authorization.k8s.io/notebook-controller-leader-election-role, ClusterRole.rbac.authorization.k8s.io/notebook-controller-kubeflow-notebooks-admin, ClusterRole.rbac.authorization.k8s.io/notebook-controller-kubeflow-notebooks-edit, ClusterRole.rbac.authorization.k8s.io/notebook-controller-kubeflow-notebooks-view, ClusterRole.rbac.authorization.k8s.io/notebook-controller-role, RoleBinding.rbac.authorization.k8s.io/notebook-controller-leader-election-rolebinding, ClusterRoleBinding.rbac.authorization.k8s.io/notebook-controller-role-binding, ConfigMap/notebook-controller-config-m Translation: Check if the installation was successful. Wait until one pod is running with the following command: kubectl get po -n kubeflow | grep notebook-controller. Translation: Install Jupyter Web App. If performed correctly, the following will be output. ``` Confirm that the installation was successful: configmap/jupyter-web-app-config-76844k4cd7 created configmap/jupyter-web-app-logos created configmap/jupyter-web-app-parameters-chmg88cm48 created service/jupyter-web-app-service created deployment.apps/jupyter-web-app-deployment created virtualservice.networking.istio.io/jupyter-web-app-jupyter-web-app created Wait until one pod is Running. English: We will install the Profile Controller. ```bash kustomize build apps/profiles/upstream/overlays/kubeflow | kubectl apply -f - ``` If performed normally, it will be outputted as follows. ```bash customresourcedefinition.apiextensions.k8s.io/profiles.kubeflow.org created serviceaccount/profiles-controller-service-account created role.rbac.authorization.k8s.io/profiles-leader-election-role created rolebinding.rbac.authorization.k8s.io/profiles-leader-election-rolebinding created clusterrolebinding.rbac.authorization.k8s.io/profiles-cluster-role-binding created configmap/namespace-labels-data-48h7kd55mc created configmap/profiles-config-46c7tgh6fd created service/profiles-kfam created deployment.apps/profiles-deployment created virtualservice.networking.istio.io/profiles-kfam created ``` Check to see if it is installed normally. ```bash kubectl get po -n kubeflow | grep profiles-deployment ``` Wait until one pod is running. ```bash profiles-deployment-89f7d88b-qsnrd 2/2 Running 0 42s ``` Install the Volumes Web App. ```bash kustomize build apps/volumes-web-app/upstream/overlays/istio | kubectl apply -f - ``` If performed normally, it will be output as follows. ```bash serviceaccount/volumes-web-app-service-account created clusterrole.rbac.authorization.k8s.io/volumes-web-app-cluster-role created clusterrole.rbac.authorization.k8s.io/volumes-web-app-kubeflow-volume-ui-admin created clusterrole.rbac.authorization.k8s.io/volumes-web-app-kubeflow-volume-ui-edit created clusterrole.rbac.authorization.k8s.io/volumes-web-app-kubeflow-volume-ui-view created clusterrolebinding.rbac.authorization.k8s.io/volumes-web-app-cluster-role-binding created configmap/volumes-web-app-parameters-4gg8cm2gmk created service/volumes-web-app-service created deployment.apps/volumes-web-app-deployment created virtualservice.networking.istio.io/volumes-web-app-volumes-web-app created ``` Check if it is installed normally. ```bash kubectl get po -n kubeflow | grep volumes-web-app ``` Wait until one pod is running. ```bash volumes-web-app-deployment-8589d664cc-62svl 1/1 Running 0 27s ``` ```bash Install Tensorboard Web App. Service account/tensorboards-web-app-service-account created, Cluster role.rbac.authorization.k8s.io/tensorboards-web-app-cluster-role created, Cluster role.rbac.authorization.k8s.io/tensorboards-web-app-kubeflow-tensorboard-ui-admin created, Cluster role.rbac.authorization.k8s.io/tensorboards-web-app-kubeflow-tensorboard-ui-edit created, Cluster role.rbac.authorization.k8s.io/tensorboards-web-app-kubeflow-tensorboard-ui-view created, Cluster role binding.rbac.authorization.k8s.io/tensorboards-web-app-cluster-role-binding created, Config map/tensorboards-web-app-parameters-g28fbd6cch created, Service/tensorboards-web-app-service created, Deployment.apps/tensorboards-web-app-deployment created, and Virtual service.networking.istio.io/t Check if it is installed correctly. ```bash Deployment "tensorboard-web-app-deployment-6ff79b7f44-qbzmw" created deployment.apps/tensorboard-controller-controller-manager created ``` A custom resource definition for 'tensorboards.tensorboard.kubeflow.org' was created, along with a service account, roles, role bindings, a config map, and a deployment for the controller manager metrics service. Check if the deployment.apps/tensorboard-controller-controller-manager was installed correctly. Wait for 1 pod to be Running. Translation: Installing Training Operator. ```bash kustomize build apps/training-operator/upstream/overlays/kubeflow | kubectl apply -f - ``` If performed normally, it will be output as follows. ```bash customresourcedefinition.apiextensions.k8s.io/mxjobs.kubeflow.org created customresourcedefinition.apiextensions.k8s.io/pytorchjobs.kubeflow.org created customresourcedefinition.apiextensions.k8s.io/tfjobs.kubeflow.org created customresourcedefinition.apiextensions.k8s.io/xgboostjobs.kubeflow.org created serviceaccount/training-operator created clusterrole.rbac.authorization.k8s.io/kubeflow-training-admin created clusterrole.rbac.authorization.k8s.io/kubeflow-training-edit created clusterrole.rbac.authorization.k8s.io/kubeflow-training-view created clusterrole.rbac.authorization.k8s.io/training-operator created clusterrolebinding.rbac.authorization.k8s.io/training-operator created service/training-operator created deployment.apps/training-operator created ``` Check to see if it has been installed normally. ```bash kubectl get po -n kubeflow | grep training-operator ``` Wait until one pod is up and running. ```bash training-operator-7d98f9dd88-6887f 1/1 Running 0 28s ``` ### User Namespace For using Kubeflow, create a Kubeflow Profile for the User to be used. ```bash kustomize build common/user-namespace/base | kubectl apply -f - ``` If performed normally, it will be outputted as follows. ```bash configmap/default-install-config-9h2h2b6hbk created profile.kubeflow.org/kubeflow-user-example-com created ``` Confirm that the kubeflow-user-example-com profile has been created. ```bash kubectl get profile ``` ```bash kubeflow-user-example-com 37s ``` ## Check installation Confirm successful installation by port forwarding to access Kubeflow central dashboard with web browser. ```bash kubectl port-forward svc/istio-ingressgateway -n istio-system 8080:80 ``` Open a web browser and connect to [http://localhost:8080](http://localhost:8080) to confirm that the following screen is displayed. ![login-ui](./img/login-after-install.png) Enter the following connection information to connect. - Email Address: `user@example.com` - Password: `12341234` ![central-dashboard](./img/after-login.png) ================================================ FILE: i18n/en/docusaurus-plugin-content-docs/version-1.0/setup-components/install-components-mlflow.md ================================================ --- title : "2. MLflow Tracking Server" description: "구성요소 설치 - MLflow" sidebar_position: 2 date: 2021-12-13 lastmod: 2021-12-13 contributors: ["Jaeyeon Kim"] --- ## Install MLflow Tracking Server MLflow is a popular open-source ML experiment management tool. In addition to [experiment management](https://mlflow.org/docs/latest/tracking.html#tracking), MLflow provides functionalities for ML [model packaging](https://mlflow.org/docs/latest/projects.html#projects), [deployment management](https://mlflow.org/docs/latest/models.html#models), and [model storage](https://mlflow.org/docs/latest/model-registry.html#registry). In *MLOps for ALL*, we will be using MLflow for experiment management purposes. o store the data managed by MLflow and provide a user interface, we will deploy the MLflow Tracking Server on the Kubernetes cluster. ## Before Install MLflow Tracking Server ### Install PostgreSQL DB MLflow Tracking Server deploys a PostgreSQL DB for use as a Backend Store to a Kubernetes cluster. First, create a namespace called `mlflow-system`. ```bash kubectl create ns mlflow-system ``` If the following message is output, it means that it has been generated normally. ```bash namespace/mlflow-system created ``` Create a Postgresql DB in the `mlflow-system` namespace. ```bash kubectl -n mlflow-system apply -f https://raw.githubusercontent.com/mlops-for-all/helm-charts/b94b5fe4133f769c04b25068b98ccfa7a505aa60/mlflow/manifests/postgres.yaml ``` If performed normally, it will be outputted as follows. ```bash service/postgresql-mlflow-service created deployment.apps/postgresql-mlflow created persistentvolumeclaim/postgresql-mlflow-pvc created ``` Wait until one postgresql related pod is running in the mlflow-system namespace. ```bash kubectl get pod -n mlflow-system | grep postgresql ``` If it is output similar to the following, it has executed normally. ```bash postgresql-mlflow-7b9bc8c79f-srkh7 1/1 Running 0 38s ``` ### Setup Minio We will utilize the Minio that was installed in the previous Kubeflow installation step. However, in order to separate it for kubeflow and mlflow purposes, we will create a mlflow-specific bucket. First, port-forward the minio-service to access Minio and create the bucket. ```bash kubectl port-forward svc/minio-service -n kubeflow 9000:9000 ``` Open a web browser and connect to [localhost:9000](http://localhost:9000) to display the following screen. ![minio-install](./img/minio-install.png) Enter the following credentials to log in: - Username: `minio` - Password: `minio123` Click the **`+`** button on the right side bottom, then click `Create Bucket`. ![create-bucket](./img/create-bucket.png) Enter `mlflow` in `Bucket Name` to create the bucket. If successfully created, you will see a bucket named `mlflow` on the left. ![mlflow-bucket](./img/mlflow-bucket.png) --- ## Let's Install MLflow Tracking Server ### Add Helm Repository ```bash helm repo add mlops-for-all https://mlops-for-all.github.io/helm-charts ``` If the following message is displayed, it means it has been added successfully. ```bash "mlops-for-all" has been added to your repositories ``` ### Update Helm Repository ```bash helm repo update ``` If the following message is displayed, it means that the update has been successfully completed. ```bash Hang tight while we grab the latest from your chart repositories... ...Successfully got an update from the "mlops-for-all" chart repository Update Complete. ⎈Happy Helming!⎈ ``` ### Helm Install Install mlflow-server Helm Chart version 0.2.0. ```bash helm install mlflow-server mlops-for-all/mlflow-server \ --namespace mlflow-system \ --version 0.2.0 ``` - The above Helm chart installs MLflow with the connection information for its backend store and artifacts store set to the default minio created during the Kubeflow installation process and the postgresql information created from the [PostgreSQL DB installation](#postgresql-db-installation) above. - If you want to use a separate DB or object storage, please refer to the [Helm Chart Repo](https://github.com/mlops-for-all/helm-charts/tree/main/mlflow/chart) and set the values separately during helm install. The following message should be displayed: ```bash NAME: mlflow-server LAST DEPLOYED: Sat Dec 18 22:02:13 2021 NAMESPACE: mlflow-system STATUS: deployed REVISION: 1 TEST SUITE: None ``` Check to see if it was installed normally. ```bash kubectl get pod -n mlflow-system | grep mlflow-server ``` Wait until one mlflow-server related pod is running in the mlflow-system namespace. If it is output similar to the following, then it has been successfully executed. ```bash mlflow-server-ffd66d858-6hm62 1/1 Running 0 74s ``` ### Check installation Let's now check if we can successfully connect to the MLflow Server. First, we will perform port forwarding in order to connect from the client node. ```bash kubectl port-forward svc/mlflow-server-service -n mlflow-system 5000:5000 ``` Open a web browser and connect to [localhost:5000](http://localhost:5000) and the following screen will be output. ![mlflow-install](./img/mlflow-install.png) ================================================ FILE: i18n/en/docusaurus-plugin-content-docs/version-1.0/setup-components/install-components-pg.md ================================================ --- title : "4. Prometheus & Grafana" description: "구성요소 설치 - Prometheus & Grafana" sidebar_position: 4 date: 2021-12-13 lastmod: 2021-12-13 contributors: ["Jaeyeon Kim"] --- ## Prometheus & Grafana Prometheus and Grafana are tools for monitoring. For stable service operation, it is necessary to continuously observe the status of the service and infrastructure where the service is operating, and to respond quickly based on the observed metrics when a problem arises. Among the many tools to efficiently perform such monitoring, *Everyone's MLOps* will use open source Prometheus and Grafana. For more information, please refer to the [Prometheus Official Documentation](https://prometheus.io/docs/introduction/overview/) and [Grafana Official Documentation](https://grafana.com/docs/). Prometheus is a tool to collect metrics from various targets, and Grafana is a tool to help visualize the gathered data. Although there is no dependency between them, they are often used together complementary to each other. In this page, we will install Prometheus and Grafana on a Kubernetes cluster, then send API requests to a SeldonDeployment created with Seldon-Core and check if metrics are collected successfully. We also install a dashboard to efficiently monitor the metrics of the SeldonDeployment created in Seldon-Core using Helm Chart version 1.12.0 from seldonio/seldon-core-analytics Helm Repository. ### Add Helm Repository ```bash helm repo add seldonio https://storage.googleapis.com/seldon-charts ``` If the following message is output, it means that it has been added successfully. ```bash "seldonio" has been added to your repositories ``` ### Update Helm Repository ```bash helm repo update ``` If the following message is displayed, it means that the update was successful. ```bash Hang tight while we grab the latest from your chart repositories... ...Successfully got an update from the "seldonio" chart repository ...Successfully got an update from the "datawire" chart repository Update Complete. ⎈Happy Helming!⎈ ``` ### Helm Install Install version 1.12.0 of the seldon-core-analytics Helm Chart. ```bash helm install seldon-core-analytics seldonio/seldon-core-analytics \ --namespace seldon-system \ --version 1.12.0 ``` The following message should be output. ```bash Skip... NAME: seldon-core-analytics LAST DEPLOYED: Tue Dec 14 18:29:38 2021 NAMESPACE: seldon-system STATUS: deployed REVISION: 1 ``` Check to see if it was installed normally. ```bash kubectl get pod -n seldon-system | grep seldon-core-analytics ``` Wait until 6 seldon-core-analytics related pods are Running in the seldon-system namespace. ```bash seldon-core-analytics-grafana-657c956c88-ng8wn 2/2 Running 0 114s seldon-core-analytics-kube-state-metrics-94bb6cb9-svs82 1/1 Running 0 114s seldon-core-analytics-prometheus-alertmanager-64cf7b8f5-nxbl8 2/2 Running 0 114s seldon-core-analytics-prometheus-node-exporter-5rrj5 1/1 Running 0 114s seldon-core-analytics-prometheus-pushgateway-8476474cff-sr4n6 1/1 Running 0 114s seldon-core-analytics-prometheus-seldon-685c664894-7cr45 2/2 Running 0 114s ``` ### Check installation Let's now check if we can connect to Grafana normally. First, we will port forward to connect to the client node. ```bash kubectl port-forward svc/seldon-core-analytics-grafana -n seldon-system 8090:80 ``` Open the web browser and connect to [localhost:8090](http://localhost:8090), then the following screen will be displayed. ![grafana-install](./img/grafana-install.png) Enter the following connection information to connect. - Email or username: `admin` - Password: `password` When you log in, the following screen will be displayed. ![grafana-login](./img/grafana-login.png) Click the dashboard icon on the left and click the `Manage` button. ![dashboard-click](./img/dashboard-click.png) You can see that the basic Grafana dashboard is included. Click the `Prediction Analytics` dashboard among them. ![dashboard](./img/dashboard.png) The Seldon Core API Dashboard is visible and can be confirmed with the following output. ![seldon-dashboard](./img/seldon-dashboard.png) ## References - [Seldon-Core-Analytics Helm Chart](https://github.com/SeldonIO/seldon-core/tree/master/helm-charts/seldon-core-analytics) ================================================ FILE: i18n/en/docusaurus-plugin-content-docs/version-1.0/setup-components/install-components-seldon.md ================================================ --- title : "3. Seldon-Core" description: "구성요소 설치 - Seldon-Core" sidebar_position: 3 date: 2021-12-13 lastmod: 2021-12-13 contributors: ["Jaeyeon Kim"] --- ## Seldon-Core Seldon-Core is one of the open source frameworks that can deploy and manage numerous machine learning models in Kubernetes environments. For more details, please refer to the official [product description page](https://www.seldon.io/tech/products/core/) and [GitHub](https://github.com/SeldonIO/seldon-core) of Seldon-Core and API Deployment part. ## Installing Seldon-Core In order to use Seldon-Core, modules such as Ambassador, which is responsible for Ingress of Kubernetes, and Istio are required [here](https://docs.seldon.io/projects/seldon-core/en/latest/workflow/install.html). Seldon-Core officially supports only Ambassador and Istio, and *MLOps for everyone* will use Ambassador to use Seldon-core, so we will install Ambassador. ### Adding Ambassador to the Helm Repository ```bash helm repo add datawire https://www.getambassador.io ``` If the following message is displayed, it means it has been added normally. ```bash "datawire" has been added to your repositories ``` ### Update Ambassador - Helm Repository ```bash helm repo update ``` If the following message is output, it means that the update has been completed normally. ```bash Hang tight while we grab the latest from your chart repositories... ...Successfully got an update from the "datawire" chart repository Update Complete. ⎈Happy Helming!⎈ ``` ### Ambassador - Helm Install Install version 6.9.3 of the Ambassador Chart. ```bash helm install ambassador datawire/ambassador \ --namespace seldon-system \ --create-namespace \ --set image.repository=quay.io/datawire/ambassador \ --set enableAES=false \ --set crds.keep=false \ --version 6.9.3 ``` The following message should be displayed. ```bash 생략... W1206 17:01:36.026326 26635 warnings.go:70] rbac.authorization.k8s.io/v1beta1 Role is deprecated in v1.17+, unavailable in v1.22+; use rbac.authorization.k8s.io/v1 Role W1206 17:01:36.029764 26635 warnings.go:70] rbac.authorization.k8s.io/v1beta1 RoleBinding is deprecated in v1.17+, unavailable in v1.22+; use rbac.authorization.k8s.io/v1 RoleBinding NAME: ambassador LAST DEPLOYED: Mon Dec 6 17:01:34 2021 NAMESPACE: seldon-system STATUS: deployed REVISION: 1 NOTES: ------------------------------------------------------------------------------- Congratulations! You've successfully installed Ambassador! ------------------------------------------------------------------------------- To get the IP address of Ambassador, run the following commands: NOTE: It may take a few minutes for the LoadBalancer IP to be available. You can watch the status of by running 'kubectl get svc -w --namespace seldon-system ambassador' On GKE/Azure: export SERVICE_IP=$(kubectl get svc --namespace seldon-system ambassador -o jsonpath='{.status.loadBalancer.ingress[0].ip}') On AWS: export SERVICE_IP=$(kubectl get svc --namespace seldon-system ambassador -o jsonpath='{.status.loadBalancer.ingress[0].hostname}') echo http://$SERVICE_IP: For help, visit our Slack at http://a8r.io/Slack or view the documentation online at https://www.getambassador.io. ``` Wait until four pods become running in the seldon-system. ```bash kubectl get pod -n seldon-system ``` ```bash ambassador-7f596c8b57-4s9xh 1/1 Running 0 7m15s ambassador-7f596c8b57-dt6lr 1/1 Running 0 7m15s ambassador-7f596c8b57-h5l6f 1/1 Running 0 7m15s ambassador-agent-77bccdfcd5-d5jxj 1/1 Running 0 7m15s ``` ### Seldon-Core - Helm Install Install version 1.11.2 of the seldon-core-operator Chart. ```bash helm install seldon-core seldon-core-operator \ --repo https://storage.googleapis.com/seldon-charts \ --namespace seldon-system \ --set usageMetrics.enabled=true \ --set ambassador.enabled=true \ --version 1.11.2 ``` The following message should be displayed. ```bash Skip... W1206 17:05:38.336391 28181 warnings.go:70] admissionregistration.k8s.io/v1beta1 ValidatingWebhookConfiguration is deprecated in v1.16+, unavailable in v1.22+; use admissionregistration.k8s.io/v1 ValidatingWebhookConfiguration NAME: seldon-core LAST DEPLOYED: Mon Dec 6 17:05:34 2021 NAMESPACE: seldon-system STATUS: deployed REVISION: 1 TEST SUITE: None ``` Wait until one seldon-controller-manager pod is Running in the seldon-system namespace. ```bash kubectl get pod -n seldon-system | grep seldon-controller ``` ```bash seldon-controller-manager-8457b8b5c7-r2frm 1/1 Running 0 2m22s ``` ## References - [Example Model Servers with Seldon](https://docs.seldon.io/projects/seldon-core/en/latest/examples/server_examples.html#examples-server-examples--page-root) ================================================ FILE: i18n/en/docusaurus-plugin-content-docs/version-1.0/setup-kubernetes/_category_.json ================================================ { "label": "Setup Kubernetes", "position": 2, "link": { "type": "generated-index" } } ================================================ FILE: i18n/en/docusaurus-plugin-content-docs/version-1.0/setup-kubernetes/install-kubernetes/_category_.json ================================================ { "label": "4. Install Kubernetes", "position": 4, "link": { "type": "generated-index" } } ================================================ FILE: i18n/en/docusaurus-plugin-content-docs/version-1.0/setup-kubernetes/install-kubernetes/kubernetes-with-k3s.md ================================================ --- title: "4.1. K3s" description: "" sidebar_position: 1 date: 2021-12-13 lastmod: 2021-12-20 draft: false weight: 221 contributors: ["Jongseob Jeon"] menu: docs: parent:../setup-kubernetes" images: [] --- ## 1. Prerequisite Before setting up a Kubernetes cluster, install the necessary components on the **cluster**. Please refer to [Install Prerequisite](../../setup-kubernetes/install-prerequisite.md) to install the necessary components on the **cluster** before installing Kubernetes. k3s uses containerd as the backend by default. However, we need to use docker as the backend to use GPU, so we will install the backend with the `--docker` option. ```bash curl -sfL https://get.k3s.io | INSTALL_K3S_VERSION=v1.21.7+k3s1 sh -s - server --disable traefik --disable servicelb --disable local-storage --docker ``` After installing k3s, check the k3s config. ```bash sudo cat /etc/rancher/k3s/k3s.yaml ``` If installed correctly, the following items will be output. (Security related keys are hidden with <...>.) ```bash apiVersion: v1 clusters: - cluster: certificate-authority-data: <...> server: https://127.0.0.1:6443 name: default contexts: - context: cluster: default user: default name: default current-context: default kind: Config preferences: {} users: - name: default user: client-certificate-data: <...> client-key-data: <...> ``` ## 2. Setup Kubernetes Cluster Set up the Kubernetes cluster by copying the k3s config to be used as the cluster’s kubeconfig. ```bash mkdir .kube sudo cp /etc/rancher/k3s/k3s.yaml .kube/config ``` Grant user access permission to the copied config file. ```bash sudo chown $USER:$USER .kube/config ``` ## 3. Setup Kubernetes Client Now move the kubeconfig configured in the cluster to the local. Set the path to `~/.kube/config` on the local. The config file copied at first has the server ip set to `https://127.0.0.1:6443`. Modify this value to match the ip of the cluster. (We modified it to `https://192.168.0.19:6443` to match the ip of the cluster used in this page.) ```bash apiVersion: v1 clusters: - cluster: certificate-authority-data: <...> server: https://192.168.0.19:6443 name: default contexts: - context: cluster: default user: default name: default current-context: default kind: Config preferences: {} users: - name: default user: client-certificate-data: <...> client-key-data: <...> ``` ## 4. Install Kubernetes Default Modules Please refer to [Setup Kubernetes Modules](../../setup-kubernetes/install-kubernetes-module.md) to install the following components: - helm - kustomize - CSI plugin - [Optional] nvidia-docker, nvidia-device-plugin ## 5. Verify Successful Installation Finally, check if the nodes are Ready and verify the OS, Docker, and Kubernetes versions. ```bash kubectl get nodes -o wide ``` If you see the following message, it means that the installation was successful. ```bash NAME STATUS ROLES AGE VERSION INTERNAL-IP EXTERNAL-IP OS-IMAGE KERNEL-VERSION CONTAINER-RUNTIME ubuntu Ready control-plane,master 11m v1.21.7+k3s1 192.168.0.19 Ubuntu 20.04.3 LTS 5.4.0-91-generic docker://20.10.11 ``` ## 6. References - [https://rancher.com/docs/k3s/latest/en/installation/install-options/](https://rancher.com/docs/k3s/latest/en/installation/install-options/) ================================================ FILE: i18n/en/docusaurus-plugin-content-docs/version-1.0/setup-kubernetes/install-kubernetes/kubernetes-with-kubeadm.md ================================================ --- title: "4.3. Kubeadm" description: "" sidebar_position: 3 date: 2021-12-13 lastmod: 2021-12-20 contributors: ["Youngcheol Jang"] --- ## 1. Prerequisite Before building a Kubernetes cluster, install the necessary components to the **cluster**. Please refer to [Install Prerequisite](../../setup-kubernetes/install-prerequisite.md) and install the necessary components to the **cluster**. Change the configuration of the network for Kubernetes. ```bash sudo modprobe br_netfilter cat < Ubuntu 20.04.3 LTS 5.4.0-91-generic docker://20.10.11 ``` ================================================ FILE: i18n/en/docusaurus-plugin-content-docs/version-1.0/setup-kubernetes/install-kubernetes-module.md ================================================ --- title: "5. Install Kubernetes Modules" description: "Install Helm, Kustomize" sidebar_position: 5 date: 2021-12-13 lastmod: 2021-12-20 contributors: ["Jaeyeon Kim"] --- ## Setup Kubernetes Modules On this page, we will explain how to install the modules that will be used on the cluster from the client nodes. All the processes introduced here will be done on the **client nodes**. ## Helm Helm is one of the package management tools that helps to deploy and manage resources related to Kubernetes packages at once. 1. Download Helm version 3.7.1 into the current folder. - For Linux amd64 ```bash wget https://get.helm.sh/helm-v3.7.1-linux-amd64.tar.gz ``` - Other OS refer to the [official website](https://github.com/helm/helm/releases/tag/v3.7.1) for the download path of the binary that matches the OS and CPU of your client node. 2. Unzip the file to use helm and move the file to its desired location. ```bash tar -zxvf helm-v3.7.1-linux-amd64.tar.gz sudo mv linux-amd64/helm /usr/local/bin/helm ``` 3. Check to see if the installation was successful: ```bash helm help ``` If you see the following message, it means that it has been installed normally. ```bash The Kubernetes package manager Common actions for Helm: - helm search: search for charts - helm pull: download a chart to your local directory to view - helm install: upload the chart to Kubernetes - helm list: list releases of charts Environment variables: | Name | Description | |--------------------------|---------------------------------------------------------------------| | $HELM_CACHE_HOME | set an alternative location for storing cached files. | | $HELM_CONFIG_HOME | set an alternative location for storing Helm configuration. | | $HELM_DATA_HOME | set an alternative location for storing Helm data. | ... ``` ## Kustomize Kustomize is one of the package management tools that helps to deploy and manage multiple Kubernetes resources at once. 1. Download the binary version of kustomize v3.10.0 in the current folder. - For Linux amd64 ```bash wget https://github.com/kubernetes-sigs/kustomize/releases/download/kustomize%2Fv3.10.0/kustomize_v3.10.0_linux_amd64.tar.gz ``` - Other OS can be downloaded from [kustomize/v3.10.0](https://github.com/kubernetes-sigs/kustomize/releases/tag/kustomize%2Fv3.10.0) after checking. 2. Unzip to use kustomize, and change the file location. ```bash tar -zxvf kustomize_v3.10.0_linux_amd64.tar.gz sudo mv kustomize /usr/local/bin/kustomize ``` 3. Check if it is installed correctly. ```bash kustomize help ``` If you see the following message, it means that it has been installed normally. ```bash Manages declarative configuration of Kubernetes. See https://sigs.k8s.io/kustomize Usage: kustomize [command] Available Commands: build Print configuration per contents of kustomization.yaml cfg Commands for reading and writing configuration. completion Generate shell completion script create Create a new kustomization in the current directory edit Edits a kustomization file fn Commands for running functions against configuration. ... ``` ## CSI Plugin : Local Path Provisioner 1. The CSI Plugin is a module that is responsible for storage within Kubernetes. Install the CSI Plugin, Local Path Provisioner, which is easy to use in single node clusters. ```bash kubectl apply -f https://raw.githubusercontent.com/rancher/local-path-provisioner/v0.0.20/deploy/local-path-storage.yaml ``` If you see the following messages, it means that the installation was successful: ```bash namespace/local-path-storage created serviceaccount/local-path-provisioner-service-account created clusterrole.rbac.authorization.k8s.io/local-path-provisioner-role created clusterrolebinding.rbac.authorization.k8s.io/local-path-provisioner-bind created deployment.apps/local-path-provisioner created storageclass.storage.k8s.io/local-path created configmap/local-path-config created ``` 2. Also, check if the provisioner pod in the local-path-storage namespace is Running by executing the following command: ```bash kubectl -n local-path-storage get pod ``` If successful, it will display the following output: ```bash NAME READY STATUS RESTARTS AGE local-path-provisioner-d744ccf98-xfcbk 1/1 Running 0 7m ``` 4. Execute the following command to change the default storage class: ```bash kubectl patch storageclass local-path -p '{"metadata": {"annotations":{"storageclass.kubernetes.io/is-default-class":"true"}}}' ``` If the command is successful, the following output will be displayed: ```bash storageclass.storage.k8s.io/local-path patched ``` 5. Verify that the default storage class has been set: ```bash kubectl get sc ``` Check if there is a storage class with the name `local-path (default)` in the NAME column: ```bash NAME PROVISIONER RECLAIMPOLICY VOLUMEBINDINGMODE ALLOWVOLUMEEXPANSION AGE local-path (default) rancher.io/local-path Delete WaitForFirstConsumer false 2h ``` ================================================ FILE: i18n/en/docusaurus-plugin-content-docs/version-1.0/setup-kubernetes/install-prerequisite.md ================================================ --- title: "3. Install Prerequisite" description: "Install docker" sidebar_position: 3 date: 2021-12-13 lastmod: 2021-12-20 contributors: ["Jaeyeon Kim", "Jongsun Shinn", "Sangwoo Shim"] --- On this page, we describe the components that need to be installed or configured on the **Cluster** and **Client** prior to installing Kubernetes. ## Install apt packages In order to enable smooth communication between the Client and the Cluster, Port-Forwarding needs to be performed. To enable Port-Forwarding, the following packages need to be installed on the **Cluster**. ```bash sudo apt-get update sudo apt-get install -y socat ``` ## Install Docker 1. Install apt packages for docker. ```bash sudo apt-get update && sudo apt-get install -y ca-certificates curl gnupg lsb-release ``` 2. add docker official GPG key. ```bash curl -fsSL https://download.docker.com/linux/ubuntu/gpg | sudo gpg --dearmor -o /usr/share/keyrings/docker-archive-keyring.gpg ``` 3. When installing Docker using the apt package manager, configure it to retrieve from the stable repository: ```bash echo \ "deb [arch=$(dpkg --print-architecture) signed-by=/usr/share/keyrings/docker-archive-keyring.gpg] https://download.docker.com/linux/ubuntu \ $(lsb_release -cs) stable" | sudo tee /etc/apt/sources.list.d/docker.list > /dev/null ``` 4. Check the currently available Docker versions for installation: ```bash sudo apt-get update && apt-cache madison docker-ce ``` Verify if the version `5:20.10.11~3-0~ubuntu-focal` is listed among the output: ```bash apt-cache madison docker-ce | grep 5:20.10.11~3-0~ubuntu-focal ``` If the addition was successful, the following output will be displayed: ```bash docker-ce | 5:20.10.11~3-0~ubuntu-focal | https://download.docker.com/linux/ubuntu focal/stable amd64 Packages ``` 5. Install Docker version `5:20.10.11~3-0~ubuntu-focal`: ```bash sudo apt-get install -y containerd.io docker-ce=5:20.10.11~3-0~ubuntu-focal docker-ce-cli=5:20.10.11~3-0~ubuntu-focal ``` 6. Check docker is installed. ```bash sudo docker run hello-world ``` If added successfully, it will output as follows: ```bash mlops@ubuntu:~$ sudo docker run hello-world Hello from Docker! This message shows that your installation appears to be working correctly. To generate this message, Docker took the following steps: 1. The Docker client contacted the Docker daemon. 2. The Docker daemon pulled the "hello-world" image from the Docker Hub. (amd64) 3. The Docker daemon created a new container from that image which runs the executable that produces the output you are currently reading. 4. The Docker daemon streamed that output to the Docker client, which sent it to your terminal. To try something more ambitious, you can run an Ubuntu container with: $ docker run -it ubuntu bash Share images, automate workflows, and more with a free Docker ID: https://hub.docker.com/ For more examples and ideas, visit: https://docs.docker.com/get-started/ ``` 7. Add permissions to use Docker commands without the `sudo` keyword by executing the following commands: ```bash sudo groupadd docker sudo usermod -aG docker $USER newgrp docker ``` 8. To verify that you can now use Docker commands without `sudo`, run the `docker run` command again: ```bash docker run hello-world ``` If you see the following message after executing the command, it means that the permissions have been successfully added: ```bash mlops@ubuntu:~$ docker run hello-world Hello from Docker! This message shows that your installation appears to be working correctly. To generate this message, Docker took the following steps: 1. The Docker client contacted the Docker daemon. 2. The Docker daemon pulled the "hello-world" image from the Docker Hub. (amd64) 3. The Docker daemon created a new container from that image which runs the executable that produces the output you are currently reading. 4. The Docker daemon streamed that output to the Docker client, which sent it to your terminal. To try something more ambitious, you can run an Ubuntu container with: $ docker run -it ubuntu bash Share images, automate workflows, and more with a free Docker ID: https://hub.docker.com/ For more examples and ideas, visit: https://docs.docker.com/get-started/ ``` ## Turn off Swap Memory In order for kubelet to work properly, **cluster** nodes must turn off the virtual memory called swap. The following command turns off the swap. **(When using cluster and client on the same desktop, turning off swap memory may result in a slowdown in speed)** ```bash sudo sed -i '/ swap / s/^\(.*\)$/#\1/g' /etc/fstab sudo swapoff -a ``` ## Install Kubectl kubectl is a client tool used to make API requests to a Kubernetes cluster. It needs to be installed on the client node. 1. Download kubectl version v1.21.7 to the current folder: ```bash curl -LO https://dl.k8s.io/release/v1.21.7/bin/linux/amd64/kubectl ``` 2. Change the file permissions and move it to the appropriate location to make kubectl executable: ```bash sudo install -o root -g root -m 0755 kubectl /usr/local/bin/kubectl ``` 3. Verify that kubectl is installed correctly: ```bash kubectl version --client ``` If you see the following message, it means that kubectl is installed successfully: ```bash Client Version: version.Info{Major:"1", Minor:"21", GitVersion:"v1.21.7", GitCommit:"1f86634ff08f37e54e8bfcd86bc90b61c98f84d4", GitTreeState:"clean", BuildDate:"2021-11-17T14:41:19Z", GoVersion:"go1.16.10", Compiler:"gc", Platform:"linux/amd64"} ``` 4. If you work with multiple Kubernetes clusters and need to manage multiple kubeconfig files or kube-contexts efficiently, you can refer to the following resources: - [Configuring Multiple kubeconfig on Your Machine](https://dev.to/aabiseverywhere/configuring-multiple-kubeconfig-on-your-machine-59eo) - [kubectx - Switch between Kubernetes contexts easily](https://github.com/ahmetb/kubectx) ## References - [Install Docker Engine on Ubuntu](https://docs.docker.com/engine/install/ubuntu/) - [Install and Set Up kubectl on Linux](https://kubernetes.io/docs/tasks/tools/install-kubectl-linux/) ================================================ FILE: i18n/en/docusaurus-plugin-content-docs/version-1.0/setup-kubernetes/intro.md ================================================ --- title: "1. Introduction" description: "Setup Introduction" sidebar_position: 1 date: 2021-12-13 lastmod: 2021-12-13 contributors: ["Jaeyeon Kim", "Jongsun Shinn", "Youngdon Tae", "SeungTae Kim"] --- ## Build MLOps System The biggest barrier when studying MLOps is the difficulty of setting up and using an MLOps system. Using public cloud platforms like AWS or GCP, or commercial tools like Weights & Biases or neptune.ai, can be costly, and starting from scratch to build the entire environment can be overwhelming and confusing. To address these challenges and help those who haven't been able to start with MLOps, *MLOps for ALL* will guide you on how to build and use an MLOps system from scratch, requiring only a desktop with Ubuntu installed. For those who cannot prepare a Ubuntu desktop environment, use virtual machines to set up the environment. > If you are using Windows or an Intel-based Mac for the *MLOps for ALL* practical exercises, you can prepare an Ubuntu desktop environment using virtual machine software such as VirtualBox or VMware. Please make sure to meet the recommended specifications when creating the virtual machine. > However, for those using an M1 Mac, as of the date of writing (February 2022), VirtualBox and VMware are not available. ([Check if macOS apps are optimized for M1 Apple Silicon Mac](https://isapplesiliconready.com/kr)) > Therefore, if you are not using a cloud environment, you can install UTM, Virtual machines for Mac, to use virtual machines. > (Purchasing and downloading software from the App Store is a form of donation-based payment. The free version is sufficient as it only differs in automatic updates.) > This virtual machine software supports the *Ubuntu 20.04.3 LTS* practice operating system, enabling you to perform the exercises on an M1 Mac. However, since it is not possible to use all the elements described in the [Components of MLOps](../introduction/component.md), *MLOps for ALL* will mainly focus on installing the representative open source software and connecting them to each other. It is not meant that installing open source software in *MLOps for ALL* is a standard, and we recommend choosing the appropriate tool that fits your situation. ## Components The components of the MLOps system that we will make in this article and each version have been verified in the following environment. To facilitate smooth testing, I will explain the setup of the **Cluster** and **Client** as separate entities. The **Cluster** refers to a single desktop with Ubuntu installed. The **Client** is recommended to be a different desktop, such as a laptop or another desktop with access to the Cluster or Kubernetes installation. However, if you only have one machine available, you can use the same desktop for both Cluster and Client purposes. ### Cluster #### 1. Software Below is the list of software that needs to be installed on the Cluster: | Software | Version | | --------------- | ----------- | | Ubuntu | 20.04.3 LTS | | Docker (Server) | 20.10.11 | | NVIDIA Driver | 470.86 | | Kubernetes | v1.21.7 | | Kubeflow | v1.4.0 | | MLFlow | v1.21.0 | #### 2. Helm Chart Below is the list of third-party software that needs to be installed using Helm: | Helm Chart Repo Name | Version | | ----------------------------- | ------- | | datawire/ambassador | 6.9.3 | | seldonio/seldon-core-operator | 1.11.2 | ### Client The Client has been validated on MacOS (Intel CPU) and Ubuntu 20.04. | Software | Version | | --------------- | ----------| | kubectl | v1.21.7 | | helm | v3.7.1 | | kustomize | v3.10.0 | ### Minimum System Requirements It is recommended that the Cluster meet the following specifications, which are dependent on the recommended specifications for Kubernetes and Kubeflow: - CPU: 6 cores - RAM: 12GB - DISK: 50GB - GPU: NVIDIA GPU (optional) ================================================ FILE: i18n/en/docusaurus-plugin-content-docs/version-1.0/setup-kubernetes/kubernetes.md ================================================ --- title : "2. Setup Kubernetes" description: "Setup Kubernetes" sidebar_position: 2 date: 2021-12-13 lastmod: 2021-12-13 contributors: ["Jaeyeon Kim"] --- ## Setup Kubernetes Cluster For those learning Kubernetes for the first time, the first barrier to entry is setting up a Kubernetes practice environment. The official tool that supports building a production-level Kubernetes cluster is kubeadm, but there are also tools such as kubespray and kops that help users set up more easily, and tools such as k3s, minikube, microk8s, and kind that help you set up a compact Kubernetes cluster easily for learning purposes. Each tool has its own advantages and disadvantages, so considering the preferences of each user, this article will use three tools: kubeadm, k3s, and minikube to set up a Kubernetes cluster. For detailed comparisons of each tool, please refer to the official Kubernetes [documentation](https://kubernetes.io/ko/docs/tasks/tools/). *MLOps for ALL* recommends **k3s** as a tool that is easy to use when setting up a Kubernetes cluster. If you want to use all the features of Kubernetes and configure the nodes, we recommend **kubeadm**. **minikube** has the advantage of being able to easily install other Kubernetes in an add-on format, in addition to the components we describe. In this *MLOps for ALL*, in order to use the components that will be built for MLOps smoothly, there are additional settings that must be configured when building the Kubernetes cluster using each of the tools. The scope of this **Setup Kubernetes** section is to build a k8s cluster on a desktop that already has Ubuntu OS installed and to confirm that external client nodes can access the Kubernetes cluster. The detailed setup procedure is composed of the following flow, as each of the three tools has its own setup procedure. ```bash 3. Setup Prerequisite 4. Setup Kubernetes 4.1. with k3s 4.2. with minikube 4.3. with kubeadm 5. Setup Kubernetes Modules ``` Let's now build a Kubernetes cluster by using each of the tools. You don't have to use all the tools, and you can use the tools that you are familiar with. ================================================ FILE: i18n/en/docusaurus-plugin-content-docs/version-1.0/setup-kubernetes/setup-nvidia-gpu.md ================================================ --- title: "6. (Optional) Setup GPU" description: "Install nvidia docker, nvidia device plugin" sidebar_position: 6 date: 2021-12-13 lastmod: 2021-12-13 contributors: ["Jaeyeon Kim"] --- For using GPU in Kubernetes and Kubeflow, the following tasks are required. ## 1. Install NVIDIA Driver If the following screen is output when executing `nvidia-smi`, please omit this step. ```bash mlops@ubuntu:~$ nvidia-smi +-----------------------------------------------------------------------------+ | NVIDIA-SMI 470.86 Driver Version: 470.86 CUDA Version: 11.4 | |-------------------------------+----------------------+----------------------+ | GPU Name Persistence-M| Bus-Id Disp.A | Volatile Uncorr. ECC | | Fan Temp Perf Pwr:Usage/Cap| Memory-Usage | GPU-Util Compute M. | | | | MIG M. | |===============================+======================+======================| | 0 NVIDIA GeForce ... Off | 00000000:01:00.0 Off | N/A | | 25% 32C P8 4W / 120W | 211MiB / 6078MiB | 0% Default | | | | N/A | +-------------------------------+----------------------+----------------------+ | 1 NVIDIA GeForce ... Off | 00000000:02:00.0 Off | N/A | | 0% 34C P8 7W / 175W | 5MiB / 7982MiB | 0% Default | | | | N/A | +-------------------------------+----------------------+----------------------+ +-----------------------------------------------------------------------------+ | Processes: | | GPU GI CI PID Type Process name GPU Memory | | ID ID Usage | |=============================================================================| | 0 N/A N/A 1644 G /usr/lib/xorg/Xorg 198MiB | | 0 N/A N/A 1893 G /usr/bin/gnome-shell 10MiB | | 1 N/A N/A 1644 G /usr/lib/xorg/Xorg 4MiB | +-----------------------------------------------------------------------------+ ``` If the output of nvidia-smi is not as above, please install the nvidia driver that fits your installed GPU. If you are not familiar with the installation of nvidia drivers, please install it through the following command. ```bash sudo add-apt-repository ppa:graphics-drivers/ppa sudo apt update && sudo apt install -y ubuntu-drivers-common sudo ubuntu-drivers autoinstall sudo reboot ``` ## 2. Install NVIDIA-Docker. Let's install NVIDIA-Docker. ```bash curl -s -L https://nvidia.github.io/nvidia-docker/gpgkey | \ sudo apt-key add - distribution=$(. /etc/os-release;echo $ID$VERSION_ID) curl -s -L https://nvidia.github.io/nvidia-docker/$distribution/nvidia-docker.list | sudo tee /etc/apt/sources.list.d/nvidia-docker.list sudo apt-get update sudo apt-get install -y nvidia-docker2 && sudo systemctl restart docker ``` To check if it is installed correctly, we will run the docker container using the GPU. ```bash sudo docker run --rm --gpus all nvidia/cuda:11.0-base nvidia-smi ``` If the following message appears, it means that the installation was successful: ```bash mlops@ubuntu:~$ sudo docker run --rm --gpus all nvidia/cuda:11.0-base nvidia-smi +-----------------------------------------------------------------------------+ | NVIDIA-SMI 470.86 Driver Version: 470.86 CUDA Version: 11.4 | |-------------------------------+----------------------+----------------------+ | GPU Name Persistence-M| Bus-Id Disp.A | Volatile Uncorr. ECC | | Fan Temp Perf Pwr:Usage/Cap| Memory-Usage | GPU-Util Compute M. | | | | MIG M. | |===============================+======================+======================| | 0 NVIDIA GeForce ... Off | 00000000:01:00.0 Off | N/A | | 25% 32C P8 4W / 120W | 211MiB / 6078MiB | 0% Default | | | | N/A | +-------------------------------+----------------------+----------------------+ | 1 NVIDIA GeForce ... Off | 00000000:02:00.0 Off | N/A | | 0% 34C P8 6W / 175W | 5MiB / 7982MiB | 0% Default | | | | N/A | +-------------------------------+----------------------+----------------------+ +-----------------------------------------------------------------------------+ | Processes: | | GPU GI CI PID Type Process name GPU Memory | | ID ID Usage | |=============================================================================| +-----------------------------------------------------------------------------+ ``` ## 3. Setting NVIDIA-Docker as the Default Container Runtime By default, Kubernetes uses Docker-CE as the default container runtime. To use NVIDIA GPU within Docker containers, you need to configure NVIDIA-Docker as the container runtime and modify the default runtime for creating pods. 1. Open the `/etc/docker/daemon.json` file and make the following modifications: ```bash sudo vi /etc/docker/daemon.json { "default-runtime": "nvidia", "runtimes": { "nvidia": { "path": "nvidia-container-runtime", "runtimeArgs": [] } } } ``` 2. After confirming the file changes, restart Docker. ```bash sudo systemctl daemon-reload sudo service docker restart ``` 3. Verify that the changes have been applied. ```bash sudo docker info | grep nvidia ``` If you see the following message, it means that the installation was successful. ```bash mlops@ubuntu:~$ docker info | grep nvidia Runtimes: io.containerd.runc.v2 io.containerd.runtime.v1.linux nvidia runc Default Runtime: nvidia ``` ## 4. Nvidia-Device-Plugin 1. Create the nvidia-device-plugin daemonset. ```bash kubectl create -f https://raw.githubusercontent.com/NVIDIA/k8s-device-plugin/v0.10.0/nvidia-device-plugin.yml ``` 2. Verify that the nvidia-device-plugin pod is in the RUNNING state. ```bash kubectl get pod -n kube-system | grep nvidia ``` You should see the following output: ```bash kube-system nvidia-device-plugin-daemonset-nlqh2 1/1 Running 0 1h ``` 3. Verify that the nodes have been configured to have GPUs available. ```bash kubectl get nodes "-o=custom-columns=NAME:.metadata.name,GPU:.status.allocatable.nvidia\.com/gpu" ``` If you see the following message, it means that the configuration was successful. (*In the *MLOps for ALL* tutorial cluster, there are two GPUs, so the output is 2. If the output shows the correct number of GPUs for your cluster, it is fine.) ```bash NAME GPU ubuntu 2 ``` If it is not configured, the GPU value will be displayed as ``. ================================================ FILE: i18n/en/docusaurus-plugin-content-docs/version-1.0.json ================================================ { "version.label": { "message": "1.0", "description": "The label for version 1.0" }, "sidebar.tutorialSidebar.category.Introduction": { "message": "Introduction", "description": "The label for category Introduction in sidebar tutorialSidebar" }, "sidebar.tutorialSidebar.category.Setup Kubernetes": { "message": "Setup Kubernetes", "description": "The label for category Setup Kubernetes in sidebar tutorialSidebar" }, "sidebar.tutorialSidebar.category.4. Install Kubernetes": { "message": "4. Install Kubernetes", "description": "The label for category 4. Install Kubernetes in sidebar tutorialSidebar" }, "sidebar.tutorialSidebar.category.Setup Components": { "message": "Setup Components", "description": "The label for category Setup Components in sidebar tutorialSidebar" }, "sidebar.tutorialSidebar.category.Kubeflow UI Guide": { "message": "Kubeflow UI Guide", "description": "The label for category Kubeflow UI Guide in sidebar tutorialSidebar" }, "sidebar.tutorialSidebar.category.Kubeflow": { "message": "Kubeflow", "description": "The label for category Kubeflow in sidebar tutorialSidebar" }, "sidebar.tutorialSidebar.category.API Deployment": { "message": "API Deployment", "description": "The label for category API Deployment in sidebar tutorialSidebar" }, "sidebar.tutorialSidebar.category.Appendix": { "message": "Appendix", "description": "The label for category Appendix in sidebar tutorialSidebar" }, "sidebar.tutorialSidebar.category.Further Readings": { "message": "Further Readings", "description": "The label for category Further Readings in sidebar tutorialSidebar" }, "sidebar.preSidebar.category.Docker": { "message": "Docker", "description": "The label for category Docker in sidebar preSidebar" } } ================================================ FILE: i18n/en/docusaurus-plugin-content-docs-community/current/community/community.md ================================================ --- title: "Community" sidebar_position: 1 --- ### *MLOps for ALL* 릴리즈 소식 새로운 포스트나 수정사항은 [Announcements](https://github.com/mlops-for-all/mlops-for-all.github.io/discussions/categories/announcements)에서 확인할 수 있습니다. ### Question 프로젝트 내용과 관련된 궁금점은 [Q&A](https://github.com/mlops-for-all/mlops-for-all.github.io/discussions/categories/q-a)를 통해 질문할 수 있습니다. ### Suggestion 제안점은 [Ideas](https://github.com/mlops-for-all/mlops-for-all.github.io/discussions/categories/ideas)를 통해 제안해 주시면 됩니다. ================================================ FILE: i18n/en/docusaurus-plugin-content-docs-community/current/community/contributors.md ================================================ --- sidebar_position: 3 --- # Contributors ## Main Authors import { MainAuthorRow, } from '@site/src/components/TeamProfileCards'; ## Contributors Thank you for contributing our tutorials! import { ContributorsRow, } from '@site/src/components/TeamProfileCards'; ================================================ FILE: i18n/en/docusaurus-plugin-content-docs-community/current/community/how-to-contribute.md ================================================ --- title: "How to Contribute" sidebar_position: 2 --- ## How to Start ### Git Repo 준비 1. [*MLOps for ALL* GitHub Repository](https://github.com/mlops-for-all/mlops-for-all.github.io)에 접속합니다. 2. 여러분의 개인 Repository로 `Fork`합니다. 3. Forked Repository를 여러분의 작업 환경으로 `git clone`합니다. ### 환경 설정 1. MLOps for ALL는 Hugo 와 Node를 이용하고 있습니다. 다음 명령어를 통해 필요한 패키지가 설치되어 있는지 확인합니다. - node & npm ```bash npm --version ``` - hugo ```bash hugo version ``` 1. 필요한 node module을 설치합니다. ```bash npm install ``` 2. 프로젝트에서는 각 글의 일관성을 위해서 여러 markdown lint를 적용하고 있습니다. 다음 명령어를 실행해 test를 진행한 후 커밋합니다.내용 수정 및 추가 후 lint가 맞는지 확인합니다. ```bash npm test ``` 4. lint 확인 완료 후 ci 를 실행합니다. ```bash npm ci ``` 4. 로컬에서 실행 후 수정한 글이 정상적으로 나오는지 확인합니다. ```bash npm run start ``` ## How to Contribute ### 1. 새로운 포스트를 작성할 때 새로운 포스트는 각 챕터와 포스트의 위치에 맞는 weight를 설정합니다. - Introduction: 1xx - Setup: 2xx - Kubeflow: 3xx - API Deployment: 4xx - Help: 10xx ### 2. 기존의 포스트를 수정할 때 기존의 포스트를 수정할 때 Contributor에 본인의 이름을 입력합니다. ```markdown contributors: ["John Doe", "Adam Smith"] ``` ### 3. 프로젝트에 처음 기여할 때 만약 프로젝트에 처음 기여 할 때 `content/kor/contributors`에 본인의 이름으로 폴더를 생성한 후, `_index.md`라는 파일을 작성합니다. 예를 들어, `minsoo kim`이 본인의 영어 이름이라면, 폴더명은 `minsoo-kim`으로 하여 해당 폴더 내부의 `_index.md`파일에 다음의 내용을 작성합니다. 폴더명은 하이픈(-)으로 연결한 소문자로, title은 띄어쓰기를 포함한 CamelCase로 작성합니다. ```markdown --- title: "John Doe" draft: false --- ``` ## After Pull Request Pull Request를 생성하면 프로젝트에서는 자동으로 *MLOps for ALL* 운영진에게 리뷰 요청이 전해집니다. 최대 일주일 이내로 확인 후 Comment를 드릴 예정입니다. ================================================ FILE: i18n/en/docusaurus-plugin-content-docs-community/current.json ================================================ { "version.label": { "message": "Next", "description": "The label for version current" } } ================================================ FILE: i18n/en/docusaurus-theme-classic/footer.json ================================================ { "copyright": { "message": "Copyright © 2021-2023 MakinaRocks. Built with Docusaurus.", "description": "The footer copyright" } } ================================================ FILE: i18n/en/docusaurus-theme-classic/navbar.json ================================================ { "title": { "message": "MLOps for ALL", "description": "The title in the navbar" }, "logo.alt": { "message": "My Site Logo", "description": "The alt text of navbar logo" }, "item.label.Tutorial": { "message": "Tutorial", "description": "Navbar item with label Tutorial" }, "item.label.Prerequisites": { "message": "Prerequisites", "description": "Navbar item with label Prerequisites" }, "item.label.Community": { "message": "Community", "description": "Navbar item with label Community" }, "item.label.GitHub": { "message": "GitHub", "description": "Navbar item with label GitHub" } } ================================================ FILE: i18n/ko/code.json ================================================ { "team.profile.Jongseob Jeon.body": { "message": "마키나락스에서 머신러닝 엔지니어로 일하고 있습니다. 모두의 딥러닝을 통해 많은 사람들이 딥러닝을 쉽게 접했듯이 모두의 MLOps를 통해 많은 사람들이 MLOps에 쉽게 접할수 있길 바랍니다." }, "team.profile.Jaeyeon Kim.body": { "message": "비효율적인 작업을 자동화하는 것에 관심이 많습니다." }, "team.profile.Youngchel Jang.body": { "message": "마키나락스에서 MLOps Engineer로 일하고 있습니다. 단순하게 생각하는 노력을 하고 있습니다." }, "team.profile.Jongsun Shinn.body": { "message": "마키나락스에서 ML Engineer로 일하고 있습니다." }, "team.profile.Sangwoo Shim.body": { "message": "마키나락스에서 CTO로 일하고 있습니다. 마키나락스는 머신러닝 기반의 산업용 AI 솔루션을 개발하는 스타트업입니다. 산업 현장의 문제 해결을 통해 사람이 본연의 일에 집중할 수 있게 만드는 것, 그것이 우리가 하는 일입니다." }, "team.profile.Seunghyun Ko.body": { "message": "3i에서 MLOps Engineer로 일하고 있습니다. kubeflow에 관심이 많습니다." }, "team.profile.SeungTae Kim.body": { "message": "Genesis Lab이라는 스타트업에서 Applied AI Engineer 인턴 업무를 수행하고 있습니다. 머신러닝 생태계가 우리 산업 전반에 큰 변화을 가져올 것이라 믿으며, 한 걸음씩 나아가고 있습니다." }, "team.profile.Youngdon Tae.body": { "message": "백패커에서 ML 엔지니어로 일하고 있습니다. 자연어처리, 추천시스템, MLOps에 관심이 많습니다." }, "theme.ErrorPageContent.title": { "message": "페이지에 오류가 발생하였습니다.", "description": "The title of the fallback page when the page crashed" }, "theme.NotFound.title": { "message": "페이지를 찾을 수 없습니다.", "description": "The title of the 404 page" }, "theme.NotFound.p1": { "message": "원하는 페이지를 찾을 수 없습니다.", "description": "The first paragraph of the 404 page" }, "theme.NotFound.p2": { "message": "사이트 관리자에게 링크가 깨진 것을 알려주세요.", "description": "The 2nd paragraph of the 404 page" }, "theme.admonition.note": { "message": "노트", "description": "The default label used for the Note admonition (:::note)" }, "theme.admonition.tip": { "message": "팁", "description": "The default label used for the Tip admonition (:::tip)" }, "theme.admonition.danger": { "message": "위험", "description": "The default label used for the Danger admonition (:::danger)" }, "theme.admonition.info": { "message": "정보", "description": "The default label used for the Info admonition (:::info)" }, "theme.admonition.caution": { "message": "주의", "description": "The default label used for the Caution admonition (:::caution)" }, "theme.BackToTopButton.buttonAriaLabel": { "message": "맨 위로 스크롤하기", "description": "The ARIA label for the back to top button" }, "theme.blog.paginator.navAriaLabel": { "message": "블로그 게시물 목록 탐색", "description": "The ARIA label for the blog pagination" }, "theme.blog.paginator.newerEntries": { "message": "이전 페이지", "description": "The label used to navigate to the newer blog posts page (previous page)" }, "theme.blog.paginator.olderEntries": { "message": "다음 페이지", "description": "The label used to navigate to the older blog posts page (next page)" }, "theme.blog.archive.title": { "message": "게시물 목록", "description": "The page & hero title of the blog archive page" }, "theme.blog.archive.description": { "message": "게시물 목록", "description": "The page & hero description of the blog archive page" }, "theme.blog.post.paginator.navAriaLabel": { "message": "블로그 게시물 탐색", "description": "The ARIA label for the blog posts pagination" }, "theme.blog.post.paginator.newerPost": { "message": "이전 게시물", "description": "The blog post button label to navigate to the newer/previous post" }, "theme.blog.post.paginator.olderPost": { "message": "다음 게시물", "description": "The blog post button label to navigate to the older/next post" }, "theme.blog.post.plurals": { "message": "{count}개 게시물", "description": "Pluralized label for \"{count} posts\". Use as much plural forms (separated by \"|\") as your language support (see https://www.unicode.org/cldr/cldr-aux/charts/34/supplemental/language_plural_rules.html)" }, "theme.blog.tagTitle": { "message": "\"{tagName}\" 태그로 연결된 {nPosts}개의 게시물이 있습니다.", "description": "The title of the page for a blog tag" }, "theme.tags.tagsPageLink": { "message": "모든 태그 보기", "description": "The label of the link targeting the tag list page" }, "theme.colorToggle.ariaLabel": { "message": "어두운 모드와 밝은 모드 전환하기 (현재 {mode})", "description": "The ARIA label for the navbar color mode toggle" }, "theme.colorToggle.ariaLabel.mode.dark": { "message": "어두운 모드", "description": "The name for the dark color mode" }, "theme.colorToggle.ariaLabel.mode.light": { "message": "밝은 모드", "description": "The name for the light color mode" }, "theme.docs.DocCard.categoryDescription": { "message": "{count} 항목", "description": "The default description for a category card in the generated index about how many items this category includes" }, "theme.docs.breadcrumbs.navAriaLabel": { "message": "Breadcrumbs", "description": "The ARIA label for the breadcrumbs" }, "theme.docs.paginator.navAriaLabel": { "message": "문서 페이지", "description": "The ARIA label for the docs pagination" }, "theme.docs.paginator.previous": { "message": "이전", "description": "The label used to navigate to the previous doc" }, "theme.docs.paginator.next": { "message": "다음", "description": "The label used to navigate to the next doc" }, "theme.docs.tagDocListPageTitle.nDocsTagged": { "message": "{count}개 문서가", "description": "Pluralized label for \"{count} docs tagged\". Use as much plural forms (separated by \"|\") as your language support (see https://www.unicode.org/cldr/cldr-aux/charts/34/supplemental/language_plural_rules.html)" }, "theme.docs.tagDocListPageTitle": { "message": "{nDocsTagged} \"{tagName}\" 태그에 분류되었습니다", "description": "The title of the page for a docs tag" }, "theme.docs.versionBadge.label": { "message": "버전: {versionLabel}" }, "theme.docs.versions.unreleasedVersionLabel": { "message": "{siteTitle} {versionLabel} 문서는 아직 정식 공개되지 않았습니다.", "description": "The label used to tell the user that he's browsing an unreleased doc version" }, "theme.docs.versions.unmaintainedVersionLabel": { "message": "{siteTitle} {versionLabel} 문서는 더 이상 업데이트되지 않습니다.", "description": "The label used to tell the user that he's browsing an unmaintained doc version" }, "theme.docs.versions.latestVersionSuggestionLabel": { "message": "최신 문서는 {latestVersionLink} ({versionLabel})을 확인하세요.", "description": "The label used to tell the user to check the latest version" }, "theme.docs.versions.latestVersionLinkLabel": { "message": "최신 버전", "description": "The label used for the latest version suggestion link label" }, "theme.common.editThisPage": { "message": "페이지 편집", "description": "The link label to edit the current page" }, "theme.common.headingLinkTitle": { "message": "{heading}에 대한 직접 링크", "description": "Title for link to heading" }, "theme.lastUpdated.atDate": { "message": " {date}에", "description": "The words used to describe on which date a page has been last updated" }, "theme.lastUpdated.byUser": { "message": " {user}가", "description": "The words used to describe by who the page has been last updated" }, "theme.lastUpdated.lastUpdatedAtBy": { "message": "최종 수정: {atDate}{byUser}", "description": "The sentence used to display when a page has been last updated, and by who" }, "theme.navbar.mobileVersionsDropdown.label": { "message": "버전", "description": "The label for the navbar versions dropdown on mobile view" }, "theme.tags.tagsListLabel": { "message": "태그:", "description": "The label alongside a tag list" }, "theme.AnnouncementBar.closeButtonAriaLabel": { "message": "닫기", "description": "The ARIA label for close button of announcement bar" }, "theme.blog.sidebar.navAriaLabel": { "message": "최근 블로그 문서 둘러보기", "description": "The ARIA label for recent posts in the blog sidebar" }, "theme.CodeBlock.wordWrapToggle": { "message": "줄 바꿈 전환", "description": "The title attribute for toggle word wrapping button of code block lines" }, "theme.CodeBlock.copied": { "message": "복사했습니다", "description": "The copied button label on code blocks" }, "theme.CodeBlock.copyButtonAriaLabel": { "message": "클립보드에 코드 복사", "description": "The ARIA label for copy code blocks button" }, "theme.CodeBlock.copy": { "message": "복사", "description": "The copy button label on code blocks" }, "theme.DocSidebarItem.toggleCollapsedCategoryAriaLabel": { "message": "접을 수 있는 사이드바 분류 '{label}' 접기(펼치기)", "description": "The ARIA label to toggle the collapsible sidebar category" }, "theme.NavBar.navAriaLabel": { "message": "Main", "description": "The ARIA label for the main navigation" }, "theme.navbar.mobileLanguageDropdown.label": { "message": "언어", "description": "The label for the mobile language switcher dropdown" }, "theme.TOCCollapsible.toggleButtonLabel": { "message": "이 페이지에서", "description": "The label used by the button on the collapsible TOC component" }, "theme.blog.post.readMore": { "message": "자세히 보기", "description": "The label used in blog post item excerpts to link to full blog posts" }, "theme.blog.post.readMoreLabel": { "message": "{title} 에 대해 더 읽어보기", "description": "The ARIA label for the link to full blog posts from excerpts" }, "theme.blog.post.readingTime.plurals": { "message": "약 {readingTime}분", "description": "Pluralized label for \"{readingTime} min read\". Use as much plural forms (separated by \"|\") as your language support (see https://www.unicode.org/cldr/cldr-aux/charts/34/supplemental/language_plural_rules.html)" }, "theme.docs.breadcrumbs.home": { "message": "홈", "description": "The ARIA label for the home page in the breadcrumbs" }, "theme.docs.sidebar.collapseButtonTitle": { "message": "사이드바 숨기기", "description": "The title attribute for collapse button of doc sidebar" }, "theme.docs.sidebar.collapseButtonAriaLabel": { "message": "사이드바 숨기기", "description": "The title attribute for collapse button of doc sidebar" }, "theme.docs.sidebar.navAriaLabel": { "message": "Docs sidebar", "description": "The ARIA label for the sidebar navigation" }, "theme.docs.sidebar.closeSidebarButtonAriaLabel": { "message": "Close navigation bar", "description": "The ARIA label for close button of mobile sidebar" }, "theme.navbar.mobileSidebarSecondaryMenu.backButtonLabel": { "message": "← 메인 메뉴로 돌아가기", "description": "The label of the back button to return to main menu, inside the mobile navbar sidebar secondary menu (notably used to display the docs sidebar)" }, "theme.docs.sidebar.toggleSidebarButtonAriaLabel": { "message": "Toggle navigation bar", "description": "The ARIA label for hamburger menu button of mobile navigation" }, "theme.docs.sidebar.expandButtonTitle": { "message": "사이드바 열기", "description": "The ARIA label and title attribute for expand button of doc sidebar" }, "theme.docs.sidebar.expandButtonAriaLabel": { "message": "사이드바 열기", "description": "The ARIA label and title attribute for expand button of doc sidebar" }, "theme.ErrorPageContent.tryAgain": { "message": "다시 시도해 보세요", "description": "The label of the button to try again rendering when the React error boundary captures an error" }, "theme.common.skipToMainContent": { "message": "본문으로 건너뛰기", "description": "The skip to content label used for accessibility, allowing to rapidly navigate to main content with keyboard tab/enter navigation" }, "theme.tags.tagsPageTitle": { "message": "태그", "description": "The title of the tag list page" } } ================================================ FILE: i18n/ko/docusaurus-plugin-content-blog/options.json ================================================ { "title": { "message": "Blog", "description": "The title for the blog used in SEO" }, "description": { "message": "Blog", "description": "The description for the blog used in SEO" }, "sidebar.title": { "message": "Recent posts", "description": "The label for the left sidebar" } } ================================================ FILE: i18n/ko/docusaurus-plugin-content-docs/current.json ================================================ { "version.label": { "message": "1.0", "description": "The label for version current" }, "sidebar.tutorialSidebar.category.Introduction": { "message": "Introduction", "description": "The label for category Introduction in sidebar tutorialSidebar" }, "sidebar.tutorialSidebar.category.Setup Kubernetes": { "message": "Setup Kubernetes", "description": "The label for category Setup Kubernetes in sidebar tutorialSidebar" }, "sidebar.tutorialSidebar.category.4. Install Kubernetes": { "message": "4. Install Kubernetes", "description": "The label for category 4. Install Kubernetes in sidebar tutorialSidebar" }, "sidebar.tutorialSidebar.category.Setup Components": { "message": "Setup Components", "description": "The label for category Setup Components in sidebar tutorialSidebar" }, "sidebar.tutorialSidebar.category.Kubeflow UI Guide": { "message": "Kubeflow UI Guide", "description": "The label for category Kubeflow UI Guide in sidebar tutorialSidebar" }, "sidebar.tutorialSidebar.category.Kubeflow": { "message": "Kubeflow", "description": "The label for category Kubeflow in sidebar tutorialSidebar" }, "sidebar.tutorialSidebar.category.API Deployment": { "message": "API Deployment", "description": "The label for category API Deployment in sidebar tutorialSidebar" }, "sidebar.tutorialSidebar.category.Appendix": { "message": "Appendix", "description": "The label for category Appendix in sidebar tutorialSidebar" }, "sidebar.tutorialSidebar.category.Further Readings": { "message": "Further Readings", "description": "The label for category Further Readings in sidebar tutorialSidebar" }, "sidebar.preSidebar.category.Docker": { "message": "Docker", "description": "The label for category Docker in sidebar preSidebar" } } ================================================ FILE: i18n/ko/docusaurus-plugin-content-docs/version-1.0.json ================================================ { "version.label": { "message": "1.0", "description": "The label for version 1.0" }, "sidebar.tutorialSidebar.category.Introduction": { "message": "Introduction", "description": "The label for category Introduction in sidebar tutorialSidebar" }, "sidebar.tutorialSidebar.category.Setup Kubernetes": { "message": "Setup Kubernetes", "description": "The label for category Setup Kubernetes in sidebar tutorialSidebar" }, "sidebar.tutorialSidebar.category.4. Install Kubernetes": { "message": "4. Install Kubernetes", "description": "The label for category 4. Install Kubernetes in sidebar tutorialSidebar" }, "sidebar.tutorialSidebar.category.Setup Components": { "message": "Setup Components", "description": "The label for category Setup Components in sidebar tutorialSidebar" }, "sidebar.tutorialSidebar.category.Kubeflow UI Guide": { "message": "Kubeflow UI Guide", "description": "The label for category Kubeflow UI Guide in sidebar tutorialSidebar" }, "sidebar.tutorialSidebar.category.Kubeflow": { "message": "Kubeflow", "description": "The label for category Kubeflow in sidebar tutorialSidebar" }, "sidebar.tutorialSidebar.category.API Deployment": { "message": "API Deployment", "description": "The label for category API Deployment in sidebar tutorialSidebar" }, "sidebar.tutorialSidebar.category.Appendix": { "message": "Appendix", "description": "The label for category Appendix in sidebar tutorialSidebar" }, "sidebar.tutorialSidebar.category.Further Readings": { "message": "Further Readings", "description": "The label for category Further Readings in sidebar tutorialSidebar" }, "sidebar.preSidebar.category.Docker": { "message": "Docker", "description": "The label for category Docker in sidebar preSidebar" } } ================================================ FILE: i18n/ko/docusaurus-plugin-content-docs-community/current.json ================================================ { "version.label": { "message": "Next", "description": "The label for version current" } } ================================================ FILE: i18n/ko/docusaurus-theme-classic/footer.json ================================================ { "copyright": { "message": "Copyright © 2021-2023 MakinaRocks. Built with Docusaurus.", "description": "The footer copyright" } } ================================================ FILE: i18n/ko/docusaurus-theme-classic/navbar.json ================================================ { "title": { "message": "MLOps for ALL", "description": "The title in the navbar" }, "logo.alt": { "message": "My Site Logo", "description": "The alt text of navbar logo" }, "item.label.Tutorial": { "message": "Tutorial", "description": "Navbar item with label Tutorial" }, "item.label.Prerequisites": { "message": "Prerequisites", "description": "Navbar item with label Prerequisites" }, "item.label.Community": { "message": "Community", "description": "Navbar item with label Community" }, "item.label.GitHub": { "message": "GitHub", "description": "Navbar item with label GitHub" } } ================================================ FILE: package.json ================================================ { "name": "v-2", "version": "0.0.0", "private": true, "scripts": { "docusaurus": "docusaurus", "start": "docusaurus start", "build": "docusaurus build", "swizzle": "docusaurus swizzle", "deploy": "docusaurus deploy", "clear": "docusaurus clear", "serve": "docusaurus serve", "write-translations": "docusaurus write-translations", "write-heading-ids": "docusaurus write-heading-ids", "typecheck": "tsc" }, "dependencies": { "@docusaurus/core": "2.4.1", "@docusaurus/plugin-content-docs": "^2.4.1", "@docusaurus/plugin-google-gtag": "^2.4.1", "@docusaurus/plugin-sitemap": "^2.4.1", "@docusaurus/preset-classic": "2.4.1", "@mdx-js/react": "^1.6.22", "clsx": "^1.2.1", "prism-react-renderer": "^1.3.5", "react": "^17.0.2", "react-dom": "^17.0.2" }, "devDependencies": { "@docusaurus/module-type-aliases": "2.4.1", "@tsconfig/docusaurus": "^1.0.5", "typescript": "^4.7.4" }, "browserslist": { "production": [ ">0.5%", "not dead", "not op_mini all" ], "development": [ "last 1 chrome version", "last 1 firefox version", "last 1 safari version" ] }, "engines": { "node": ">=16.14" } } ================================================ FILE: python/env/.gitkeep ================================================ ================================================ FILE: python/pyproject.toml ================================================ [tool.poetry] name = "mlops-for-all" version = "0.1.0" description = "Scripts for translation" authors = ["Aiden-Jeon "] readme = "README.md" packages = [{include = "mlops_for_all"}] [tool.poetry.dependencies] python = "^3.9" openai = "^0.27.8" python-dotenv = "^1.0.0" langchain = {extras = ["llms"], version = "^0.0.228"} [build-system] requires = ["poetry-core"] build-backend = "poetry.core.masonry.api" ================================================ FILE: python/translation/main.py ================================================ import os from pathlib import Path import dotenv from langchain.llms import OpenAI from langchain.schema import HumanMessage ROOT_PATH = Path(__file__).parent OPENAI_ENV_PATH = ROOT_PATH.parent / "env" / "openai.env" dotenv.load_dotenv(OPENAI_ENV_PATH) OPENAI_API_KEY = os.getenv("OPENAI_API_KEY") OPENAI_MODEL = OpenAI(openai_api_key=OPENAI_API_KEY) def request_prompt(source_sentence): translated_sentence = "\n" if source_sentence: translation_prompt = HumanMessage( content=f"Translate those sentences from Korean to English. {source_sentence}" ) translated_sentence = OPENAI_MODEL.predict_messages([translation_prompt]).content return translated_sentence + "\n" def translate(source_path, dest_path): translate_lines = [] with open(source_path, "r") as f: line = f.readline() translate_lines += [line] lines = [] is_codeblock = False is_header = True while line: line = f.readline() # 헤더 블록인 경우 if line.startswith("---"): is_header = False translate_lines += [line] continue if is_header: translate_lines += [line] continue # 코드 블록인 경우 if line.startswith("```"): if not is_codeblock: # 코드 블록 시작인 경우 번역한다. source_sentence = "".join(lines) translated_sentence = request_prompt(source_sentence) translate_lines += [translated_sentence] # 모으는 부분을 초기화 하고 코드 블록임을 선언한다. lines = [] is_codeblock = True else: # 코드 블록이 끝난 경우 is_codeblock = False translate_lines += [line] continue if is_codeblock: # 코드 블록 내부인 경우 통과한다. translate_lines += [line] continue lines += [line] if len(lines) > 10: # 많이 모이면 먼저 번역한다. source_sentence = "".join(lines) translated_sentence = request_prompt(source_sentence) translate_lines += [translated_sentence] lines = [] source_sentence = "".join(lines) # # request # translated_sentence = request_prompt(source_sentence) translate_lines += [translated_sentence] docs = "".join(translate_lines) with open(dest_path, "w") as f: f.write(docs) if __name__ == "__main__": from argparse import ArgumentParser parser = ArgumentParser() parser.add_argument("--chapter", type=str) args = parser.parse_args() REPO_ROOT = ROOT_PATH.parent.parent DOCS_ROOT = REPO_ROOT / "docs" / args.chapter DEST_ROOT = REPO_ROOT / "i18n/en/docusaurus-plugin-content-docs/version-1.0" / args.chapter for source_path in DOCS_ROOT.glob("*.md"): dest_path = DEST_ROOT / source_path.name print("source : ", source_path) translate(source_path, dest_path) print("dest : ", dest_path) ================================================ FILE: sidebars.js ================================================ /** * Creating a sidebar enables you to: - create an ordered group of docs - render a sidebar for each doc of that group - provide next/previous navigation The sidebars can be generated from the filesystem, or explicitly defined here. Create as many sidebars as you want. */ // @ts-check /** @type {import('@docusaurus/plugin-content-docs').SidebarsConfig} */ const sidebars = { tutorialSidebar: [ { type: "category", label: "Introduction", items: [ "introduction/intro", "introduction/levels", "introduction/component", "introduction/why_kubernetes", ], }, { type: "category", label: "Setup Kubernetes", items: [ "setup-kubernetes/intro", "setup-kubernetes/kubernetes", "setup-kubernetes/install-prerequisite", { type: "category", label: "4. Install Kubernetes", items: [ "setup-kubernetes/install-kubernetes/kubernetes-with-k3s", "setup-kubernetes/install-kubernetes/kubernetes-with-kubeadm", "setup-kubernetes/install-kubernetes/kubernetes-with-minikube", ], }, "setup-kubernetes/install-kubernetes-module", "setup-kubernetes/setup-nvidia-gpu", ], }, { type: "category", label: "Setup Components", items: [ "setup-components/install-components-kf", "setup-components/install-components-mlflow", "setup-components/install-components-seldon", "setup-components/install-components-pg", ], }, { type: "category", label: "Kubeflow UI Guide", items: [ "kubeflow-dashboard-guide/intro", "kubeflow-dashboard-guide/notebooks", "kubeflow-dashboard-guide/tensorboards", "kubeflow-dashboard-guide/volumes", "kubeflow-dashboard-guide/experiments", "kubeflow-dashboard-guide/experiments-and-others", ], }, { type: "category", label: "Kubeflow", items: [ "kubeflow/kubeflow-intro", "kubeflow/kubeflow-concepts", "kubeflow/basic-requirements", "kubeflow/basic-component", "kubeflow/basic-pipeline", "kubeflow/basic-pipeline-upload", "kubeflow/basic-run", "kubeflow/advanced-component", "kubeflow/advanced-environment", "kubeflow/advanced-pipeline", "kubeflow/advanced-run", "kubeflow/advanced-mlflow", "kubeflow/how-to-debug", ], }, { type: "category", label: "API Deployment", items: [ "api-deployment/what-is-api-deployment", "api-deployment/seldon-iris", "api-deployment/seldon-pg", "api-deployment/seldon-fields", "api-deployment/seldon-mlflow", "api-deployment/seldon-children", ], }, { type: "category", label: "Appendix", items: ["appendix/pyenv", "appendix/metallb"], }, { type: "category", label: "Further Readings", items: ["further-readings/info"], }, ], preSidebar: [ { type: "category", label: "Docker", items: [ "prerequisites/docker/install", "prerequisites/docker/introduction", "prerequisites/docker/docker", "prerequisites/docker/command", "prerequisites/docker/images", "prerequisites/docker/advanced", ], }, ], }; module.exports = sidebars; ================================================ FILE: sidebarsCommunity.js ================================================ /** * Creating a sidebar enables you to: - create an ordered group of docs - render a sidebar for each doc of that group - provide next/previous navigation The sidebars can be generated from the filesystem, or explicitly defined here. Create as many sidebars as you want. */ // @ts-check /** @type {import('@docusaurus/plugin-content-docs').SidebarsConfig} */ const sidebars = { // By default, Docusaurus generates a sidebar from the docs folder structure tutorialSidebar: [{type: 'autogenerated', dirName: '.'}], // But you can create a sidebar manually /* tutorialSidebar: [ 'intro', 'hello', { type: 'category', label: 'Tutorial', items: ['tutorial-basics/create-a-document'], }, ], */ }; module.exports = sidebars; ================================================ FILE: src/components/HomepageFeatures/index.tsx ================================================ import React from 'react'; import clsx from 'clsx'; import styles from './styles.module.css'; type FeatureItem = { title: string; Svg: React.ComponentType>; description: JSX.Element; }; const FeatureList: FeatureItem[] = [ { title: MakinaRocks, Svg: require('@site/static/img/undraw_docusaurus_tree.svg').default, description: ( <>

Sponsored by MakinaRocks

이 프로젝트는 MakinaRocks의 지원을 받아 제작되었습니다. ), }, { title: MLOps for MLE, Svg: require('@site/static/img/undraw_docusaurus_mountain.svg').default, description: ( <>

ML Engineer를 위한 MLOps Release!

구글에서 제안한 MLOps 0단계를 직접 구현하며 MLOps 가 무엇인지 공부할 수 있는 튜토리얼을 오픈했습니다! ), }, ]; function Feature({title, Svg, description}: FeatureItem) { return (

{title}

{description}

); } export default function HomepageFeatures(): JSX.Element { return (
{FeatureList.map((props, idx) => ( ))}
); } ================================================ FILE: src/components/HomepageFeatures/styles.module.css ================================================ .features { display: flex; align-items: center; padding: 2rem 0; width: 100%; } .featureSvg { height: 200px; width: 200px; } ================================================ FILE: src/components/TeamProfileCards/index.tsx ================================================ /** * Copyright (c) Facebook, Inc. and its affiliates. * * This source code is licensed under the MIT license found in the * LICENSE file in the root directory of this source tree. */ import React, {type ReactNode} from 'react'; import Translate from '@docusaurus/Translate'; type ProfileProps = { className?: string; name: string; children: ReactNode; githubUrl: string; linkedinUrl?: string; }; function TeamProfileCard({ className, name, children, githubUrl, linkedinUrl, role, }: ProfileProps) { return (
{`${name}'s

{name}

{role}
{children}
{githubUrl && ( GitHub )} {linkedinUrl && ( LinkedIn )}
); } function TeamProfileCardCol(props: ProfileProps) { return ( ); } export function MainAuthorRow(): JSX.Element { return (
마키나락스에서 머신러닝 엔지니어로 일하고 있습니다. 모두의 딥러닝을 통해 많은 사람들이 딥러닝을 쉽게 접했듯이 모두의 MLOps를 통해 많은 사람들이 MLOps에 쉽게 접할수 있길 바랍니다. 비효율적인 작업을 자동화하는 것에 관심이 많습니다. 마키나락스에서 MLOps Engineer로 일하고 있습니다. 단순하게 생각하는 노력을 하고 있습니다.
); } export function ContributorsRow(): JSX.Element { return (
마키나락스에서 ML Engineer로 일하고 있습니다. 마키나락스에서 CTO로 일하고 있습니다. 마키나락스는 머신러닝 기반의 산업용 AI 솔루션을 개발하는 스타트업입니다. 산업 현장의 문제 해결을 통해 사람이 본연의 일에 집중할 수 있게 만드는 것, 그것이 우리가 하는 일입니다. 3i에서 MLOps Engineer로 일하고 있습니다. kubeflow에 관심이 많습니다. Genesis Lab이라는 스타트업에서 Applied AI Engineer 인턴 업무를 수행하고 있습니다. 머신러닝 생태계가 우리 산업 전반에 큰 변화을 가져올 것이라 믿으며, 한 걸음씩 나아가고 있습니다. 백패커에서 ML 엔지니어로 일하고 있습니다. 자연어처리, 추천시스템, MLOps에 관심이 많습니다.
); } ================================================ FILE: src/css/custom.css ================================================ /** * Any CSS included here will be global. The classic template * bundles Infima by default. Infima is a CSS framework designed to * work well for content-centric websites. */ /* You can override the default Infima variables here. */ :root { --ifm-color-primary: #2e8555; --ifm-color-primary-dark: #29784c; --ifm-color-primary-darker: #277148; --ifm-color-primary-darkest: #205d3b; --ifm-color-primary-light: #33925d; --ifm-color-primary-lighter: #359962; --ifm-color-primary-lightest: #3cad6e; --ifm-code-font-size: 95%; --docusaurus-highlighted-code-line-bg: rgba(0, 0, 0, 0.1); } /* For readability concerns, you should choose a lighter palette in dark mode. */ [data-theme='dark'] { --ifm-color-primary: #25c2a0; --ifm-color-primary-dark: #21af90; --ifm-color-primary-darker: #1fa588; --ifm-color-primary-darkest: #1a8870; --ifm-color-primary-light: #29d5b0; --ifm-color-primary-lighter: #32d8b4; --ifm-color-primary-lightest: #4fddbf; --docusaurus-highlighted-code-line-bg: rgba(0, 0, 0, 0.3); } ================================================ FILE: src/pages/index.module.css ================================================ /** * CSS files with the .module.css suffix will be treated as CSS modules * and scoped locally. */ .heroBanner { padding: 4rem 0; text-align: center; position: relative; overflow: hidden; } @media screen and (max-width: 996px) { .heroBanner { padding: 2rem; } } .buttons { display: flex; align-items: center; justify-content: center; } ================================================ FILE: src/pages/index.tsx ================================================ import React from "react"; import clsx from "clsx"; import Link from "@docusaurus/Link"; import useDocusaurusContext from "@docusaurus/useDocusaurusContext"; import Layout from "@theme/Layout"; import HomepageFeatures from "@site/src/components/HomepageFeatures"; import styles from "./index.module.css"; function HomepageHeader() { const { siteConfig } = useDocusaurusContext(); return (

{siteConfig.title}

{siteConfig.tagline}

Let's Start!
); } export default function Home(): JSX.Element { const { siteConfig } = useDocusaurusContext(); return (
); } ================================================ FILE: src/pages/markdown-page.md ================================================ --- title: Markdown page example --- # Markdown page example You don't need React to write simple standalone pages. ================================================ FILE: static/.nojekyll ================================================ ================================================ FILE: static/googlee5904fe980148e9b.html ================================================ google-site-verification: googlee5904fe980148e9b.html ================================================ FILE: static/img/site.webmanifest ================================================ {"name":"","short_name":"","icons":[{"src":"/android-chrome-192x192.png","sizes":"192x192","type":"image/png"},{"src":"/android-chrome-512x512.png","sizes":"512x512","type":"image/png"}],"theme_color":"#ffffff","background_color":"#ffffff","display":"standalone"} ================================================ FILE: tsconfig.json ================================================ { // This file is not used in compilation. It is here just for a nice editor experience. "extends": "@tsconfig/docusaurus/tsconfig.json", "compilerOptions": { "baseUrl": "." } } ================================================ FILE: versioned_docs/version-1.0/api-deployment/_category_.json ================================================ { "label": "API Deployment", "position": 7, "link": { "type": "generated-index" } } ================================================ FILE: versioned_docs/version-1.0/api-deployment/seldon-children.md ================================================ --- title : "6. Multi Models" description: "" sidebar_position: 6 contributors: ["Jongseob Jeon"] --- ## Multi Models 앞서 설명했던 방법들은 모두 단일 모델을 대상으로 했습니다. 이번 페이지에서는 여러 개의 모델을 연결하는 방법에 대해서 알아봅니다. ## Pipeline 우선 모델을 2개를 생성하는 파이프라인을 작성하겠습니다. 모델은 앞서 사용한 SVC 모델에 StandardScaler를 추가하고 저장하도록 하겠습니다. ```python from functools import partial import kfp from kfp.components import InputPath, OutputPath, create_component_from_func @partial( create_component_from_func, packages_to_install=["pandas", "scikit-learn"], ) def load_iris_data( data_path: OutputPath("csv"), target_path: OutputPath("csv"), ): import pandas as pd from sklearn.datasets import load_iris iris = load_iris() data = pd.DataFrame(iris["data"], columns=iris["feature_names"]) target = pd.DataFrame(iris["target"], columns=["target"]) data.to_csv(data_path, index=False) target.to_csv(target_path, index=False) @partial( create_component_from_func, packages_to_install=["dill", "pandas", "scikit-learn", "mlflow"], ) def train_scaler_from_csv( data_path: InputPath("csv"), scaled_data_path: OutputPath("csv"), model_path: OutputPath("dill"), input_example_path: OutputPath("dill"), signature_path: OutputPath("dill"), conda_env_path: OutputPath("dill"), ): import dill import pandas as pd from sklearn.preprocessing import StandardScaler from mlflow.models.signature import infer_signature from mlflow.utils.environment import _mlflow_conda_env data = pd.read_csv(data_path) scaler = StandardScaler() scaled_data = scaler.fit_transform(data) scaled_data = pd.DataFrame(scaled_data, columns=data.columns, index=data.index) scaled_data.to_csv(scaled_data_path, index=False) with open(model_path, mode="wb") as file_writer: dill.dump(scaler, file_writer) input_example = data.sample(1) with open(input_example_path, "wb") as file_writer: dill.dump(input_example, file_writer) signature = infer_signature(data, scaler.transform(data)) with open(signature_path, "wb") as file_writer: dill.dump(signature, file_writer) conda_env = _mlflow_conda_env( additional_pip_deps=["scikit-learn"], install_mlflow=False ) with open(conda_env_path, "wb") as file_writer: dill.dump(conda_env, file_writer) @partial( create_component_from_func, packages_to_install=["dill", "pandas", "scikit-learn", "mlflow"], ) def train_svc_from_csv( train_data_path: InputPath("csv"), train_target_path: InputPath("csv"), model_path: OutputPath("dill"), input_example_path: OutputPath("dill"), signature_path: OutputPath("dill"), conda_env_path: OutputPath("dill"), kernel: str, ): import dill import pandas as pd from sklearn.svm import SVC from mlflow.models.signature import infer_signature from mlflow.utils.environment import _mlflow_conda_env train_data = pd.read_csv(train_data_path) train_target = pd.read_csv(train_target_path) clf = SVC(kernel=kernel) clf.fit(train_data, train_target) with open(model_path, mode="wb") as file_writer: dill.dump(clf, file_writer) input_example = train_data.sample(1) with open(input_example_path, "wb") as file_writer: dill.dump(input_example, file_writer) signature = infer_signature(train_data, clf.predict(train_data)) with open(signature_path, "wb") as file_writer: dill.dump(signature, file_writer) conda_env = _mlflow_conda_env( additional_pip_deps=["scikit-learn"], install_mlflow=False ) with open(conda_env_path, "wb") as file_writer: dill.dump(conda_env, file_writer) @partial( create_component_from_func, packages_to_install=["dill", "pandas", "scikit-learn", "mlflow", "boto3"], ) def upload_sklearn_model_to_mlflow( model_name: str, model_path: InputPath("dill"), input_example_path: InputPath("dill"), signature_path: InputPath("dill"), conda_env_path: InputPath("dill"), ): import os import dill from mlflow.sklearn import save_model from mlflow.tracking.client import MlflowClient os.environ["MLFLOW_S3_ENDPOINT_URL"] = "http://minio-service.kubeflow.svc:9000" os.environ["AWS_ACCESS_KEY_ID"] = "minio" os.environ["AWS_SECRET_ACCESS_KEY"] = "minio123" client = MlflowClient("http://mlflow-server-service.mlflow-system.svc:5000") with open(model_path, mode="rb") as file_reader: clf = dill.load(file_reader) with open(input_example_path, "rb") as file_reader: input_example = dill.load(file_reader) with open(signature_path, "rb") as file_reader: signature = dill.load(file_reader) with open(conda_env_path, "rb") as file_reader: conda_env = dill.load(file_reader) save_model( sk_model=clf, path=model_name, serialization_format="cloudpickle", conda_env=conda_env, signature=signature, input_example=input_example, ) run = client.create_run(experiment_id="0") client.log_artifact(run.info.run_id, model_name) from kfp.dsl import pipeline @pipeline(name="multi_model_pipeline") def multi_model_pipeline(kernel: str = "rbf"): iris_data = load_iris_data() scaled_data = train_scaler_from_csv(data=iris_data.outputs["data"]) _ = upload_sklearn_model_to_mlflow( model_name="scaler", model=scaled_data.outputs["model"], input_example=scaled_data.outputs["input_example"], signature=scaled_data.outputs["signature"], conda_env=scaled_data.outputs["conda_env"], ) model = train_svc_from_csv( train_data=scaled_data.outputs["scaled_data"], train_target=iris_data.outputs["target"], kernel=kernel, ) _ = upload_sklearn_model_to_mlflow( model_name="svc", model=model.outputs["model"], input_example=model.outputs["input_example"], signature=model.outputs["signature"], conda_env=model.outputs["conda_env"], ) if __name__ == "__main__": kfp.compiler.Compiler().compile(multi_model_pipeline, "multi_model_pipeline.yaml") ``` 파이프라인을 업로드하면 다음과 같이 나옵니다. ![children-kubeflow.png](./img/children-kubeflow.png) MLflow 대시보드를 확인하면 다음과 같이 두 개의 모델이 생성됩니다. ![children-mlflow.png](./img/children-mlflow.png) 각각의 run_id를 확인 후 다음과 같이 SeldonDeployment 스펙을 정의합니다. ```bash apiVersion: machinelearning.seldon.io/v1 kind: SeldonDeployment metadata: name: multi-model-example namespace: kubeflow-user-example-com spec: name: model predictors: - name: model componentSpecs: - spec: volumes: - name: model-provision-location emptyDir: {} initContainers: - name: scaler-initializer image: gcr.io/kfserving/storage-initializer:v0.4.0 args: - "s3://mlflow/mlflow/artifacts/0/7f445015a0e94519b003d316478766ef/artifacts/scaler" - "/mnt/models" volumeMounts: - mountPath: /mnt/models name: model-provision-location envFrom: - secretRef: name: seldon-init-container-secret - name: svc-initializer image: gcr.io/kfserving/storage-initializer:v0.4.0 args: - "s3://mlflow/mlflow/artifacts/0/87eb168e76264b39a24b0e5ca0fe922b/artifacts/svc" - "/mnt/models" volumeMounts: - mountPath: /mnt/models name: model-provision-location envFrom: - secretRef: name: seldon-init-container-secret containers: - name: scaler image: seldonio/mlflowserver:1.8.0-dev volumeMounts: - mountPath: /mnt/models name: model-provision-location readOnly: true securityContext: privileged: true runAsUser: 0 runAsGroup: 0 - name: svc image: seldonio/mlflowserver:1.8.0-dev volumeMounts: - mountPath: /mnt/models name: model-provision-location readOnly: true securityContext: privileged: true runAsUser: 0 runAsGroup: 0 graph: name: scaler type: MODEL parameters: - name: model_uri type: STRING value: "/mnt/models" - name: predict_method type: STRING value: "transform" children: - name: svc type: MODEL parameters: - name: model_uri type: STRING value: "/mnt/models" ``` 모델이 두 개가 되었으므로 각 모델의 initContainer와 container를 정의해주어야 합니다. 이 필드는 입력값을 array로 받으며 순서는 관계없습니다. 모델이 실행하는 순서는 graph에서 정의됩니다. ```bash graph: name: scaler type: MODEL parameters: - name: model_uri type: STRING value: "/mnt/models" - name: predict_method type: STRING value: "transform" children: - name: svc type: MODEL parameters: - name: model_uri type: STRING value: "/mnt/models" ``` graph의 동작 방식은 처음 받은 값을 정해진 predict_method로 변환한 뒤 children으로 정의된 모델에 전달하는 방식입니다. 이 경우 scaler -> svc 로 데이터가 전달됩니다. 이제 위의 스펙을 yaml파일로 생성해 보겠습니다. ```bash cat < multi-model.yaml apiVersion: machinelearning.seldon.io/v1 kind: SeldonDeployment metadata: name: multi-model-example namespace: kubeflow-user-example-com spec: name: model predictors: - name: model componentSpecs: - spec: volumes: - name: model-provision-location emptyDir: {} initContainers: - name: scaler-initializer image: gcr.io/kfserving/storage-initializer:v0.4.0 args: - "s3://mlflow/mlflow/artifacts/0/7f445015a0e94519b003d316478766ef/artifacts/scaler" - "/mnt/models" volumeMounts: - mountPath: /mnt/models name: model-provision-location envFrom: - secretRef: name: seldon-init-container-secret - name: svc-initializer image: gcr.io/kfserving/storage-initializer:v0.4.0 args: - "s3://mlflow/mlflow/artifacts/0/87eb168e76264b39a24b0e5ca0fe922b/artifacts/svc" - "/mnt/models" volumeMounts: - mountPath: /mnt/models name: model-provision-location envFrom: - secretRef: name: seldon-init-container-secret containers: - name: scaler image: ghcr.io/mlops-for-all/mlflowserver volumeMounts: - mountPath: /mnt/models name: model-provision-location readOnly: true securityContext: privileged: true runAsUser: 0 runAsGroup: 0 - name: svc image: ghcr.io/mlops-for-all/mlflowserver volumeMounts: - mountPath: /mnt/models name: model-provision-location readOnly: true securityContext: privileged: true runAsUser: 0 runAsGroup: 0 graph: name: scaler type: MODEL parameters: - name: model_uri type: STRING value: "/mnt/models" - name: predict_method type: STRING value: "transform" children: - name: svc type: MODEL parameters: - name: model_uri type: STRING value: "/mnt/models" EOF ``` 다음 명령어를 통해 API를 생성합니다. ```bash kubectl apply -f multi-model.yaml ``` 정상적으로 수행되면 다음과 같이 출력됩니다. ```bash seldondeployment.machinelearning.seldon.io/multi-model-example created ``` 정상적으로 생성됐는지 확인합니다. ```bash kubectl get po -n kubeflow-user-example-com | grep multi-model-example ``` 정상적으로 생성되면 다음과 비슷한 pod이 생성됩니다. ```bash multi-model-example-model-0-scaler-svc-9955fb795-n9ffw 4/4 Running 0 2m30s ``` ================================================ FILE: versioned_docs/version-1.0/api-deployment/seldon-fields.md ================================================ --- title : "4. Seldon Fields" description: "" sidebar_position: 4 contributors: ["Jongseob Jeon"] --- ## How Seldon Core works? Seldon Core가 API 서버를 생성하는 과정을 요약하면 다음과 같습니다. ![seldon-fields-0.png](./img/seldon-fields-0.png) 1. initContainer는 모델 저장소에서 필요한 모델을 다운로드 받습니다. 2. 다운로드받은 모델을 container로 전달합니다. 3. container는 전달받은 모델을 감싼 API 서버를 실행합니다. 4. 생성된 API 서버 주소로 API를 요청하여 모델의 추론 값을 받을 수 있습니다. ## SeldonDeployment Spec Seldon Core를 사용할 때, 주로 사용하게 되는 커스텀 리소스인 SeldonDeployment를 정의하는 yaml 파일은 다음과 같습니다. ```bash apiVersion: machinelearning.seldon.io/v1 kind: SeldonDeployment metadata: name: seldon-example namespace: kubeflow-user-example-com spec: name: model predictors: - name: model componentSpecs: - spec: volumes: - name: model-provision-location emptyDir: {} initContainers: - name: model-initializer image: gcr.io/kfserving/storage-initializer:v0.4.0 args: - "gs://seldon-models/v1.12.0-dev/sklearn/iris" - "/mnt/models" volumeMounts: - mountPath: /mnt/models name: model-provision-location containers: - name: model image: seldonio/sklearnserver:1.8.0-dev volumeMounts: - mountPath: /mnt/models name: model-provision-location readOnly: true securityContext: privileged: true runAsUser: 0 runAsGroup: 0 graph: name: model type: MODEL parameters: - name: model_uri type: STRING value: "/mnt/models" children: [] ``` SeldonDeployment spec 중 `name` 과 `predictors` 필드는 required 필드입니다. `name`은 쿠버네티스 상에서 pod의 구분을 위한 이름으로 크게 영향을 미치지 않습니다. `predictors`는 한 개로 구성된 array로 `name`, `componentSpecs` 와 `graph` 가 정의되어야 합니다. 여기서도 `name`은 pod의 구분을 위한 이름으로 크게 영향을 미치지 않습니다. 이제 `componentSpecs` 와 `graph`에서 정의해야 할 필드들에 대해서 알아보겠습니다. ## componentSpecs `componentSpecs` 는 하나로 구성된 array로 `spec` 키값이 정의되어야 합니다. `spec` 에는 `volumes`, `initContainers`, `containers` 의 필드가 정의되어야 합니다. ### volumes ```bash volumes: - name: model-provision-location emptyDir: {} ``` `volumes`은 initContainer에서 다운로드받는 모델을 저장하기 위한 공간을 의미합니다. array로 입력을 받으며 array의 구성 요소는 `name`과 `emptyDir` 입니다. 이 값들은 모델을 다운로드받고 옮길 때 한번 사용되므로 크게 수정하지 않아도 됩니다. ### initContainer ```bash - name: model-initializer image: gcr.io/kfserving/storage-initializer:v0.4.0 args: - "gs://seldon-models/v1.12.0-dev/sklearn/iris" - "/mnt/models" volumeMounts: - mountPath: /mnt/models name: model-provision-location ``` initContainer는 API에서 사용할 모델을 다운로드받는 역할을 합니다. 그래서 사용되는 필드들은 모델 저장소(Model Registry)로부터 데이터를 다운로드받을 때 필요한 정보들을 정해줍니다. initContainer의 값은 n개의 array로 구성되어 있으며 사용하는 모델마다 각각 지정해주어야 합니다. #### name `name`은 쿠버네티스 상의 pod의 이름입니다. 디버깅을 위해 `{model_name}-initializer` 로 사용하길 권장합니다. #### image `image` 는 모델을 다운로드 받기 위해 사용할 이미지 이름입니다. seldon core에서 권장하는 이미지는 크게 두 가지입니다. - gcr.io/kfserving/storage-initializer:v0.4.0 - seldonio/rclone-storage-initializer:1.13.0-dev 각각의 자세한 내용은 다음을 참고 바랍니다. - [kfserving](https://docs.seldon.io/projects/seldon-core/en/latest/servers/kfserving-storage-initializer.html) - [rclone](https://github.com/SeldonIO/seldon-core/tree/master/components/rclone-storage-initializer) *모두의 MLOps* 에서는 kfserving을 사용합니다. #### args ```bash args: - "gs://seldon-models/v1.12.0-dev/sklearn/iris" - "/mnt/models" ``` gcr.io/kfserving/storage-initializer:v0.4.0 도커 이미지가 실행(`run`)될 때 입력받는 argument를 입력합니다. array로 구성되며 첫 번째 array의 값은 다운로드받을 모델의 주소를 적습니다. 두 번째 array의 값은 다운로드받은 모델을 저장할 주소를 적습니다. (seldon core에서는 주로 `/mnt/models`에 저장합니다.) ### volumeMounts ```bash volumeMounts: - mountPath: /mnt/models name: model-provision-location ``` `volumneMounts`는 volumes에서 설명한 것과 같이 `/mnt/models`를 쿠버네티스 상에서 공유할 수 있도록 볼륨을 붙여주는 필드입니다. 자세한 내용은 [쿠버네티스 Volume](https://kubernetes.io/docs/concepts/storage/volumes/)을 참조 바랍니다. ### container ```bash containers: - name: model image: seldonio/sklearnserver:1.8.0-dev volumeMounts: - mountPath: /mnt/models name: model-provision-location readOnly: true securityContext: privileged: true runAsUser: 0 runAsGroup: 0 ``` container는 실제로 모델이 API 형식으로 실행될 때의 설정을 정의하는 필드입니다. #### name `name`은 쿠버네티스 상의 pod의 이름입니다. 사용하는 모델의 이름을 적습니다. #### image `image` 는 모델을 API로 만드는 데 사용할 이미지입니다. 이미지에는 모델이 로드될 때 필요한 패키지들이 모두 설치되어 있어야 합니다. Seldon Core에서 지원하는 공식 이미지는 다음과 같습니다. - seldonio/sklearnserver - seldonio/mlflowserver - seldonio/xgboostserver - seldonio/tfserving #### volumeMounts ```bash volumeMounts: - mountPath: /mnt/models name: model-provision-location readOnly: true ``` initContainer에서 다운로드받은 데이터가 있는 경로를 알려주는 필드입니다. 이때 모델이 수정되는 것을 방지하기 위해 `readOnly: true`도 같이 주겠습니다. #### securityContext ```bash securityContext: privileged: true runAsUser: 0 runAsGroup: 0 ``` 필요한 패키지를 설치할 때 pod이 권한이 없어서 패키지 설치를 수행하지 못할 수 있습니다. 이를 위해서 root 권한을 부여합니다. (다만 이 작업은 실제 서빙 시 보안 문제가 생길 수 있습니다.) ## graph ```bash graph: name: model type: MODEL parameters: - name: model_uri type: STRING value: "/mnt/models" children: [] ``` 모델이 동작하는 순서를 정의한 필드입니다. ### name 모델 그래프의 이름입니다. container에서 정의된 이름을 사용합니다. ### type type은 크게 4가지가 있습니다. 1. TRANSFORMER 2. MODEL 3. OUTPUT_TRANSFORMER 4. ROUTER 각 type에 대한 자세한 설명은 [Seldon Core Complex Graphs Metadata Example](https://docs.seldon.io/projects/seldon-core/en/latest/examples/graph-metadata.html)을 참조 바랍니다. ### parameters class init 에서 사용되는 값들입니다. sklearnserver에서 필요한 값은 [다음 파일](https://github.com/SeldonIO/seldon-core/blob/master/servers/sklearnserver/sklearnserver/SKLearnServer.py)에서 확인할 수 있습니다. ```python class SKLearnServer(SeldonComponent): def __init__(self, model_uri: str = None, method: str = "predict_proba"): ``` 코드를 보면 `model_uri`와 `method`를 정의할 수 있습니다. ### children 순서도를 작성할 때 사용됩니다. 자세한 내용은 다음 페이지에서 설명합니다. ================================================ FILE: versioned_docs/version-1.0/api-deployment/seldon-iris.md ================================================ --- title : "2. Deploy SeldonDeployment" description: "" sidebar_position: 2 date: 2021-12-22 lastmod: 2021-12-22 contributors: ["Youngcheol Jang", "SeungTae Kim"] --- ## SeldonDeployment를 통해 배포하기 이번에는 학습된 모델이 있을 때 SeldonDeployment를 통해 API Deployment를 해보겠습니다. SeldonDeployment는 쿠버네티스(Kubernetes)에 모델을 REST/gRPC 서버의 형태로 배포하기 위해 정의된 CRD(CustomResourceDefinition)입니다. ### 1. Prerequisites SeldonDeployment 관련된 실습은 seldon-deploy라는 새로운 네임스페이스(namespace)에서 진행하도록 하겠습니다. 네임스페이스를 생성한 뒤, seldon-deploy를 현재 네임스페이스로 설정합니다. ```bash kubectl create namespace seldon-deploy kubectl config set-context --current --namespace=seldon-deploy ``` ### 2. 스펙 정의 SeldonDeployment를 배포하기 위한 yaml 파일을 생성합니다. 이번 페이지에서는 공개된 iris model을 사용하도록 하겠습니다. 이 iris model은 sklearn 프레임워크를 통해 학습되었기 때문에 SKLEARN_SERVER를 사용합니다. ```bash cat < iris-sdep.yaml apiVersion: machinelearning.seldon.io/v1alpha2 kind: SeldonDeployment metadata: name: sklearn namespace: seldon-deploy spec: name: iris predictors: - graph: children: [] implementation: SKLEARN_SERVER modelUri: gs://seldon-models/v1.12.0-dev/sklearn/iris name: classifier name: default replicas: 1 EOF ``` yaml 파일을 배포합니다. ```bash kubectl apply -f iris-sdep.yaml ``` 다음 명령어를 통해 정상적으로 배포가 되었는지 확인합니다. ```bash kubectl get pods --selector seldon-app=sklearn-default -n seldon-deploy ``` 모두 Running 이 되면 다음과 비슷한 결과가 출력됩니다. ```bash NAME READY STATUS RESTARTS AGE sklearn-default-0-classifier-5fdfd7bb77-ls9tr 2/2 Running 0 5m ``` ## Ingress URL 이제 배포된 모델에 추론 요청(predict request)를 보내서 추론 결괏값을 받아옵니다. 배포된 API는 다음과 같은 규칙으로 생성됩니다. `http://{NODE_IP}:{NODE_PORT}/seldon/{namespace}/{seldon-deployment-name}/api/v1.0/{method-name}/` ### NODE_IP / NODE_PORT [Seldon Core 설치 시, Ambassador를 Ingress Controller로 설정하였으므로](../setup-components/install-components-seldon.md), SeldonDeployment로 생성된 API 서버는 모두 Ambassador의 Ingress gateway를 통해 요청할 수 있습니다. 따라서 우선 Ambassador Ingress Gateway의 url을 환경 변수로 설정합니다. ```bash export NODE_IP=$(kubectl get nodes -o jsonpath='{ $.items[*].status.addresses[?(@.type=="InternalIP")].address }') export NODE_PORT=$(kubectl get service ambassador -n seldon-system -o jsonpath="{.spec.ports[0].nodePort}") ``` 설정된 url을 확인합니다. ```bash echo "NODE_IP"=$NODE_IP echo "NODE_PORT"=$NODE_PORT ``` 다음과 비슷하게 출력되어야 하며, 클라우드 등을 통해 설정할 경우, internal ip 주소가 설정되는 것을 확인할 수 있습니다. ```bash NODE_IP=192.168.0.19 NODE_PORT=30486 ``` ### namespace / seldon-deployment-name SeldonDeployment가 배포된 `namespace`와 `seldon-deployment-name`를 의미합니다. 이는 스펙을 정의할 때 metadata에 정의된 값을 사용합니다. ```bash metadata: name: sklearn namespace: seldon-deploy ``` 위의 예시에서는 `namespace`는 seldon-deploy, `seldon-deployment-name`은 sklearn 입니다. ### method-name SeldonDeployment에서 주로 사용하는 `method-name`은 두 가지가 있습니다. 1. doc 2. predictions 각각의 method의 자세한 사용 방법은 아래에서 설명합니다. ## Using Swagger 우선 doc method를 사용하는 방법입니다. doc method를 이용하면 seldon에서 생성한 swagger에 접속할 수 있습니다. ### 1. Swagger 접속 위에서 설명한 ingress url 규칙에 따라 아래 주소를 통해 swagger에 접근할 수 있습니다. `http://192.168.0.19:30486/seldon/seldon-deploy/sklearn/api/v1.0/doc/` ![iris-swagger1.png](./img/iris-swagger1.png) ### 2. Swagger Predictions 메뉴 선택 UI에서 `/seldon/seldon-deploy/sklearn/api/v1.0/predictions` 메뉴를 선택합니다. ![iris-swagger2.png](./img/iris-swagger2.png) ### 3. *Try it out* 선택 ![iris-swagger3.png](./img/iris-swagger3.png) ### 4. Request body에 data 입력 ![iris-swagger4.png](./img/iris-swagger4.png) 다음 데이터를 입력합니다. ```bash { "data": { "ndarray":[[1.0, 2.0, 5.0, 6.0]] } } ``` ### 5. 추론 결과 확인 `Execute` 버튼을 눌러서 추론 결과를 확인할 수 있습니다. ![iris-swagger5.png](./img/iris-swagger5.png) 정상적으로 수행되면 다음과 같은 추론 결과를 얻습니다. ```bash { "data": { "names": [ "t:0", "t:1", "t:2" ], "ndarray": [ [ 9.912315378486697e-7, 0.0007015931307746079, 0.9992974156376876 ] ] }, "meta": { "requestPath": { "classifier": "seldonio/sklearnserver:1.11.2" } } } ``` ## Using CLI 또한, curl과 같은 http client CLI 도구를 활용해서도 API 요청을 수행할 수 있습니다. 예를 들어, 다음과 같이 `/predictions`를 요청하면 ```bash curl -X POST http://$NODE_IP:$NODE_PORT/seldon/seldon-deploy/sklearn/api/v1.0/predictions \ -H 'Content-Type: application/json' \ -d '{ "data": { "ndarray": [[1,2,3,4]] } }' ``` 아래와 같은 응답이 정상적으로 출력되는 것을 확인할 수 있습니다. ```bash {"data":{"names":["t:0","t:1","t:2"],"ndarray":[[0.0006985194531162835,0.00366803903943666,0.995633441507447]]},"meta":{"requestPath":{"classifier":"seldonio/sklearnserver:1.11.2"}}} ``` ================================================ FILE: versioned_docs/version-1.0/api-deployment/seldon-mlflow.md ================================================ --- title : "5. Model from MLflow" description: "" sidebar_position: 5 contributors: ["Jongseob Jeon"] --- ## Model from MLflow 이번 페이지에서는 [MLflow Component](../kubeflow/advanced-mlflow.md)에서 저장된 모델을 이용해 API를 생성하는 방법에 대해서 알아보겠습니다. ## Secret initContainer가 minio에 접근해서 모델을 다운로드받으려면 credentials가 필요합니다. minio에 접근하기 위한 credentials는 다음과 같습니다. ```bash apiVersion: v1 type: Opaque kind: Secret metadata: name: seldon-init-container-secret namespace: kubeflow-user-example-com data: AWS_ACCESS_KEY_ID: bWluaW8K= AWS_SECRET_ACCESS_KEY: bWluaW8xMjM= AWS_ENDPOINT_URL: aHR0cDovL21pbmlvLm1ha2luYXJvY2tzLmFp USE_SSL: ZmFsc2U= ``` `AWS_ACCESS_KEY_ID` 의 입력값은 `minio`입니다. 다만 secret의 입력값은 인코딩된 값이여야 되기 때문에 실제로 입력되는 값은 다음을 수행후 나오는 값이어야 합니다. data에 입력되어야 하는 값들은 다음과 같습니다. - AWS_ACCESS_KEY_ID: minio - AWS_SECRET_ACCESS_KEY: minio123 - AWS_ENDPOINT_URL: http://minio-service.kubeflow.svc:9000 - USE_SSL: false 인코딩은 다음 명령어를 통해서 할 수 있습니다. ```bash echo -n minio | base64 ``` 그러면 다음과 같은 값이 출력됩니다. ```bash bWluaW8= ``` 인코딩을 전체 값에 대해서 진행하면 다음과 같이 됩니다. - AWS_ACCESS_KEY_ID: bWluaW8= - AWS_SECRET_ACCESS_KEY: bWluaW8xMjM= - AWS_ENDPOINT_URL: aHR0cDovL21pbmlvLXNlcnZpY2Uua3ViZWZsb3cuc3ZjOjkwMDA= - USE_SSL: ZmFsc2U= 다음 명령어를 통해 secret을 생성할 수 있는 yaml파일을 생성합니다. ```bash cat < seldon-init-container-secret.yaml apiVersion: v1 kind: Secret metadata: name: seldon-init-container-secret namespace: kubeflow-user-example-com type: Opaque data: AWS_ACCESS_KEY_ID: bWluaW8= AWS_SECRET_ACCESS_KEY: bWluaW8xMjM= AWS_ENDPOINT_URL: aHR0cDovL21pbmlvLXNlcnZpY2Uua3ViZWZsb3cuc3ZjOjkwMDA= USE_SSL: ZmFsc2U= EOF ``` 다음 명령어를 통해 secret을 생성합니다. ```bash kubectl apply -f seldon-init-container-secret.yaml ``` 정상적으로 수행되면 다음과 같이 출력됩니다. ```bash secret/seldon-init-container-secret created ``` ## Seldon Core yaml 이제 Seldon Core를 생성하는 yaml파일을 작성합니다. ```bash apiVersion: machinelearning.seldon.io/v1 kind: SeldonDeployment metadata: name: seldon-example namespace: kubeflow-user-example-com spec: name: model predictors: - name: model componentSpecs: - spec: volumes: - name: model-provision-location emptyDir: {} initContainers: - name: model-initializer image: gcr.io/kfserving/storage-initializer:v0.4.0 args: - "s3://mlflow/mlflow/artifacts/0/74ba8e33994144f599e50b3be176cdb0/artifacts/svc" - "/mnt/models" volumeMounts: - mountPath: /mnt/models name: model-provision-location envFrom: - secretRef: name: seldon-init-container-secret containers: - name: model image: ghcr.io/mlops-for-all/mlflowserver volumeMounts: - mountPath: /mnt/models name: model-provision-location readOnly: true securityContext: privileged: true runAsUser: 0 runAsGroup: 0 graph: name: model type: MODEL parameters: - name: model_uri type: STRING value: "/mnt/models" children: [] ``` 이 전에 작성한 [Seldon Fields](../api-deployment/seldon-fields.md)와 달라진 점은 크게 두 부분입니다. initContainer에 `envFrom` 필드가 추가되었으며 args의 주소가 `s3://mlflow/mlflow/artifacts/0/74ba8e33994144f599e50b3be176cdb0/artifacts/svc` 로 바뀌었습니다. ### args 앞서 args의 첫번째 array는 우리가 다운로드받을 모델의 경로라고 했습니다. 그럼 mlflow에 저장된 모델의 경로는 어떻게 알 수 있을까요? 다시 mlflow에 들어가서 run을 클릭하고 모델을 누르면 다음과 같이 확인할 수 있습니다. ![seldon-mlflow-0.png](./img/seldon-mlflow-0.png) 이렇게 확인된 경로를 입력하면 됩니다. ### envFrom minio에 접근해서 모델을 다운로드 받는 데 필요한 환경변수를 입력해주는 과정입니다. 앞서 만든 `seldon-init-container-secret`를 이용합니다. ## API 생성 우선 위에서 정의한 스펙을 yaml 파일로 생성하겠습니다. ```bash apiVersion: machinelearning.seldon.io/v1 kind: SeldonDeployment metadata: name: seldon-example namespace: kubeflow-user-example-com spec: name: model predictors: - name: model componentSpecs: - spec: volumes: - name: model-provision-location emptyDir: {} initContainers: - name: model-initializer image: gcr.io/kfserving/storage-initializer:v0.4.0 args: - "s3://mlflow/mlflow/artifacts/0/74ba8e33994144f599e50b3be176cdb0/artifacts/svc" - "/mnt/models" volumeMounts: - mountPath: /mnt/models name: model-provision-location envFrom: - secretRef: name: seldon-init-container-secret containers: - name: model image: ghcr.io/mlops-for-all/mlflowserver volumeMounts: - mountPath: /mnt/models name: model-provision-location readOnly: true securityContext: privileged: true runAsUser: 0 runAsGroup: 0 graph: name: model type: MODEL parameters: - name: model_uri type: STRING value: "/mnt/models" - name: xtype type: STRING value: "dataframe" children: [] EOF ``` seldon pod을 생성합니다. ```bash kubectl apply -f seldon-mlflow.yaml ``` 정상적으로 수행되면 다음과 같이 출력됩니다. ```bash seldondeployment.machinelearning.seldon.io/seldon-example created ``` 이제 pod이 정상적으로 뜰 때까지 기다립니다. ```bash kubectl get po -n kubeflow-user-example-com | grep seldon ``` 다음과 비슷하게 출력되면 정상적으로 API를 생성했습니다. ```bash seldon-example-model-0-model-5c949bd894-c5f28 3/3 Running 0 69s ``` CLI를 이용해 생성된 API에는 다음 request를 통해 실행을 확인할 수 있습니다. ```bash curl -X POST http://$NODE_IP:$NODE_PORT/seldon/seldon-deploy/sklearn/api/v1.0/predictions \ -H 'Content-Type: application/json' \ -d '{ "data": { "ndarray": [ [ 143.0, 0.0, 30.0, 30.0 ] ], "names": [ "sepal length (cm)", "sepal width (cm)", "petal length (cm)", "petal width (cm)" ] } }' ``` 정상적으로 실행될 경우 다음과 같은 결과를 받을 수 있습니다. ```bash {"data":{"names":[],"ndarray":["Virginica"]},"meta":{"requestPath":{"model":"ghcr.io/mlops-for-all/mlflowserver:e141f57"}}} ``` ================================================ FILE: versioned_docs/version-1.0/api-deployment/seldon-pg.md ================================================ --- title : "3. Seldon Monitoring" description: "Prometheus & Grafana 확인하기" sidebar_position: 3 date: 2021-12-24 lastmod: 2021-12-24 contributors: ["Jongseob Jeon"] --- ## Grafana & Prometheus 이제, [지난 페이지](../api-deployment/seldon-iris.md)에서 생성했던 SeldonDeployment 로 API Request 를 반복적으로 수행해보고, 대시보드에 변화가 일어나는지 확인해봅니다. ### 대시보드 [앞서 생성한 대시보드](../setup-components/install-components-pg.md)를 포트 포워딩합니다. ```bash kubectl port-forward svc/seldon-core-analytics-grafana -n seldon-system 8090:80 ``` ### API 요청 [앞서 생성한 Seldon Deployment](../api-deployment/seldon-iris.md#using-cli)에 요청을 **반복해서** 보냅니다. ```bash curl -X POST http://$NODE_IP:$NODE_PORT/seldon/seldon-deploy/sklearn/api/v1.0/predictions \ -H 'Content-Type: application/json' \ -d '{ "data": { "ndarray": [[1,2,3,4]] } }' ``` 그리고 그라파나 대시보드를 확인하면 다음과 같이 Global Request Rate 이 `0 ops` 에서 순간적으로 상승하는 것을 확인할 수 있습니다. ![repeat-raise.png](./img/repeat-raise.png) 이렇게 프로메테우스와 그라파나가 정상적으로 설치된 것을 확인할 수 있습니다. ================================================ FILE: versioned_docs/version-1.0/api-deployment/what-is-api-deployment.md ================================================ --- title : "1. What is API Deployment?" description: "" sidebar_position: 1 date: 2021-12-22 lastmod: 2021-12-22 contributors: ["Youngcheol Jang"] --- ## API Deployment란? 머신러닝 모델을 학습한 뒤에는 어떻게 사용해야 할까요? 머신러닝을 학습할 때는 더 높은 성능의 모델이 나오기를 기대하지만, 학습된 모델을 사용하여 추론을 할 때는 빠르고 쉽게 추론 결과를 받아보고 싶을 것입니다. 모델의 추론 결과를 확인하고자 할 때 주피터 노트북이나 파이썬 스크립트를 통해 학습된 모델을 로드한 뒤 추론할 수 있습니다. 그렇지만 이런 방법은 모델이 클수록 모델을 불러오는 데 많은 시간을 소요하게 되어서 비효율적입니다. 또한 이렇게 이용하면 많은 사람이 모델을 이용할 수 없고 학습된 모델이 있는 환경에서밖에 사용할 수 없습니다. 그래서 실제 서비스에서 머신러닝이 사용될 때는 API를 이용해서 학습된 모델을 사용합니다. 모델은 API 서버가 구동되는 환경에서 한 번만 로드가 되며, DNS를 활용하여 외부에서도 쉽게 추론 결과를 받을 수 있고 다른 서비스와 연동할 수 있습니다. 하지만 모델을 API로 만드는 작업에는 생각보다 많은 부수적인 작업이 필요합니다. 그래서 API로 만드는 작업을 더 쉽게 하기 위해서 Tensorflow와 같은 머신러닝 프레임워크 진영에서는 추론 엔진(Inference engine)을 개발하였습니다. 추론 엔진들을 이용하면 해당 머신러닝 프레임워크로 개발되고 학습된 모델을 불러와 추론이 가능한 API(REST 또는 gRPC)를 생성합니다. 이러한 추론 엔진을 활용하여 구축한 API 서버로 추론하고자 하는 데이터를 담아 요청을 보내면, 추론 엔진이 추론 결과를 응답에 담아 전송하는 것입니다. 대표적으로 다음과 같은 오픈소스 추론 엔진들이 개발되었습니다. - [Tensorflow : Tensorflow Serving](https://github.com/tensorflow/serving) - [PyTorch : Torchserve](https://github.com/pytorch/serve) - [Onnx : Onnx Runtime](https://github.com/microsoft/onnxruntime) 오프소스에서 공식적으로 지원하지는 않지만, 많이 쓰이는 sklearn, xgboost 프레임워크를 위한 추론 엔진도 개발되어 있습니다. 이처럼 모델의 추론 결과를 API의 형태로 받아볼 수 있도록 배포하는 것을 **API Deployment**라고 합니다. ## Serving Framework 위에서 다양한 추론 엔진들이 개발되었다는 사실을 소개해 드렸습니다. 쿠버네티스 환경에서 이러한 추론 엔진들을 사용하여 API Deployment를 한다면 어떤 작업이 필요할까요? 추론 엔진을 배포하기 위한 Deployment, 추론 요청을 보낼 Endpoint를 생성하기 위한 Service, 외부에서의 추론 요청을 추론 엔진으로 보내기 위한 Ingress 등 많은 쿠버네티스 리소스를 배포해 주어야 합니다. 이것 이외에도, 많은 추론 요청이 들어왔을 경우의 스케일 아웃(scale-out), 추론 엔진 상태에 대한 모니터링, 개선된 모델이 나왔을 경우 버전 업데이트 등 추론 엔진을 운영할 때의 요구사항은 한두 가지가 아닙니다. 이러한 많은 요구사항을 처리하기 위해 추론 엔진들을 쿠버네티스 환경 위에서 한 번 더 추상화한 **Serving Framework**들이 개발되었습니다. 개발된 Serving Framework들은 다음과 같은 오픈소스들이 있습니다. - [Seldon Core](https://github.com/SeldonIO/seldon-core) - [Kserve](https://github.com/kserve) - [BentoML](https://github.com/bentoml/BentoML) *모두의 MLOps*에서는 Seldon Core를 사용하여 API Deployment를 하는 과정을 다루어 보도록 하겠습니다. ================================================ FILE: versioned_docs/version-1.0/appendix/_category_.json ================================================ { "label": "Appendix", "position": 9, "link": { "type": "generated-index" } } ================================================ FILE: versioned_docs/version-1.0/appendix/metallb.md ================================================ --- title: "2. Bare Metal 클러스터용 load balancer metallb 설치" sidebar_position: 2 --- ## MetalLB란? Kubernetes 사용 시 AWS, GCP, Azure 와 같은 클라우드 플랫폼에서는 자체적으로 로드 벨런서(Load Balancer)를 제공해 주지만, 온프레미스 클러스터에서는 로드 벨런싱 기능을 제공하는 모듈을 추가적으로 설치해야 합니다. [MetalLB](https://metallb.universe.tf/)는 베어메탈 환경에서 사용할 수 있는 로드 벨런서를 제공하는 오픈소스 프로젝트 입니다. ## 요구사항 | 요구 사항 | 버전 및 내용 | | ------------------------------------------------------------ | ------------------------------------------------------------ | | Kubernetes | 로드 벨런싱 기능이 없는 >= v1.13.0 | | [호환가능한 네트워크 CNI](https://metallb.universe.tf/installation/network-addons/) | Calico, Canal, Cilium, Flannel, Kube-ovn, Kube-router, Weave Net | | IPv4 주소 | MetalLB 배포에 사용 | | BGP 모드를 사용할 경우 | BGP 기능을 지원하는 하나 이상의 라우터 | | 노드 간 포트 TCP/UDP 7946 오픈 | memberlist 요구 사항 ## MetalLB 설치 ### Preparation IPVS 모드에서 kube-proxy를 사용하는 경우 Kubernetes v1.14.2 이후부터는 엄격한 ARP(strictARP) 모드를 사용하도록 설정해야 합니다. Kube-router는 기본적으로 엄격한 ARP를 활성화하므로 서비스 프록시로 사용할 경우에는 이 기능이 필요하지 않습니다. 엄격한 ARP 모드를 적용하기에 앞서, 현재 모드를 확인합니다. ```bash # see what changes would be made, returns nonzero returncode if different kubectl get configmap kube-proxy -n kube-system -o yaml | \ grep strictARP ``` ```bash strictARP: false ``` strictARP: false 가 출력되는 경우 다음을 실행하여 strictARP: true로 변경합니다. (strictARP: true가 이미 출력된다면 다음 커맨드를 수행하지 않으셔도 됩니다.) ```bash # actually apply the changes, returns nonzero returncode on errors only kubectl get configmap kube-proxy -n kube-system -o yaml | \ sed -e "s/strictARP: false/strictARP: true/" | \ kubectl apply -f - -n kube-system ``` 정상적으로 수행되면 다음과 같이 출력됩니다. ```bash Warning: resource configmaps/kube-proxy is missing the kubectl.kubernetes.io/last-applied-configuration annotation which is required by kubectl apply. kubectl apply should only be used on resources created declaratively by either kubectl create --save-config or kubectl apply. The missing annotation will be patched automatically. configmap/kube-proxy configured ``` ### 설치 - Manifest #### 1. MetalLB 를 설치합니다. ```bash kubectl apply -f https://raw.githubusercontent.com/metallb/metallb/v0.11.0/manifests/namespace.yaml kubectl apply -f https://raw.githubusercontent.com/metallb/metallb/v0.11.0/manifests/metallb.yaml ``` #### 2. 정상 설치 확인 metallb-system namespace 의 2 개의 pod 이 모두 Running 이 될 때까지 기다립니다. ```bash kubectl get pod -n metallb-system ``` 모두 Running 이 되면 다음과 비슷한 결과가 출력됩니다. ```bash NAME READY STATUS RESTARTS AGE controller-7dcc8764f4-8n92q 1/1 Running 1 1m speaker-fnf8l 1/1 Running 1 1m ``` 매니페스트의 구성 요소는 다음과 같습니다. - metallb-system/controller - deployment 로 배포되며, 로드 벨런싱을 수행할 external IP 주소의 할당을 처리하는 역할을 담당합니다. - metallb-system/speaker - daemonset 형태로 배포되며, 외부 트래픽과 서비스를 연결해 네트워크 통신이 가능하도록 구성하는 역할을 담당합니다. 서비스에는 컨트롤러 및 스피커와 구성 요소가 작동하는 데 필요한 RBAC 사용 권한이 포함됩니다. ## Configuration MetalLB 의 로드 벨런싱 정책 설정은 관련 설정 정보를 담은 configmap 을 배포하여 설정할 수 있습니다. MetalLB 에서 구성할 수 있는 모드로는 다음과 같이 2가지가 있습니다. 1. [Layer 2 모드](https://metallb.universe.tf/concepts/layer2/) 2. [BGP 모드](https://metallb.universe.tf/concepts/bgp/) 여기에서는 Layer 2 모드로 진행하겠습니다. ### Layer 2 Configuration Layer 2 모드는 간단하게 사용할 IP 주소의 대역만 설정하면 됩니다. Layer 2 모드를 사용할 경우 워커 노드의 네트워크 인터페이스에 IP를 바인딩 하지 않아도 되는데 로컬 네트워크의 ARP 요청에 직접 응답하여 컴퓨터의 MAC주소를 클라이언트에 제공하는 방식으로 작동하기 때문입니다. 다음 `metallb_config.yaml` 파일은 MetalLB 가 192.168.35.100 ~ 192.168.35.110의 IP에 대한 제어 권한을 제공하고 Layer 2 모드를 구성하는 설정입니다. 클러스터 노드와 클라이언트 노드가 분리된 경우, 192.168.35.100 ~ 192.168.35.110 대역이 클라이언트 노드와 클러스터 노드 모두 접근 가능한 대역이어야 합니다. #### metallb_config.yaml ```bash apiVersion: v1 kind: ConfigMap metadata: namespace: metallb-system name: config data: config: | address-pools: - name: default protocol: layer2 addresses: - 192.168.35.100-192.168.35.110 # IP 대역폭 ``` 위의 설정을 적용합니다. ```test kubectl apply -f metallb_config.yaml ``` 정상적으로 배포하면 다음과 같이 출력됩니다. ```test configmap/config created ``` ## MetalLB 사용 ### Kubeflow Dashboard 먼저 kubeflow의 Dashboard 를 제공하는 istio-system 네임스페이스의 istio-ingressgateway 서비스의 타입을 `LoadBalancer`로 변경하여 MetalLB로부터 로드 벨런싱 기능을 제공받기 전에, 현재 상태를 확인합니다. ```bash kubectl get svc/istio-ingressgateway -n istio-system ``` 해당 서비스의 타입은 ClusterIP이며, External-IP 값은 `none` 인 것을 확인할 수 있습니다. ```bash NAME TYPE CLUSTER-IP EXTERNAL-IP PORT(S) AGE istio-ingressgateway ClusterIP 10.103.72.5 15021/TCP,80/TCP,443/TCP,31400/TCP,15443/TCP 4h21m ``` type 을 LoadBalancer 로 변경하고 원하는 IP 주소를 입력하고 싶은 경우 loadBalancerIP 항목을 추가합니다. 추가 하지 않을 경우에는 위에서 설정한 IP 주소풀에서 순차적으로 IP 주소가 배정됩니다. ```bash kubectl edit svc/istio-ingressgateway -n istio-system ``` ```bash spec: clusterIP: 10.103.72.5 clusterIPs: - 10.103.72.5 ipFamilies: - IPv4 ipFamilyPolicy: SingleStack ports: - name: status-port port: 15021 protocol: TCP targetPort: 15021 - name: http2 port: 80 protocol: TCP targetPort: 8080 - name: https port: 443 protocol: TCP targetPort: 8443 - name: tcp port: 31400 protocol: TCP targetPort: 31400 - name: tls port: 15443 protocol: TCP targetPort: 15443 selector: app: istio-ingressgateway istio: ingressgateway sessionAffinity: None type: LoadBalancer # Change ClusterIP to LoadBalancer loadBalancerIP: 192.168.35.100 # Add IP status: loadBalancer: {} ``` 다시 확인을 해보면 External-IP 값이 `192.168.35.100` 인 것을 확인합니다. ```bash kubectl get svc/istio-ingressgateway -n istio-system ``` ```bash NAME TYPE CLUSTER-IP EXTERNAL-IP PORT(S) AGE istio-ingressgateway LoadBalancer 10.103.72.5 192.168.35.100 15021:31054/TCP,80:30853/TCP,443:30443/TCP,31400:30012/TCP,15443:31650/TCP 5h1m ``` Web Browser 를 열어 [http://192.168.35.100](http://192.168.35.100) 으로 접속하여, 다음과 같은 화면이 출력되는 것을 확인합니다. ![login-after-istio-ingressgateway-setting.png](./img/login-after-istio-ingressgateway-setting.png) ### minio Dashboard 먼저 minio 의 Dashboard 를 제공하는 kubeflow 네임스페이스의 minio-service 서비스의 타입을 LoadBalancer로 변경하여 MetalLB로부터 로드 벨런싱 기능을 제공받기 전에, 현재 상태를 확인합니다. ```bash kubectl get svc/minio-service -n kubeflow ``` 해당 서비스의 타입은 ClusterIP이며, External-IP 값은 `none` 인 것을 확인할 수 있습니다. ```bash NAME TYPE CLUSTER-IP EXTERNAL-IP PORT(S) AGE minio-service ClusterIP 10.109.209.87 9000/TCP 5h14m ``` type 을 LoadBalancer 로 변경하고 원하는 IP 주소를 입력하고 싶은 경우 loadBalancerIP 항목을 추가합니다. 추가 하지 않을 경우에는 위에서 설정한 IP 주소풀에서 순차적으로 IP 주소가 배정됩니다. ```bash kubectl edit svc/minio-service -n kubeflow ``` ```bash apiVersion: v1 kind: Service metadata: annotations: kubectl.kubernetes.io/last-applied-configuration: | {"apiVersion":"v1","kind":"Service","metadata":{"annotations":{},"labels":{"application-crd-id":"kubeflow-pipelines"},"name":"minio-ser> creationTimestamp: "2022-01-05T08:44:23Z" labels: application-crd-id: kubeflow-pipelines name: minio-service namespace: kubeflow resourceVersion: "21120" uid: 0053ee28-4f87-47bb-ad6b-7ad68aa29a48 spec: clusterIP: 10.109.209.87 clusterIPs: - 10.109.209.87 ipFamilies: - IPv4 ipFamilyPolicy: SingleStack ports: - name: http port: 9000 protocol: TCP targetPort: 9000 selector: app: minio application-crd-id: kubeflow-pipelines sessionAffinity: None type: LoadBalancer # Change ClusterIP to LoadBalancer loadBalancerIP: 192.168.35.101 # Add IP status: loadBalancer: {} ``` 다시 확인을 해보면 External-IP 값이 `192.168.35.101` 인 것을 확인할 수 있습니다. ```bash kubectl get svc/minio-service -n kubeflow ``` ```bash NAME TYPE CLUSTER-IP EXTERNAL-IP PORT(S) AGE minio-service LoadBalancer 10.109.209.87 192.168.35.101 9000:31371/TCP 5h21m ``` Web Browser 를 열어 [http://192.168.35.101:9000](http://192.168.35.101:9000) 으로 접속하여, 다음과 같은 화면이 출력되는 것을 확인합니다. ![login-after-minio-setting.png](./img/login-after-minio-setting.png) ### mlflow Dashboard 먼저 mlflow 의 Dashboard 를 제공하는 mlflow-system 네임스페이스의 mlflow-server-service 서비스의 타입을 LoadBalancer로 변경하여 MetalLB로부터 로드 벨런싱 기능을 제공받기 전에, 현재 상태를 확인합니다. ```bash kubectl get svc/mlflow-server-service -n mlflow-system ``` 해당 서비스의 타입은 ClusterIP이며, External-IP 값은 `none` 인 것을 확인할 수 있습니다. ```bash NAME TYPE CLUSTER-IP EXTERNAL-IP PORT(S) AGE mlflow-server-service ClusterIP 10.111.173.209 5000/TCP 4m50s ``` type 을 LoadBalancer 로 변경하고 원하는 IP 주소를 입력하고 싶은 경우 loadBalancerIP 항목을 추가합니다. 추가 하지 않을 경우에는 위에서 설정한 IP 주소풀에서 순차적으로 IP 주소가 배정됩니다. ```bash kubectl edit svc/mlflow-server-service -n mlflow-system ``` ```bash apiVersion: v1 kind: Service metadata: annotations: meta.helm.sh/release-name: mlflow-server meta.helm.sh/release-namespace: mlflow-system creationTimestamp: "2022-01-07T04:00:19Z" labels: app.kubernetes.io/managed-by: Helm name: mlflow-server-service namespace: mlflow-system resourceVersion: "276246" uid: e5d39fb7-ad98-47e7-b512-f9c673055356 spec: clusterIP: 10.111.173.209 clusterIPs: - 10.111.173.209 ipFamilies: - IPv4 ipFamilyPolicy: SingleStack ports: - port: 5000 protocol: TCP targetPort: 5000 selector: app.kubernetes.io/name: mlflow-server sessionAffinity: None type: LoadBalancer # Change ClusterIP to LoadBalancer loadBalancerIP: 192.168.35.102 # Add IP status: loadBalancer: {} ``` 다시 확인을 해보면 External-IP 값이 `192.168.35.102` 인 것을 확인할 수 있습니다. ```bash kubectl get svc/mlflow-server-service -n mlflow-system ``` ```bash NAME TYPE CLUSTER-IP EXTERNAL-IP PORT(S) AGE mlflow-server-service LoadBalancer 10.111.173.209 192.168.35.102 5000:32287/TCP 6m11s ``` Web Browser 를 열어 [http://192.168.35.102:5000](http://192.168.35.102:5000) 으로 접속하여, 다음과 같은 화면이 출력되는 것을 확인합니다. ![login-after-mlflow-setting.png](./img/login-after-mlflow-setting.png) ### Grafana Dashboard 먼저 Grafana 의 Dashboard 를 제공하는 seldon-system 네임스페이스의 seldon-core-analytics-grafana 서비스의 타입을 LoadBalancer로 변경하여 MetalLB로부터 로드 벨런싱 기능을 제공받기 전에, 현재 상태를 확인합니다. ```bash kubectl get svc/seldon-core-analytics-grafana -n seldon-system ``` 해당 서비스의 타입은 ClusterIP이며, External-IP 값은 `none` 인 것을 확인할 수 있습니다. ```bash NAME TYPE CLUSTER-IP EXTERNAL-IP PORT(S) AGE seldon-core-analytics-grafana ClusterIP 10.109.20.161 80/TCP 94s ``` type 을 LoadBalancer 로 변경하고 원하는 IP 주소를 입력하고 싶은 경우 loadBalancerIP 항목을 추가합니다. 추가 하지 않을 경우에는 위에서 설정한 IP 주소풀에서 순차적으로 IP 주소가 배정됩니다. ```bash kubectl edit svc/seldon-core-analytics-grafana -n seldon-system ``` ```bash apiVersion: v1 kind: Service metadata: annotations: meta.helm.sh/release-name: seldon-core-analytics meta.helm.sh/release-namespace: seldon-system creationTimestamp: "2022-01-07T04:16:47Z" labels: app.kubernetes.io/instance: seldon-core-analytics app.kubernetes.io/managed-by: Helm app.kubernetes.io/name: grafana app.kubernetes.io/version: 7.0.3 helm.sh/chart: grafana-5.1.4 name: seldon-core-analytics-grafana namespace: seldon-system resourceVersion: "280605" uid: 75073b78-92ec-472c-b0d5-240038ea8fa5 spec: clusterIP: 10.109.20.161 clusterIPs: - 10.109.20.161 ipFamilies: - IPv4 ipFamilyPolicy: SingleStack ports: - name: service port: 80 protocol: TCP targetPort: 3000 selector: app.kubernetes.io/instance: seldon-core-analytics app.kubernetes.io/name: grafana sessionAffinity: None type: LoadBalancer # Change ClusterIP to LoadBalancer loadBalancerIP: 192.168.35.103 # Add IP status: loadBalancer: {} ``` 다시 확인을 해보면 External-IP 값이 `192.168.35.103` 인 것을 확인할 수 있습니다. ```bash kubectl get svc/seldon-core-analytics-grafana -n seldon-system ``` ```bash NAME TYPE CLUSTER-IP EXTERNAL-IP PORT(S) AGE seldon-core-analytics-grafana LoadBalancer 10.109.20.161 192.168.35.103 80:31191/TCP 5m14s ``` Web Browser 를 열어 [http://192.168.35.103:80](http://192.168.35.103:80) 으로 접속하여, 다음과 같은 화면이 출력되는 것을 확인합니다. ![login-after-grafana-setting.png](./img/login-after-grafana-setting.png) ================================================ FILE: versioned_docs/version-1.0/appendix/pyenv.md ================================================ --- title: "1. Python 가상환경 설치" sidebar_position: 1 --- ## 파이썬 가상환경 Python 환경을 사용하다 보면 여러 버전의 Python 환경을 사용하고 싶은 경우나, 여러 프로젝트별 패키지 버전을 따로 관리하고 싶은 경우가 발생합니다. 이처럼 Python 환경 혹은 Python Package 환경을 가상화하여 관리하는 것을 쉽게 도와주는 도구로는 pyenv, conda, virtualenv, venv 등이 존재합니다. 이 중 *모두의 MLOps*에서는 [pyenv](https://github.com/pyenv/pyenv)와 [pyenv-virtualenv](https://github.com/pyenv/pyenv-virtualenv)를 설치하는 방법을 다룹니다. pyenv는 Python 버전을 관리하는 것을 도와주며, pyenv-virtualenv는 pyenv의 plugin으로써 파이썬 패키지 환경을 관리하는 것을 도와줍니다. ## pyenv 설치 ### Prerequisites 운영 체제별로 Prerequisites가 다릅니다. [다음 페이지](https://github.com/pyenv/pyenv/wiki#suggested-build-environment)를 참고하여 필수 패키지들을 설치해주시기 바랍니다. ### 설치 - macOS 1. pyenv, pyenv-virtualenv 설치 ```bash brew update brew install pyenv brew install pyenv-virtualenv ``` 2. pyenv 설정 macOS의 경우 카탈리나 버전 이후 기본 shell이 zsh로 변경되었기 때문에 zsh을 사용하는 경우를 가정하였습니다. ```bash echo 'eval "$(pyenv init -)"' >> ~/.zshrc echo 'eval "$(pyenv virtualenv-init -)"' >> ~/.zshrc source ~/.zshrc ``` pyenv 명령이 정상적으로 수행되는지 확인합니다. ```bash pyenv --help ``` ```bash $ pyenv --help Usage: pyenv [] Some useful pyenv commands are: --version Display the version of pyenv activate Activate virtual environment commands List all available pyenv commands deactivate Deactivate virtual environment exec Run an executable with the selected Python version global Set or show the global Python version(s) help Display help for a command hooks List hook scripts for a given pyenv command init Configure the shell environment for pyenv install Install a Python version using python-build local Set or show the local application-specific Python version(s) prefix Display prefix for a Python version rehash Rehash pyenv shims (run this after installing executables) root Display the root directory where versions and shims are kept shell Set or show the shell-specific Python version shims List existing pyenv shims uninstall Uninstall a specific Python version version Show the current Python version(s) and its origin version-file Detect the file that sets the current pyenv version version-name Show the current Python version version-origin Explain how the current Python version is set versions List all Python versions available to pyenv virtualenv Create a Python virtualenv using the pyenv-virtualenv plugin virtualenv-delete Uninstall a specific Python virtualenv virtualenv-init Configure the shell environment for pyenv-virtualenv virtualenv-prefix Display real_prefix for a Python virtualenv version virtualenvs List all Python virtualenvs found in `$PYENV_ROOT/versions/*'. whence List all Python versions that contain the given executable which Display the full path to an executable See `pyenv help ' for information on a specific command. For full documentation, see: https://github.com/pyenv/pyenv#readme ``` ### 설치 - Ubuntu 1. pyenv, pyenv-virtualenv 설치 ```bash curl https://pyenv.run | bash ``` 다음과 같은 내용이 출력되면 정상적으로 설치된 것을 의미합니다. ```bash % Total % Received % Xferd Average Speed Time Time Time Current Dload Upload Total Spent Left Speed 0 0 0 0 0 0 0 0 --:--:-- --:--:-- 0 0 0 0 0 0 0 0 --:--:-- --:--:-- 100 270 100 270 0 0 239 0 0:00:01 0:00:01 --:--:-- 239 Cloning into '/home/mlops/.pyenv'... r ... 중략... ... remote: Enumerating objects: 10, done. remote: Counting objects: 100% (10/10), done. remote: Compressing objects: 100% (6/6), done. remote: Total 10 (delta 1), reused 6 (delta 0), pack-reused 0 Unpacking objects: 100% (10/10), 2.92 KiB | 2.92 MiB/s, done. WARNING: seems you still have not added 'pyenv' to the load path. # See the README for instructions on how to set up # your shell environment for Pyenv. # Load pyenv-virtualenv automatically by adding # the following to ~/.bashrc: eval "$(pyenv virtualenv-init -)" ``` 2. pyenv 설정 기본 shell로 bash shell을 사용하는 경우를 가정하였습니다. bash에서 pyenv와 pyenv-virtualenv 를 사용할 수 있도록 설정합니다. ```bash sudo vi ~/.bashrc ``` 다음 문자열을 입력한 후 저장합니다. ```bash export PATH="$HOME/.pyenv/bin:$PATH" eval "$(pyenv init -)" eval "$(pyenv virtualenv-init -)" ``` shell을 restart 합니다. ```bash exec $SHELL ``` pyenv 명령이 정상적으로 수행되는지 확인합니다. ```bash pyenv --help ``` 다음과 같은 메시지가 출력되면 정상적으로 설정된 것을 의미합니다. ```bash $ pyenv pyenv 2.2.2 Usage: pyenv [] Some useful pyenv commands are: --version Display the version of pyenv activate Activate virtual environment commands List all available pyenv commands deactivate Deactivate virtual environment doctor Verify pyenv installation and development tools to build pythons. exec Run an executable with the selected Python version global Set or show the global Python version(s) help Display help for a command hooks List hook scripts for a given pyenv command init Configure the shell environment for pyenv install Install a Python version using python-build local Set or show the local application-specific Python version(s) prefix Display prefix for a Python version rehash Rehash pyenv shims (run this after installing executables) root Display the root directory where versions and shims are kept shell Set or show the shell-specific Python version shims List existing pyenv shims uninstall Uninstall a specific Python version version Show the current Python version(s) and its origin version-file Detect the file that sets the current pyenv version version-name Show the current Python version version-origin Explain how the current Python version is set versions List all Python versions available to pyenv virtualenv Create a Python virtualenv using the pyenv-virtualenv plugin virtualenv-delete Uninstall a specific Python virtualenv virtualenv-init Configure the shell environment for pyenv-virtualenv virtualenv-prefix Display real_prefix for a Python virtualenv version virtualenvs List all Python virtualenvs found in `$PYENV_ROOT/versions/*'. whence List all Python versions that contain the given executable which Display the full path to an executable See `pyenv help ' for information on a specific command. For full documentation, see: https://github.com/pyenv/pyenv#readme ``` ## pyenv 사용 ### Python 버전 설치 `pyenv install ` 명령을 통해 원하는 파이썬 버전을 설치할 수 있습니다. 이번 페이지에서는 예시로 kubeflow에서 기본으로 사용하는 파이썬 3.7.12 버전을 설치하겠습니다. ```bash pyenv install 3.7.12 ``` 정상적으로 설치되면 다음과 같은 메시지가 출력됩니다. ```bash $ pyenv install 3.7.12 Downloading Python-3.7.12.tar.xz... -> https://www.python.org/ftp/python/3.7.12/Python-3.7.12.tar.xz Installing Python-3.7.12... patching file Doc/library/ctypes.rst patching file Lib/test/test_unicode.py patching file Modules/_ctypes/_ctypes.c patching file Modules/_ctypes/callproc.c patching file Modules/_ctypes/ctypes.h patching file setup.py patching file 'Misc/NEWS.d/next/Core and Builtins/2020-06-30-04-44-29.bpo-41100.PJwA6F.rst' patching file Modules/_decimal/libmpdec/mpdecimal.h Installed Python-3.7.12 to /home/mlops/.pyenv/versions/3.7.12 ``` ### Python 가상환경 생성 `pyenv virtualenv <가상환경-이름>` 명령을 통해 원하는 파이썬 버전의 파이썬 가상환경을 생성할 수 있습니다. 예시로 Python 3.7.12 버전의 `demo`라는 이름의 Python 가상환경을 생성하겠습니다. ```bash pyenv virtualenv 3.7.12 demo ``` ```bash $ pyenv virtualenv 3.7.12 demo Looking in links: /tmp/tmpffqys0gv Requirement already satisfied: setuptools in /home/mlops/.pyenv/versions/3.7.12/envs/demo/lib/python3.7/site-packages (47.1.0) Requirement already satisfied: pip in /home/mlops/.pyenv/versions/3.7.12/envs/demo/lib/python3.7/site-packages (20.1.1) ``` ### Python 가상환경 사용 `pyenv activate <가상환경 이름>` 명령을 통해 위와 같은 방식으로 생성한 가상환경을 사용할 수 있습니다. 예시로는 `demo`라는 이름의 Python 가상환경을 사용하겠습니다. ```bash pyenv activate demo ``` 다음과 같이 현재 가상환경의 정보가 shell의 맨 앞에 출력되는 것을 확인할 수 있습니다. Before ```bash mlops@ubuntu:~$ pyenv activate demo ``` After ```bash pyenv-virtualenv: prompt changing will be removed from future release. configure `export PYENV_VIRTUALENV_DISABLE_PROMPT=1' to simulate the behavior. (demo) mlops@ubuntu:~$ ``` ### Python 가상환경 비활성화 `source deactivate` 명령을 통해 현재 사용 중인 가상환경을 비활성화할 수 있습니다. ```bash source deactivate ``` Before ```bash (demo) mlops@ubuntu:~$ source deactivate ``` After ```bash mlops@ubuntu:~$ ``` ================================================ FILE: versioned_docs/version-1.0/further-readings/_category_.json ================================================ { "label": "Further Readings", "position": 8, "link": { "type": "generated-index" } } ================================================ FILE: versioned_docs/version-1.0/further-readings/info.md ================================================ --- title: "다루지 못한 것들" date: 2021-12-21 lastmod: 2021-12-21 --- ## MLOps Component [MLOps Concepts](../introduction/component.md)에서 다루었던 컴포넌트를 도식화하면 다음과 같습니다. ![open-stacks-0.png](./img/open-stacks-0.png) 이 중 *모두의 MLOps* 에서 다룬 기술 스택들은 다음과 같습니다. ![open-stacks-1.png](./img/open-stacks-1.png) 보시는 것처럼 아직 우리가 다루지 못한 많은 MLOps 컴포넌트들이 있습니다. 시간 관계상 이번에 모두 다루지는 못했지만, 만약 필요하다면 다음과 같은 오픈소스들을 먼저 참고해보면 좋을 것 같습니다. ![open-stacks-2.png](./img/open-stacks-2.png) 세부 내용은 다음과 같습니다. | Mgmt. | Component | Open Soruce | | -------------------------- | --------------------------- | ------------------------------------- | | Data Mgmt. | Collection | [Kafka](https://kafka.apache.org/) | | | Validation | [Beam](https://beam.apache.org/) | | | Feature Store | [Flink](https://flink.apache.org/) | | ML Model Dev. & Experiment | Modeling | [Jupyter](https://jupyter.org/) | | | Analysis & Experiment Mgmt. | [MLflow](https://mlflow.org/) | | | HPO Tuning & AutoML | [Katib](https://github.com/kubeflow/katib) | | Deploy Mgmt. | Serving Framework | [Seldon Core](https://docs.seldon.io/projects/seldon-core/en/latest/index.html) | | | A/B Test | [Iter8](https://iter8.tools/) | | | Monitoring | [Grafana](https://grafana.com/oss/grafana/), [Prometheus](https://prometheus.io/) | | Process Mgmt. | pipeline | [Kubeflow](https://www.kubeflow.org/) | | | CI/CD | [Github Action](https://docs.github.com/en/actions) | | | Continuous Training | [Argo Events](https://argoproj.github.io/events/) | | Platform Mgmt. | Configuration Mgmt. | [Consul](https://www.consul.io/) | | | Code Version Mgmt. | [Github](https://github.com/), [Minio](https://min.io/) | | | Logging | (EFK) [Elastic Search](https://www.elastic.co/kr/elasticsearch/), [Fluentd](https://www.fluentd.org/), [Kibana](https://www.elastic.co/kr/kibana/) | | | Resource Mgmt. | [Kubernetes](https://kubernetes.io/) | ================================================ FILE: versioned_docs/version-1.0/introduction/_category_.json ================================================ { "label": "Introduction", "position": 1, "link": { "type": "generated-index" } } ================================================ FILE: versioned_docs/version-1.0/introduction/component.md ================================================ --- title : "3. Components of MLOps" description: "Describe MLOps Components" sidebar_position: 3 date: 2021-12-03 lastmod: 2021-12-10 contributors: ["Youngcheol Jang"] --- ## Practitioners guide to MLOps 2021년 5월에 발표된 구글의 [white paper : Practitioners guide to MLOps: A framework for continuous delivery and automation of machine learning](https://services.google.com/fh/files/misc/practitioners_guide_to_mlops_whitepaper.pdf)에서는 MLOps의 핵심 기능들로 다음과 같은 것들을 언급하였습니다. ![mlops-component](./img/mlops-component.png) 각 기능이 어떤 역할을 하는지 살펴보겠습니다. ### 1. Experimentation 실험(Experimentation)은 머신러닝 엔지니어들이 데이터를 분석하고, 프로토타입 모델을 만들며 학습 기능을 구현할 수 있도록 하는 다음과 같은 기능을 제공합니다. - 깃(Git)과 같은 버전 컨트롤 도구와 통합된 노트북(Jupyter Notebook) 환경 제공 - 사용한 데이터, 하이퍼 파라미터, 평가 지표를 포함한 실험 추적 기능 제공 - 데이터와 모델에 대한 분석 및 시각화 기능 제공 ### 2. Data Processing 데이터 처리(Data Processing)는 머신러닝 모델 개발 단계, 지속적인 학습(Continuous Training) 단계, 그리고 API 배포(API Deployment) 단계에서 많은 양의 데이터를 사용할 수 있게 해 주는 다음과 같은 기능을 제공합니다. - 다양한 데이터 소스와 서비스에 호환되는 데이터 커넥터(connector) 기능 제공 - 다양한 형태의 데이터와 호환되는 데이터 인코더(encoder) & 디코더(decoder) 기능 제공 - 다양한 형태의 데이터에 대한 데이터 변환과 피처 엔지니어링(feature engineering) 기능 제공 - 학습과 서빙을 위한 확장 가능한 배치, 스트림 데이터 처리 기능 제공 ### 3. Model training 모델 학습(Model training)은 모델 학습을 위한 알고리즘을 효율적으로 실행시켜주는 다음과 같은 기능을 제공합니다. - ML 프레임워크의 실행을 위한 환경 제공 - 다수의 GPU / 분산 학습 사용을 위한 분산 학습 환경 제공 - 하이퍼 파라미터 튜닝과 최적화 기능 제공 ### 4. Model evaluation 모델 평가(Model evaluation)는 실험 환경과 상용 환경에서 동작하는 모델의 성능을 관찰할 수 있는 다음과 같은 기능을 제공합니다. - 평가 데이터에 대한 모델 성능 평가 기능 - 서로 다른 지속 학습 실행 결과에 대한 예측 성능 추적 - 서로 다른 모델의 성능 비교와 시각화 - 해석할 수 있는 AI 기술을 이용한 모델 출력 해석 기능 제공 ### 5. Model serving 모델 서빙(Model serving)은 상용 환경에 모델을 배포하고 서빙하기 위한 다음과 같은 기능들을 제공합니다. - 저 지연 추론과 고가용성 추론 기능 제공 - 다양한 ML 모델 서빙 프레임워크 지원(Tensorflow Serving, TorchServe, NVIDIA Triton, Scikit-learn, XGGoost. etc) - 복잡한 형태의 추론 루틴 기능 제공, 예를 들어 전처리(preprocess) 또는 후처리(postprocess) 기능과 최종 결과를 위해 다수의 모델이 사용되는 경우를 말합니다. - 순간적으로 치솟는 추론 요청을 처리하기 위한 오토 스케일링(autoscaling) 기능 제공 - 추론 요청과 추론 결과에 대한 로깅 기능 제공 ### 6. Online experimentation 온라인 실험(Online experimentation)은 새로운 모델이 생성되었을 때, 이 모델을 배포하면 어느 정도의 성능을 보일 것인지 검증하는 기능을 제공합니다. 이 기능은 새 모델을 배포하는 것까지 연동하기 위해 모델 저장소(Model Registry)와 연동되어야 합니다. - 카나리(canary) & 섀도(shadow) 배포 기능 제공 - A/B 테스트 기능 제공 - 멀티 암드 밴딧(Multi-armed bandit) 테스트 기능 제공 ### 7. Model Monitoring 모델 모니터링(Model Monitoring)은 상용 환경에 배포된 모델이 정상적으로 동작하고 있는지를 모니터링하는 기능을 제공합니다. 예를 들어 모델의 성능이 떨어져 업데이트가 필요한지에 대한 정보 등을 제공합니다. ### 8. ML Pipeline 머신러닝 파이프라인(ML Pipeline)은 상용 환경에서 복잡한 ML 학습과 추론 작업을 구성하고 제어하고 자동화하기 위한 다음과 같은 기능을 제공합니다. - 다양한 이벤트를 소스를 통한 파이프라인 실행 기능 - 파이프라인 파라미터와 생성되는 산출물 관리를 위한 머신러닝 메타데이터 추적과 연동 기능 - 일반적인 머신러닝 작업을 위한 내장 컴포넌트 지원과 사용자가 직접 구현한 컴포넌트에 대한 지원 기능 - 서로 다른 실행 환경 제공 기능 ### 9. Model Registry 모델 저장소(Model Registry)는 머신러닝 모델의 생명 주기(Lifecycle)을 중앙 저장소에서 관리할 수 있게 해 주는 기능을 제공합니다. - 학습된 모델 그리고 배포된 모델에 대한 등록, 추적, 버저닝 기능 제공 - 배포를 위해 필요한 데이터와 런타임 패키지들에 대한 정보 저장 기능 ### 10. Dataset and Feature Repository - 데이터에 대한 공유, 검색, 재사용 그리고 버전 관리 기능 - 이벤트 스트리밍 및 온라인 추론 작업에 대한 실시간 처리 및 저 지연 서빙 기능 - 사진, 텍스트, 테이블 형태의 데이터와 같은 다양한 형태의 데이터 지원 기능 ### 11. ML Metadata and Artifact Tracking MLOps의 각 단계에서는 다양한 형태의 산출물들이 생성됩니다. ML 메타데이터는 이런 산출물들에 대한 정보를 의미합니다. ML 메타데이터와 산출물 관리는 산출물의 위치, 타입, 속성, 그리고 관련된 실험(experiment)에 대한 정보를 관리하기 위해 다음과 같은 기능들을 제공합니다. - ML 산출물에 대한 히스토리 관리 기능 - 실험과 파이프라인 파라미터 설정에 대한 추적, 공유 기능 - ML 산출물에 대한 저장, 접근, 시각화, 다운로드 기능 제공 - 다른 MLOps 기능과의 통합 기능 제공 ================================================ FILE: versioned_docs/version-1.0/introduction/intro.md ================================================ --- title : "1. What is MLOps?" description: "Introduction to MLOps" sidebar_position: 1 date: 2021-1./img to MLOps" lastmod: 2022-03-05 contributors: ["Jongseob Jeon"] --- ## Machine Learning Project 2012년 Alexnet 이후 CV, NLP를 비롯하여 데이터가 존재하는 도메인이라면 어디서든 머신러닝과 딥러닝을 도입하고자 하였습니다. 딥러닝과 머신러닝은 AI라는 단어로 묶이며 불렸고 많은 매체에서 AI의 필요성을 외쳤습니다. 그리고 무수히 많은 기업에서 머신러닝과 딥러닝을 이용한 수많은 프로젝트를 진행하였습니다. 하지만 그 결과는 어떻게 되었을까요? 엘리먼트 AI의 음병찬 동북아 지역 총괄책임자는 [*"10개 기업에 AI 프로젝트를 시작한다면 그중 9개는 컨셉검증(POC)만 하다 끝난다"*](https://zdnet.co.kr/view/?no=20200611062002)고 말했습니다. 이처럼 많은 프로젝트에서 머신러닝과 딥러닝은 이 문제를 풀 수 있을 것 같다는 가능성만을 보여주고 사라졌습니다. 그리고 이 시기쯤에 [AI에 다시 겨울](https://www.aifutures.org/2021/ai-winter-is-coming/)이 다가오고 있다는 전망도 나오기 시작했습니다. 왜 프로젝트 대부분이 컨셉검증(POC) 단계에서 끝났을까요? 머신러닝과 딥러닝 코드만으로는 실제 서비스를 운영할 수 없기 때문입니다. 실제 서비스 단계에서 머신러닝과 딥러닝의 코드가 차지하는 부분은 생각보다 크지 않기 때문에, 단순히 모델의 성능만이 아닌 다른 많은 부분을 고려해야 합니다. 구글은 이런 문제를 2015년 [Hidden Technical Debt in Machine Learning Systems](https://proceedings.neurips.cc/paper/2015/file/86df7dcfd896fcaf2674f757a2463eba-Paper.pdf)에서 지적한 바 있습니다. 하지만 이 논문이 나올 당시에는 아직 많은 머신러닝 엔지니어들이 딥러닝과 머신러닝의 가능성을 입증하기 바쁜 시기였기 때문에, 논문이 지적하는 바에 많은 주의를 기울이지는 않았습니다. 그리고 몇 년이 지난 후 머신러닝과 딥러닝은 가능성을 입증해내어, 이제 사람들은 실제 서비스에 적용하고자 했습니다. 하지만 곧 많은 사람이 실제 서비스는 쉽지 않다는 것을 깨달았습니다. ## Devops MLOps는 이전에 없던 새로운 개념이 아니라 DevOps라고 불리는 개발 방법론에서 파생된 단어입니다. 그렇기에 DevOps를 이해한다면 MLOps를 이해하는 데 도움이 됩니다. ### DevOps DevOps는 Development(개발)와 Operations(운영)의 합성어로 소프트웨어의 개발(Development)과 운영(Operations)의 합성어로서 소프트웨어 개발자와 정보기술 전문가 간의 소통, 협업 및 통합을 강조하는 개발 환경이나 문화를 말합니다. DevOps의 목적은 소프트웨어 개발 조직과 운영 조직간의 상호 의존적 대응이며 조직이 소프트웨어 제품과 서비스를 빠른 시간에 개발 및 배포하는 것을 목적으로 합니다. ### Silo Effect 그럼 간단한 상황 설명을 통해 DevOps가 왜 필요한지 알아보도록 하겠습니다. 서비스 초기에는 지원하는 기능이 많지 않으며 팀 또는 회사의 규모가 작습니다. 이때에는 개발팀과 운영팀의 구분이 없거나 작은 규모의 팀으로 구분되어 있습니다. 핵심은 규모가 작다는 것에 있습니다. 이때는 서로 소통할 수 있는 접점이 많고, 집중해야 하는 서비스가 적기 때문에 빠르게 서비스를 개선해 나갈 수 있습니다. 하지만 서비스의 규모가 커질수록 개발팀과 운영팀은 분리되고 서로 소통할 수 있는 채널의 물리적인 한계가 오게 됩니다. 예를 들어서 다른 팀과 함께하는 미팅에 팀원 전체가 미팅을 하는 것이 아니라 각 팀의 팀장 혹은 소수의 시니어만 참석하여 미팅을 진행하게 됩니다. 이런 소통 채널의 한계는 필연적으로 소통의 부재로 이어지게 됩니다. 그러다 보면 개발팀은 새로운 기능들을 계속해서 개발하고 운영팀 입장에서는 개발팀에서 개발한 기능이 배포 시 장애를 일으키는 등 여러 문제가 생기게 됩니다. 위와 같은 상황이 반복되면 조직 이기주의라고 불리는 사일로 현상이 생길 수 있습니다. ![silo](./img/silo.png) > 사일로(silo)는 곡식이나 사료를 저장하는 굴뚝 모양의 창고를 의미한다. 사일로는 독립적으로 존재하며 저장되는 물품이 서로 섞이지 않도록 철저히 관리할 수 있도록 도와준다. > 사일로 효과(Organizational Silos Effect)는 조직 부서 간에 서로 협력하지 않고 내부 이익만을 추구하는 현상을 의미한다. 조직 내에서 개별 부서끼리 서로 담을 쌓고 각자의 이익에만 몰두하는 부서 이기주의를 일컫는다. 사일로 현상은 서비스 품질의 저하로 이어지게 됩니다. 이러한 사일로 현상을 해결하기 위해 나온 것이 바로 DevOps입니다. ### CI/CD Continuous Integration(CI) 와 Continuous Delivery (CD)는 개발팀과 운영팀의 장벽을 해제하기 위한 구체적인 방법입니다. ![cicd](./img/cicd.png) 이 방법을 통해서 개발팀에서는 운영팀의 환경을 이해하고 개발팀에서 개발 중인 기능이 정상적으로 배포까지 이어질 수 있는지 확인합니다. 운영팀은 검증된 기능 또는 개선된 제품을 더 자주 배포해 고객의 제품 경험을 상승시킵니다. 앞에서 설명한 내용을 종합하자면 DevOps는 개발팀과 운영팀 간의 문제가 있었고 이를 해결하기 위한 방법론입니다. ## MLOps ### 1) ML+Ops MLOps는 Machine Learning 과 Operations의 합성어로 DevOps에서 Dev가 ML로 바뀌었습니다. 이제 앞에서 살펴본 DevOps를 통해 MLOps가 무엇인지 짐작해 볼 수 있습니다. “MLOps는 머신러닝팀과 운영팀의 문제를 해결하기 위한 방법입니다.” 이 말은 머신러닝팀과 운영팀 사이에 문제가 발생했다는 의미입니다. 그럼 왜 머신러닝팀과 운영팀에는 문제가 발생했을까요? 두 팀 간의 문제를 알아보기 위해서 추천시스템을 예시로 알아보겠습니다. #### Rule Based 처음 추천시스템을 만드는 경우 간단한 규칙을 기반으로 아이템을 추천합니다. 예를 들어서 1주일간 판매량이 가장 많은 순서대로 보여주는 식의 방식을 이용합니다. 이 방식으로 모델을 정한다면 특별한 이유가 없는 이상 모델의 수정이 필요 없습니다. #### Machine Learning 서비스의 규모가 조금 커지고 로그 데이터가 많이 쌓인다면 이를 이용해 아이템 기반 혹은 유저 기반의 머신러닝 모델을 생성합니다. 이때 모델은 정해진 주기에 따라 모델을 재학습 후 재배포합니다. #### Deep Learning 개인화 추천에 대한 요구가 더 커지고 더 좋은 성능을 내는 모델을 필요해질 경우 딥러닝을 이용한 모델을 개발하기 시작합니다. 이때 만드는 모델은 머신러닝과 같이 정해진 주기에 따라 모델을 재학습 후 재배포합니다. ![graph](./img/graph.png) 위에서 설명한 것을 x축을 모델의 복잡도, y축을 모델의 성능으로 두고 그래프로 표현한다면 다음과 같이 복잡도가 올라갈 때 모델의 성능이 올라가는 상승 관계를 갖습니다. 머신러닝에서 딥러닝으로 넘어갈 머신러닝 팀이 새로 생기게 됩니다. 만약 관리해야할 모델이 적다면 서로 협업을 통해서 충분히 해결할 수 있지만 개발해야 할 모델이 많아진다면 DevOps의 경우와 같이 사일로 현상이 나타나게 됩니다. DevOps의 목표와 맞춰서 생각해보면 MLOps의 목표는 개발한 모델이 정상적으로 배포될 수 있는지 테스트하는 것입니다. 개발팀에서 개발한 기능이 정상적으로 배포될 수 있는지 확인하는 것이 DevOps의 목표였다면, MLOps의 목표는 머신러닝 팀에서 개발한 모델이 정상적으로 배포될 수 있는지 확인하는 것입니다. ### 2) ML -> Ops 하지만 최근 나오고 있는 MLOps 관련 제품과 설명을 보면 꼭 앞에서 설명한 목표만을 대상으로 하고 있지 않습니다. 어떤 경우에는 머신러닝 팀에서 만든 모델을 이용해 직접 운영을 할 수 있도록 도와주려고 합니다. 이러한 니즈는 최근 머신러닝 프로젝트가 진행되는 과정에서 알 수 있습니다. 추천시스템의 경우 운영에서 간단한 모델부터 시작해 운영할 수 있었습니다. 하지만 자연어, 이미지와 같은 곳에서는 규칙 기반의 모델보다는 딥러닝을 이용해 주어진 태스크를 해결할 수 있는지 검증(POC)를 선행하는 경우가 많습니다. 검증이 끝난 프로젝트는 이제 서비스를 위한 운영 환경을 개발하기 시작합니다. 하지만 머신러닝 팀 내의 자체 역량으로는 이 문제를 해결하기 쉽지 않습니다. 이를 해결하기 위해서 MLOps가 필요한 경우도 있습니다. ### 3) 결론 요약하자면 MLOps는 두 가지 목표가 있습니다. 앞에서 설명한 MLOps는 ML+Ops 로 두 팀의 생산성 향상을 위한 것이였습니다. 반면, 뒤에서 설명한 것은 ML->Ops 로 머신러닝 팀에서 직접 운영을 할 수 있도록 도와주는 것을 말합니다. ================================================ FILE: versioned_docs/version-1.0/introduction/levels.md ================================================ --- title : "2. Levels of MLOps" description: "Levels of MLOps" sidebar_position: 2 date: 2021-12-03 lastmod: 2022-03-05 contributors: ["Jongseob Jeon"] --- 이번 페이지에서는 구글에서 발표한 MLOps의 단계를 보며 MLOps의 핵심 기능은 무엇인지 알아 보겠습니다. ## Hidden Technical Debt in ML System 구글은 무려 2015년부터 MLOps의 필요성을 말했습니다. Hidden Technical Debt in Machine Learning Systems 은 그런 구글의 생각을 담은 논문입니다. ![paper](./img/paper.png) 이 논문의 핵심은 바로 머신러닝을 이용한 제품을 만드는데 있어서 머신러닝 코드는 전체 시스템을 구성하는데 있어서 아주 일부일 뿐이라는 것입니다. ![paper-2](./img/paper-2.png) 구글은 이 논문을 더 발전시켜서 MLOps라는 용어를 만들어 확장시켰습니다. 더 자세한 내용은 [구글 클라우드 홈페이지](https://cloud.google.com/architecture/mlops-continuous-delivery-and-automation-pipelines-in-machine-learning)에서 더 자세한 내용을 확인할 수 있습니다. 이번 포스트에서는 구글에서 말하는 MLOps란 어떤 것인지에 대해서 설명해보고자 합니다. 구글에서는 MLOps의 발전 단계를 총 3(0~2)단계로 나누었습니다. 각 단계들에 대해 설명하기 앞서 이전 포스트에서 설명했던 개념 중 필요한 부분을 다시 한번 보겠습니다. 머신러닝 모델을 운영하기 위해서는 모델을 개발하는 머신러닝 팀과 배포 및 운영을 담당하는 운영팀이 있습니다. 이 두 팀의 원할한 협업을 위해서 MLOps가 필요하게 되었습니다. 이전에는 간단히 Continuous Integration(CI)/Continuous Deployment(CD)를 통해서 할 수 있다고 하였는데, 어떻게 CI/CD를 하는지에 대해서 알아 보겠습니다. ## 0단계: 수동 프로세스 ![level-0](./img/level-0.png) 0단계에서 두 팀은 “모델”을 통해 소통합니다. 머신 러닝팀은 쌓여있는 데이터로 모델을 학습시키고 학습된 모델을 운영팀에게 전달 합니다. 운영팀은 이렇게 전달받은 모델을 배포합니다. ![toon](./img/toon.png) 초기의 머신 러닝 모델들은 이 “모델” 중심의 소통을 통해 배포합니다. 그런데 이런 배포 방식은 여러 문제가 있습니다. 예를 들어서 어떤 기능에서는 파이썬 3.7을 쓰고 어떤 기능에서는 파이썬 3.8을 쓴다면 다음과 같은 상황을 자주 목격할 수 있습니다. 이러한 상황이 일어나는 이유는 머신러닝 모델의 특성에 있습니다. 학습된 머신러닝 모델이 동작하기 위해서는 3가지가 필요합니다. 1. 파이썬 코드 2. 학습된 가중치 3. 환경 (패키지, 버전 등) 만약 이 3가지 중 한 가지라도 전달이 잘못 된다면 모델이 동작하지 않거나 예상하지 못한 예측을 할수 있습니다. 그런데 많은 경우 환경이 일치하지 않아서 동작하지 않는 경우가 많습니다. 머신러닝은 다양한 오픈소스를 사용하는데 오픈소스는 특성상 어떤 버전을 쓰는지에 따라서 같은 함수라도 결과가 다를 수 있습니다. 이러한 문제는 서비스 초기에는 관리할 모델이 많지 않기 때문에 금방 해결할 수 있습니다. 하지만 관리하는 기능들이 많아지고 서로 소통에 어려움을 겪게 된다면 성능이 더 좋은 모델을 빠르게 배포할 수 없게 됩니다. ## 1단계: ML 파이프라인 자동화 ### Pipeline ![level-1-pipeline](./img/level-1-pipeline.png) 그래서 MLOps에서는 “파이프라인(Pipeline)”을 이용해 이러한 문제를 방지하고자 했습니다. MLOps의 파이프라인은 도커와 같은 컨테이너를 이용해 머신러닝 엔지니어가 모델 개발에 사용한 것과 동일한 환경으로 동작되는 것을 보장합니다. 이를 통해서 환경이 달라서 모델이 동작하지 않는 상황을 방지합니다. 그런데 파이프라인은 범용적인 용어로 여러 다양한 태스크에서 사용됩니다. 머신러닝 엔지니어가 작성하는 파이프라인의 역할은 무엇일까요? 머신러닝 엔지니어가 작성하는 파이프라인은 학습된 모델을 생산합니다. 그래서 파이프라인 대신 학습 파이프라인(Training Pipeline)이 더 정확하다고 볼 수 있습니다. ### Continuous Training ![level-1-ct.png](./img/level-1-ct.png) 그리고 Continuous Training(CT) 개념이 추가됩니다. 그렇다면 CT는 왜 필요할까요? #### Auto Retrain Real World에서 데이터는 Data Shift라는 데이터의 분포가 계속해서 변하는 특징이 있습니다. 그래서 과거에 학습한 모델이 시간이 지남에 따라 모델의 성능이 저하되는 문제가 있습니다. 이 문제를 해결하는 가장 간단하고 효과적인 해결책은 바로 최근 데이터를 이용해 모델을 재학습하는 것입니다. 변화된 데이터 분포에 맞춰서 모델을 재학습하면 다시 준수한 성능을 낼 수 있습니다. #### Auto Deploy 하지만 제조업과 같이 한 공장에서 여러 레시피를 처리하는 경우 무조건 재학습을 하는 것이 좋지 않을 수 도 있습니다. Blind Spot이 대표적인 예입니다. 예를 들어서 자동차 생산 라인에서 모델 A에 대해서 모델을 만들고 이를 이용해 예측을 진행하고 있었습니다. 만약 전혀 다른 모델 B가 들어오면 이전에 보지 못한 데이터 패턴이기 때문에 모델 B에 대해서 새로운 모델을 학습합니다. 이제 모델 B에 대해서 모델을 만들었기 때문에 모델은 예측을 진행할 것 입니다. 그런데 만약 데이터가 다시 모델 A로 바뀐다면 어떻게 할까요? 만약 Retraining 규칙만 있다면 다시 모델 A에 대해서 새로운 모델을 학습하게 됩니다. 그런데 머신러닝 모델이 충분한 성능을 보이기 위해서는 충분한 양의 데이터가 모여야 합니다. Blind Spot이란 이렇게 데이터를 모으기 위해서 모델이 동작하지 않는 구간을 말합니다. 이러한 Blind Spot을 해결하는 방법은 간단할 수 있습니다. 바로 모델 A에 대한 모델이 과거에 있었는지 확인하고 만약 있었다면 새로운 모델을 바로 학습하기 보다는 이 전 모델을 이용해 다시 예측을 하면 이런 Blind Spot을 해결할 수 있습니다. 이렇게 모델와 같은 메타 데이터를 이용해 모델을 자동으로 변환해주는 것을 Auto Deploy라고 합니다. 정리하자면 CT를 위해서는 Auto Retraining과 Auto Deploy 두 가지 기능이 필요합니다. 둘은 서로의 단점을 보완해 계속해서 모델의 성능을 유지할 수 있게 합니다. ## 2단계: CI/CD 파이프라인의 자동화 ![level-2](./img/level-2.png) 2단계의 제목은 CI와 CD의 자동화 입니다. DevOps에서의 CI/CD의 대상은 소스 코드입니다. 그렇다면 MLOps는 어떤 것이 CI/CD의 대상일까요? MLOps의 CI/CD 대상 또한 소스 코드인 것은 맞지만 조금 더 엄밀히 정의하자면 학습 파이프라인이라고 볼 수 있습니다. 그래서 모델을 학습하는데 있어서 영향이 있는 변화에 대해서 실제로 모델이 정상적으로 학습이 되는지 (CI), 학습된 모델이 정상적으로 동작하는지 (CD)를 확인해야 합니다. 그래서 학습을 하는 코드에 직접적인 수정이 있는 경우에는 CI/CD를 진행해야 합니다. 코드 외에도 사용하는 패키지의 버전, 파이썬의 버전 변경도 CI/CD의 대상입니다. 많은 경우 머신 러닝은 오픈 소스를 이용합니다. 하지만 오픈 소스는 그 특성상 버전이 바뀌었을 때 함수의 내부 로직이 변하는 경우도 있습니다. 물론 어느 정도 버전이 올라 갈 때 이와 관련된 알림을 주지만 한 번에 버전이 크게 바뀐다면 이러한 변화를 모를 수도 있습니다. 그래서 사용하는 패키지의 버전이 변하는 경우에도 CI/CD를 통해 정상적으로 모델이 학습, 동작하는지 확인을 해야 합니다. ================================================ FILE: versioned_docs/version-1.0/introduction/why_kubernetes.md ================================================ --- title : "4. Why Kubernetes?" description: "Reason for using k8s in MLOps" sidebar_position: 4 date: 2021-12-03 lastmod: 2021-12-10 contributors: ["Jaeyeon Kim"] --- ## MLOps & Kubernetes 그렇다면 MLOps를 이야기할 때, 쿠버네티스(Kubernetes)라는 단어가 항상 함께 들리는 이유가 무엇일까요? 성공적인 MLOps 시스템을 구축하기 위해서는 [MLOps의 구성요소](../introduction/component.md) 에서 설명한 것처럼 다양한 구성 요소들이 필요하지만, 각각의 구성 요소들이 유기적으로 운영되기 위해서는 인프라 레벨에서 수많은 이슈를 해결해야 합니다. 간단하게는 수많은 머신러닝 모델의 학습 요청을 차례대로 실행하는 것, 다른 작업 공간에서도 같은 실행 환경을 보장해야 하는 것, 배포된 서비스에 장애가 생겼을 때 빠르게 대응해야 하는 것 등의 이슈 등을 생각해볼 수 있습니다. 여기서 컨테이너(Container)와 컨테이너 오케스트레이션 시스템(Container Orchestration System)의 필요성이 등장합니다. 쿠버네티스와 같은 컨테이너 오케스트레이션 시스템을 도입하면 실행 환경의 격리와 관리를 효율적으로 수행할 수 있습니다. 컨테이너 오케스트레이션 시스템을 도입한다면, 머신러닝 모델을 개발하고 배포하는 과정에서 다수의 개발자가 소수의 클러스터를 공유하면서 *'1번 클러스터 사용 중이신가요?', 'GPU 사용 중이던 제 프로세스 누가 죽였나요?', '누가 클러스터에 x 패키지 업데이트했나요?'* 와 같은 상황을 방지할 수 있습니다. ## Container 그렇다면 컨테이너란 무엇일까요? 마이크로소프트에서는 컨테이너를 [다음](https://azure.microsoft.com/ko-kr/overview/what-is-a-container/)과 같이 정의하고 있습니다. > 컨테이너란 : 애플리케이션의 표준화된 이식 가능한 패키징 그런데 왜 머신러닝에서 컨테이너가 필요할까요? 머신러닝 모델들은 운영체제나 Python 실행 환경, 패키지 버전 등에 따라 다르게 동작할 수 있습니다. 이를 방지하기 위해서 머신러닝에 사용된 소스 코드와 함께 종속적인 실행 환경 전체를 **하나로 묶어서(패키징해서)** 공유하고 실행하는 데 활용할 수 있는 기술이 컨테이너라이제이션(Containerization) 기술입니다. 이렇게 패키징된 형태를 컨테이너 이미지라고 부르며, 컨테이너 이미지를 공유함으로써 사용자들은 어떤 시스템에서든 같은 실행 결과를 보장할 수 있게 됩니다. 즉, 단순히 Jupyter Notebook 파일이나, 모델의 소스 코드와 requirements.txt 파일을 공유하는 것이 아닌, 모든 실행 환경이 담긴 컨테이너 이미지를 공유한다면 *"제 노트북에서는 잘 되는데요?"* 와 같은 상황을 피할 수 있습니다. 컨테이너를 처음 접하시는 분들이 흔히 하시는 오해 중 하나는 "**컨테이너 == 도커**"라고 받아들이는 것입니다. 도커는 컨테이너와 같은 의미를 지니는 개념이 아니라, 컨테이너를 띄우거나, 컨테이너 이미지를 만들고 공유하는 것과 같이 컨테이너를 더욱더 쉽고 유연하게 사용할 수 있는 기능을 제공해주는 도구입니다. 정리하자면 컨테이너는 가상화 기술이고, 도커는 가상화 기술의 구현체라고 말할 수 있습니다. 다만, 도커는 여러 컨테이너 가상화 도구 중에서 쉬운 사용성과 높은 효율성을 바탕으로 가장 빠르게 성장하여 대세가 되었기에 컨테이너하면 도커라는 이미지가 자동으로 떠오르게 되었습니다. 이렇게 컨테이너와 도커 생태계가 대세가 되기까지는 다양한 이유가 있지만, 기술적으로 자세한 이야기는 *모두의 MLOps*의 범위를 넘어서기 때문에 다루지는 않겠습니다. 컨테이너 혹은 도커를 처음 들어보시는 분들에게는 *모두의 MLOps*의 내용이 다소 어렵게 느껴질 수 있으므로, [생활코딩](https://opentutorials.org/course/4781), [subicura 님의 개인 블로그 글](https://subicura.com/2017/01/19/docker-guide-for-beginners-1.html) 등의 자료를 먼저 살펴보는 것을 권장합니다. ## Container Orchestration System 그렇다면 컨테이너 오케스트레이션 시스템은 무엇일까요? **오케스트레이션**이라는 단어에서 추측해 볼 수 있듯이, 수많은 컨테이너가 있을 때 컨테이너들이 서로 조화롭게 구동될 수 있도록 지휘하는 시스템에 비유할 수 있습니다. 컨테이너 기반의 시스템에서 서비스는 컨테이너의 형태로 사용자들에게 제공됩니다. 이때 관리해야 할 컨테이너의 수가 적다면 운영 담당자 한 명이서도 충분히 모든 상황에 대응할 수 있습니다. 하지만, 수백 개 이상의 컨테이너가 수 십 대 이상의 클러스터에서 구동되고 있고 장애를 일으키지 않고 항상 정상 동작해야 한다면, 모든 서비스의 정상 동작 여부를 담당자 한 명이 파악하고 이슈에 대응하는 것은 불가능에 가깝습니다. 예를 들면, 모든 서비스가 정상적으로 동작하고 있는지를 계속해서 모니터링(Monitoring)해야 합니다. 만약, 특정 서비스가 장애를 일으켰다면 여러 컨테이너의 로그를 확인해가며 문제를 파악해야 합니다. 또한, 특정 클러스터나 특정 컨테이너에 작업이 몰리지 않도록 스케줄링(Scheduling)하고 로드 밸런싱(Load Balancing)하며, 스케일링(Scaling)하는 등의 수많은 작업을 담당해야 합니다. 이렇게 수많은 컨테이너의 상태를 지속해서 관리하고 운영하는 과정을 조금이나마 쉽게, 자동으로 할 수 있는 기능을 제공해주는 소프트웨어가 바로 컨테이너 오케스트레이션 시스템입니다. 머신러닝에서는 어떻게 쓰일 수 있을까요? 예를 들어서 GPU가 있어야 하는 딥러닝 학습 코드가 패키징된 컨테이너는 사용 가능한 GPU가 있는 클러스터에서 수행하고, 많은 메모리를 필요로 하는 데이터 전처리 코드가 패키징된 컨테이너는 메모리의 여유가 많은 클러스터에서 수행하고, 학습 중에 클러스터에 문제가 생기면 자동으로 같은 컨테이너를 다른 클러스터로 이동시키고 다시 학습을 진행하는 등의 작업을 사람이 일일이 수행하지 않고, 자동으로 관리하는 시스템을 개발한 뒤 맡기는 것입니다. 집필을 하는 2022년을 기준으로 쿠버네티스는 컨테이너 오케스트레이션 시스템의 사실상의 표준(De facto standard)입니다. CNCF에서 2018년 발표한 [Survey](https://www.cncf.io/blog/2018/08/29/cncf-survey-use-of-cloud-native-technologies-in-production-has-grown-over-200-percent/) 에 따르면 다음 그림과 같이 이미 두각을 나타내고 있었으며, 2019년 발표한 [Survey](https://www.cncf.io/wp-content/uploads/2020/08/CNCF_Survey_Report.pdf)에 따르면 그중 78%가 상용 수준(Production Level)에서 사용하고 있다는 것을 알 수 있습니다. ![k8s-graph](./img/k8s-graph.png) 쿠버네티스 생태계가 이처럼 커지게 된 이유에는 여러 가지 이유가 있습니다. 하지만 도커와 마찬가지로 쿠버네티스 역시 머신러닝 기반의 서비스에서만 사용하는 기술이 아니기에, 자세히 다루기에는 상당히 많은 양의 기술적인 내용을 다루어야 하므로 이번 *모두의 MLOps*에서는 자세한 내용은 생략할 예정입니다. 다만, *모두의 MLOps*에서 앞으로 다룰 내용은 도커와 쿠버네티스에 대한 내용을 어느 정도 알고 계신 분들을 대상으로 작성하였습니다. 따라서 쿠버네티스에 대해 익숙하지 않으신 분들은 다음 [쿠버네티스 공식 문서](https://kubernetes.io/ko/docs/concepts/overview/what-is-kubernetes/), [subicura 님의 개인 블로그 글](https://subicura.com/k8s/) 등의 쉽고 자세한 자료들을 먼저 참고해주시는 것을 권장합니다. ================================================ FILE: versioned_docs/version-1.0/kubeflow/_category_.json ================================================ { "label": "Kubeflow", "position": 6, "link": { "type": "generated-index" } } ================================================ FILE: versioned_docs/version-1.0/kubeflow/advanced-component.md ================================================ --- title : "8. Component - InputPath/OutputPath" description: "" sidebar_position: 8 contributors: ["Jongseob Jeon", "SeungTae Kim"] --- ## Complex Outputs 이번 페이지에서는 [Kubeflow Concepts](../kubeflow/kubeflow-concepts.md#component-contents) 예시로 나왔던 코드를 컴포넌트로 작성해 보겠습니다. ## Component Contents 아래 코드는 [Kubeflow Concepts](../kubeflow/kubeflow-concepts.md#component-contents)에서 사용했던 컴포넌트 콘텐츠입니다. ```python import dill import pandas as pd from sklearn.svm import SVC train_data = pd.read_csv(train_data_path) train_target = pd.read_csv(train_target_path) clf = SVC(kernel=kernel) clf.fit(train_data, train_target) with open(model_path, mode="wb") as file_writer: dill.dump(clf, file_writer) ``` ## Component Wrapper ### Define a standalone Python function 컴포넌트 래퍼에 필요한 Config들과 함께 작성하면 다음과 같이 됩니다. ```python def train_from_csv( train_data_path: str, train_target_path: str, model_path: str, kernel: str, ): import dill import pandas as pd from sklearn.svm import SVC train_data = pd.read_csv(train_data_path) train_target = pd.read_csv(train_target_path) clf = SVC(kernel=kernel) clf.fit(train_data, train_target) with open(model_path, mode="wb") as file_writer: dill.dump(clf, file_writer) ``` [Basic Usage Component](../kubeflow/basic-component)에서 설명할 때 입력과 출력에 대한 타입 힌트를 적어야 한다고 설명 했었습니다. 그런데 만약 json에서 사용할 수 있는 기본 타입이 아닌 dataframe, model와 같이 복잡한 객체들은 어떻게 할까요? 파이썬에서 함수간에 값을 전달할 때, 객체를 반환해도 그 값이 호스트의 메모리에 저장되어 있으므로 다음 함수에서도 같은 객체를 사용할 수 있습니다. 하지만 kubeflow에서 컴포넌트들은 각각 컨테이너 위에서 서로 독립적으로 실행됩니다. 즉, 같은 메모리를 공유하고 있지 않기 때문에, 보통의 파이썬 함수에서 사용하는 방식과 같이 객체를 전달할 수 없습니다. 컴포넌트 간에 넘겨 줄 수 있는 정보는 `json` 으로만 가능합니다. 따라서 Model이나 DataFrame과 같이 json 형식으로 변환할 수 없는 타입의 객체는 다른 방법을 통해야 합니다. Kubeflow에서는 이를 해결하기 위해 json-serializable 하지 않은 타입의 객체는 메모리 대신 파일에 데이터를 저장한 뒤, 그 파일을 이용해 정보를 전달합니다. 저장된 파일의 경로는 str이기 때문에 컴포넌트 간에 전달할 수 있기 때문입니다. 그런데 kubeflow에서는 minio를 이용해 파일을 저장하는데 유저는 실행을 하기 전에는 각 파일의 경로를 알 수 없습니다. 이를 위해서 kubeflow에서는 입력과 출력의 경로와 관련된 매직을 제공하는데 바로 `InputPath`와 `OutputPath` 입니다. `InputPath`는 단어 그대로 입력 경로를 `OutputPath` 는 단어 그대로 출력 경로를 의미합니다. 예를 들어서 데이터를 생성하고 반환하는 컴포넌트에서는 `data_path: OutputPath()`를 argument로 만듭니다. 그리고 데이터를 받는 컴포넌트에서는 `data_path: InputPath()`을 argument로 생성합니다. 이렇게 만든 후 파이프라인에서 서로 연결을 하면 kubeflow에서 필요한 경로를 자동으로 생성후 입력해 주기 때문에 더 이상 유저는 경로를 신경쓰지 않고 컴포넌트간의 관계만 신경쓰면 됩니다. 이제 이 내용을 바탕으로 다시 컴포넌트 래퍼를 작성하면 다음과 같이 됩니다. ```python from kfp.components import InputPath, OutputPath def train_from_csv( train_data_path: InputPath("csv"), train_target_path: InputPath("csv"), model_path: OutputPath("dill"), kernel: str, ): import dill import pandas as pd from sklearn.svm import SVC train_data = pd.read_csv(train_data_path) train_target = pd.read_csv(train_target_path) clf = SVC(kernel=kernel) clf.fit(train_data, train_target) with open(model_path, mode="wb") as file_writer: dill.dump(clf, file_writer) ``` InputPath나 OutputPath는 string을 입력할 수 있습니다. 이 string은 입력 또는 출력하려고 하는 파일의 포맷입니다. 그렇다고 꼭 이 포맷으로 파일 형태로 저장이 강제되는 것은 아닙니다. 다만 파이프라인을 컴파일할 때 최소한의 타입 체크를 위한 도우미 역할을 합니다. 만약 파일 포맷이 고정되지 않는다면 입력하지 않으면 됩니다 (타입 힌트 에서 `Any` 와 같은 역할을 합니다). ### Convert to Kubeflow Format 작성한 컴포넌트를 kubeflow에서 사용할 수 있는 포맷으로 변환합니다. ```python from kfp.components import InputPath, OutputPath, create_component_from_func @create_component_from_func def train_from_csv( train_data_path: InputPath("csv"), train_target_path: InputPath("csv"), model_path: OutputPath("dill"), kernel: str, ): import dill import pandas as pd from sklearn.svm import SVC train_data = pd.read_csv(train_data_path) train_target = pd.read_csv(train_target_path) clf = SVC(kernel=kernel) clf.fit(train_data, train_target) with open(model_path, mode="wb") as file_writer: dill.dump(clf, file_writer) ``` ## Rule to use InputPath/OutputPath InputPath나 OutputPath argument는 파이프라인으로 작성할 때 지켜야하는 규칙이 있습니다. ### Load Data Component 위에서 작성한 컴포넌트를 실행하기 위해서는 데이터가 필요하므로 데이터를 생성하는 컴포넌트를 작성합니다. ```python from functools import partial from kfp.components import InputPath, OutputPath, create_component_from_func @create_component_from_func def load_iris_data( data_path: OutputPath("csv"), target_path: OutputPath("csv"), ): import pandas as pd from sklearn.datasets import load_iris iris = load_iris() data = pd.DataFrame(iris["data"], columns=iris["feature_names"]) target = pd.DataFrame(iris["target"], columns=["target"]) data.to_csv(data_path, index=False) target.to_csv(target_path, index=False) ``` ### Write Pipeline 이제 파이프라인을 작성해 보도록 하겠습니다. ```python from kfp.dsl import pipeline @pipeline(name="complex_pipeline") def complex_pipeline(kernel: str): iris_data = load_iris_data() model = train_from_csv( train_data=iris_data.outputs["data"], train_target=iris_data.outputs["target"], kernel=kernel, ) ``` 한 가지 이상한 점을 확인하셨나요? 바로 입력과 출력에서 받는 argument중 경로와 관련된 것들에 `_path` 접미사가 모두 사라졌습니다. `iris_data.outputs["data_path"]` 가 아닌 `iris_data.outputs["data"]` 으로 접근하는 것을 확인할 수 있습니다. 이는 kubeflow에서 정한 법칙으로 `InputPath` 와 `OutputPath` 으로 생성된 경로들은 파이프라인에서 접근할 때는 `_path` 접미사를 생략하여 접근합니다. 다만 방금 작성한 파이프라인을 업로드할 경우 실행이 되지 않습니다. 이유는 다음 페이지에서 설명합니다. ================================================ FILE: versioned_docs/version-1.0/kubeflow/advanced-environment.md ================================================ --- title : "9. Component - Environment" description: "" sidebar_position: 9 contributors: ["Jongseob Jeon"] --- ## Component Environment 앞서 [8. Component - InputPath/OutputPath](../kubeflow/advanced-component.md)에서 작성한 파이프라인을 실행하면 실패하게 됩니다. 왜 실패하는지 알아보고 정상적으로 실행될 수 있도록 수정합니다. ### Convert to Kubeflow Format [앞에서 작성한 컴포넌트](../kubeflow/advanced-component.md#convert-to-kubeflow-format)를 yaml파일로 변환하도록 하겠습니다. ```python from kfp.components import InputPath, OutputPath, create_component_from_func @create_component_from_func def train_from_csv( train_data_path: InputPath("csv"), train_target_path: InputPath("csv"), model_path: OutputPath("dill"), kernel: str, ): import dill import pandas as pd from sklearn.svm import SVC train_data = pd.read_csv(train_data_path) train_target = pd.read_csv(train_target_path) clf = SVC(kernel=kernel) clf.fit(train_data, train_target) with open(model_path, mode="wb") as file_writer: dill.dump(clf, file_writer) if __name__ == "__main__": train_from_csv.component_spec.save("train_from_csv.yaml") ``` 위의 스크립트를 실행하면 다음과 같은 `train_from_csv.yaml` 파일을 얻을 수 있습니다. ```bash name: Train from csv inputs: - {name: train_data, type: csv} - {name: train_target, type: csv} - {name: model, type: dill} - {name: kernel, type: String} implementation: container: image: python:3.7 command: - sh - -ec - | program_path=$(mktemp) printf "%s" "$0" > "$program_path" python3 -u "$program_path" "$@" - | def train_from_csv( train_data_path, train_target_path, model_path, kernel, ): import dill import pandas as pd from sklearn.svm import SVC train_data = pd.read_csv(train_data_path) train_target = pd.read_csv(train_target_path) clf = SVC(kernel=kernel) clf.fit(train_data, train_target) with open(model_path, mode="wb") as file_writer: dill.dump(clf, file_writer) import argparse _parser = argparse.ArgumentParser(prog='Train from csv', description='') _parser.add_argument("--train-data", dest="train_data_path", type=str, required=True, default=argparse.SUPPRESS) _parser.add_argument("--train-target", dest="train_target_path", type=str, required=True, default=argparse.SUPPRESS) _parser.add_argument("--model", dest="model_path", type=str, required=True, default=argparse.SUPPRESS) _parser.add_argument("--kernel", dest="kernel", type=str, required=True, default=argparse.SUPPRESS) _parsed_args = vars(_parser.parse_args()) _outputs = train_from_csv(**_parsed_args) args: - --train-data - {inputPath: train_data} - --train-target - {inputPath: train_target} - --model - {inputPath: model} - --kernel - {inputValue: kernel} ``` 앞서 [Basic Usage Component](../kubeflow/basic-component.md#convert-to-kubeflow-format)에서 설명한 내용에 따르면 이 컴포넌트는 다음과 같이 실행됩니다. 1. `docker pull python:3.7` 2. run `command` 하지만 위에서 생성된 컴포넌트를 실행하면 오류가 발생하게 됩니다. 그 이유는 컴포넌트 래퍼가 실행되는 방식에 있습니다. Kubeflow는 쿠버네티스를 이용하기 때문에 컴포넌트 래퍼는 각각 독립된 컨테이너 위에서 컴포넌트 콘텐츠를 실행합니다. 자세히 보면 생성된 만든 `train_from_csv.yaml` 에서 정해진 이미지는 `image: python:3.7` 입니다. 이제 어떤 이유 때문에 실행이 안 되는지 눈치채신 분들도 있을 것입니다. `python:3.7` 이미지에는 우리가 사용하고자 하는 `dill`, `pandas`, `sklearn` 이 설치되어 있지 않습니다. 그러므로 실행할 때 해당 패키지가 존재하지 않는다는 에러와 함께 실행이 안 됩니다. 그럼 어떻게 패키지를 추가할 수 있을까요? ## 패키지 추가 방법 Kubeflow를 변환하는 과정에서 두 가지 방법을 통해 패키지를 추가할 수 있습니다. 1. `base_image` 사용 2. `package_to_install` 사용 컴포넌트를 컴파일할 때 사용했던 함수 `create_component_from_func` 가 어떤 argument들을 받을 수 있는지 확인해 보겠습니다. ```bash def create_component_from_func( func: Callable, output_component_file: Optional[str] = None, base_image: Optional[str] = None, packages_to_install: List[str] = None, annotations: Optional[Mapping[str, str]] = None, ): ``` - `func`: 컴포넌트로 만들 컴포넌트 래퍼 함수 - `base_image`: 컴포넌트 래퍼가 실행할 이미지 - `packages_to_install`: 컴포넌트에서 사용해서 추가로 설치해야 하는 패키지 ### 1. base_image 컴포넌트가 실행되는 순서를 좀 더 자세히 들여다보면 다음과 같습니다. 1. `docker pull base_image` 2. `pip install packages_to_install` 3. run `command` 만약 컴포넌트가 사용하는 base_image에 패키지들이 전부 설치되어 있다면 추가적인 패키지 설치 없이 바로 사용할 수 있습니다. 예를 들어, 이번 페이지에서는 다음과 같은 Dockerfile을 작성하겠습니다. ```dockerfile FROM python:3.7 RUN pip install dill pandas scikit-learn ``` 위의 Dockerfile을 이용해 이미지를 빌드해 보겠습니다. 실습에서 사용해볼 도커 허브는 ghcr입니다. 각자 환경에 맞추어서 도커 허브를 선택 후 업로드하면 됩니다. ```bash docker build . -f Dockerfile -t ghcr.io/mlops-for-all/base-image docker push ghcr.io/mlops-for-all/base-image ``` 이제 base_image를 입력해 보겠습니다. ```python from functools import partial from kfp.components import InputPath, OutputPath, create_component_from_func @partial( create_component_from_func, base_image="ghcr.io/mlops-for-all/base-image:latest", ) def train_from_csv( train_data_path: InputPath("csv"), train_target_path: InputPath("csv"), model_path: OutputPath("dill"), kernel: str, ): import dill import pandas as pd from sklearn.svm import SVC train_data = pd.read_csv(train_data_path) train_target = pd.read_csv(train_target_path) clf = SVC(kernel=kernel) clf.fit(train_data, train_target) with open(model_path, mode="wb") as file_writer: dill.dump(clf, file_writer) if __name__ == "__main__": train_from_csv.component_spec.save("train_from_csv.yaml") ``` 이제 생성된 컴포넌트를 컴파일하면 다음과 같이 나옵니다. ```bash name: Train from csv inputs: - {name: train_data, type: csv} - {name: train_target, type: csv} - {name: kernel, type: String} outputs: - {name: model, type: dill} implementation: container: image: ghcr.io/mlops-for-all/base-image:latest command: - sh - -ec - | program_path=$(mktemp) printf "%s" "$0" > "$program_path" python3 -u "$program_path" "$@" - | def _make_parent_dirs_and_return_path(file_path: str): import os os.makedirs(os.path.dirname(file_path), exist_ok=True) return file_path def train_from_csv( train_data_path, train_target_path, model_path, kernel, ): import dill import pandas as pd from sklearn.svm import SVC train_data = pd.read_csv(train_data_path) train_target = pd.read_csv(train_target_path) clf = SVC(kernel=kernel) clf.fit(train_data, train_target) with open(model_path, mode="wb") as file_writer: dill.dump(clf, file_writer) import argparse _parser = argparse.ArgumentParser(prog='Train from csv', description='') _parser.add_argument("--train-data", dest="train_data_path", type=str, required=True, default=argparse.SUPPRESS) _parser.add_argument("--train-target", dest="train_target_path", type=str, required=True, default=argparse.SUPPRESS) _parser.add_argument("--kernel", dest="kernel", type=str, required=True, default=argparse.SUPPRESS) _parser.add_argument("--model", dest="model_path", type=_make_parent_dirs_and_return_path, required=True, default=argparse.SUPPRESS) _parsed_args = vars(_parser.parse_args()) _outputs = train_from_csv(**_parsed_args) args: - --train-data - {inputPath: train_data} - --train-target - {inputPath: train_target} - --kernel - {inputValue: kernel} - --model - {outputPath: model} ``` base_image가 우리가 설정한 값으로 바뀐 것을 확인할 수 있습니다. ### 2. packages_to_install 하지만 패키지가 추가될 때마다 docker 이미지를 계속해서 새로 생성하는 작업은 많은 시간이 소요됩니다. 이 때, `packages_to_install` argument 를 사용하면 패키지를 컨테이너에 쉽게 추가할 수 있습니다. ```python from functools import partial from kfp.components import InputPath, OutputPath, create_component_from_func @partial( create_component_from_func, packages_to_install=["dill==0.3.4", "pandas==1.3.4", "scikit-learn==1.0.1"], ) def train_from_csv( train_data_path: InputPath("csv"), train_target_path: InputPath("csv"), model_path: OutputPath("dill"), kernel: str, ): import dill import pandas as pd from sklearn.svm import SVC train_data = pd.read_csv(train_data_path) train_target = pd.read_csv(train_target_path) clf = SVC(kernel=kernel) clf.fit(train_data, train_target) with open(model_path, mode="wb") as file_writer: dill.dump(clf, file_writer) if __name__ == "__main__": train_from_csv.component_spec.save("train_from_csv.yaml") ``` 스크립트를 실행하면 다음과 같은 `train_from_csv.yaml` 파일이 생성됩니다. ```bash name: Train from csv inputs: - {name: train_data, type: csv} - {name: train_target, type: csv} - {name: kernel, type: String} outputs: - {name: model, type: dill} implementation: container: image: python:3.7 command: - sh - -c - (PIP_DISABLE_PIP_VERSION_CHECK=1 python3 -m pip install --quiet --no-warn-script-location 'dill==0.3.4' 'pandas==1.3.4' 'scikit-learn==1.0.1' || PIP_DISABLE_PIP_VERSION_CHECK=1 python3 -m pip install --quiet --no-warn-script-location 'dill==0.3.4' 'pandas==1.3.4' 'scikit-learn==1.0.1' --user) && "$0" "$@" - sh - -ec - | program_path=$(mktemp) printf "%s" "$0" > "$program_path" python3 -u "$program_path" "$@" - | def _make_parent_dirs_and_return_path(file_path: str): import os os.makedirs(os.path.dirname(file_path), exist_ok=True) return file_path def train_from_csv( train_data_path, train_target_path, model_path, kernel, ): import dill import pandas as pd from sklearn.svm import SVC train_data = pd.read_csv(train_data_path) train_target = pd.read_csv(train_target_path) clf = SVC(kernel=kernel) clf.fit(train_data, train_target) with open(model_path, mode="wb") as file_writer: dill.dump(clf, file_writer) import argparse _parser = argparse.ArgumentParser(prog='Train from csv', description='') _parser.add_argument("--train-data", dest="train_data_path", type=str, required=True, default=argparse.SUPPRESS) _parser.add_argument("--train-target", dest="train_target_path", type=str, required=True, default=argparse.SUPPRESS) _parser.add_argument("--kernel", dest="kernel", type=str, required=True, default=argparse.SUPPRESS) _parser.add_argument("--model", dest="model_path", type=_make_parent_dirs_and_return_path, required=True, default=argparse.SUPPRESS) _parsed_args = vars(_parser.parse_args()) _outputs = train_from_csv(**_parsed_args) args: - --train-data - {inputPath: train_data} - --train-target - {inputPath: train_target} - --kernel - {inputValue: kernel} - --model - {outputPath: model} ``` 위에 작성한 컴포넌트가 실행되는 순서를 좀 더 자세히 들여다보면 다음과 같습니다. 1. `docker pull python:3.7` 2. `pip install dill==0.3.4 pandas==1.3.4 scikit-learn==1.0.1` 3. run `command` 생성된 yaml 파일을 자세히 보면, 다음과 같은 줄이 자동으로 추가되어 필요한 패키지가 설치되기 때문에 오류 없이 정상적으로 실행됩니다. ```bash command: - sh - -c - (PIP_DISABLE_PIP_VERSION_CHECK=1 python3 -m pip install --quiet --no-warn-script-location 'dill==0.3.4' 'pandas==1.3.4' 'scikit-learn==1.0.1' || PIP_DISABLE_PIP_VERSION_CHECK=1 python3 -m pip install --quiet --no-warn-script-location 'dill==0.3.4' 'pandas==1.3.4' 'scikit-learn==1.0.1' --user) && "$0" "$@" ``` ================================================ FILE: versioned_docs/version-1.0/kubeflow/advanced-mlflow.md ================================================ --- title : "12. Component - MLFlow" description: "" sidebar_position: 12 date: 2021-12-13 lastmod: 2021-12-20 contributors: ["Jongseob Jeon", "SeungTae Kim"] --- ## MLFlow Component [Advanced Usage Component](../kubeflow/advanced-component.md) 에서 학습한 모델이 API Deployment까지 이어지기 위해서는 MLFlow에 모델을 저장해야 합니다. 이번 페이지에서는 MLFlow에 모델을 저장할 수 있는 컴포넌트를 작성하는 과정을 설명합니다. ## MLFlow in Local MLFlow에서 모델을 저장하고 서빙에서 사용하기 위해서는 다음의 항목들이 필요합니다. - model - signature - input_example - conda_env 파이썬 코드를 통해서 MLFLow에 모델을 저장하는 과정에 대해서 알아보겠습니다. ### 1. 모델 학습 아래 과정은 iris 데이터를 이용해 SVC 모델을 학습하는 과정입니다. ```python import pandas as pd from sklearn.datasets import load_iris from sklearn.svm import SVC iris = load_iris() data = pd.DataFrame(iris["data"], columns=iris["feature_names"]) target = pd.DataFrame(iris["target"], columns=["target"]) clf = SVC(kernel="rbf") clf.fit(data, target) ``` ### 2. MLFLow Infos mlflow에 필요한 정보들을 만드는 과정입니다. ```python from mlflow.models.signature import infer_signature from mlflow.utils.environment import _mlflow_conda_env input_example = data.sample(1) signature = infer_signature(data, clf.predict(data)) conda_env = _mlflow_conda_env(additional_pip_deps=["dill", "pandas", "scikit-learn"]) ``` 각 변수의 내용을 확인하면 다음과 같습니다. - `input_example` | sepal length (cm) | sepal width (cm) | petal length (cm) | petal width (cm) | | --- | --- | --- | --- | | 6.5 | 6.7 | 3.1 | 4.4 | - `signature` ```python inputs: ['sepal length (cm)': double, 'sepal width (cm)': double, 'petal length (cm)': double, 'petal width (cm)': double] outputs: [Tensor('int64', (-1,))] ``` - `conda_env` ```python {'name': 'mlflow-env', 'channels': ['conda-forge'], 'dependencies': ['python=3.8.10', 'pip', {'pip': ['mlflow', 'dill', 'pandas', 'scikit-learn']}]} ``` ### 3. Save MLFLow Infos 다음으로 학습한 정보들과 모델을 저장합니다. 학습한 모델이 sklearn 패키지를 이용하기 때문에 `mlflow.sklearn` 을 이용하면 쉽게 모델을 저장할 수 있습니다. ```python from mlflow.sklearn import save_model save_model( sk_model=clf, path="svc", serialization_format="cloudpickle", conda_env=conda_env, signature=signature, input_example=input_example, ) ``` 로컬에서 작업하면 다음과 같은 svc 폴더가 생기며 아래와 같은 파일들이 생성됩니다. ```bash ls svc ``` 위의 명령어를 실행하면 다음의 출력값을 확인할 수 있습니다. ```bash MLmodel conda.yaml input_example.json model.pkl requirements.txt ``` 각 파일을 확인하면 다음과 같습니다. - MLmodel ```bash flavors: python_function: env: conda.yaml loader_module: mlflow.sklearn model_path: model.pkl python_version: 3.8.10 sklearn: pickled_model: model.pkl serialization_format: cloudpickle sklearn_version: 1.0.1 saved_input_example_info: artifact_path: input_example.json pandas_orient: split type: dataframe signature: inputs: '[{"name": "sepal length (cm)", "type": "double"}, {"name": "sepal width (cm)", "type": "double"}, {"name": "petal length (cm)", "type": "double"}, {"name": "petal width (cm)", "type": "double"}]' outputs: '[{"type": "tensor", "tensor-spec": {"dtype": "int64", "shape": [-1]}}]' utc_time_created: '2021-12-06 06:52:30.612810' ``` - conda.yaml ```bash channels: - conda-forge dependencies: - python=3.8.10 - pip - pip: - mlflow - dill - pandas - scikit-learn name: mlflow-env ``` - input_example.json ```bash { "columns": [ "sepal length (cm)", "sepal width (cm)", "petal length (cm)", "petal width (cm)" ], "data": [ [6.7, 3.1, 4.4, 1.4] ] } ``` - requirements.txt ```bash mlflow dill pandas scikit-learn ``` - model.pkl ## MLFlow on Server 이제 저장된 모델을 mlflow 서버에 올리는 작업을 해보겠습니다. ```python import mlflow with mlflow.start_run(): mlflow.log_artifact("svc/") ``` 저장하고 `mlruns` 가 생성된 경로에서 `mlflow ui` 명령어를 이용해 mlflow 서버와 대시보드를 띄웁니다. mlflow 대시보드에 접속하여 생성된 run을 클릭하면 다음과 같이 보입니다. ![mlflow-0.png](./img/mlflow-0.png) (해당 화면은 mlflow 버전에 따라 다를 수 있습니다.) ## MLFlow Component 이제 Kubeflow에서 재사용할 수 있는 컴포넌트를 작성해 보겠습니다. 재사용할 수 있는 컴포넌트를 작성하는 방법은 크게 3가지가 있습니다. 1. 모델을 학습하는 컴포넌트에서 필요한 환경을 저장 후 MLFlow 컴포넌트는 업로드만 담당 ![mlflow-1.png](./img/mlflow-1.png) 2. 학습된 모델과 데이터를 MLFlow 컴포넌트에 전달 후 컴포넌트에서 저장과 업로드 담당 ![mlflow-2.png](./img/mlflow-2.png) 3. 모델을 학습하는 컴포넌트에서 저장과 업로드를 담당 ![mlflow-3.png](./img/mlflow-3.png) 저희는 이 중 1번의 접근 방법을 통해 모델을 관리하려고 합니다. 이유는 MLFlow 모델을 업로드하는 코드는 바뀌지 않기 때문에 매번 3번처럼 컴포넌트 작성마다 작성할 필요는 없기 때문입니다. 컴포넌트를 재활용하는 방법은 1번과 2번의 방법으로 가능합니다. 다만 2번의 경우 모델이 학습된 이미지와 패키지들을 전달해야 하므로 결국 컴포넌트에 대한 추가 정보를 전달해야 합니다. 1번의 방법으로 진행하기 위해서는 학습하는 컴포넌트 또한 변경되어야 합니다. 모델을 저장하는데 필요한 환경들을 저장해주는 코드가 추가되어야 합니다. ```python from functools import partial from kfp.components import InputPath, OutputPath, create_component_from_func @partial( create_component_from_func, packages_to_install=["dill", "pandas", "scikit-learn", "mlflow"], ) def train_from_csv( train_data_path: InputPath("csv"), train_target_path: InputPath("csv"), model_path: OutputPath("dill"), input_example_path: OutputPath("dill"), signature_path: OutputPath("dill"), conda_env_path: OutputPath("dill"), kernel: str, ): import dill import pandas as pd from sklearn.svm import SVC from mlflow.models.signature import infer_signature from mlflow.utils.environment import _mlflow_conda_env train_data = pd.read_csv(train_data_path) train_target = pd.read_csv(train_target_path) clf = SVC(kernel=kernel) clf.fit(train_data, train_target) with open(model_path, mode="wb") as file_writer: dill.dump(clf, file_writer) input_example = train_data.sample(1) with open(input_example_path, "wb") as file_writer: dill.dump(input_example, file_writer) signature = infer_signature(train_data, clf.predict(train_data)) with open(signature_path, "wb") as file_writer: dill.dump(signature, file_writer) conda_env = _mlflow_conda_env( additional_pip_deps=["dill", "pandas", "scikit-learn"] ) with open(conda_env_path, "wb") as file_writer: dill.dump(conda_env, file_writer) ``` 그리고 MLFlow에 업로드하는 컴포넌트를 작성합니다. 이 때 업로드되는 MLflow의 endpoint를 우리가 설치한 [mlflow service](../setup-components/install-components-mlflow.md) 로 이어지게 설정해주어야 합니다. 이 때 S3 Endpoint의 주소는 MLflow Server 설치 당시 설치한 minio의 [쿠버네티스 서비스 DNS 네임을 활용](https://kubernetes.io/ko/docs/concepts/services-networking/dns-pod-service/)합니다. 해당 service 는 kubeflow namespace에서 minio-service라는 이름으로 생성되었으므로, `http://minio-service.kubeflow.svc:9000` 로 설정합니다. 이와 비슷하게 tracking_uri의 주소는 mlflow server의 쿠버네티스 서비스 DNS 네임을 활용하여, `http://mlflow-server-service.mlflow-system.svc:5000` 로 설정합니다. ```python from functools import partial from kfp.components import InputPath, create_component_from_func @partial( create_component_from_func, packages_to_install=["dill", "pandas", "scikit-learn", "mlflow", "boto3"], ) def upload_sklearn_model_to_mlflow( model_name: str, model_path: InputPath("dill"), input_example_path: InputPath("dill"), signature_path: InputPath("dill"), conda_env_path: InputPath("dill"), ): import os import dill from mlflow.sklearn import save_model from mlflow.tracking.client import MlflowClient os.environ["MLFLOW_S3_ENDPOINT_URL"] = "http://minio-service.kubeflow.svc:9000" os.environ["AWS_ACCESS_KEY_ID"] = "minio" os.environ["AWS_SECRET_ACCESS_KEY"] = "minio123" client = MlflowClient("http://mlflow-server-service.mlflow-system.svc:5000") with open(model_path, mode="rb") as file_reader: clf = dill.load(file_reader) with open(input_example_path, "rb") as file_reader: input_example = dill.load(file_reader) with open(signature_path, "rb") as file_reader: signature = dill.load(file_reader) with open(conda_env_path, "rb") as file_reader: conda_env = dill.load(file_reader) save_model( sk_model=clf, path=model_name, serialization_format="cloudpickle", conda_env=conda_env, signature=signature, input_example=input_example, ) run = client.create_run(experiment_id="0") client.log_artifact(run.info.run_id, model_name) ``` ## MLFlow Pipeline 이제 작성한 컴포넌트들을 연결해서 파이프라인으로 만들어 보겠습니다. ### Data Component 모델을 학습할 때 쓸 데이터는 sklearn의 iris 입니다. 데이터를 생성하는 컴포넌트를 작성합니다. ```python from functools import partial from kfp.components import InputPath, OutputPath, create_component_from_func @partial( create_component_from_func, packages_to_install=["pandas", "scikit-learn"], ) def load_iris_data( data_path: OutputPath("csv"), target_path: OutputPath("csv"), ): import pandas as pd from sklearn.datasets import load_iris iris = load_iris() data = pd.DataFrame(iris["data"], columns=iris["feature_names"]) target = pd.DataFrame(iris["target"], columns=["target"]) data.to_csv(data_path, index=False) target.to_csv(target_path, index=False) ``` ### Pipeline 파이프라인 코드는 다음과 같이 작성할 수 있습니다. ```python from kfp.dsl import pipeline @pipeline(name="mlflow_pipeline") def mlflow_pipeline(kernel: str, model_name: str): iris_data = load_iris_data() model = train_from_csv( train_data=iris_data.outputs["data"], train_target=iris_data.outputs["target"], kernel=kernel, ) _ = upload_sklearn_model_to_mlflow( model_name=model_name, model=model.outputs["model"], input_example=model.outputs["input_example"], signature=model.outputs["signature"], conda_env=model.outputs["conda_env"], ) ``` ### Run 위에서 작성된 컴포넌트와 파이프라인을 하나의 파이썬 파일에 정리하면 다음과 같습니다. ```python from functools import partial import kfp from kfp.components import InputPath, OutputPath, create_component_from_func from kfp.dsl import pipeline @partial( create_component_from_func, packages_to_install=["pandas", "scikit-learn"], ) def load_iris_data( data_path: OutputPath("csv"), target_path: OutputPath("csv"), ): import pandas as pd from sklearn.datasets import load_iris iris = load_iris() data = pd.DataFrame(iris["data"], columns=iris["feature_names"]) target = pd.DataFrame(iris["target"], columns=["target"]) data.to_csv(data_path, index=False) target.to_csv(target_path, index=False) @partial( create_component_from_func, packages_to_install=["dill", "pandas", "scikit-learn", "mlflow"], ) def train_from_csv( train_data_path: InputPath("csv"), train_target_path: InputPath("csv"), model_path: OutputPath("dill"), input_example_path: OutputPath("dill"), signature_path: OutputPath("dill"), conda_env_path: OutputPath("dill"), kernel: str, ): import dill import pandas as pd from sklearn.svm import SVC from mlflow.models.signature import infer_signature from mlflow.utils.environment import _mlflow_conda_env train_data = pd.read_csv(train_data_path) train_target = pd.read_csv(train_target_path) clf = SVC(kernel=kernel) clf.fit(train_data, train_target) with open(model_path, mode="wb") as file_writer: dill.dump(clf, file_writer) input_example = train_data.sample(1) with open(input_example_path, "wb") as file_writer: dill.dump(input_example, file_writer) signature = infer_signature(train_data, clf.predict(train_data)) with open(signature_path, "wb") as file_writer: dill.dump(signature, file_writer) conda_env = _mlflow_conda_env( additional_pip_deps=["dill", "pandas", "scikit-learn"] ) with open(conda_env_path, "wb") as file_writer: dill.dump(conda_env, file_writer) @partial( create_component_from_func, packages_to_install=["dill", "pandas", "scikit-learn", "mlflow", "boto3"], ) def upload_sklearn_model_to_mlflow( model_name: str, model_path: InputPath("dill"), input_example_path: InputPath("dill"), signature_path: InputPath("dill"), conda_env_path: InputPath("dill"), ): import os import dill from mlflow.sklearn import save_model from mlflow.tracking.client import MlflowClient os.environ["MLFLOW_S3_ENDPOINT_URL"] = "http://minio-service.kubeflow.svc:9000" os.environ["AWS_ACCESS_KEY_ID"] = "minio" os.environ["AWS_SECRET_ACCESS_KEY"] = "minio123" client = MlflowClient("http://mlflow-server-service.mlflow-system.svc:5000") with open(model_path, mode="rb") as file_reader: clf = dill.load(file_reader) with open(input_example_path, "rb") as file_reader: input_example = dill.load(file_reader) with open(signature_path, "rb") as file_reader: signature = dill.load(file_reader) with open(conda_env_path, "rb") as file_reader: conda_env = dill.load(file_reader) save_model( sk_model=clf, path=model_name, serialization_format="cloudpickle", conda_env=conda_env, signature=signature, input_example=input_example, ) run = client.create_run(experiment_id="0") client.log_artifact(run.info.run_id, model_name) @pipeline(name="mlflow_pipeline") def mlflow_pipeline(kernel: str, model_name: str): iris_data = load_iris_data() model = train_from_csv( train_data=iris_data.outputs["data"], train_target=iris_data.outputs["target"], kernel=kernel, ) _ = upload_sklearn_model_to_mlflow( model_name=model_name, model=model.outputs["model"], input_example=model.outputs["input_example"], signature=model.outputs["signature"], conda_env=model.outputs["conda_env"], ) if __name__ == "__main__": kfp.compiler.Compiler().compile(mlflow_pipeline, "mlflow_pipeline.yaml") ```

mlflow_pipeline.yaml ```bash apiVersion: argoproj.io/v1alpha1 kind: Workflow metadata: generateName: mlflow-pipeline- annotations: {pipelines.kubeflow.org/kfp_sdk_version: 1.8.10, pipelines.kubeflow.org/pipeline_compilation_time: '2022-01-19T14:14:11.999807', pipelines.kubeflow.org/pipeline_spec: '{"inputs": [{"name": "kernel", "type": "String"}, {"name": "model_name", "type": "String"}], "name": "mlflow_pipeline"}'} labels: {pipelines.kubeflow.org/kfp_sdk_version: 1.8.10} spec: entrypoint: mlflow-pipeline templates: - name: load-iris-data container: args: [--data, /tmp/outputs/data/data, --target, /tmp/outputs/target/data] command: - sh - -c - (PIP_DISABLE_PIP_VERSION_CHECK=1 python3 -m pip install --quiet --no-warn-script-location 'pandas' 'scikit-learn' || PIP_DISABLE_PIP_VERSION_CHECK=1 python3 -m pip install --quiet --no-warn-script-location 'pandas' 'scikit-learn' --user) && "$0" "$@" - sh - -ec - | program_path=$(mktemp) printf "%s" "$0" > "$program_path" python3 -u "$program_path" "$@" - | def _make_parent_dirs_and_return_path(file_path: str): import os os.makedirs(os.path.dirname(file_path), exist_ok=True) return file_path def load_iris_data( data_path, target_path, ): import pandas as pd from sklearn.datasets import load_iris iris = load_iris() data = pd.DataFrame(iris["data"], columns=iris["feature_names"]) target = pd.DataFrame(iris["target"], columns=["target"]) data.to_csv(data_path, index=False) target.to_csv(target_path, index=False) import argparse _parser = argparse.ArgumentParser(prog='Load iris data', description='') _parser.add_argument("--data", dest="data_path", type=_make_parent_dirs_and_return_path, required=True, default=argparse.SUPPRESS) _parser.add_argument("--target", dest="target_path", type=_make_parent_dirs_and_return_path, required=True, default=argparse.SUPPRESS) _parsed_args = vars(_parser.parse_args()) _outputs = load_iris_data(**_parsed_args) image: python:3.7 outputs: artifacts: - {name: load-iris-data-data, path: /tmp/outputs/data/data} - {name: load-iris-data-target, path: /tmp/outputs/target/data} metadata: labels: pipelines.kubeflow.org/kfp_sdk_version: 1.8.10 pipelines.kubeflow.org/pipeline-sdk-type: kfp pipelines.kubeflow.org/enable_caching: "true" annotations: {pipelines.kubeflow.org/component_spec: '{"implementation": {"container": {"args": ["--data", {"outputPath": "data"}, "--target", {"outputPath": "target"}], "command": ["sh", "-c", "(PIP_DISABLE_PIP_VERSION_CHECK=1 python3 -m pip install --quiet --no-warn-script-location ''pandas'' ''scikit-learn'' || PIP_DISABLE_PIP_VERSION_CHECK=1 python3 -m pip install --quiet --no-warn-script-location ''pandas'' ''scikit-learn'' --user) && \"$0\" \"$@\"", "sh", "-ec", "program_path=$(mktemp)\nprintf \"%s\" \"$0\" > \"$program_path\"\npython3 -u \"$program_path\" \"$@\"\n", "def _make_parent_dirs_and_return_path(file_path: str):\n import os\n os.makedirs(os.path.dirname(file_path), exist_ok=True)\n return file_path\n\ndef load_iris_data(\n data_path,\n target_path,\n):\n import pandas as pd\n from sklearn.datasets import load_iris\n\n iris = load_iris()\n\n data = pd.DataFrame(iris[\"data\"], columns=iris[\"feature_names\"])\n target = pd.DataFrame(iris[\"target\"], columns=[\"target\"])\n\n data.to_csv(data_path, index=False)\n target.to_csv(target_path, index=False)\n\nimport argparse\n_parser = argparse.ArgumentParser(prog=''Load iris data'', description='''')\n_parser.add_argument(\"--data\", dest=\"data_path\", type=_make_parent_dirs_and_return_path, required=True, default=argparse.SUPPRESS)\n_parser.add_argument(\"--target\", dest=\"target_path\", type=_make_parent_dirs_and_return_path, required=True, default=argparse.SUPPRESS)\n_parsed_args = vars(_parser.parse_args())\n\n_outputs = load_iris_data(**_parsed_args)\n"], "image": "python:3.7"}}, "name": "Load iris data", "outputs": [{"name": "data", "type": "csv"}, {"name": "target", "type": "csv"}]}', pipelines.kubeflow.org/component_ref: '{}'} - name: mlflow-pipeline inputs: parameters: - {name: kernel} - {name: model_name} dag: tasks: - {name: load-iris-data, template: load-iris-data} - name: train-from-csv template: train-from-csv dependencies: [load-iris-data] arguments: parameters: - {name: kernel, value: '{{inputs.parameters.kernel}}'} artifacts: - {name: load-iris-data-data, from: '{{tasks.load-iris-data.outputs.artifacts.load-iris-data-data}}'} - {name: load-iris-data-target, from: '{{tasks.load-iris-data.outputs.artifacts.load-iris-data-target}}'} - name: upload-sklearn-model-to-mlflow template: upload-sklearn-model-to-mlflow dependencies: [train-from-csv] arguments: parameters: - {name: model_name, value: '{{inputs.parameters.model_name}}'} artifacts: - {name: train-from-csv-conda_env, from: '{{tasks.train-from-csv.outputs.artifacts.train-from-csv-conda_env}}'} - {name: train-from-csv-input_example, from: '{{tasks.train-from-csv.outputs.artifacts.train-from-csv-input_example}}'} - {name: train-from-csv-model, from: '{{tasks.train-from-csv.outputs.artifacts.train-from-csv-model}}'} - {name: train-from-csv-signature, from: '{{tasks.train-from-csv.outputs.artifacts.train-from-csv-signature}}'} - name: train-from-csv container: args: [--train-data, /tmp/inputs/train_data/data, --train-target, /tmp/inputs/train_target/data, --kernel, '{{inputs.parameters.kernel}}', --model, /tmp/outputs/model/data, --input-example, /tmp/outputs/input_example/data, --signature, /tmp/outputs/signature/data, --conda-env, /tmp/outputs/conda_env/data] command: - sh - -c - (PIP_DISABLE_PIP_VERSION_CHECK=1 python3 -m pip install --quiet --no-warn-script-location 'dill' 'pandas' 'scikit-learn' 'mlflow' || PIP_DISABLE_PIP_VERSION_CHECK=1 python3 -m pip install --quiet --no-warn-script-location 'dill' 'pandas' 'scikit-learn' 'mlflow' --user) && "$0" "$@" - sh - -ec - | program_path=$(mktemp) printf "%s" "$0" > "$program_path" python3 -u "$program_path" "$@" - | def _make_parent_dirs_and_return_path(file_path: str): import os os.makedirs(os.path.dirname(file_path), exist_ok=True) return file_path def train_from_csv( train_data_path, train_target_path, model_path, input_example_path, signature_path, conda_env_path, kernel, ): import dill import pandas as pd from sklearn.svm import SVC from mlflow.models.signature import infer_signature from mlflow.utils.environment import _mlflow_conda_env train_data = pd.read_csv(train_data_path) train_target = pd.read_csv(train_target_path) clf = SVC(kernel=kernel) clf.fit(train_data, train_target) with open(model_path, mode="wb") as file_writer: dill.dump(clf, file_writer) input_example = train_data.sample(1) with open(input_example_path, "wb") as file_writer: dill.dump(input_example, file_writer) signature = infer_signature(train_data, clf.predict(train_data)) with open(signature_path, "wb") as file_writer: dill.dump(signature, file_writer) conda_env = _mlflow_conda_env( additional_pip_deps=["dill", "pandas", "scikit-learn"] ) with open(conda_env_path, "wb") as file_writer: dill.dump(conda_env, file_writer) import argparse _parser = argparse.ArgumentParser(prog='Train from csv', description='') _parser.add_argument("--train-data", dest="train_data_path", type=str, required=True, default=argparse.SUPPRESS) _parser.add_argument("--train-target", dest="train_target_path", type=str, required=True, default=argparse.SUPPRESS) _parser.add_argument("--kernel", dest="kernel", type=str, required=True, default=argparse.SUPPRESS) _parser.add_argument("--model", dest="model_path", type=_make_parent_dirs_and_return_path, required=True, default=argparse.SUPPRESS) _parser.add_argument("--input-example", dest="input_example_path", type=_make_parent_dirs_and_return_path, required=True, default=argparse.SUPPRESS) _parser.add_argument("--signature", dest="signature_path", type=_make_parent_dirs_and_return_path, required=True, default=argparse.SUPPRESS) _parser.add_argument("--conda-env", dest="conda_env_path", type=_make_parent_dirs_and_return_path, required=True, default=argparse.SUPPRESS) _parsed_args = vars(_parser.parse_args()) _outputs = train_from_csv(**_parsed_args) image: python:3.7 inputs: parameters: - {name: kernel} artifacts: - {name: load-iris-data-data, path: /tmp/inputs/train_data/data} - {name: load-iris-data-target, path: /tmp/inputs/train_target/data} outputs: artifacts: - {name: train-from-csv-conda_env, path: /tmp/outputs/conda_env/data} - {name: train-from-csv-input_example, path: /tmp/outputs/input_example/data} - {name: train-from-csv-model, path: /tmp/outputs/model/data} - {name: train-from-csv-signature, path: /tmp/outputs/signature/data} metadata: labels: pipelines.kubeflow.org/kfp_sdk_version: 1.8.10 pipelines.kubeflow.org/pipeline-sdk-type: kfp pipelines.kubeflow.org/enable_caching: "true" annotations: {pipelines.kubeflow.org/component_spec: '{"implementation": {"container": {"args": ["--train-data", {"inputPath": "train_data"}, "--train-target", {"inputPath": "train_target"}, "--kernel", {"inputValue": "kernel"}, "--model", {"outputPath": "model"}, "--input-example", {"outputPath": "input_example"}, "--signature", {"outputPath": "signature"}, "--conda-env", {"outputPath": "conda_env"}], "command": ["sh", "-c", "(PIP_DISABLE_PIP_VERSION_CHECK=1 python3 -m pip install --quiet --no-warn-script-location ''dill'' ''pandas'' ''scikit-learn'' ''mlflow'' || PIP_DISABLE_PIP_VERSION_CHECK=1 python3 -m pip install --quiet --no-warn-script-location ''dill'' ''pandas'' ''scikit-learn'' ''mlflow'' --user) && \"$0\" \"$@\"", "sh", "-ec", "program_path=$(mktemp)\nprintf \"%s\" \"$0\" > \"$program_path\"\npython3 -u \"$program_path\" \"$@\"\n", "def _make_parent_dirs_and_return_path(file_path: str):\n import os\n os.makedirs(os.path.dirname(file_path), exist_ok=True)\n return file_path\n\ndef train_from_csv(\n train_data_path,\n train_target_path,\n model_path,\n input_example_path,\n signature_path,\n conda_env_path,\n kernel,\n):\n import dill\n import pandas as pd\n from sklearn.svm import SVC\n\n from mlflow.models.signature import infer_signature\n from mlflow.utils.environment import _mlflow_conda_env\n\n train_data = pd.read_csv(train_data_path)\n train_target = pd.read_csv(train_target_path)\n\n clf = SVC(kernel=kernel)\n clf.fit(train_data, train_target)\n\n with open(model_path, mode=\"wb\") as file_writer:\n dill.dump(clf, file_writer)\n\n input_example = train_data.sample(1)\n with open(input_example_path, \"wb\") as file_writer:\n dill.dump(input_example, file_writer)\n\n signature = infer_signature(train_data, clf.predict(train_data))\n with open(signature_path, \"wb\") as file_writer:\n dill.dump(signature, file_writer)\n\n conda_env = _mlflow_conda_env(\n additional_pip_deps=[\"dill\", \"pandas\", \"scikit-learn\"]\n )\n with open(conda_env_path, \"wb\") as file_writer:\n dill.dump(conda_env, file_writer)\n\nimport argparse\n_parser = argparse.ArgumentParser(prog=''Train from csv'', description='''')\n_parser.add_argument(\"--train-data\", dest=\"train_data_path\", type=str, required=True, default=argparse.SUPPRESS)\n_parser.add_argument(\"--train-target\", dest=\"train_target_path\", type=str, required=True, default=argparse.SUPPRESS)\n_parser.add_argument(\"--kernel\", dest=\"kernel\", type=str, required=True, default=argparse.SUPPRESS)\n_parser.add_argument(\"--model\", dest=\"model_path\", type=_make_parent_dirs_and_return_path, required=True, default=argparse.SUPPRESS)\n_parser.add_argument(\"--input-example\", dest=\"input_example_path\", type=_make_parent_dirs_and_return_path, required=True, default=argparse.SUPPRESS)\n_parser.add_argument(\"--signature\", dest=\"signature_path\", type=_make_parent_dirs_and_return_path, required=True, default=argparse.SUPPRESS)\n_parser.add_argument(\"--conda-env\", dest=\"conda_env_path\", type=_make_parent_dirs_and_return_path, required=True, default=argparse.SUPPRESS)\n_parsed_args = vars(_parser.parse_args())\n\n_outputs = train_from_csv(**_parsed_args)\n"], "image": "python:3.7"}}, "inputs": [{"name": "train_data", "type": "csv"}, {"name": "train_target", "type": "csv"}, {"name": "kernel", "type": "String"}], "name": "Train from csv", "outputs": [{"name": "model", "type": "dill"}, {"name": "input_example", "type": "dill"}, {"name": "signature", "type": "dill"}, {"name": "conda_env", "type": "dill"}]}', pipelines.kubeflow.org/component_ref: '{}', pipelines.kubeflow.org/arguments.parameters: '{"kernel": "{{inputs.parameters.kernel}}"}'} - name: upload-sklearn-model-to-mlflow container: args: [--model-name, '{{inputs.parameters.model_name}}', --model, /tmp/inputs/model/data, --input-example, /tmp/inputs/input_example/data, --signature, /tmp/inputs/signature/data, --conda-env, /tmp/inputs/conda_env/data] command: - sh - -c - (PIP_DISABLE_PIP_VERSION_CHECK=1 python3 -m pip install --quiet --no-warn-script-location 'dill' 'pandas' 'scikit-learn' 'mlflow' 'boto3' || PIP_DISABLE_PIP_VERSION_CHECK=1 python3 -m pip install --quiet --no-warn-script-location 'dill' 'pandas' 'scikit-learn' 'mlflow' 'boto3' --user) && "$0" "$@" - sh - -ec - | program_path=$(mktemp) printf "%s" "$0" > "$program_path" python3 -u "$program_path" "$@" - | def upload_sklearn_model_to_mlflow( model_name, model_path, input_example_path, signature_path, conda_env_path, ): import os import dill from mlflow.sklearn import save_model from mlflow.tracking.client import MlflowClient os.environ["MLFLOW_S3_ENDPOINT_URL"] = "http://minio-service.kubeflow.svc:9000" os.environ["AWS_ACCESS_KEY_ID"] = "minio" os.environ["AWS_SECRET_ACCESS_KEY"] = "minio123" client = MlflowClient("http://mlflow-server-service.mlflow-system.svc:5000") with open(model_path, mode="rb") as file_reader: clf = dill.load(file_reader) with open(input_example_path, "rb") as file_reader: input_example = dill.load(file_reader) with open(signature_path, "rb") as file_reader: signature = dill.load(file_reader) with open(conda_env_path, "rb") as file_reader: conda_env = dill.load(file_reader) save_model( sk_model=clf, path=model_name, serialization_format="cloudpickle", conda_env=conda_env, signature=signature, input_example=input_example, ) run = client.create_run(experiment_id="0") client.log_artifact(run.info.run_id, model_name) import argparse _parser = argparse.ArgumentParser(prog='Upload sklearn model to mlflow', description='') _parser.add_argument("--model-name", dest="model_name", type=str, required=True, default=argparse.SUPPRESS) _parser.add_argument("--model", dest="model_path", type=str, required=True, default=argparse.SUPPRESS) _parser.add_argument("--input-example", dest="input_example_path", type=str, required=True, default=argparse.SUPPRESS) _parser.add_argument("--signature", dest="signature_path", type=str, required=True, default=argparse.SUPPRESS) _parser.add_argument("--conda-env", dest="conda_env_path", type=str, required=True, default=argparse.SUPPRESS) _parsed_args = vars(_parser.parse_args()) _outputs = upload_sklearn_model_to_mlflow(**_parsed_args) image: python:3.7 inputs: parameters: - {name: model_name} artifacts: - {name: train-from-csv-conda_env, path: /tmp/inputs/conda_env/data} - {name: train-from-csv-input_example, path: /tmp/inputs/input_example/data} - {name: train-from-csv-model, path: /tmp/inputs/model/data} - {name: train-from-csv-signature, path: /tmp/inputs/signature/data} metadata: labels: pipelines.kubeflow.org/kfp_sdk_version: 1.8.10 pipelines.kubeflow.org/pipeline-sdk-type: kfp pipelines.kubeflow.org/enable_caching: "true" annotations: {pipelines.kubeflow.org/component_spec: '{"implementation": {"container": {"args": ["--model-name", {"inputValue": "model_name"}, "--model", {"inputPath": "model"}, "--input-example", {"inputPath": "input_example"}, "--signature", {"inputPath": "signature"}, "--conda-env", {"inputPath": "conda_env"}], "command": ["sh", "-c", "(PIP_DISABLE_PIP_VERSION_CHECK=1 python3 -m pip install --quiet --no-warn-script-location ''dill'' ''pandas'' ''scikit-learn'' ''mlflow'' ''boto3'' || PIP_DISABLE_PIP_VERSION_CHECK=1 python3 -m pip install --quiet --no-warn-script-location ''dill'' ''pandas'' ''scikit-learn'' ''mlflow'' ''boto3'' --user) && \"$0\" \"$@\"", "sh", "-ec", "program_path=$(mktemp)\nprintf \"%s\" \"$0\" > \"$program_path\"\npython3 -u \"$program_path\" \"$@\"\n", "def upload_sklearn_model_to_mlflow(\n model_name,\n model_path,\n input_example_path,\n signature_path,\n conda_env_path,\n):\n import os\n import dill\n from mlflow.sklearn import save_model\n\n from mlflow.tracking.client import MlflowClient\n\n os.environ[\"MLFLOW_S3_ENDPOINT_URL\"] = \"http://minio-service.kubeflow.svc:9000\"\n os.environ[\"AWS_ACCESS_KEY_ID\"] = \"minio\"\n os.environ[\"AWS_SECRET_ACCESS_KEY\"] = \"minio123\"\n\n client = MlflowClient(\"http://mlflow-server-service.mlflow-system.svc:5000\")\n\n with open(model_path, mode=\"rb\") as file_reader:\n clf = dill.load(file_reader)\n\n with open(input_example_path, \"rb\") as file_reader:\n input_example = dill.load(file_reader)\n\n with open(signature_path, \"rb\") as file_reader:\n signature = dill.load(file_reader)\n\n with open(conda_env_path, \"rb\") as file_reader:\n conda_env = dill.load(file_reader)\n\n save_model(\n sk_model=clf,\n path=model_name,\n serialization_format=\"cloudpickle\",\n conda_env=conda_env,\n signature=signature,\n input_example=input_example,\n )\n run = client.create_run(experiment_id=\"0\")\n client.log_artifact(run.info.run_id, model_name)\n\nimport argparse\n_parser = argparse.ArgumentParser(prog=''Upload sklearn model to mlflow'', description='''')\n_parser.add_argument(\"--model-name\", dest=\"model_name\", type=str, required=True, default=argparse.SUPPRESS)\n_parser.add_argument(\"--model\", dest=\"model_path\", type=str, required=True, default=argparse.SUPPRESS)\n_parser.add_argument(\"--input-example\", dest=\"input_example_path\", type=str, required=True, default=argparse.SUPPRESS)\n_parser.add_argument(\"--signature\", dest=\"signature_path\", type=str, required=True, default=argparse.SUPPRESS)\n_parser.add_argument(\"--conda-env\", dest=\"conda_env_path\", type=str, required=True, default=argparse.SUPPRESS)\n_parsed_args = vars(_parser.parse_args())\n\n_outputs = upload_sklearn_model_to_mlflow(**_parsed_args)\n"], "image": "python:3.7"}}, "inputs": [{"name": "model_name", "type": "String"}, {"name": "model", "type": "dill"}, {"name": "input_example", "type": "dill"}, {"name": "signature", "type": "dill"}, {"name": "conda_env", "type": "dill"}], "name": "Upload sklearn model to mlflow"}', pipelines.kubeflow.org/component_ref: '{}', pipelines.kubeflow.org/arguments.parameters: '{"model_name": "{{inputs.parameters.model_name}}"}'} arguments: parameters: - {name: kernel} - {name: model_name} serviceAccountName: pipeline-runner ```

실행후 생성된 mlflow_pipeline.yaml 파일을 파이프라인 업로드한 후, 실행하여 run 의 결과를 확인합니다. ![mlflow-svc-0](./img/mlflow-svc-0.png) mlflow service를 포트포워딩해서 MLflow ui에 접속합니다. ```bash kubectl port-forward svc/mlflow-server-service -n mlflow-system 5000:5000 ``` 웹 브라우저를 열어 localhost:5000으로 접속하면, 다음과 같이 run이 생성된 것을 확인할 수 있습니다. ![mlflow-svc-1](./img/mlflow-svc-1.png) run 을 클릭해서 확인하면 학습한 모델 파일이 있는 것을 확인할 수 있습니다. ![mlflow-svc-2](./img/mlflow-svc-2.png) ================================================ FILE: versioned_docs/version-1.0/kubeflow/advanced-pipeline.md ================================================ --- title : "10. Pipeline - Setting" description: "" sidebar_position: 10 contributors: ["Jongseob Jeon"] --- ## Pipeline Setting 이번 페이지에서는 파이프라인에서 설정할 수 있는 값들에 대해 알아보겠습니다. ## Display Name 생성된 파이프라인 내에서 컴포넌트는 두 개의 이름을 갖습니다. - task_name: 컴포넌트를 작성할 때 작성한 함수 이름 - display_name: kubeflow UI상에 보이는 이름 예를 들어서 다음과 같은 경우 두 컴포넌트 모두 Print and return number로 설정되어 있어서 어떤 컴포넌트가 1번인지 2번인지 확인하기 어렵습니다. ![run-7](./img/run-7.png) ### set_display_name 이를 위한 것이 바로 display_name 입니다. 설정하는 방법은 파이프라인에서 컴포넌트에 다음과 같이 `set_display_name` [attribute](https://kubeflow-pipelines.readthedocs.io/en/latest/source/kfp.dsl.html#kfp.dsl.ContainerOp.set_display_name)를 이용하면 됩니다. ```python import kfp from kfp.components import create_component_from_func from kfp.dsl import pipeline @create_component_from_func def print_and_return_number(number: int) -> int: print(number) return number @create_component_from_func def sum_and_print_numbers(number_1: int, number_2: int): print(number_1 + number_2) @pipeline(name="example_pipeline") def example_pipeline(number_1: int, number_2: int): number_1_result = print_and_return_number(number_1).set_display_name("This is number 1") number_2_result = print_and_return_number(number_2).set_display_name("This is number 2") sum_result = sum_and_print_numbers( number_1=number_1_result.output, number_2=number_2_result.output ).set_display_name("This is sum of number 1 and number 2") if __name__ == "__main__": kfp.compiler.Compiler().compile(example_pipeline, "example_pipeline.yaml") ``` 이 스크립트를 실행해서 나온 `example_pipeline.yaml`을 확인하면 다음과 같습니다.

example_pipeline.yaml ```bash apiVersion: argoproj.io/v1alpha1 kind: Workflow metadata: generateName: example-pipeline- annotations: {pipelines.kubeflow.org/kfp_sdk_version: 1.8.9, pipelines.kubeflow.org/pipeline_compilation_time: '2021-12-09T18:11:43.193190', pipelines.kubeflow.org/pipeline_spec: '{"inputs": [{"name": "number_1", "type": "Integer"}, {"name": "number_2", "type": "Integer"}], "name": "example_pipeline"}'} labels: {pipelines.kubeflow.org/kfp_sdk_version: 1.8.9} spec: entrypoint: example-pipeline templates: - name: example-pipeline inputs: parameters: - {name: number_1} - {name: number_2} dag: tasks: - name: print-and-return-number template: print-and-return-number arguments: parameters: - {name: number_1, value: '{{inputs.parameters.number_1}}'} - name: print-and-return-number-2 template: print-and-return-number-2 arguments: parameters: - {name: number_2, value: '{{inputs.parameters.number_2}}'} - name: sum-and-print-numbers template: sum-and-print-numbers dependencies: [print-and-return-number, print-and-return-number-2] arguments: parameters: - {name: print-and-return-number-2-Output, value: '{{tasks.print-and-return-number-2.outputs.parameters.print-and-return-number-2-Output}}'} - {name: print-and-return-number-Output, value: '{{tasks.print-and-return-number.outputs.parameters.print-and-return-number-Output}}'} - name: print-and-return-number container: args: [--number, '{{inputs.parameters.number_1}}', '----output-paths', /tmp/outputs/Output/data] command: - sh - -ec - | program_path=$(mktemp) printf "%s" "$0" > "$program_path" python3 -u "$program_path" "$@" - | def print_and_return_number(number): print(number) return number def _serialize_int(int_value: int) -> str: if isinstance(int_value, str): return int_value if not isinstance(int_value, int): raise TypeError('Value "{}" has type "{}" instead of int.'.format( str(int_value), str(type(int_value)))) return str(int_value) import argparse _parser = argparse.ArgumentParser(prog='Print and return number', description='') _parser.add_argument("--number", dest="number", type=int, required=True, default=argparse.SUPPRESS) _parser.add_argument("----output-paths", dest="_output_paths", type=str, nargs=1) _parsed_args = vars(_parser.parse_args()) _output_files = _parsed_args.pop("_output_paths", []) _outputs = print_and_return_number(**_parsed_args) _outputs = [_outputs] _output_serializers = [ _serialize_int, ] import os for idx, output_file in enumerate(_output_files): try: os.makedirs(os.path.dirname(output_file)) except OSError: pass with open(output_file, 'w') as f: f.write(_output_serializers[idx](_outputs[idx])) image: python:3.7 inputs: parameters: - {name: number_1} outputs: parameters: - name: print-and-return-number-Output valueFrom: {path: /tmp/outputs/Output/data} artifacts: - {name: print-and-return-number-Output, path: /tmp/outputs/Output/data} metadata: annotations: {pipelines.kubeflow.org/task_display_name: This is number 1, pipelines.kubeflow.org/component_spec: '{"implementation": {"container": {"args": ["--number", {"inputValue": "number"}, "----output-paths", {"outputPath": "Output"}], "command": ["sh", "-ec", "program_path=$(mktemp)\nprintf \"%s\" \"$0\" > \"$program_path\"\npython3 -u \"$program_path\" \"$@\"\n", "def print_and_return_number(number):\n print(number)\n return number\n\ndef _serialize_int(int_value: int) -> str:\n if isinstance(int_value, str):\n return int_value\n if not isinstance(int_value, int):\n raise TypeError(''Value \"{}\" has type \"{}\" instead of int.''.format(\n str(int_value), str(type(int_value))))\n return str(int_value)\n\nimport argparse\n_parser = argparse.ArgumentParser(prog=''Print and return number'', description='''')\n_parser.add_argument(\"--number\", dest=\"number\", type=int, required=True, default=argparse.SUPPRESS)\n_parser.add_argument(\"----output-paths\", dest=\"_output_paths\", type=str, nargs=1)\n_parsed_args = vars(_parser.parse_args())\n_output_files = _parsed_args.pop(\"_output_paths\", [])\n\n_outputs = print_and_return_number(**_parsed_args)\n\n_outputs = [_outputs]\n\n_output_serializers = [\n _serialize_int,\n\n]\n\nimport os\nfor idx, output_file in enumerate(_output_files):\n try:\n os.makedirs(os.path.dirname(output_file))\n except OSError:\n pass\n with open(output_file, ''w'') as f:\n f.write(_output_serializers[idx](_outputs[idx]))\n"], "image": "python:3.7"}}, "inputs": [{"name": "number", "type": "Integer"}], "name": "Print and return number", "outputs": [{"name": "Output", "type": "Integer"}]}', pipelines.kubeflow.org/component_ref: '{}', pipelines.kubeflow.org/arguments.parameters: '{"number": "{{inputs.parameters.number_1}}"}'} labels: pipelines.kubeflow.org/kfp_sdk_version: 1.8.9 pipelines.kubeflow.org/pipeline-sdk-type: kfp pipelines.kubeflow.org/enable_caching: "true" - name: print-and-return-number-2 container: args: [--number, '{{inputs.parameters.number_2}}', '----output-paths', /tmp/outputs/Output/data] command: - sh - -ec - | program_path=$(mktemp) printf "%s" "$0" > "$program_path" python3 -u "$program_path" "$@" - | def print_and_return_number(number): print(number) return number def _serialize_int(int_value: int) -> str: if isinstance(int_value, str): return int_value if not isinstance(int_value, int): raise TypeError('Value "{}" has type "{}" instead of int.'.format( str(int_value), str(type(int_value)))) return str(int_value) import argparse _parser = argparse.ArgumentParser(prog='Print and return number', description='') _parser.add_argument("--number", dest="number", type=int, required=True, default=argparse.SUPPRESS) _parser.add_argument("----output-paths", dest="_output_paths", type=str, nargs=1) _parsed_args = vars(_parser.parse_args()) _output_files = _parsed_args.pop("_output_paths", []) _outputs = print_and_return_number(**_parsed_args) _outputs = [_outputs] _output_serializers = [ _serialize_int, ] import os for idx, output_file in enumerate(_output_files): try: os.makedirs(os.path.dirname(output_file)) except OSError: pass with open(output_file, 'w') as f: f.write(_output_serializers[idx](_outputs[idx])) image: python:3.7 inputs: parameters: - {name: number_2} outputs: parameters: - name: print-and-return-number-2-Output valueFrom: {path: /tmp/outputs/Output/data} artifacts: - {name: print-and-return-number-2-Output, path: /tmp/outputs/Output/data} metadata: annotations: {pipelines.kubeflow.org/task_display_name: This is number 2, pipelines.kubeflow.org/component_spec: '{"implementation": {"container": {"args": ["--number", {"inputValue": "number"}, "----output-paths", {"outputPath": "Output"}], "command": ["sh", "-ec", "program_path=$(mktemp)\nprintf \"%s\" \"$0\" > \"$program_path\"\npython3 -u \"$program_path\" \"$@\"\n", "def print_and_return_number(number):\n print(number)\n return number\n\ndef _serialize_int(int_value: int) -> str:\n if isinstance(int_value, str):\n return int_value\n if not isinstance(int_value, int):\n raise TypeError(''Value \"{}\" has type \"{}\" instead of int.''.format(\n str(int_value), str(type(int_value))))\n return str(int_value)\n\nimport argparse\n_parser = argparse.ArgumentParser(prog=''Print and return number'', description='''')\n_parser.add_argument(\"--number\", dest=\"number\", type=int, required=True, default=argparse.SUPPRESS)\n_parser.add_argument(\"----output-paths\", dest=\"_output_paths\", type=str, nargs=1)\n_parsed_args = vars(_parser.parse_args())\n_output_files = _parsed_args.pop(\"_output_paths\", [])\n\n_outputs = print_and_return_number(**_parsed_args)\n\n_outputs = [_outputs]\n\n_output_serializers = [\n _serialize_int,\n\n]\n\nimport os\nfor idx, output_file in enumerate(_output_files):\n try:\n os.makedirs(os.path.dirname(output_file))\n except OSError:\n pass\n with open(output_file, ''w'') as f:\n f.write(_output_serializers[idx](_outputs[idx]))\n"], "image": "python:3.7"}}, "inputs": [{"name": "number", "type": "Integer"}], "name": "Print and return number", "outputs": [{"name": "Output", "type": "Integer"}]}', pipelines.kubeflow.org/component_ref: '{}', pipelines.kubeflow.org/arguments.parameters: '{"number": "{{inputs.parameters.number_2}}"}'} labels: pipelines.kubeflow.org/kfp_sdk_version: 1.8.9 pipelines.kubeflow.org/pipeline-sdk-type: kfp pipelines.kubeflow.org/enable_caching: "true" - name: sum-and-print-numbers container: args: [--number-1, '{{inputs.parameters.print-and-return-number-Output}}', --number-2, '{{inputs.parameters.print-and-return-number-2-Output}}'] command: - sh - -ec - | program_path=$(mktemp) printf "%s" "$0" > "$program_path" python3 -u "$program_path" "$@" - | def sum_and_print_numbers(number_1, number_2): print(number_1 + number_2) import argparse _parser = argparse.ArgumentParser(prog='Sum and print numbers', description='') _parser.add_argument("--number-1", dest="number_1", type=int, required=True, default=argparse.SUPPRESS) _parser.add_argument("--number-2", dest="number_2", type=int, required=True, default=argparse.SUPPRESS) _parsed_args = vars(_parser.parse_args()) _outputs = sum_and_print_numbers(**_parsed_args) image: python:3.7 inputs: parameters: - {name: print-and-return-number-2-Output} - {name: print-and-return-number-Output} metadata: annotations: {pipelines.kubeflow.org/task_display_name: This is sum of number 1 and number 2, pipelines.kubeflow.org/component_spec: '{"implementation": {"container": {"args": ["--number-1", {"inputValue": "number_1"}, "--number-2", {"inputValue": "number_2"}], "command": ["sh", "-ec", "program_path=$(mktemp)\nprintf \"%s\" \"$0\" > \"$program_path\"\npython3 -u \"$program_path\" \"$@\"\n", "def sum_and_print_numbers(number_1, number_2):\n print(number_1 + number_2)\n\nimport argparse\n_parser = argparse.ArgumentParser(prog=''Sum and print numbers'', description='''')\n_parser.add_argument(\"--number-1\", dest=\"number_1\", type=int, required=True, default=argparse.SUPPRESS)\n_parser.add_argument(\"--number-2\", dest=\"number_2\", type=int, required=True, default=argparse.SUPPRESS)\n_parsed_args = vars(_parser.parse_args())\n\n_outputs = sum_and_print_numbers(**_parsed_args)\n"], "image": "python:3.7"}}, "inputs": [{"name": "number_1", "type": "Integer"}, {"name": "number_2", "type": "Integer"}], "name": "Sum and print numbers"}', pipelines.kubeflow.org/component_ref: '{}', pipelines.kubeflow.org/arguments.parameters: '{"number_1": "{{inputs.parameters.print-and-return-number-Output}}", "number_2": "{{inputs.parameters.print-and-return-number-2-Output}}"}'} labels: pipelines.kubeflow.org/kfp_sdk_version: 1.8.9 pipelines.kubeflow.org/pipeline-sdk-type: kfp pipelines.kubeflow.org/enable_caching: "true" arguments: parameters: - {name: number_1} - {name: number_2} serviceAccountName: pipeline-runner ```

이 전의 파일과 비교하면 `pipelines.kubeflow.org/task_display_name` key가 새로 생성되었습니다. ### UI in Kubeflow 위에서 만든 파일을 이용해 이전에 생성한 [파이프라인](../kubeflow/basic-pipeline-upload.md#upload-pipeline-version)의 버전을 올리겠습니다. ![adv-pipeline-0.png](./img/adv-pipeline-0.png) 그러면 위와 같이 설정한 이름이 노출되는 것을 확인할 수 있습니다. ## Resources ### GPU 특별한 설정이 없다면 파이프라인은 컴포넌트를 쿠버네티스 파드(pod)로 실행할 때, 기본 리소스 스펙으로 실행하게 됩니다. 만약 GPU를 사용해 모델을 학습해야 할 때 쿠버네티스상에서 GPU를 할당받지 못해 제대로 학습이 이루어지지 않습니다. 이를 위해 `set_gpu_limit()` [attribute](https://kubeflow-pipelines.readthedocs.io/en/latest/source/kfp.dsl.html?highlight=set_gpu_limit#kfp.dsl.UserContainer.set_gpu_limit)을 이용해 설정할 수 있습니다. ```python import kfp from kfp.components import create_component_from_func from kfp.dsl import pipeline @create_component_from_func def print_and_return_number(number: int) -> int: print(number) return number @create_component_from_func def sum_and_print_numbers(number_1: int, number_2: int): print(number_1 + number_2) @pipeline(name="example_pipeline") def example_pipeline(number_1: int, number_2: int): number_1_result = print_and_return_number(number_1).set_display_name("This is number 1") number_2_result = print_and_return_number(number_2).set_display_name("This is number 2") sum_result = sum_and_print_numbers( number_1=number_1_result.output, number_2=number_2_result.output ).set_display_name("This is sum of number 1 and number 2").set_gpu_limit(1) if __name__ == "__main__": kfp.compiler.Compiler().compile(example_pipeline, "example_pipeline.yaml") ``` 위의 스크립트를 실행하면 생성된 파일에서 `sum-and-print-numbers`를 자세히 보면 resources에 `{nvidia.com/gpu: 1}` 도 추가된 것을 볼 수 있습니다. 이를 통해 GPU를 할당받을 수 있습니다. ```bash - name: sum-and-print-numbers container: args: [--number-1, '{{inputs.parameters.print-and-return-number-Output}}', --number-2, '{{inputs.parameters.print-and-return-number-2-Output}}'] command: - sh - -ec - | program_path=$(mktemp) printf "%s" "$0" > "$program_path" python3 -u "$program_path" "$@" - | def sum_and_print_numbers(number_1, number_2): print(number_1 + number_2) import argparse _parser = argparse.ArgumentParser(prog='Sum and print numbers', description='') _parser.add_argument("--number-1", dest="number_1", type=int, required=True, default=argparse.SUPPRESS) _parser.add_argument("--number-2", dest="number_2", type=int, required=True, default=argparse.SUPPRESS) _parsed_args = vars(_parser.parse_args()) _outputs = sum_and_print_numbers(**_parsed_args) image: python:3.7 resources: limits: {nvidia.com/gpu: 1} ``` ### CPU cpu의 개수를 정하기 위해서 이용하는 함수는 `.set_cpu_limit()` [attribute](https://kubeflow-pipelines.readthedocs.io/en/latest/source/kfp.dsl.html?highlight=set_gpu_limit#kfp.dsl.Sidecar.set_cpu_limit)을 이용해 설정할 수 있습니다. gpu와는 다른 점은 int가 아닌 string으로 입력해야 한다는 점입니다. ```python import kfp from kfp.components import create_component_from_func from kfp.dsl import pipeline @create_component_from_func def print_and_return_number(number: int) -> int: print(number) return number @create_component_from_func def sum_and_print_numbers(number_1: int, number_2: int): print(number_1 + number_2) @pipeline(name="example_pipeline") def example_pipeline(number_1: int, number_2: int): number_1_result = print_and_return_number(number_1).set_display_name("This is number 1") number_2_result = print_and_return_number(number_2).set_display_name("This is number 2") sum_result = sum_and_print_numbers( number_1=number_1_result.output, number_2=number_2_result.output ).set_display_name("This is sum of number 1 and number 2").set_gpu_limit(1).set_cpu_limit("16") if __name__ == "__main__": kfp.compiler.Compiler().compile(example_pipeline, "example_pipeline.yaml") ``` 바뀐 부분만 확인하면 다음과 같습니다. ```bash resources: limits: {nvidia.com/gpu: 1, cpu: '16'} ``` ### Memory 메모리는 `.set_memory_limit()` [attribute](https://kubeflow-pipelines.readthedocs.io/en/latest/source/kfp.dsl.html?highlight=set_gpu_limit#kfp.dsl.Sidecar.set_memory_limit)을 이용해 설정할 수 있습니다. ```python import kfp from kfp.components import create_component_from_func from kfp.dsl import pipeline @create_component_from_func def print_and_return_number(number: int) -> int: print(number) return number @create_component_from_func def sum_and_print_numbers(number_1: int, number_2: int): print(number_1 + number_2) @pipeline(name="example_pipeline") def example_pipeline(number_1: int, number_2: int): number_1_result = print_and_return_number(number_1).set_display_name("This is number 1") number_2_result = print_and_return_number(number_2).set_display_name("This is number 2") sum_result = sum_and_print_numbers( number_1=number_1_result.output, number_2=number_2_result.output ).set_display_name("This is sum of number 1 and number 2").set_gpu_limit(1).set_memory_limit("1G") if __name__ == "__main__": kfp.compiler.Compiler().compile(example_pipeline, "example_pipeline.yaml") ``` 바뀐 부분만 확인하면 다음과 같습니다. ```bash resources: limits: {nvidia.com/gpu: 1, memory: 1G} ``` ================================================ FILE: versioned_docs/version-1.0/kubeflow/advanced-run.md ================================================ --- title : "11. Pipeline - Run Result" description: "" sidebar_position: 11 contributors: ["Jongseob Jeon", "SeungTae Kim"] --- ## Run Result Run 실행 결과를 눌러보면 3개의 탭이 존재합니다. 각각 Graph, Run output, Config 입니다. ![advanced-run-0.png](./img/advanced-run-0.png) ## Graph ![advanced-run-1.png](./img/advanced-run-1.png) 그래프에서는 실행된 컴포넌트를 누르면 컴포넌트의 실행 정보를 확인할 수 있습니다. ### Input/Output Input/Output 탭은 컴포넌트에서 사용한 Config들과 Input, Output Artifacts를 확인하고 다운로드 받을 수 있습니다. ### Logs Logs에서는 파이썬 코드 실행 중 나오는 모든 stdout을 확인할 수 있습니다. 다만 pod은 일정 시간이 지난 후 지워지기 때문에 일정 시간이 지나면 이 탭에서는 확인할 수 없습니다. 이때는 Output artifacts의 main-logs에서 확인할 수 있습니다. ### Visualizations Visualizations에서는 컴포넌트에서 생성된 플랏을 보여줍니다. 플랏을 생성하기 위해서는 `mlpipeline_ui_metadata: OutputPath("UI_Metadata")` argument로 보여주고 싶은 값을 저장하면 됩니다. 이 때 플랏의 형태는 html 포맷이어야 합니다. 변환하는 과정은 다음과 같습니다. ```python @partial( create_component_from_func, packages_to_install=["matplotlib"], ) def plot_linear( mlpipeline_ui_metadata: OutputPath("UI_Metadata") ): import base64 import json from io import BytesIO import matplotlib.pyplot as plt plt.plot(x=[1, 2, 3], y=[1, 2,3]) tmpfile = BytesIO() plt.savefig(tmpfile, format="png") encoded = base64.b64encode(tmpfile.getvalue()).decode("utf-8") html = f"" metadata = { "outputs": [ { "type": "web-app", "storage": "inline", "source": html, }, ], } with open(mlpipeline_ui_metadata, "w") as html_writer: json.dump(metadata, html_writer) ``` 파이프라인으로 작성하면 다음과 같이 됩니다. ```python from functools import partial import kfp from kfp.components import create_component_from_func, OutputPath from kfp.dsl import pipeline @partial( create_component_from_func, packages_to_install=["matplotlib"], ) def plot_linear(mlpipeline_ui_metadata: OutputPath("UI_Metadata")): import base64 import json from io import BytesIO import matplotlib.pyplot as plt plt.plot([1, 2, 3], [1, 2, 3]) tmpfile = BytesIO() plt.savefig(tmpfile, format="png") encoded = base64.b64encode(tmpfile.getvalue()).decode("utf-8") html = f"" metadata = { "outputs": [ { "type": "web-app", "storage": "inline", "source": html, }, ], } with open(mlpipeline_ui_metadata, "w") as html_writer: json.dump(metadata, html_writer) @pipeline(name="plot_pipeline") def plot_pipeline(): plot_linear() if __name__ == "__main__": kfp.compiler.Compiler().compile(plot_pipeline, "plot_pipeline.yaml") ``` 이 스크립트를 실행해서 나온 `plot_pipeline.yaml`을 확인하면 다음과 같습니다.

plot_pipeline.yaml ```bash apiVersion: argoproj.io/v1alpha1 kind: Workflow metadata: generateName: plot-pipeline- annotations: {pipelines.kubeflow.org/kfp_sdk_version: 1.8.9, pipelines.kubeflow.org/pipeline_compilation_time: '2 022-01-17T13:31:32.963214', pipelines.kubeflow.org/pipeline_spec: '{"name": "plot_pipeline"}'} labels: {pipelines.kubeflow.org/kfp_sdk_version: 1.8.9} spec: entrypoint: plot-pipeline templates: - name: plot-linear container: args: [--mlpipeline-ui-metadata, /tmp/outputs/mlpipeline_ui_metadata/data] command: - sh - -c - (PIP_DISABLE_PIP_VERSION_CHECK=1 python3 -m pip install --quiet --no-warn-script-location 'matplotlib' || PIP_DISABLE_PIP_VERSION_CHECK=1 python3 -m pip install --quiet --no-warn-script-location 'matplotlib' --user) && "$0" "$@" - sh - -ec - | program_path=$(mktemp) printf "%s" "$0" > "$program_path" python3 -u "$program_path" "$@" - | def _make_parent_dirs_and_return_path(file_path: str): import os os.makedirs(os.path.dirname(file_path), exist_ok=True) return file_path def plot_linear(mlpipeline_ui_metadata): import base64 import json from io import BytesIO import matplotlib.pyplot as plt plt.plot([1, 2, 3], [1, 2, 3]) tmpfile = BytesIO() plt.savefig(tmpfile, format="png") encoded = base64.b64encode(tmpfile.getvalue()).decode("utf-8") html = f"" metadata = { "outputs": [ { "type": "web-app", "storage": "inline", "source": html, }, ], } with open(mlpipeline_ui_metadata, "w") as html_writer: json.dump(metadata, html_writer) import argparse _parser = argparse.ArgumentParser(prog='Plot linear', description='') _parser.add_argument("--mlpipeline-ui-metadata", dest="mlpipeline_ui_metadata", type=_make_parent_dirs_and_return_path, required=True, default=argparse.SUPPRESS) _parsed_args = vars(_parser.parse_args()) _outputs = plot_linear(**_parsed_args) image: python:3.7 outputs: artifacts: - {name: mlpipeline-ui-metadata, path: /tmp/outputs/mlpipeline_ui_metadata/data} metadata: labels: pipelines.kubeflow.org/kfp_sdk_version: 1.8.9 pipelines.kubeflow.org/pipeline-sdk-type: kfp pipelines.kubeflow.org/enable_caching: "true" annotations: {pipelines.kubeflow.org/component_spec: '{"implementation": {"container": {"args": ["--mlpipeline-ui-metadata", {"outputPath": "mlpipeline_ui_metadata"}], "command": ["sh", "-c", "(PIP_DISABLE_PIP_VERSION_CHECK=1 python3 -m pip install --quiet --no-warn-script-location ''matplotlib'' || PIP_DISABLE_PIP_VERSION_CHECK=1 python3 -m pip install --quiet --no-warn-script-location ''matplotlib'' --user) && \"$0\" \"$@\"", "sh", "-ec", "program_path=$(mktemp)\nprintf \"%s\" \"$0\" > \"$program_path\"\npython3 -u \"$program_path\" \"$@\"\n", "def _make_parent_dirs_and_return_path(file_path: str):\n import os\n os.makedirs(os.path.dirname(file_path), exist_ok=True)\n return file_path\n\ndef plot_linear(mlpipeline_ui_metadata):\n import base64\n import json\n from io import BytesIO\n\n import matplotlib.pyplot as plt\n\n plt.plot([1, 2, 3], [1, 2, 3])\n\n tmpfile = BytesIO()\n plt.savefig(tmpfile, format=\"png\")\n encoded = base64.b64encode(tmpfile.getvalue()).decode(\"utf-8\")\n\n html = f\"\"\n metadata = {\n \"outputs\": [\n {\n \"type\": \"web-app\",\n \"storage\": \"inline\",\n \"source\": html,\n },\n ],\n }\n with open(mlpipeline_ui_metadata, \"w\") as html_writer:\n json.dump(metadata, html_writer)\n\nimport argparse\n_parser = argparse.ArgumentParser(prog=''Plot linear'', description='''')\n_parser.add_argument(\"--mlpipeline-ui-metadata\", dest=\"mlpipeline_ui_metadata\", type=_make_parent_dirs_and_return_path, required=True, default=argparse.SUPPRESS)\n_parsed_args = vars(_parser.parse_args())\n\n_outputs = plot_linear(**_parsed_args)\n"], "image": "python:3.7"}}, "name": "Plot linear", "outputs": [{"name": "mlpipeline_ui_metadata", "type": "UI_Metadata"}]}', pipelines.kubeflow.org/component_ref: '{}'} - name: plot-pipeline dag: tasks: - {name: plot-linear, template: plot-linear} arguments: parameters: [] serviceAccountName: pipeline-runner ```

실행 후 Visualization을 클릭합니다. ![advanced-run-5.png](./img/advanced-run-5.png) ## Run output ![advanced-run-2.png](./img/advanced-run-2.png) Run output은 kubeflow에서 지정한 형태로 생긴 Artifacts를 모아서 보여주는 곳이며 평가 지표(Metric)를 보여줍니다. 평가 지표(Metric)을 보여주기 위해서는 `mlpipeline_metrics_path: OutputPath("Metrics")` argument에 보여주고 싶은 이름과 값을 json 형태로 저장하면 됩니다. 예를 들어서 다음과 같이 작성할 수 있습니다. ```python @create_component_from_func def show_metric_of_sum( number: int, mlpipeline_metrics_path: OutputPath("Metrics"), ): import json metrics = { "metrics": [ { "name": "sum_value", "numberValue": number, }, ], } with open(mlpipeline_metrics_path, "w") as f: json.dump(metrics, f) ``` 평가 지표를 생성하는 컴포넌트를 [파이프라인](../kubeflow/basic-pipeline.md)에서 생성한 파이프라인에 추가 후 실행해 보겠습니다. 전체 파이프라인은 다음과 같습니다. ```python import kfp from kfp.components import create_component_from_func, OutputPath from kfp.dsl import pipeline @create_component_from_func def print_and_return_number(number: int) -> int: print(number) return number @create_component_from_func def sum_and_print_numbers(number_1: int, number_2: int) -> int: sum_number = number_1 + number_2 print(sum_number) return sum_number @create_component_from_func def show_metric_of_sum( number: int, mlpipeline_metrics_path: OutputPath("Metrics"), ): import json metrics = { "metrics": [ { "name": "sum_value", "numberValue": number, }, ], } with open(mlpipeline_metrics_path, "w") as f: json.dump(metrics, f) @pipeline(name="example_pipeline") def example_pipeline(number_1: int, number_2: int): number_1_result = print_and_return_number(number_1) number_2_result = print_and_return_number(number_2) sum_result = sum_and_print_numbers( number_1=number_1_result.output, number_2=number_2_result.output ) show_metric_of_sum(sum_result.output) if __name__ == "__main__": kfp.compiler.Compiler().compile(example_pipeline, "example_pipeline.yaml") ``` 실행 후 Run Output을 클릭하면 다음과 같이 나옵니다. ![advanced-run-4.png](./img/advanced-run-4.png) ## Config ![advanced-run-3.png](./img/advanced-run-3.png) Config에서는 파이프라인 Config로 입력받은 모든 값을 확인할 수 있습니다. ================================================ FILE: versioned_docs/version-1.0/kubeflow/basic-component.md ================================================ --- title : "4. Component - Write" description: "" sidebar_position: 4 contributors: ["Jongseob Jeon"] --- ## Component 컴포넌트(Component)를 작성하기 위해서는 다음과 같은 내용을 작성해야 합니다. 1. 컴포넌트 콘텐츠(Component Contents) 작성 2. 컴포넌트 래퍼(Component Wrapper) 작성 이제 각 과정에 대해서 알아보도록 하겠습니다. ## Component Contents 컴포넌트 콘텐츠는 우리가 흔히 작성하는 파이썬 코드와 다르지 않습니다. 예를 들어서 숫자를 입력으로 받고 입력받은 숫자를 출력한 뒤 반환하는 컴포넌트를 작성해 보겠습니다. 파이썬 코드로 작성하면 다음과 같이 작성할 수 있습니다. ```python print(number) ``` 그런데 이 코드를 실행하면 에러가 나고 동작하지 않는데 그 이유는 출력해야 할 `number`가 정의되어 있지 않기 때문입니다. [Kubeflow Concepts](../kubeflow/kubeflow-concepts.md)에서 `number` 와 같이 컴포넌트 콘텐츠에서 필요한 값들은 **Config**로 정의한다고 했습니다. 컴포넌트 콘텐츠를 실행시키기 위해 필요한 Config들은 컴포넌트 래퍼에서 전달이 되어야 합니다. ## Component Wrapper ### Define a standalone Python function 이제 필요한 Config를 전달할 수 있도록 컴포넌트 래퍼를 만들어야 합니다. 별도의 Config 없이 컴포넌트 래퍼로 감쌀 경우 다음과 같이 됩니다. ```python def print_and_return_number(): print(number) return number ``` 이제 콘텐츠에서 필요한 Config를 래퍼의 argument로 추가합니다. 다만, argument 만을 적는 것이 아니라 argument의 타입 힌트도 작성해야 합니다. Kubeflow에서는 파이프라인을 Kubeflow 포맷으로 변환할 때, 컴포넌트 간의 연결에서 정해진 입력과 출력의 타입이 일치하는지 체크합니다. 만약 컴포넌트가 필요로 하는 입력과 다른 컴포넌트로부터 전달받은 출력의 포맷이 일치하지 않을 경우 파이프라인 생성을 할 수 없습니다. 이제 다음과 같이 argument와 그 타입, 그리고 반환하는 타입을 적어서 컴포넌트 래퍼를 완성합니다. ```python def print_and_return_number(number: int) -> int: print(number) return number ``` Kubeflow에서 반환 값으로 사용할 수 있는 타입은 json에서 표현할 수 있는 타입들만 사용할 수 있습니다. 대표적으로 사용되며 권장하는 타입들은 다음과 같습니다. - int - float - str 만약 단일 값이 아닌 여러 값을 반환하려면 `collections.namedtuple` 을 이용해야 합니다. 자세한 내용은 [Kubeflow 공식 문서](https://www.kubeflow.org/docs/components/pipelines/sdk/python-function-components/#passing-parameters-by-value)를 참고 하시길 바랍니다. 예를 들어서 입력받은 숫자를 2로 나눈 몫과 나머지를 반환하는 컴포넌트는 다음과 같이 작성해야 합니다. ```python from typing import NamedTuple def divide_and_return_number( number: int, ) -> NamedTuple("DivideOutputs", [("quotient", int), ("remainder", int)]): from collections import namedtuple quotient, remainder = divmod(number, 2) print("quotient is", quotient) print("remainder is", remainder) divide_outputs = namedtuple( "DivideOutputs", [ "quotient", "remainder", ], ) return divide_outputs(quotient, remainder) ``` ### Convert to Kubeflow Format 이제 작성한 컴포넌트를 kubeflow에서 사용할 수 있는 포맷으로 변환해야 합니다. 변환은 `kfp.components.create_component_from_func` 를 통해서 할 수 있습니다. 이렇게 변환된 형태는 파이썬에서 함수로 import 하여서 파이프라인에서 사용할 수 있습니다. ```python from kfp.components import create_component_from_func @create_component_from_func def print_and_return_number(number: int) -> int: print(number) return number ``` ### Share component with yaml file 만약 파이썬 코드로 공유를 할 수 없는 경우 YAML 파일로 컴포넌트를 공유해서 사용할 수 있습니다. 이를 위해서는 우선 컴포넌트를 YAML 파일로 변환한 뒤 `kfp.components.load_component_from_file` 을 통해 파이프라인에서 사용할 수 있습니다. 우선 작성한 컴포넌트를 YAML 파일로 변환하는 과정에 대해서 설명합니다. ```python from kfp.components import create_component_from_func @create_component_from_func def print_and_return_number(number: int) -> int: print(number) return number if __name__ == "__main__": print_and_return_number.component_spec.save("print_and_return_number.yaml") ``` 작성한 파이썬 코드를 실행하면 `print_and_return_number.yaml` 파일이 생성됩니다. 파일을 확인하면 다음과 같습니다. ```bash name: Print and return number inputs: - {name: number, type: Integer} outputs: - {name: Output, type: Integer} implementation: container: image: python:3.7 command: - sh - -ec - | program_path=$(mktemp) printf "%s" "$0" > "$program_path" python3 -u "$program_path" "$@" - | def print_and_return_number(number): print(number) return number def _serialize_int(int_value: int) -> str: if isinstance(int_value, str): return int_value if not isinstance(int_value, int): raise TypeError('Value "{}" has type "{}" instead of int.'.format(str(int_value), str(type(int_value)))) return str(int_value) import argparse _parser = argparse.ArgumentParser(prog='Print and return number', description='') _parser.add_argument("--number", dest="number", type=int, required=True, default=argparse.SUPPRESS) _parser.add_argument("----output-paths", dest="_output_paths", type=str, nargs=1) _parsed_args = vars(_parser.parse_args()) _output_files = _parsed_args.pop("_output_paths", []) _outputs = print_and_return_number(**_parsed_args) _outputs = [_outputs] _output_serializers = [ _serialize_int, ] import os for idx, output_file in enumerate(_output_files): try: os.makedirs(os.path.dirname(output_file)) except OSError: pass with open(output_file, 'w') as f: f.write(_output_serializers[idx](_outputs[idx])) args: - --number - {inputValue: number} - '----output-paths' - {outputPath: Output} ``` 이제 생성된 파일을 공유해서 파이프라인에서 다음과 같이 사용할 수 있습니다. ```python from kfp.components import load_component_from_file print_and_return_number = load_component_from_file("print_and_return_number.yaml") ``` ## How Kubeflow executes component Kubeflow에서 컴포넌트가 실행되는 순서는 다음과 같습니다. 1. `docker pull `: 정의된 컴포넌트의 실행 환경 정보가 담긴 이미지를 pull 2. run `command`: pull 한 이미지에서 컴포넌트 콘텐츠를 실행합니다. `print_and_return_number.yaml` 를 예시로 들자면 `@create_component_from_func` 의 default image 는 python:3.7 이므로 해당 이미지를 기준으로 컴포넌트 콘텐츠를 실행하게 됩니다. 1. `docker pull python:3.7` 2. `print(number)` ## References: - [Getting Started With Python function based components](https://www.kubeflow.org/docs/components/pipelines/sdk/python-function-components/#getting-started-with-python-function-based-components) ================================================ FILE: versioned_docs/version-1.0/kubeflow/basic-pipeline-upload.md ================================================ --- title : "6. Pipeline - Upload" description: "" sidebar_position: 6 contributors: ["Jongseob Jeon"] --- ## Upload Pipeline 이제 우리가 만든 파이프라인을 직접 kubeflow에서 업로드 해 보겠습니다. 파이프라인 업로드는 kubeflow 대시보드 UI를 통해 진행할 수 있습니다. [Install Kubeflow](../setup-components/install-components-kf.md#정상-설치-확인) 에서 사용한 방법을 이용해 포트포워딩합니다. ```bash kubectl port-forward svc/istio-ingressgateway -n istio-system 8080:80 ``` [http://localhost:8080](http://localhost:8080)에 접속해 대시보드를 열어줍니다. ### 1. Pipelines 탭 선택 ![pipeline-gui-0.png](./img/pipeline-gui-0.png) ### 2. Upload Pipeline 선택 ![pipeline-gui-1.png](./img/pipeline-gui-1.png) ### 3. Choose file 선택 ![pipeline-gui-2.png](./img/pipeline-gui-2.png) ### 4. 생성된 yaml파일 업로드 ![pipeline-gui-3.png](./img/pipeline-gui-3.png) ### 5. Create ![pipeline-gui-4.png](./img/pipeline-gui-4.png) ## Upload Pipeline Version 업로드된 파이프라인은 업로드를 통해서 버전을 관리할 수 있습니다. 다만 깃헙과 같은 코드 차원의 버전 관리가 아닌 같은 이름의 파이프라인을 모아서 보여주는 역할을 합니다. 위의 예시에서 파이프라인을 업로드한 경우 다음과 같이 example_pipeline이 생성된 것을 확인할 수 있습니다. ![pipeline-gui-5.png](./img/pipeline-gui-5.png) 클릭하면 다음과 같은 화면이 나옵니다. ![pipeline-gui-4.png](./img/pipeline-gui-4.png) Upload Version을 클릭하면 다음과 같이 파이프라인을 업로드할 수 있는 화면이 생성됩니다. ![pipeline-gui-6.png](./img/pipeline-gui-6.png) 파이프라인을 업로드 합니다. ![pipeline-gui-7.png](./img/pipeline-gui-7.png) 업로드된 경우 다음과 같이 파이프라인 버전을 확인할 수 있습니다. ![pipeline-gui-8.png](./img/pipeline-gui-8.png) ================================================ FILE: versioned_docs/version-1.0/kubeflow/basic-pipeline.md ================================================ --- title : "5. Pipeline - Write" description: "" sidebar_position: 5 contributors: ["Jongseob Jeon"] --- ## Pipeline 컴포넌트는 독립적으로 실행되지 않고 파이프라인의 구성요소로써 실행됩니다. 그러므로 컴포넌트를 실행해 보려면 파이프라인을 작성해야 합니다. 그리고 파이프라인을 작성하기 위해서는 컴포넌트의 집합과 컴포넌트의 실행 순서가 필요합니다. 이번 페이지에서는 숫자를 입력받고 출력하는 컴포넌트와 두 개의 컴포넌트로부터 숫자를 받아서 합을 출력하는 컴포넌트가 있는 파이프라인을 만들어 보도록 하겠습니다. ## Component Set 우선 파이프라인에서 사용할 컴포넌트들을 작성합니다. 1. `print_and_return_number` 입력받은 숫자를 출력하고 반환하는 컴포넌트입니다. 컴포넌트가 입력받은 값을 반환하기 때문에 int를 return의 타입 힌트로 입력합니다. ```python @create_component_from_func def print_and_return_number(number: int) -> int: print(number) return number ``` 2. `sum_and_print_numbers` 입력받은 두 개의 숫자의 합을 출력하는 컴포넌트입니다. 이 컴포넌트 역시 두 숫자의 합을 반환하기 때문에 int를 return의 타입 힌트로 입력합니다. ```python @create_component_from_func def sum_and_print_numbers(number_1: int, number_2: int) -> int: sum_num = number_1 + number_2 print(sum_num) return sum_num ``` ## Component Order ### Define Order 필요한 컴포넌트의 집합을 만들었으면, 다음으로는 이들의 순서를 정의해야 합니다. 이번 페이지에서 만들 파이프라인의 순서를 그림으로 표현하면 다음과 같이 됩니다. ![pipeline-0.png](./img/pipeline-0.png) ### Single Output 이제 이 순서를 코드로 옮겨보겠습니다. 우선 위의 그림에서 `print_and_return_number_1` 과 `print_and_return_number_2` 를 작성하면 다음과 같이 됩니다. ```python def example_pipeline(): number_1_result = print_and_return_number(number_1) number_2_result = print_and_return_number(number_2) ``` 컴포넌트를 실행하고 그 반환 값을 각각 `number_1_result` 와 `number_2_result` 에 저장합니다. 저장된 `number_1_result` 의 반환 값은 `number_1_resulst.output` 를 통해 사용할 수 있습니다. ### Multi Output 위의 예시에서 컴포넌트는 단일 값만을 반환하기 때문에 `output`을 이용해 바로 사용할 수 있습니다. 만약, 여러 개의 반환 값이 있다면 `outputs`에 저장이 되며 dict 타입이기에 key를 이용해 원하는 반환 값을 사용할 수 있습니다. 예를 들어서 앞에서 작성한 여러 개를 반환하는 [컴포넌트](../kubeflow/basic-component.md#define-a-standalone-python-function) 의 경우를 보겠습니다. `divde_and_return_number` 의 return 값은 `quotient` 와 `remainder` 가 있습니다. 이 두 값을 `print_and_return_number` 에 전달하는 예시를 보면 다음과 같습니다. ```python def multi_pipeline(): divided_result = divde_and_return_number(number) num_1_result = print_and_return_number(divided_result.outputs["quotient"]) num_2_result = print_and_return_number(divided_result.outputs["remainder"]) ``` `divde_and_return_number`의 결과를 `divided_result`에 저장하고 각각 `divided_result.outputs["quotient"]`, `divided_result.outputs["remainder"]`로 값을 가져올 수 있습니다. ### Write to python code 이제 다시 본론으로 돌아와서 이 두 값의 결과를 `sum_and_print_numbers` 에 전달합니다. ```python def example_pipeline(): number_1_result = print_and_return_number(number_1) number_2_result = print_and_return_number(number_2) sum_result = sum_and_print_numbers( number_1=number_1_result.output, number_2=number_2_result.output ) ``` 다음으로 각 컴포넌트에 필요한 Config들을 모아서 파이프라인 Config로 정의 합니다. ```python def example_pipeline(number_1: int, number_2:int): number_1_result = print_and_return_number(number_1) number_2_result = print_and_return_number(number_2) sum_result = sum_and_print_numbers( number_1=number_1_result.output, number_2=number_2_result.output ) ``` ## Convert to Kubeflow Format 마지막으로 kubeflow에서 사용할 수 있는 형식으로 변환합니다. 변환은 `kfp.dsl.pipeline` 함수를 이용해 할 수 있습니다. ```python from kfp.dsl import pipeline @pipeline(name="example_pipeline") def example_pipeline(number_1: int, number_2: int): number_1_result = print_and_return_number(number_1) number_2_result = print_and_return_number(number_2) sum_result = sum_and_print_numbers( number_1=number_1_result.output, number_2=number_2_result.output ) ``` Kubeflow에서 파이프라인을 실행하기 위해서는 yaml 형식으로만 가능하기 때문에 생성한 파이프라인을 정해진 yaml 형식으로 컴파일(Compile) 해 주어야 합니다. 컴파일은 다음 명령어를 이용해 생성할 수 있습니다. ```python if __name__ == "__main__": import kfp kfp.compiler.Compiler().compile(example_pipeline, "example_pipeline.yaml") ``` ## Conclusion 앞서 설명한 내용을 한 파이썬 코드로 모으면 다음과 같이 됩니다. ```python import kfp from kfp.components import create_component_from_func from kfp.dsl import pipeline @create_component_from_func def print_and_return_number(number: int) -> int: print(number) return number @create_component_from_func def sum_and_print_numbers(number_1: int, number_2: int): print(number_1 + number_2) @pipeline(name="example_pipeline") def example_pipeline(number_1: int, number_2: int): number_1_result = print_and_return_number(number_1) number_2_result = print_and_return_number(number_2) sum_result = sum_and_print_numbers( number_1=number_1_result.output, number_2=number_2_result.output ) if __name__ == "__main__": kfp.compiler.Compiler().compile(example_pipeline, "example_pipeline.yaml") ``` 컴파일된 결과를 보면 다음과 같습니다.
example_pipeline.yaml ```bash apiVersion: argoproj.io/v1alpha1 kind: Workflow metadata: generateName: example-pipeline- annotations: {pipelines.kubeflow.org/kfp_sdk_version: 1.6.3, pipelines.kubeflow.org/pipeline_compilation_time: '2021-12-05T13:38:51.566777', pipelines.kubeflow.org/pipeline_spec: '{"inputs": [{"name": "number_1", "type": "Integer"}, {"name": "number_2", "type": "Integer"}], "name": "example_pipeline"}'} labels: {pipelines.kubeflow.org/kfp_sdk_version: 1.6.3} spec: entrypoint: example-pipeline templates: - name: example-pipeline inputs: parameters: - {name: number_1} - {name: number_2} dag: tasks: - name: print-and-return-number template: print-and-return-number arguments: parameters: - {name: number_1, value: '{{inputs.parameters.number_1}}'} - name: print-and-return-number-2 template: print-and-return-number-2 arguments: parameters: - {name: number_2, value: '{{inputs.parameters.number_2}}'} - name: sum-and-print-numbers template: sum-and-print-numbers dependencies: [print-and-return-number, print-and-return-number-2] arguments: parameters: - {name: print-and-return-number-2-Output, value: '{{tasks.print-and-return-number-2.outputs.parameters.print-and-return-number-2-Output}}'} - {name: print-and-return-number-Output, value: '{{tasks.print-and-return-number.outputs.parameters.print-and-return-number-Output}}'} - name: print-and-return-number container: args: [--number, '{{inputs.parameters.number_1}}', '----output-paths', /tmp/outputs/Output/data] command: - sh - -ec - | program_path=$(mktemp) printf "%s" "$0" > "$program_path" python3 -u "$program_path" "$@" - | def print_and_return_number(number): print(number) return number def _serialize_int(int_value: int) -> str: if isinstance(int_value, str): return int_value if not isinstance(int_value, int): raise TypeError('Value "{}" has type "{}" instead of int.'.format(str(int_value), str(type(int_value)))) return str(int_value) import argparse _parser = argparse.ArgumentParser(prog='Print and return number', description='') _parser.add_argument("--number", dest="number", type=int, required=True, default=argparse.SUPPRESS) _parser.add_argument("----output-paths", dest="_output_paths", type=str, nargs=1) _parsed_args = vars(_parser.parse_args()) _output_files = _parsed_args.pop("_output_paths", []) _outputs = print_and_return_number(**_parsed_args) _outputs = [_outputs] _output_serializers = [ _serialize_int, ] import os for idx, output_file in enumerate(_output_files): try: os.makedirs(os.path.dirname(output_file)) except OSError: pass with open(output_file, 'w') as f: f.write(_output_serializers[idx](_outputs[idx])) image: python:3.7 inputs: parameters: - {name: number_1} outputs: parameters: - name: print-and-return-number-Output valueFrom: {path: /tmp/outputs/Output/data} artifacts: - {name: print-and-return-number-Output, path: /tmp/outputs/Output/data} metadata: labels: {pipelines.kubeflow.org/kfp_sdk_version: 1.6.3, pipelines.kubeflow.org/pipeline-sdk-type: kfp} annotations: {pipelines.kubeflow.org/component_spec: '{"implementation": {"container": {"args": ["--number", {"inputValue": "number"}, "----output-paths", {"outputPath": "Output"}], "command": ["sh", "-ec", "program_path=$(mktemp)\nprintf \"%s\" \"$0\" > \"$program_path\"\npython3 -u \"$program_path\" \"$@\"\n", "def print_and_return_number(number):\n print(number)\n return number\n\ndef _serialize_int(int_value: int) -> str:\n if isinstance(int_value, str):\n return int_value\n if not isinstance(int_value, int):\n raise TypeError(''Value \"{}\" has type \"{}\" instead of int.''.format(str(int_value), str(type(int_value))))\n return str(int_value)\n\nimport argparse\n_parser = argparse.ArgumentParser(prog=''Print and return number'', description='''')\n_parser.add_argument(\"--number\", dest=\"number\", type=int, required=True, default=argparse.SUPPRESS)\n_parser.add_argument(\"----output-paths\", dest=\"_output_paths\", type=str, nargs=1)\n_parsed_args = vars(_parser.parse_args())\n_output_files = _parsed_args.pop(\"_output_paths\", [])\n\n_outputs = print_and_return_number(**_parsed_args)\n\n_outputs = [_outputs]\n\n_output_serializers = [\n _serialize_int,\n\n]\n\nimport os\nfor idx, output_file in enumerate(_output_files):\n try:\n os.makedirs(os.path.dirname(output_file))\n except OSError:\n pass\n with open(output_file, ''w'') as f:\n f.write(_output_serializers[idx](_outputs[idx]))\n"], "image": "python:3.7"}}, "inputs": [{"name": "number", "type": "Integer"}], "name": "Print and return number", "outputs": [{"name": "Output", "type": "Integer"}]}', pipelines.kubeflow.org/component_ref: '{}', pipelines.kubeflow.org/arguments.parameters: '{"number": "{{inputs.parameters.number_1}}"}'} - name: print-and-return-number-2 container: args: [--number, '{{inputs.parameters.number_2}}', '----output-paths', /tmp/outputs/Output/data] command: - sh - -ec - | program_path=$(mktemp) printf "%s" "$0" > "$program_path" python3 -u "$program_path" "$@" - | def print_and_return_number(number): print(number) return number def _serialize_int(int_value: int) -> str: if isinstance(int_value, str): return int_value if not isinstance(int_value, int): raise TypeError('Value "{}" has type "{}" instead of int.'.format(str(int_value), str(type(int_value)))) return str(int_value) import argparse _parser = argparse.ArgumentParser(prog='Print and return number', description='') _parser.add_argument("--number", dest="number", type=int, required=True, default=argparse.SUPPRESS) _parser.add_argument("----output-paths", dest="_output_paths", type=str, nargs=1) _parsed_args = vars(_parser.parse_args()) _output_files = _parsed_args.pop("_output_paths", []) _outputs = print_and_return_number(**_parsed_args) _outputs = [_outputs] _output_serializers = [ _serialize_int, ] import os for idx, output_file in enumerate(_output_files): try: os.makedirs(os.path.dirname(output_file)) except OSError: pass with open(output_file, 'w') as f: f.write(_output_serializers[idx](_outputs[idx])) image: python:3.7 inputs: parameters: - {name: number_2} outputs: parameters: - name: print-and-return-number-2-Output valueFrom: {path: /tmp/outputs/Output/data} artifacts: - {name: print-and-return-number-2-Output, path: /tmp/outputs/Output/data} metadata: labels: {pipelines.kubeflow.org/kfp_sdk_version: 1.6.3, pipelines.kubeflow.org/pipeline-sdk-type: kfp} annotations: {pipelines.kubeflow.org/component_spec: '{"implementation": {"container": {"args": ["--number", {"inputValue": "number"}, "----output-paths", {"outputPath": "Output"}], "command": ["sh", "-ec", "program_path=$(mktemp)\nprintf \"%s\" \"$0\" > \"$program_path\"\npython3 -u \"$program_path\" \"$@\"\n", "def print_and_return_number(number):\n print(number)\n return number\n\ndef _serialize_int(int_value: int) -> str:\n if isinstance(int_value, str):\n return int_value\n if not isinstance(int_value, int):\n raise TypeError(''Value \"{}\" has type \"{}\" instead of int.''.format(str(int_value), str(type(int_value))))\n return str(int_value)\n\nimport argparse\n_parser = argparse.ArgumentParser(prog=''Print and return number'', description='''')\n_parser.add_argument(\"--number\", dest=\"number\", type=int, required=True, default=argparse.SUPPRESS)\n_parser.add_argument(\"----output-paths\", dest=\"_output_paths\", type=str, nargs=1)\n_parsed_args = vars(_parser.parse_args())\n_output_files = _parsed_args.pop(\"_output_paths\", [])\n\n_outputs = print_and_return_number(**_parsed_args)\n\n_outputs = [_outputs]\n\n_output_serializers = [\n _serialize_int,\n\n]\n\nimport os\nfor idx, output_file in enumerate(_output_files):\n try:\n os.makedirs(os.path.dirname(output_file))\n except OSError:\n pass\n with open(output_file, ''w'') as f:\n f.write(_output_serializers[idx](_outputs[idx]))\n"], "image": "python:3.7"}}, "inputs": [{"name": "number", "type": "Integer"}], "name": "Print and return number", "outputs": [{"name": "Output", "type": "Integer"}]}', pipelines.kubeflow.org/component_ref: '{}', pipelines.kubeflow.org/arguments.parameters: '{"number": "{{inputs.parameters.number_2}}"}'} - name: sum-and-print-numbers container: args: [--number-1, '{{inputs.parameters.print-and-return-number-Output}}', --number-2, '{{inputs.parameters.print-and-return-number-2-Output}}'] command: - sh - -ec - | program_path=$(mktemp) printf "%s" "$0" > "$program_path" python3 -u "$program_path" "$@" - | def sum_and_print_numbers(number_1, number_2): print(number_1 + number_2) import argparse _parser = argparse.ArgumentParser(prog='Sum and print numbers', description='') _parser.add_argument("--number-1", dest="number_1", type=int, required=True, default=argparse.SUPPRESS) _parser.add_argument("--number-2", dest="number_2", type=int, required=True, default=argparse.SUPPRESS) _parsed_args = vars(_parser.parse_args()) _outputs = sum_and_print_numbers(**_parsed_args) image: python:3.7 inputs: parameters: - {name: print-and-return-number-2-Output} - {name: print-and-return-number-Output} metadata: labels: {pipelines.kubeflow.org/kfp_sdk_version: 1.6.3, pipelines.kubeflow.org/pipeline-sdk-type: kfp} annotations: {pipelines.kubeflow.org/component_spec: '{"implementation": {"container": {"args": ["--number-1", {"inputValue": "number_1"}, "--number-2", {"inputValue": "number_2"}], "command": ["sh", "-ec", "program_path=$(mktemp)\nprintf \"%s\" \"$0\" > \"$program_path\"\npython3 -u \"$program_path\" \"$@\"\n", "def sum_and_print_numbers(number_1, number_2):\n print(number_1 + number_2)\n\nimport argparse\n_parser = argparse.ArgumentParser(prog=''Sum and print numbers'', description='''')\n_parser.add_argument(\"--number-1\", dest=\"number_1\", type=int, required=True, default=argparse.SUPPRESS)\n_parser.add_argument(\"--number-2\", dest=\"number_2\", type=int, required=True, default=argparse.SUPPRESS)\n_parsed_args = vars(_parser.parse_args())\n\n_outputs = sum_and_print_numbers(**_parsed_args)\n"], "image": "python:3.7"}}, "inputs": [{"name": "number_1", "type": "Integer"}, {"name": "number_2", "type": "Integer"}], "name": "Sum and print numbers"}', pipelines.kubeflow.org/component_ref: '{}', pipelines.kubeflow.org/arguments.parameters: '{"number_1": "{{inputs.parameters.print-and-return-number-Output}}", "number_2": "{{inputs.parameters.print-and-return-number-2-Output}}"}'} arguments: parameters: - {name: number_1} - {name: number_2} serviceAccountName: pipeline-runner ```
================================================ FILE: versioned_docs/version-1.0/kubeflow/basic-requirements.md ================================================ --- title : "3. Install Requirements" description: "" sidebar_position: 3 contributors: ["Jongseob Jeon"] --- 실습을 위해 권장하는 파이썬 버전은 python>=3.7입니다. 파이썬 환경에 익숙하지 않은 분들은 다음 [Appendix 1. 파이썬 가상환경](../appendix/pyenv)을 참고하여 **클라이언트 노드**에 설치해주신 뒤 패키지 설치를 진행해주시기를 바랍니다. 실습을 진행하기에서 필요한 패키지들과 버전은 다음과 같습니다. - requirements.txt ```bash kfp==1.8.9 scikit-learn==1.0.1 mlflow==1.21.0 pandas==1.3.4 dill==0.3.4 ``` [앞에서 만든 파이썬 가상환경](../appendix/pyenv.md#python-가상환경-생성)을 활성화합니다. ```bash pyenv activate demo ``` 패키지 설치를 진행합니다. ```bash pip3 install -U pip pip3 install kfp==1.8.9 scikit-learn==1.0.1 mlflow==1.21.0 pandas==1.3.4 dill==0.3.4 ``` ================================================ FILE: versioned_docs/version-1.0/kubeflow/basic-run.md ================================================ --- title : "7. Pipeline - Run" description: "" sidebar_position: 7 contributors: ["Jongseob Jeon"] --- ## Run Pipeline 이제 업로드한 파이프라인을 실행시켜 보겠습니다. ## Before Run ### 1. Create Experiment Experiment란 Kubeflow 에서 실행되는 Run을 논리적으로 관리하는 단위입니다. Kubeflow에서 namespace를 처음 들어오면 생성되어 있는 Experiment가 없습니다. 따라서 파이프라인을 실행하기 전에 미리 Experiment를 생성해두어야 합니다. Experiment이 있다면 [Run Pipeline](../kubeflow/basic-run.md#run-pipeline-1)으로 넘어가도 무방합니다. Experiment는 Create Experiment 버튼을 통해 생성할 수 있습니다. ![run-0.png](./img/run-0.png) ### 2. Name 입력 Experiment로 사용할 이름을 입력합니다. ![run-1.png](./img/run-1.png) ## Run Pipeline ### 1. Create Run 선택 ![run-2.png](./img/run-2.png) ### 2. Experiment 선택 ![run-9.png](./img/run-9.png) ![run-10.png](./img/run-10.png) ### 3. Pipeline Config 입력 파이프라인을 생성할 때 입력한 Config 값들을 채워 넣습니다. 업로드한 파이프라인은 number_1과 number_2를 입력해야 합니다. ![run-3.png](./img/run-3.png) ### 4. Start 입력 후 Start 버튼을 누르면 파이프라인이 실행됩니다. ![run-4.png](./img/run-4.png) ## Run Result 실행된 파이프라인들은 Runs 탭에서 확인할 수 있습니다. Run을 클릭하면 실행된 파이프라인과 관련된 자세한 내용을 확인해 볼 수 있습니다. ![run-5.png](./img/run-5.png) 클릭하면 다음과 같은 화면이 나옵니다. 아직 실행되지 않은 컴포넌트는 회색 표시로 나옵니다. ![run-6.png](./img/run-6.png) 컴포넌트가 실행이 완료되면 초록색 체크 표시가 나옵니다. ![run-7.png](./img/run-7.png) 가장 마지막 컴포넌트를 보면 입력한 Config인 3과 5의 합인 8이 출력된 것을 확인할 수 있습니다. ![run-8.png](./img/run-8.png) ================================================ FILE: versioned_docs/version-1.0/kubeflow/how-to-debug.md ================================================ --- title : "13. Component - Debugging" description: "" sidebar_position: 13 contributors: ["Jongseob Jeon"] --- ## Debugging Pipeline 이번 페이지에서는 Kubeflow 컴포넌트를 디버깅하는 방법에 대해서 알아봅니다. ## Failed Component 이번 페이지에서는 [Component - MLFlow](../kubeflow/advanced-mlflow.md#mlflow-pipeline) 에서 이용한 파이프라인을 조금 수정해서 사용합니다. 우선 컴포넌트가 실패하도록 파이프라인을 변경하도록 하겠습니다. ```python from functools import partial import kfp from kfp.components import InputPath, OutputPath, create_component_from_func from kfp.dsl import pipeline @partial( create_component_from_func, packages_to_install=["pandas", "scikit-learn"], ) def load_iris_data( data_path: OutputPath("csv"), target_path: OutputPath("csv"), ): import pandas as pd from sklearn.datasets import load_iris iris = load_iris() data = pd.DataFrame(iris["data"], columns=iris["feature_names"]) target = pd.DataFrame(iris["target"], columns=["target"]) data["sepal length (cm)"] = None data.to_csv(data_path, index=False) target.to_csv(target_path, index=False) @partial( create_component_from_func, packages_to_install=["pandas"], ) def drop_na_from_csv( data_path: InputPath("csv"), output_path: OutputPath("csv"), ): import pandas as pd data = pd.read_csv(data_path) data = data.dropna() data.to_csv(output_path, index=False) @partial( create_component_from_func, packages_to_install=["dill", "pandas", "scikit-learn", "mlflow"], ) def train_from_csv( train_data_path: InputPath("csv"), train_target_path: InputPath("csv"), model_path: OutputPath("dill"), input_example_path: OutputPath("dill"), signature_path: OutputPath("dill"), conda_env_path: OutputPath("dill"), kernel: str, ): import dill import pandas as pd from sklearn.svm import SVC from mlflow.models.signature import infer_signature from mlflow.utils.environment import _mlflow_conda_env train_data = pd.read_csv(train_data_path) train_target = pd.read_csv(train_target_path) clf = SVC(kernel=kernel) clf.fit(train_data, train_target) with open(model_path, mode="wb") as file_writer: dill.dump(clf, file_writer) input_example = train_data.sample(1) with open(input_example_path, "wb") as file_writer: dill.dump(input_example, file_writer) signature = infer_signature(train_data, clf.predict(train_data)) with open(signature_path, "wb") as file_writer: dill.dump(signature, file_writer) conda_env = _mlflow_conda_env( additional_pip_deps=["dill", "pandas", "scikit-learn"] ) with open(conda_env_path, "wb") as file_writer: dill.dump(conda_env, file_writer) @pipeline(name="debugging_pipeline") def debugging_pipeline(kernel: str): iris_data = load_iris_data() drop_data = drop_na_from_csv(data=iris_data.outputs["data"]) model = train_from_csv( train_data=drop_data.outputs["output"], train_target=iris_data.outputs["target"], kernel=kernel, ) if __name__ == "__main__": kfp.compiler.Compiler().compile(debugging_pipeline, "debugging_pipeline.yaml") ``` 수정한 점은 다음과 같습니다. 1. 데이터를 불러오는 `load_iris_data` 컴포넌트에서 `sepal length (cm)` 피처에 `None` 값을 주입 2. `drop_na_from_csv` 컴포넌트에서 `drop_na()` 함수를 이용해 na 값이 포함된 `row`를 제거 이제 파이프라인을 업로드하고 실행해 보겠습니다. 실행 후 Run을 눌러서 확인해보면 `Train from csv` 컴포넌트에서 실패했다고 나옵니다. ![debug-0.png](./img/debug-0.png) 실패한 컴포넌트를 클릭하고 로그를 확인해서 실패한 이유를 확인해 보겠습니다. ![debug-2.png](./img/debug-2.png) 로그를 확인하면 데이터의 개수가 0이여서 실행되지 않았다고 나옵니다. 분명 정상적으로 데이터를 전달했는데 왜 데이터의 개수가 0개일까요? 이제 입력받은 데이터에 어떤 문제가 있었는지 확인해 보겠습니다. 우선 컴포넌트를 클릭하고 Input/Ouput 탭에서 입력값으로 들어간 데이터들을 다운로드 받습니다. 다운로드는 빨간색 네모로 표시된 곳의 링크를 클릭하면 됩니다. ![debug-5.png](./img/debug-5.png) 두 개의 파일을 같은 경로에 다운로드합니다. 그리고 해당 경로로 이동해서 파일을 확인합니다. ```bash ls ``` 다음과 같이 두 개의 파일이 있습니다. ```bash drop-na-from-csv-output.tgz load-iris-data-target.tgz ``` 압축을 풀어보겠습니다. ```bash tar -xzvf load-iris-data-target.tgz ; mv data target.csv tar -xzvf drop-na-from-csv-output.tgz ; mv data data.csv ``` 그리고 이를 주피터 노트북을 이용해 컴포넌트 코드를 실행합니다. ![debug-3.png](./img/debug-3.png) 디버깅을 해본 결과 dropna 할 때 column을 기준으로 drop을 해야 하는데 row를 기준으로 drop을 해서 데이터가 모두 사라졌습니다. 이제 문제의 원인을 알아냈으니 column을 기준으로 drop이 되게 컴포넌트를 수정합니다. ```python @partial( create_component_from_func, packages_to_install=["pandas"], ) def drop_na_from_csv( data_path: InputPath("csv"), output_path: OutputPath("csv"), ): import pandas as pd data = pd.read_csv(data_path) data = data.dropna(axis="columns") data.to_csv(output_path, index=False) ``` 수정 후 파이프라인을 다시 업로드하고 실행하면 다음과 같이 정상적으로 수행하는 것을 확인할 수 있습니다. ![debug-6.png](./img/debug-6.png) ================================================ FILE: versioned_docs/version-1.0/kubeflow/kubeflow-concepts.md ================================================ --- title : "2. Kubeflow Concepts" description: "" sidebar_position: 2 contributors: ["Jongseob Jeon"] --- ## Component 컴포넌트(Component)는 컴포넌트 콘텐츠(Component contents)와 컴포넌트 래퍼(Component wrapper)로 구성되어 있습니다. 하나의 컴포넌트는 컴포넌트 래퍼를 통해 kubeflow에 전달되며 전달된 컴포넌트는 정의된 컴포넌트 콘텐츠를 실행(execute)하고 아티팩트(artifacts)들을 생산합니다. ![concept-0.png](./img/concept-0.png) ### Component Contents 컴포넌트 콘텐츠를 구성하는 것은 총 3가지가 있습니다. ![concept-1.png](./img/concept-1.png) 1. Environemnt 2. Python code w\ Config 3. Generates Artifacts 예시와 함께 각 구성 요소가 어떤 것인지 알아보도록 하겠습니다. 다음과 같이 데이터를 불러와 SVC(Support Vector Classifier)를 학습한 후 SVC 모델을 저장하는 과정을 적은 파이썬 코드가 있습니다. ```python import dill import pandas as pd from sklearn.svm import SVC train_data = pd.read_csv(train_data_path) train_target= pd.read_csv(train_target_path) clf= SVC( kernel=kernel ) clf.fit(train_data) with open(model_path, mode="wb") as file_writer: dill.dump(clf, file_writer) ``` 위의 파이썬 코드는 다음과 같이 컴포넌트 콘텐츠로 나눌 수 있습니다. ![concept-2.png](./img/concept-2.png) Environment는 파이썬 코드에서 사용하는 패키지들을 import하는 부분입니다. 다음으로 Python Code w\ Config 에서는 주어진 Config를 이용해 실제로 학습을 수행합니다. 마지막으로 아티팩트를 저장하는 과정이 있습니다. ### Component Wrapper 컴포넌트 래퍼는 컴포넌트 콘텐츠에 필요한 Config를 전달하고 실행시키는 작업을 합니다. ![concept-3.png](./img/concept-3.png) Kubeflow에서는 컴포넌트 래퍼를 위의 `train_svc_from_csv`와 같이 함수의 형태로 정의합니다. 컴포넌트 래퍼가 콘텐츠를 감싸면 다음과 같이 됩니다. ![concept-4.png](./img/concept-4.png) ### Artifacts 위의 설명에서 컴포넌트는 아티팩트(Artifacts)를 생성한다고 했습니다. 아티팩트란 evaluation result, log 등 어떤 형태로든 파일로 생성되는 것을 통틀어서 칭하는 용어입니다. 그중 우리가 관심을 두는 유의미한 것들은 다음과 같은 것들이 있습니다. ![concept-5.png](./img/concept-5.png) - Model - Data - Metric - etc #### Model 저희는 모델을 다음과 같이 정의 했습니다. > 모델이란 파이썬 코드와 학습된 Weights와 Network 구조 그리고 이를 실행시키기 위한 환경이 모두 포함된 형태 #### Data 데이터는 전 처리된 피처, 모델의 예측 값 등을 포함합니다. #### Metric Metric은 동적 지표와 정적 지표 두 가지로 나누었습니다. - 동적 지표란 train loss와 같이 학습이 진행되는 중 에폭(Epoch)마다 계속해서 변화하는 값을 의미합니다. - 정적 지표란 학습이 끝난 후 최종적으로 모델을 평가하는 정확도 등을 의미합니다. ## Pipeline 파이프라인은 컴포넌트의 집합과 컴포넌트를 실행시키는 순서도로 구성되어 있습니다. 이 때, 순서도는 방향 순환이 없는 그래프로 이루어져 있으며, 간단한 조건문을 포함할 수 있습니다. ![concept-6.png](./img/concept-6.png) ### Pipeline Config 앞서 컴포넌트를 실행시키기 위해서는 Config가 필요하다고 설명했습니다. 파이프라인을 구성하는 컴포넌트의 Config 들을 모아 둔 것이 파이프라인 Config입니다. ![concept-7.png](./img/concept-7.png) ## Run 파이프라인이 필요로 하는 파이프라인 Config가 주어져야지만 파이프라인을 실행할 수 있습니다. Kubeflow에서는 실행된 파이프라인을 Run 이라고 부릅니다. ![concept-8.png](./img/concept-8.png) 파이프라인이 실행되면 각 컴포넌트가 아티팩트들을 생성합니다. Kubeflow pipeline에서는 Run 하나당 고유한 ID 를 생성하고, Run에서 생성되는 모든 아티팩트들을 저장합니다. ![concept-9.png](./img/concept-9.png) 그러면 이제 직접 컴포넌트와 파이프라인을 작성하는 방법에 대해서 알아보도록 하겠습니다. ================================================ FILE: versioned_docs/version-1.0/kubeflow/kubeflow-intro.md ================================================ --- title : "1. Kubeflow Introduction" description: "" sidebar_position: 1 contributors: ["Jongseob Jeon"] --- Kubeflow를 사용하기 위해서는 컴포넌트(Component)와 파이프라인(Pipeline)을 작성해야 합니다. *모두의 MLOps*에서 설명하는 방식은 [Kubeflow Pipeline 공식 홈페이지](https://www.kubeflow.org/docs/components/pipelines/overview/quickstart/)에서 설명하는 방식과는 다소 차이가 있습니다. 여기에서는 Kubeflow Pipeline을 워크플로(Workflow)가 아닌 앞서 설명한 [MLOps를 구성하는 요소](../kubeflow/kubeflow-concepts.md#component-contents) 중 하나의 컴포넌트로 사용하기 때문입니다. 그럼 이제 컴포넌트와 파이프라인은 무엇이며 어떻게 작성할 수 있는지 알아보도록 하겠습니다. ================================================ FILE: versioned_docs/version-1.0/kubeflow-dashboard-guide/_category_.json ================================================ { "label": "Kubeflow UI Guide", "position": 5, "link": { "type": "generated-index" } } ================================================ FILE: versioned_docs/version-1.0/kubeflow-dashboard-guide/experiments-and-others.md ================================================ --- title : "6. Kubeflow Pipeline 관련" description: "" sidebar_position: 6 contributors: ["Jaeyeon Kim"] --- Central Dashboard의 왼쪽 탭의 Experiments(KFP), Pipelines, Runs, Recurring Runs, Artifacts, Executions 페이지들에서는 Kubeflow Pipeline과 Pipeline의 실행 그리고 Pipeline Run의 결과를 관리합니다. ![left-tabs](./img/left-tabs.png) Kubeflow Pipeline이 *모두의 MLOps*에서 Kubeflow를 사용하는 주된 이유이며, Kubeflow Pipeline을 만드는 방법, 실행하는 방법, 결과를 확인하는 방법 등 자세한 내용은 [3.Kubeflow](../kubeflow/kubeflow-intro)에서 다룹니다. ================================================ FILE: versioned_docs/version-1.0/kubeflow-dashboard-guide/experiments.md ================================================ --- title : "5. Experiments(AutoML)" description: "" sidebar_position: 5 contributors: ["Jaeyeon Kim"] --- 다음으로는 Central Dashboard의 왼쪽 탭의 Experiments(AutoML)을 클릭해보겠습니다. ![left-tabs](./img/left-tabs.png) ![automl](./img/automl.png) Experiments(AutoML) 페이지는 Kubeflow에서 Hyperparameter Tuning과 Neural Architecture Search를 통한 AutoML을 담당하는 [Katib](https://www.kubeflow.org/docs/components/katib/overview/)를 관리할 수 있는 페이지입니다. Katib와 Experiments(AutoML)에 대한 사용법은 *모두의 MLOps* v1.0에서는 다루지 않으며, v2.0에 추가될 예정입니다. ================================================ FILE: versioned_docs/version-1.0/kubeflow-dashboard-guide/intro.md ================================================ --- title : "1. Central Dashboard" description: "" sidebar_position: 1 contributors: ["Jaeyeon Kim", "SeungTae Kim"] --- [Kubeflow 설치](../setup-components/install-components-kf.md)를 완료하면, 다음 커맨드를 통해 대시보드에 접속할 수 있습니다. ```bash kubectl port-forward --address 0.0.0.0 svc/istio-ingressgateway -n istio-system 8080:80 ``` ![after-login](./img/after-login.png) Central Dashboard는 Kubeflow에서 제공하는 모든 기능을 통합하여 제공하는 UI입니다. Central Dashboard에서 제공하는 기능은 크게 왼쪽의 탭을 기준으로 구분할 수 있습니다. ![left-tabs](./img/left-tabs.png) - Home - Notebooks - Tensorboards - Volumes - Models - Experiments(AutoML) - Experiments(KFP) - Pipelines - Runs - Recurring Runs - Artifacts - Executions 그럼 이제 기능별 간단한 사용법을 알아보겠습니다. ================================================ FILE: versioned_docs/version-1.0/kubeflow-dashboard-guide/notebooks.md ================================================ --- title : "2. Notebooks" description: "" sidebar_position: 2 contributors: ["Jaeyeon Kim"] --- ## 노트북 서버(Notebook Server) 생성하기 다음 Central Dashboard의 왼쪽 탭의 Notebooks를 클릭해보겠습니다. ![left-tabs](./img/left-tabs.png) 다음과 같은 화면을 볼 수 있습니다. Notebooks 탭은 JupyterHub와 비슷하게 유저별로 jupyter notebook 및 code server 환경(이하 노트북 서버)을 독립적으로 생성하고 접속할 수 있는 페이지입니다. ![notebook-home](./img/notebook-home.png) 오른쪽 위의 `+ NEW NOTEBOOK` 버튼을 클릭합니다. ![new-notebook](./img/new-notebook.png) 아래와 같은 화면이 나타나면, 이제 생성할 노트북 서버의 스펙(Spec)을 명시하여 생성합니다. ![create](./img/create.png)
각 스펙에 대한 자세한 내용은 아래와 같습니다. - **name**: - 노트북 서버를 구분할 수 있는 이름으로 생성합니다. - **namespace** : - 따로 변경할 수 없습니다. (현재 로그인한 user 계정의 namespace이 자동으로 지정되어 있습니다.) - **Image**: - sklearn, pytorch, tensorflow 등의 파이썬 패키지가 미리 설치된 jupyter lab 이미지 중 사용할 이미지를 선택합니다. - 노트북 서버 내에서 GPU를 사용하여 tensorflow-cuda, pytorch-cuda 등의 이미지를 사용하는 경우, **하단의 GPUs** 부분을 확인하시기 바랍니다. - 추가적인 패키지나 소스코드 등을 포함한 커스텀(Custom) 노트북 서버를 사용하고 싶은 경우에는 커스텀 이미지(Custom Image)를 만들고 배포 후 사용할 수도 있습니다. - **CPU / RAM** - 필요한 자원 사용량을 입력합니다. - cpu : core 단위 - 가상 core 개수 단위를 의미하며, int 형식이 아닌 `1.5`, `2.7` 등의 float 형식도 입력할 수 있습니다. - memory : Gi 단위 - **GPUs** - 주피터 노트북에 할당할 GPU 개수를 입력합니다. - `None` - GPU 자원이 필요하지 않은 상황 - 1, 2, 4 - GPU 1, 2, 4 개 할당 - GPU Vendor - 앞의 [(Optional) Setup GPU](../setup-kubernetes/setup-nvidia-gpu.md) 를 따라 nvidia gpu plugin을 설치하였다면 NVIDIA를 선택합니다. - **Workspace Volume** - 노트북 서버 내에서 필요한 만큼의 디스크 용량을 입력합니다. - Type 과 Name 은 변경하지 않고, **디스크 용량을 늘리고 싶거나** **AccessMode 를 변경하고 싶을** 때에만 변경해서 사용하시면 됩니다. - **"Don't use Persistent Storage for User's home"** 체크박스는 노트북 서버의 작업 내용을 저장하지 않아도 상관없을 때에만 클릭합니다. **일반적으로는 누르지 않는 것을 권장합니다.** - 기존에 미리 생성해두었던 PVC를 사용하고 싶을 때에는, Type을 "Existing" 으로 입력하여 해당 PVC의 이름을 입력하여 사용하시면 됩니다. - **Data Volumes** - 추가적인 스토리지 자원이 필요하다면 **"+ ADD VOLUME"** 버튼을 클릭하여 생성할 수 있습니다. - ~~Configurations, Affinity/Tolerations, Miscellaneous Settings~~ - 일반적으로는 필요하지 않으므로 *모두의 MLOps*에서는 자세한 설명을 생략합니다.
모두 정상적으로 입력하였다면 하단의 **LAUNCH** 버튼이 활성화되며, 버튼을 클릭하면 노트북 서버 생성이 시작됩니다. ![creating](./img/creating.png) 생성 후 아래와 같이 **Status** 가 초록색 체크 표시 아이콘으로 변하며, **CONNECT 버튼**이 활성화됩니다. ![created](./img/created.png) --- ## 노트북 서버 접속하기 **CONNECT 버튼**을 클릭하면 브라우저에 새 창이 열리며, 다음과 같은 화면이 보입니다. ![notebook-access](./img/notebook-access.png) **Launcher**의 Notebook, Console, Terminal 아이콘을 클릭하여 사용할 수 있습니다. 생성된 Notebook 화면 ![notebook-console](./img/notebook-console.png) 생성된 Terminal 화면 ![terminal-console](./img/terminal-console.png) --- ## 노트북 서버 중단하기 노트북 서버를 오랜 시간 사용하지 않는 경우, 쿠버네티스 클러스터의 효율적인 리소스 사용을 위해서 노트북 서버를 중단(Stop)할 수 있습니다. **단, 이 경우 노트북 서버 생성 시 Workspace Volume 또는 Data Volume으로 지정해놓은 경로 외에 저장된 데이터는 모두 초기화되는 것에 주의하시기 바랍니다.** 노트북 서버 생성 당시 경로를 변경하지 않았다면, 디폴트(Default) Workspace Volume의 경로는 노트북 서버 내의 `/home/jovyan` 이므로, `/home/jovyan` 의 하위 경로 이외의 경로에 저장된 데이터는 모두 사라집니다. 다음과 같이 `STOP` 버튼을 클릭하면 노트북 서버가 중단됩니다. ![notebook-stop](./img/notebook-stop.png) 중단이 완료되면 다음과 같이 `CONNECT` 버튼이 비활성화되며, `PLAY` 버튼을 클릭하면 다시 정상적으로 사용할 수 있습니다. ![notebook-restart](./img/notebook-restart.png) ================================================ FILE: versioned_docs/version-1.0/kubeflow-dashboard-guide/tensorboards.md ================================================ --- title : "3. Tensorboards" description: "" sidebar_position: 3 contributors: ["Jaeyeon Kim"] --- 다음으로는 Central Dashboard의 왼쪽 탭의 Tensorboards를 클릭해보겠습니다. ![left-tabs](./img/left-tabs.png) 다음과 같은 화면을 볼 수 있습니다. ![tensorboard](./img/tensorboard.png) Tensorboards 탭은 Tensorflow, PyTorch 등의 프레임워크에서 제공하는 Tensorboard 유틸이 생성한 ML 학습 관련 데이터를 시각화하는 텐서보드 서버(Tensorboard Server)를 쿠버네티스 클러스터에 생성하는 기능을 제공합니다. 이렇게 생성한 텐서보드 서버는, 일반적인 원격 텐서보드 서버의 사용법과 같이 사용할 수도 있으며, [Kubeflow 파이프라인 런에서 바로 텐서보드 서버에 데이터를 저장하는 용도](https://www.kubeflow.org/docs/components/pipelines/sdk/output-viewer/#tensorboard)로 활용할 수 있습니다. Kubeflow 파이프라인 런의 결과를 시각화하는 방법에는 [다양한 방식](https://www.kubeflow.org/docs/components/pipelines/sdk/output-viewer/)이 있으며, *모두의 MLOps*에서는 더 일반적으로 활용할 수 있도록 Kubeflow 컴포넌트의 Visualization 기능과 MLflow의 시각화 기능을 활용할 예정이므로, Tensorboards 페이지에 대한 자세한 설명은 생략하겠습니다. ================================================ FILE: versioned_docs/version-1.0/kubeflow-dashboard-guide/volumes.md ================================================ --- title : "4. Volumes" description: "" sidebar_position: 4 contributors: ["Jaeyeon Kim"] --- ## Volumes 다음으로는 Central Dashboard의 왼쪽 탭의 Volumes를 클릭해보겠습니다. ![left-tabs](./img/left-tabs.png) 다음과 같은 화면을 볼 수 있습니다. ![volumes](./img/volumes.png) Volumes 탭은 [Kubernetes의 볼륨(Volume)](https://kubernetes.io/ko/docs/concepts/storage/volumes/), 정확히는 [퍼시스턴트 볼륨 클레임(Persistent Volume Claim, 이하 pvc)](https://kubernetes.io/ko/docs/concepts/storage/persistent-volumes/) 중 현재 user의 namespace에 속한 pvc를 관리하는 기능을 제공합니다. 위 스크린샷을 보면, [1. Notebooks](../kubeflow-dashboard-guide/notebooks) 페이지에서 생성한 Volume의 정보를 확인할 수 있습니다. 해당 Volume의 Storage Class는 쿠버네티스 클러스터 설치 당시 설치한 Default Storage Class인 local-path로 설정되어있음을 확인할 수 있습니다. 이외에도 user namespace에 새로운 볼륨을 생성하거나, 조회하거나, 삭제하고 싶은 경우에 Volumes 페이지를 활용할 수 있습니다. --- ## 볼륨 생성하기 오른쪽 위의 `+ NEW VOLUME` 버튼을 클릭하면 다음과 같은 화면을 볼 수 있습니다. ![new-volume](./img/new-volume.png) name, size, storage class, access mode를 지정하여 생성할 수 있습니다. 원하는 리소스 스펙을 지정하여 생성하면 다음과 같이 볼륨의 Status가 `Pending`으로 조회됩니다. `Status` 아이콘에 마우스 커서를 가져다 대면 *해당 볼륨은 mount하여 사용하는 first consumer가 나타날 때 실제로 생성을 진행한다(This volume will be bound when its first consumer is created.)*는 메시지를 확인할 수 있습니다. 이는 실습을 진행하는 [StorageClass](https://kubernetes.io/ko/docs/concepts/storage/storage-classes/)인 `local-path`의 볼륨 생성 정책에 해당하며, **문제 상황이 아닙니다.** 해당 페이지에서 Status가 `Pending` 으로 보이더라도 해당 볼륨을 사용하길 원하는 노트북 서버 혹은 파드(Pod)에서는 해당 볼륨의 이름을 지정하여 사용할 수 있으며, 그때 실제로 볼륨 생성이 진행됩니다. ![creating-volume](./img/creating-volume.png) ================================================ FILE: versioned_docs/version-1.0/prerequisites/_category_.json ================================================ { "label": "Prerequisites", "position": 1, "link": { "type": "generated-index" } } ================================================ FILE: versioned_docs/version-1.0/prerequisites/docker/_category_.json ================================================ { "label": "Docker", "position": 1, "link": { "type": "generated-index" } } ================================================ FILE: versioned_docs/version-1.0/prerequisites/docker/advanced.md ================================================ --- title : "[Practice] Docker Advanced" description: "Practice to use docker more advanced way." sidebar_position: 6 contributors: ["Jongseob Jeon", "Jaeyeon Kim"] --- ## 도커 이미지 잘 만들기 ### 도커 이미지를 만들 때 고려해야 될 점 Dockerfile 을 활용하여 도커 이미지를 만들 때는 명령어의 **순서**가 중요합니다. 그 이유는 도커 이미지는 여러 개의 Read-Only Layer 로 구성되어있고, 이미지를 빌드할 때 이미 존재하는 레이어는 **캐시되어** 재사용되기 때문에, 이를 생각해서 Dockerfile 을 구성한다면 **빌드 시간을 줄일 수 있습니다.** Dockerfile에서 `RUN`, `ADD`, `COPY` 명령어 하나가 하나의 레이어로 저장됩니다. 예를 들어서 다음과 같은 `Dockerfile`이 있습니다. ```docker # Layer 1 FROM ubuntu:latest # Layer 2 RUN apt-get update && apt-get install python3 pip3 -y # Layer 3 RUN pip3 install -U pip && pip3 install torch # Layer 4 COPY src/ src/ # Layer 5 CMD python src/app.py ``` 위의 `Dockerfile`로 빌드된 이미지를 `docker run -it app:latest /bin/bash` 명령어로 실행하면 다음과 같은 레이어로 표현할 수 있습니다. ![layers.png](./img/layers.png) 최상단의 R/W Layer 는 이미지에 영향을 주지 않습니다. 즉, 컨테이너 내부에서 작업한 내역은 모두 휘발성입니다. 하단의 레이어가 변경되면, 그 위의 레이어는 모두 새로 빌드됩니다. 그래서 Dockerfile 내장 명령어의 순서가 중요합니다. 예를 들면, **자주 변경**되는 부분은 **최대한 뒤쪽으로** 정렬하는 것을 추천합니다. (ex. `COPY src/ app/src/`) 그렇기 때문에 반대로 변경되지 않는 부분은 최대한 앞쪽으로 정렬하는게 좋습니다. 만약 거의 **변경되지 않지만**, 여러 곳에서 **자주** 쓰이는 부분을 공통화할 수도 있습니다. 해당 공통부분만 묶어서 별도의 이미지는 미리 만들어둔 다음, **베이스 이미지** 로 활용하는 것이 좋습니다. 예를 들어, 다른 건 거의 똑같은데, tensorflow-cpu 를 사용하는 이미지와, tensorflow-gpu 를 사용하는 환경을 분리해서 이미지로 만들고 싶은 경우에는 다음과 같이 할 수 있습니다. python 과 기타 기본적인 패키지가 설치된 [`ghcr.io/makinarocks/python:3.8-base`](http://ghcr.io/makinarocks/python:3.8-base-cpu) 를 만들어두고, **tensorflow cpu 버전과 gpu 버전이** 설치된 이미지 새로 만들때는, 위의 이미지를 `FROM` 으로 불러온 다음, tensorflow install 하는 부분만 별도로 작성해서 Dockerfile 을 2 개로 관리한다면 가독성도 좋고 빌드 시간도 줄일 수 있습니다. **합칠 수 있는 Layer 는 합치는 것**이 Old version 의 도커에서는 성능 향상 효과를 이끌었습니다. 여러분의 도커 컨테이너가 어떤 도커 버전에서 실행될 것인지 보장할 수 없으며, **가독성**을 위해서도 합칠 수 있는 Layer 는 적절히 합치는 것이 좋습니다. 예를 들면, 다음과 같이 작성된 `Dockerfile`이 있습니다. ```docker # Bad Case RUN apt-get update RUN apt-get install build-essential -y RUN apt-get install curl -y RUN apt-get install jq -y RUN apt-get install git -y ``` 이를 아래와 같이 합쳐서 적을 수 있습니다. ```docker # Better Case RUN apt-get update && \ apt-get install -y \ build-essential \ curl \ jq \ git ``` 편의를 위해서는 `.dockerignore` 도 사용하는게 좋습니다. `.dockerignore`는 `.gitignore` 와 비슷한 역할을 한다고 이해하면 됩니다. (git add 할 때 제외할 수 있듯이, docker build 할 때 자동으로 제외) 더 많은 정보는 [Docker 공식 문서](https://docs.docker.com/develop/develop-images/dockerfile_best-practices/)에서 확인하실 수 있습니다. ### ENTRYPOINT vs CMD `ENTRYPOINT` 와 `CMD` 는 모두 컨테이너의 실행 시점에서 어떤 명령어를 실행시키고 싶을 때 사용합니다. 그리고 이 둘 중 하나는 반드시 존재해야 합니다. - **차이점** - `CMD`: docker run 을 수행할 때, 쉽게 변경하여 사용할 수 있음 - `ENTRYPOINT`: `--entrypoint` 를 사용해야 변경할 수 있음 `ENTRYPOINT` 와 `CMD` 가 함께 쓰일 때는 보통 `CMD`는 `ENTRYPOINT` 에서 적은 명령의 arguments(parameters) 를 의미합니다. 예를 들어서 다음과 같은 `Dockerfile` 이 있습니다. ```docker FROM ubuntu:latest # 아래 4 가지 option 을 바꿔가며 직접 테스트해보시면 이해하기 편합니다. # 단, NO ENTRYPOINT 옵션은 base image 인 ubuntu:latest 에 이미 있어서 테스트해볼 수는 없고 나머지 v2, 3, 5, 6, 8, 9, 11, 12 를 테스트해볼 수 있습니다. # ENTRYPOINT echo "Hello ENTRYPOINT" # ENTRYPOINT ["echo", "Hello ENTRYPOINT"] # CMD echo "Hello CMD" # CMD ["echo", "Hello CMD"] ``` 위의 `Dockerfile`에서 주석으로 표시된 부분들을 해제하며 빌드하고 실행하면 다음과 같은 결과를 얻을 수 있습니다. | | No ENTRYPOINT | ENTRYPOINT a b | ENTRYPOINT ["a", "b"] | | ------------------ | -------------- | -------------- | --------------------- | | **NO CMD** | Error! | /bin/sh -c a b | a b | | **CMD ["x", "y"]** | x y | /bin/sh -c a b | a b x y | | **CMD x y** | /bin/sh -c x y | /bin/sh -c a b | a b /bin/sh -c x y | - In Kubernetes pod - `ENTRYPOINT` → command - `CMD` → args ### Docker tag 이름 짓기 도커 이미지의 tag 로 **latest 는 사용하지 않는 것을 권장**합니다. 이유는 latest 는 default tag name 이므로 **의도치 않게 overwritten** 되는 경우가 너무 많이 발생하기 때문입니다. 하나의 이미지는 하나의 태그를 가짐(**uniqueness**)을 보장해야 추후 Production 단계에서 **협업/디버깅**에 용이합니다. 내용은 다르지만, 동일한 tag 를 사용하게 되면 추후 dangling image 로 취급되어 관리하기 어려워집니다. dangling image는 `docker images`에는 나오지 않지만 계속해서 저장소를 차지하고 있습니다. ### ETC 1. log 등의 정보는 container 내부가 아닌 곳에 따로 저장합니다. container 내부에서 write 한 data 는 언제든지 사라질 수 있기 때문입니다. 2. secret 한 정보, 환경(dev/prod) dependent 한 정보 등은 Dockerfile 에 직접 적는 게 아니라, env var 또는 .env config file 을 사용합니다. 3. Dockerfile **linter** 도 존재하므로, 협업 시에는 활용하면 좋습니다. [https://github.com/hadolint/hadolint](https://github.com/hadolint/hadolint) ## docker run 의 다양한 옵션 ### docker run with volume Docker container 사용 시 불편한 점이 있습니다. 바로 Docker는 기본적으로 Docker **container 내부에서 작업한 모든 사항은 저장되지 않습니다.** 이유는 Docker container 는 각각 격리된 파일시스템을 사용합니다. 따라서, **여러 docker container 끼리 데이터를 공유하기 어렵습니다.** 이 문제를 해결하기 위해서 Docker에서 제공하는 방식은 **2 가지**가 있습니다. ![storage.png](./img/storage.png) #### Docker volume - docker cli 를 사용해 `volume` 이라는 리소스를 직접 관리 - host 에서 Docker area(`/var/lib/docker`) 아래에 특정 디렉토리를 생성한 다음, 해당 경로를 docker container 에 mount #### Bind mount - host 의 특정 경로를 docker container 에 mount #### How to use? 사용 방식은 **동일한 인터페이스**로 `-v` 옵션을 통해 사용할 수 있습니다. 다만, volume 을 사용할 때에는 `docker volume create`, `docker volume ls`, `docker volume rm` 등을 수행하여 직접 관리해주어야 합니다. - Docker volume ```bash docker run \ -v my_volume:/app \ nginx:latest ```` - Blind mount ```bash docker run \ -v /home/user/some/path:/app \ nginx:latest ``` 로컬에서 개발할 때는 bind mount 가 편하긴 하지만, 환경을 깔끔하게 유지하고 싶다면 docker volume 을 사용하여 create, rm 을 명시적으로 수행하는 것도 하나의 방법입니다. 쿠버네티스에서 스토리지를 제공하는 방식도 결국 docker 의 bind mount 를 활용하여 제공합니다. ### docker run with resource limit 기본적으로 docker container 는 **host OS 의 cpu, memory 자원을 fully 사용**할 수 있습니다. 하지만 이렇게 사용하게 되면 host OS 의 자원 상황에 따라서 **OOM** 등의 이슈로 docker container 가 비정상적으로 종료되는 상황이 발생할 수 있습니다. 이런 문제를 다루기 위해 **docker container 실행 시, cpu 와 memory 의 사용량 제한**을 걸 수 있는 `-m` [옵션](https://docs.docker.com/config/containers/resource_constraints/#limit-a-containers-access-to-memory)을 제공합니다. ```bash docker run -d -m 512m --memory-reservation=256m --name 512-limit ubuntu sleep 3600 docker run -d -m 1g --memory-reservation=256m --name 1g-limit ubuntu sleep 3600 ``` 위의 도커를 실행 후 `docker stats` 커맨드를 통해 사용량을 확인할 수 있습니다. ```bash CONTAINER ID NAME CPU % MEM USAGE / LIMIT MEM % NET I/O BLOCK I/O PIDS 4ea1258e2e09 1g-limit 0.00% 300KiB / 1GiB 0.03% 1kB / 0B 0B / 0B 1 4edf94b9a3e5 512-limit 0.00% 296KiB / 512MiB 0.06% 1.11kB / 0B 0B / 0B 1 ``` 쿠버네티스에서 pod 라는 리소스에 cpu, memory 제한을 줄 때, 이 방식을 활용하여 제공합니다. ### docker run with restart policy 특정 컨테이너가 계속해서 running 상태를 유지시켜야 하는 경우가 존재합니다. 이런 경우를 위해서 해당 컨테이너가 종료되자마자 바로 재생성을 시도할 수 있는 `--restart=always` 옵션을 제공하고 있습니다. 옵션 입력 후 도커를 실행합니다. ```bash docker run --restart=always ubuntu ``` `watch -n1 docker ps`를 통해 재실행이 되고 있는지 확인합니다. 정상적으로 수행되고 있다면 다음과 같이 STATUS에 `Restarting (0)` 이 출력됩니다. ```bash CONTAINER ID IMAGE COMMAND CREATED STATUS PORTS NAMES a911850276e8 ubuntu "bash" 35 seconds ago Restarting (0) 6 seconds ago hungry_vaughan ``` - [https://docs.docker.com/engine/reference/commandline/run/#restart-policies---restart](https://docs.docker.com/engine/reference/commandline/run/#restart-policies---restart) - on-failure with max retries - always 등의 선택지 제공 쿠버네티스에서 job 이라는 resource 의 restart 옵션을 줄 때, 이 방식을 활용하여 제공합니다. ### docker run as a background process 도커 컨테이너를 실행할 때는 기본적으로 foreground process 로 실행됩니다. 즉, 컨테이너를 실행한 터미널이 해당 컨테이너에 자동으로 attach 되어 있어, 다른 명령을 실행할 수 없습니다. 다음과 같은 예시를 수행해봅니다. 우선 터미널 2 개를 열어, 하나의 터미널에서는 `docker ps` 를 지켜보고, 다른 하나의 터미널에서는 다음과 같은 명령을 차례로 실행해보며 동작을 지켜봅니다. #### First Practice ```bash docker run -it ubuntu sleep 10 ``` 10 초동안 멈춰 있어야 하고, 해당 컨테이너에서 다른 명령을 수행할 수 없습니다. 10초 뒤에는 docker ps 에서 container 가 종료되는 것을 확인할 수 있습니다. #### Second Practice ```bash docker run -it ubuntu sleep 10 ``` 이후, `ctrl + p` -> `ctrl + q` 해당 터미널에서 이제 다른 명령을 수행할 수 있게 되었으며, docker ps 로도 10초까지는 해당 컨테이너가 살아있는 것을 확인할 수 있습니다. 이렇게 docker container 내부에서 빠져나온 상황을 detached 라고 부릅니다. 도커에서는 run 을 실행함과 동시에 detached mode 로 실행시킬 수 있는 옵션을 제공합니다. #### Third Practice ```bash docker run -d ubuntu sleep 10 ``` detached mode 이므로 해당 명령을 실행시킨 터미널에서 다른 액션을 수행시킬 수 있습니다. 상황에 따라 detached mode 를 적절히 활용하면 좋습니다. 예를 들어, DB 와 통신하는 Backend API server 를 개발할 때 Backend API server 는 source code 를 변경시켜가면서 hot-loading 으로 계속해서 로그를 확인해봐야 하지만, DB 는 로그를 지켜볼 필요는 없는 경우라면 다음과 같이 실행할 수 있습니다. DB 는 docker container 를 detached mode 로 실행시키고, Backend API server 는 attached mode 로 log 를 following 하면서 실행시키면 효율적입니다. ## References - [https://towardsdatascience.com/docker-storage-598e385f4efe](https://towardsdatascience.com/docker-storage-598e385f4efe) - [https://vsupalov.com/docker-latest-tag/](https://vsupalov.com/docker-latest-tag/) - [https://docs.microsoft.com/ko-kr/azure/container-registry/container-registry-image-tag-version](https://docs.microsoft.com/ko-kr/azure/container-registry/container-registry-image-tag-version) - [https://stevelasker.blog/2018/03/01/docker-tagging-best-practices-for-tagging-and-versioning-docker-images/](https://stevelasker.blog/2018/03/01/docker-tagging-best-practices-for-tagging-and-versioning-docker-images/) ================================================ FILE: versioned_docs/version-1.0/prerequisites/docker/command.md ================================================ --- title : "[Practice] Docker command" description: "Practice to use docker command." sidebar_position: 4 contributors: ["Jongseob Jeon", "Jaeyeon Kim"] --- ## 1. 정상 설치 확인 ```bash docker run hello-world ``` 정상적으로 설치된 경우 다음과 같은 메시지를 확인할 수 있습니다. ```bash Hello from Docker! This message shows that your installation appears to be working correctly. .... ``` **(For ubuntu)** sudo 없이 사용하고 싶다면 아래 사이트를 참고합니다. - [https://docs.docker.com/engine/install/linux-postinstall/#manage-docker-as-a-non-root-user](https://docs.docker.com/engine/install/linux-postinstall/#manage-docker-as-a-non-root-user) ## 2. Docker Pull docker image registry(도커 이미지를 저장하고 공유할 수 있는 저장소)로부터 Docker image 를 로컬에 다운로드 받는 커맨드입니다. 아래 커맨드를 통해 docker pull에서 사용 가능한 argument들을 확인할 수 있습니다. ```bash docker pull --help ``` 정상적으로 수행되면 아래와 같이 출력됩니다. ```bash Usage: docker pull [OPTIONS] NAME[:TAG|@DIGEST] Pull an image or a repository from a registry Options: -a, --all-tags Download all tagged images in the repository --disable-content-trust Skip image verification (default true) --platform string Set platform if server is multi-platform capable -q, --quiet Suppress verbose output ``` 여기서 알 수 있는 것은 바로 docker pull은 두 개 타입의 argument를 받는다는 것을 알 수 있습니다. 1. `[OPTIONS]` 2. `NAME[:TAG|@DIGEST]` help에서 나온 `-a`, -`q` 옵션을 사용하기 위해서는 NAME 앞에서 사용해야 합니다. 직접 `ubuntu:18.04` 이미지를 pull 해보겠습니다. ```bash docker pull ubuntu:18.04 ``` 위 명령어를 해석하면 `ubuntu` 라는 이름을 가진 이미지 중 `18.04` 태그가 달려있는 이미지를 가져오라는 뜻입니다. 만약, 정상적으로 수행된다면 다음과 비슷하게 출력됩니다. ```bash 18.04: Pulling from library/ubuntu 20d796c36622: Pull complete Digest: sha256:42cd9143b6060261187a72716906187294b8b66653b50d70bc7a90ccade5c984 Status: Downloaded newer image for ubuntu:18.04 docker.io/library/ubuntu:18.04 ``` 위의 명령어를 수행하면 [docker.io/library](http://docker.io/library/) 라는 이름의 registry 에서 ubuntu:18.04 라는 image 를 여러분의 노트북에 다운로드 받게됩니다. - 참고사항 - 추후 [docker.io](http://docker.io) 나 public 한 docker hub 와 같은 registry 대신에, 특정 **private** 한 registry 에서 docker image 를 가져와야 하는 경우에는, [`docker login`](https://docs.docker.com/engine/reference/commandline/login/) 을 통해서 특정 registry 를 바라보도록 한 뒤, docker pull 을 수행하는 형태로 사용합니다. 혹은 insecure registry 를 설정하는 [방안](https://stackoverflow.com/questions/42211380/add-insecure-registry-to-docker)도 활용할 수 있습니다. - 폐쇄망에서 docker image 를 `.tar` 파일과 같은 형태로 저장하고 공유할 수 있도록 [`docker save`](https://docs.docker.com/engine/reference/commandline/save/), [`docker load`](https://docs.docker.com/engine/reference/commandline/load/) 와 같은 명령어도 존재합니다. ## 3. Docker images 로컬에 존재하는 docker image 리스트를 출력하는 커맨드입니다. ```bash docker images --help ``` docker images에서 사용할 수 있는 argument는 다음과 같습니다. ```bash Usage: docker images [OPTIONS] [REPOSITORY[:TAG]] List images Options: -a, --all Show all images (default hides intermediate images) --digests Show digests -f, --filter filter Filter output based on conditions provided --format string Pretty-print images using a Go template --no-trunc Don't truncate output -q, --quiet Only show image IDs ``` 아래 명령어를 이용해 직접 실행해 보겠습니다. ```bash docker images ``` 만약 도커를 최초 설치 후 이 실습을 진행한다면 다음과 비슷하게 출력됩니다. ```bash REPOSITORY TAG IMAGE ID CREATED SIZE ubuntu 18.04 29e70752d7b2 2 days ago 56.7MB ``` 줄 수 있는 argument중 `-q`를 사용하면 `IMAGE ID` 만 출력됩니다. ```bash docker images -q ``` ```bash 29e70752d7b2 ``` ## 4. Docker ps 현재 실행 중인 도커 컨테이너 리스트를 출력하는 커맨드입니다. ```bash docker ps --help ``` docker ps에서 사용할 수 있는 argument는 다음과 같습니다. ```bash Usage: docker ps [OPTIONS] List containers Options: -a, --all Show all containers (default shows just running) -f, --filter filter Filter output based on conditions provided --format string Pretty-print containers using a Go template -n, --last int Show n last created containers (includes all states) (default -1) -l, --latest Show the latest created container (includes all states) --no-trunc Don't truncate output -q, --quiet Only display container IDs -s, --size Display total file sizes ``` 아래 명령어를 이용해 직접 실행해 보겠습니다. ```bash docker ps ``` 현재 실행 중인 컨테이너가 없다면 다음과 같이 나옵니다. ```bash CONTAINER ID IMAGE COMMAND CREATED STATUS PORTS NAMES ``` 만약 실행되는 컨테이너가 있다면 다음과 비슷하게 나옵니다. ```bash CONTAINER ID IMAGE COMMAND CREATED STATUS PORTS NAMES c1e8f5e89d8d ubuntu "sleep 3600" 13 seconds ago Up 12 seconds trusting_newton ``` ## 5. Docker run 도커 컨테이너를 실행시키는 커맨드입니다. ```bash docker run --help ``` docker run을 실행하는 명령어는 다음과 같습니다. ```bash Usage: docker run [OPTIONS] IMAGE [COMMAND] [ARG...] Run a command in a new container ``` 여기서 우리가 확인해야 하는 것은 바로 docker run은 세 개 타입의 argument를 받는다는 것을 알 수 있습니다. 1. `[OPTIONS]` 2. `[COMMAND]` 3. `[ARG...]` 직접 도커 컨테이너를 실행해 보겠습니다. ```bash ## Usage: docker run [OPTIONS] IMAGE [COMMAND] [ARG...] docker run -it --name demo1 ubuntu:18.04 /bin/bash ``` - `-it` : `-i` 옵션 + `-t` 옵션 - container 를 실행시킴과 동시에 interactive 한 terminal 로 접속시켜주는 옵션 - `--name` : name - 컨테이너 id 대신, 구분하기 쉽도록 지정해주는 이름 - `/bin/bash` - 컨테이너를 실행시킴과 동시에 실행할 커맨드로, `/bin/bash` 는 bash 쉘을 여는 것을 의미합니다. 실행 후 `exit` 명령어를 통해 컨테이너를 종료합니다. 이 제 앞서 배웠던 `docker ps` 명령어를 치면 다음과 같이 나옵니다. ```bash CONTAINER ID IMAGE COMMAND CREATED STATUS PORTS NAMES ``` 실행되고 있는 컨테이너가 나온다고 했지만 어째서인지 방금 실행한 컨테이너가 보이지 않습니다. 그 이유는 `docker ps`는 기본값으로 현재 실행 중인 컨테이너를 보여주기 때문입니다. 만약 종료된 컨테이너들도 보고 싶다면 `-a` 옵션을 주어야 합니다. ```bash docker ps -a ``` 그러면 다음과 같이 종료된 컨테이너 목록도 나옵니다. ```bash CONTAINER ID IMAGE COMMAND CREATED STATUS PORTS NAMES 4c1aa74a382a ubuntu:18.04 "/bin/bash" 2 minutes ago Exited (0) 2 minutes ago demo1 ``` ## 6. Docker exec Docker 컨테이너 내부에서 명령을 내리거나, 내부로 접속하는 커맨드입니다. ```bash docker exec --help ``` 예를 들어서 다음과 같은 명령어를 실행해 보겠습니다. ```bash docker run -d --name demo2 ubuntu:18.04 sleep 3600 ``` 여기서 `-d` 옵션은 도커 컨테이너를 백그라운드에서 실행시켜서, 컨테이너에서 접속 종료를 하더라도, 계속 실행 중이 되도록 하는 커맨드입니다. `docker ps`를 통해 현재 실행중인지 확인합니다. 다음과 같이 실행 중임을 확인할 수 있습니다. ```bash CONTAINER ID IMAGE COMMAND CREATED STATUS PORTS NAMES fc88a83e90f0 ubuntu:18.04 "sleep 3600" 4 seconds ago Up 3 seconds demo2 ``` 이제 `docker exec` 명령어를 통해서 실행중인 도커 컨테이너에 접속해 보겠습니다. ```bash docker exec -it demo2 /bin/bash ``` 이 전의 `docker run`과 동일하게 container 내부에 접속할 수 있습니다. `exit`을 통해 종료합니다. ## 7. Docker logs 도커 컨테이너의 log를 확인하는 커맨드 입니다. ```bash docker logs --help ``` 다음과 같은 컨테이너를 실행시키도록 하겠습니다. ```bash docker run --name demo3 -d busybox sh -c "while true; do $(echo date); sleep 1; done" ``` 위 명령어를 통해서 test 라는 이름의 busybox 컨테이너를 백그라운드에서 도커 컨테이너로 실행하여, 1초에 한 번씩 현재 시간을 출력하도록 했습니다. 이제 아래 명령어를 통해 log를 확인해 보겠습니다. ```bash docker logs demo3 ``` 정상적으로 수행되면 아래와 비슷하게 나옵니다. ```bash Sun Mar 6 11:06:49 UTC 2022 Sun Mar 6 11:06:50 UTC 2022 Sun Mar 6 11:06:51 UTC 2022 Sun Mar 6 11:06:52 UTC 2022 Sun Mar 6 11:06:53 UTC 2022 Sun Mar 6 11:06:54 UTC 2022 ``` 그런데 이렇게 사용할 경우 여태까지 찍힌 log 밖에 확인할 수 없습니다. 이 때 `-f` 옵션을 이용해 계속 watch 하며 출력할 수 있습니다. ```bash docker logs demo3 -f ``` ## 8. Docker stop 실행 중인 도커 컨테이너를 중단시키는 커맨드입니다. ```bash docker stop --help ``` `docker ps`를 통해 현재 실행 중인 컨테이너를 확인하면 다음과 같습니다. ```bash CONTAINER ID IMAGE COMMAND CREATED STATUS PORTS NAMES 730391669c39 busybox "sh -c 'while true; …" About a minute ago Up About a minute demo3 fc88a83e90f0 ubuntu:18.04 "sleep 3600" 4 minutes ago Up 4 minutes demo2 ``` 이제 `docker stop` 을 통해 도커를 정지해 보겠습니다. ```bash docker stop demo2 ``` 실행 후 `docker ps`를 다시 입력합니다. ```bash CONTAINER ID IMAGE COMMAND CREATED STATUS PORTS NAMES 730391669c39 busybox "sh -c 'while true; …" 2 minutes ago Up 2 minutes demo3 ``` 위의 결과와 비교했을 때 demo2 컨테이너가 현재 실행 중인 컨테이너 목록에서 사라진 것을 확인할 수 있습니다. 나머지 컨테이너도 정지합니다. ```bash docker stop demo3 ``` ## 9. Docker rm 도커 컨테이너를 삭제하는 커맨드입니다. ```bash docker rm --help ``` 도커 컨테이너는 기본적으로 종료가 된 상태로 있습니다. 그래서 `docker ps -a`를 통해서 종료된 컨테이너도 볼 수 있습니다. 그런데 종료된 컨테이너는 왜 지워야 할까요? 종료되어 있는 도커에는 이전에 사용한 데이터가 아직 컨테이너 내부에 남아있습니다. 그래서 restart 등을 통해서 컨테이너를 재시작할 수 있습니다. 그런데 이 과정에서 disk를 사용하게 됩니다. 그래서 완전히 사용하지 않는 컨테이너를 지우기 위해서는 `docker rm` 명령어를 사용해야 합니다. 우선 현재 컨테이너들을 확인합니다. ```bash docker ps -a ``` 다음과 같이 3개의 컨테이너가 있습니다. ```bash CONTAINER ID IMAGE COMMAND CREATED STATUS PORTS NAMES 730391669c39 busybox "sh -c 'while true; …" 4 minutes ago Exited (137) About a minute ago demo3 fc88a83e90f0 ubuntu:18.04 "sleep 3600" 7 minutes ago Exited (137) 2 minutes ago demo2 4c1aa74a382a ubuntu:18.04 "/bin/bash" 10 minutes ago Exited (0) 10 minutes ago demo1 ``` 아래 명령어를 통해 `demo3` 컨테이너를 삭제해 보겠습니다. ```bash docker rm demo3 ``` `docker ps -a` 명령어를 치면 다음과 같이 2개로 줄었습니다. ```bash CONTAINER ID IMAGE COMMAND CREATED STATUS PORTS NAMES fc88a83e90f0 ubuntu:18.04 "sleep 3600" 13 minutes ago Exited (137) 8 minutes ago demo2 4c1aa74a382a ubuntu:18.04 "/bin/bash" 16 minutes ago Exited (0) 16 minutes ago demo1 ``` 나머지 컨테이너들도 삭제합니다. ```bash docker rm demo2 docker rm demo1 ``` ## 10. Docker rmi 도커 이미지를 삭제하는 커맨드입니다. ```bash docker rmi --help ``` 아래 명령어를 통해 현재 어떤 이미지들이 로컬에 있는지 확인합니다. ```bash docker images ``` 다음과 같이 출력됩니다. ```bash REPOSITORY TAG IMAGE ID CREATED SIZE busybox latest a8440bba1bc0 32 hours ago 1.41MB ubuntu 18.04 29e70752d7b2 2 days ago 56.7MB ``` `busybox` 이미지를 삭제해 보겠습니다. ```bash docker rmi busybox ``` 다시 `docker images`를 칠 경우 다음과 같이 나옵니다. ```bash REPOSITORY TAG IMAGE ID CREATED SIZE ubuntu 18.04 29e70752d7b2 2 days ago 56.7MB ``` ## References - [https://docs.github.com/en/packages/working-with-a-github-packages-registry/working-with-the-container-registry](https://docs.github.com/en/packages/working-with-a-github-packages-registry/working-with-the-container-registry) ================================================ FILE: versioned_docs/version-1.0/prerequisites/docker/docker.md ================================================ --- title : "What is Docker?" description: "Introduction to Docker." sidebar_position: 3 contributors: ["Jongseob Jeon", "Jaeyeon Kim"] --- ## 컨테이너 - 컨테이너 가상화 - 어플리케이션을 어디에서나 동일하게 실행하는 기술 - 컨테이너 이미지 - 어플리케이션을 실행시키기 위해 필요한 모든 파일들의 집합 - → 붕어빵 틀 - 컨테이너란? - 컨테이너 이미지를 기반으로 실행된 한 개의 프로세스 - → 붕어빵 틀로 찍어낸 붕어빵 ## 도커 도커는 **컨테이너를 관리**하고 사용할 수 있게 해주는 플랫폼입니다. 이러한 도커의 슬로건은 바로 **Build Once, Run Anywhere** 로 어디에서나 동일한 실행 결과를 보장합니다. 도커 내부에서 동작하는 과정을 보자면 실제로 container 를 위한 리소스를 분리하고, lifecycle 을 제어하는 기능은 linux kernel 의 cgroup 등이 수행합니다. 하지만 이러한 인터페이스를 바로 사용하는 것은 **너무 어렵기 때문에** 다음과 같은 추상화 layer를 만들게 됩니다. ![docker-layer.png](./img/docker-layer.png) 이를 통해 사용자는 사용자 친화적인 API 인 **Docker CLI** 만으로 쉽게 컨테이너를 제어할 수 있습니다. ## Layer 해석 위에서 나온 layer들의 역할은 다음과 같습니다. 1. runC: linux kernel 의 기능을 직접 사용해서, container 라는 하나의 프로세스가 사용할 네임스페이스와 cpu, memory, filesystem 등을 격리시켜주는 기능을 수행합니다. 2. containerd: runC(OCI layer) 에게 명령을 내리기 위한 추상화 단계이며, 표준화된 인터페이스(OCI)를 사용합니다. 3. dockerd: containerd 에게 명령을 내리는 역할만 합니다. 4. docker cli: 사용자는 docker cli 로 dockerd (Docker daemon)에게 명령을 내리기만 하면 됩니다. - 이 통신 과정에서 unix socket 을 사용하기 때문에 가끔 도커 관련 에러가 나면 `/var/run/docker.sock` 가 사용 중이다, 권한이 없다 등등의 에러 메시지가 나오는 것입니다. 이처럼 도커는 많은 단계를 감싸고 있지만, 흔히 도커라는 용어를 사용할 때는 Docker CLI 를 말할 때도 있고, Dockerd 를 말할 때도 있고 Docker Container 하나를 말할 때도 있어서 혼란이 생길 수 있습니다. 앞으로 나오는 글에서도 도커가 여러가지 의미로 쓰일 수 있습니다. ## For ML Engineer 머신러닝 엔지니어가 도커를 사용하는 이유는 다음과 같습니다. 1. 나의 ML 학습/추론 코드를 OS, python version, python 환경, 특정 python package 버전에 independent 하도록 해야 한다. 2. 그래서 코드 뿐만이 아닌 **해당 코드가 실행되기 위해 필요한 모든 종속적인 패키지, 환경 변수, 폴더명 등등을 하나의 패키지로** 묶을 수 있는 기술이 컨테이너화 기술이다. 3. 이 기술을 쉽게 사용하고 관리할 수 있는 소프트웨어 중 하나가 도커이며, 패키지를 도커 이미지라고 부른다. ================================================ FILE: versioned_docs/version-1.0/prerequisites/docker/images.md ================================================ --- title : "[Practice] Docker images" description: "Practice to use docker image." sidebar_position: 5 contributors: ["Jongseob Jeon", "Jaeyeon Kim"] --- ## 1. Dockerfile 만들기 도커 이미지를 만드는 가장 쉬운 방법은 도커에서 제공하는 템플릿인 Dockerfile을 사용하는 것입니다. 이외에는 running container 를 docker image 로 만드는 `docker commit` 등을 활용하는 방법이 있습니다. - `Dockerfile` - 사용자가 도커 이미지를 쉽게 만들 수 있도록, 제공하는 템플릿 - 파일명은 꼭 `Dockerfile` 이 아니어도 상관없지만, `docker build` 수행 시, default 로 사용하는 파일명이 `Dockerfile` 입니다. - 도커 이미지를 만드는 `docker build` 를 수행할 때, `-f` 옵션을 주면 다른 파일명으로도 사용 가능합니다. - ex) `docker build -f dockerfile-asdf .` 도 가능 1. 실습을 위해서 편한 디렉토리로 이동합니다. ```bash cd ``` 2. docker-practice 라는 이름의 폴더를 생성합니다. ```bash mkdir docker-practice ``` 3. docker-practice 폴더로 이동합니다. ```bash cd docker-practice ``` 4. Dockerfile 이라는 빈 파일을 생성합니다. ```bash touch Dockerfile ``` 5. 정상적으로 생성되었는지 확인합니다. ```bash ls ``` ## 2. Dockerfile 내장 명령어 Dockerfile 에서 사용할 수 있는 기본적인 명령어에 대해서 하나씩 알아보겠습니다. ### FROM Dockerfile 이 base image 로 어떠한 이미지를 사용할 것인지를 명시하는 명령어입니다. 도커 이미지를 만들 때, 아무것도 없는 빈 환경에서부터 하나하나씩 제가 의도한 환경을 만들어가는게 아니라, python 3.9 버전이 설치된 환경을 베이스로해두고, 저는 pytorch 를 설치하고, 제 소스코드만 넣어두는 형태로 활용할 수가 있습니다. 이러한 경우에는 `python:3.9`, `python-3.9-alpine`, ... 등의 잘 만들어진 이미지를 베이스로 활용합니다. ```docker FROM [:] [AS ] # 예시 FROM ubuntu FROM ubuntu:18.04 FROM nginx:latest AS ngx ``` ### COPY **host(로컬)에서의 ``** 경로의 파일 혹은 디렉토리를 **container 내부에서의 ``** 경로에 복사하는 명령어입니다. ```docker COPY ... # 예시 COPY a.txt /some-directory/b.txt COPY my-directory /some-directory-2 ``` `ADD` 는 `COPY` 와 비슷하지만 추가적인 기능을 품고 있습니다. ```docker # 1 - 호스트에 압축되어있는 파일을 풀면서 컨테이너 내부로 copy 할 수 있음 ADD scripts.tar.gz /tmp # 2 - Remote URLs 에 있는 파일을 소스 경로로 지정할 수 있음 ADD http://www.example.com/script.sh /tmp # 위 두 가지 기능을 사용하고 싶을 경우에만 COPY 대신 ADD 를 사용하는 것을 권장 ``` ### RUN 명시한 커맨드를 도커 컨테이너 내부에서 실행하는 명령어입니다. 도커 이미지는 해당 커맨드들이 실행된 상태를 유지합니다. ```docker RUN RUN ["executable-command", "parameter1", "parameter2"] # 예시 RUN pip install torch RUN pip install -r requirements.txt ``` ### CMD 명시한 커맨드를 도커 컨테이너가 **시작될 때**, 실행하는 것을 명시하는 명령어입니다. 비슷한 역할을 하는 명령어로 **ENTRYPOINT** 가 있습니다. 이 둘의 차이에 대해서는 **뒤에서** 다룹니다. 하나의 도커 이미지에서는 하나의 **CMD** 만 실행할 수 있다는 점에서 **RUN** 명령어와 다릅니다. ```docker CMD CMD ["executable-command", "parameter1", "parameter2"] CMD ["parameter1", "parameter2"] # ENTRYPOINT 와 함께 사용될 때 # 예시 CMD python main.py ``` ### WORKDIR 이후 추가될 명령어를 컨테이너 내의 어떤 디렉토리에서 수행할 것인지를 명시하는 명령어입니다. 만약, 해당 디렉토리가 없다면 생성합니다. ```docker WORKDIR /path/to/workdir # 예시 WORKDIR /home/demo RUN pwd # /home/demo 가 출력됨 ``` ### ENV 컨테이너 내부에서 지속적으로 사용될 environment variable 의 값을 설정하는 명령어입니다. ```docker ENV ENV = # 예시 # default 언어 설정 RUN locale-gen ko_KR.UTF-8 ENV LANG ko_KR.UTF-8 ENV LANGUAGE ko_KR.UTF-8 ENV LC_ALL ko_KR.UTF-8 ``` ### EXPOSE 컨테이너에서 뚫어줄 포트/프로토콜을 지정할 수 있습니다. `` 을 지정하지 않으면 TCP 가 디폴트로 설정됩니다. ```docker EXPOSE EXPOSE / # 예시 EXPOSE 8080 ``` ## 3. 간단한 Dockerfile 작성해보기 `vim Dockerfile` 혹은 vscode 등 본인이 사용하는 편집기로 `Dockerfile` 을 열어 다음과 같이 작성해줍니다. ```docker # base image 를 ubuntu 18.04 로 설정합니다. FROM ubuntu:18.04 # apt-get update 명령을 실행합니다. RUN apt-get update # TEST env var의 값을 hello 로 지정합니다. ENV TEST hello # DOCKER CONTAINER 가 시작될 때, 환경변수 TEST 의 값을 출력합니다. CMD echo $TEST ``` ## 4. Docker build from Dockerfile `docker build` 명령어로 Dockerfile 로부터 Docker Image 를 만들어봅니다. ```bash docker build --help ``` Dockerfile 이 있는 경로에서 다음 명령을 실행합니다. ```bash docker build -t my-image:v1.0.0 . ``` 위 커맨드를 설명하면 다음과 같습니다. - `.` : **현재 경로**에 있는 Dockerfile 로부터 - `-t` : my-image 라는 **이름**과 v1.0.0 이라는 **태그**로 **이미지**를 - 빌드하겠다라는 명령어 정상적으로 이미지 빌드되었는지 확인해 보겠습니다. ```bash # grep : my-image 가 있는지를 잡아내는 (grep) 하는 명령어 docker images | grep my-image ``` 정상적으로 수행된다면 다음과 같이 출력됩니다. ```bash my-image v1.0.0 143114710b2d 3 seconds ago 87.9MB ``` ## 5. Docker run from Dockerfile 그럼 이제 방금 빌드한 `my-image:v1.0.0` 이미지로 docker 컨테이너를 **run** 해보겠습니다. ```bash docker run my-image:v1.0.0 ``` 정상적으로 수행된다면 다음과 같이 나옵니다. ```bash hello ``` ## 6. Docker run with env 이번에는 방금 빌드한 `my-image:v1.0.0` 이미지를 실행하는 시점에, `TEST` env var 의 값을 변경하여 docker 컨테이너를 run 해보겠습니다. ```bash docker run -e TEST=bye my-image:v1.0.0 ``` 정상적으로 수행된다면 다음과 같이 나옵니다. ```bash bye ``` ================================================ FILE: versioned_docs/version-1.0/prerequisites/docker/install.md ================================================ --- title : "Install Docker" description: "Install docker to start." sidebar_position: 1 contributors: ["Jongseob Jeon", "Jaeyeon Kim"] --- ## Docker 도커 실습을 위해 도커를 설치해야 합니다. 도커 설치는 어떤 OS를 사용하는지에 따라 달라집니다. 각 환경에 맞는 도커 설치는 공식 홈페이지를 참고해주세요. - [ubuntu](https://docs.docker.com/engine/install/ubuntu/) - [mac](https://docs.docker.com/desktop/mac/install/) - [windows](https://docs.docker.com/desktop/windows/install/) ## 설치 확인 `docker run hello-world` 가 정상적으로 수행되는 OS, 터미널 환경이 필요합니다. | OS | Docker Engine | Terminal | | ------- | -------------- | ------------------ | | MacOS | Docker Desktop | zsh | | Windows | Docker Desktop | Powershell | | Windows | Docker Desktop | WSL2 | | Ubuntu | Docker Engine | bash | ## 들어가기 앞서서.. MLOps를 사용하기 위해 필요한 도커 사용법을 설명하니 많은 비유와 예시가 MLOps 쪽으로 치중되어 있을 수 있습니다. ================================================ FILE: versioned_docs/version-1.0/prerequisites/docker/introduction.md ================================================ --- title : "Why Docker & Kubernetes ?" description: "Introduction to Docker." sidebar_position: 2 contributors: ["Jongseob Jeon", "Jaeyeon Kim"] --- ## Why Kubernetes ? 머신러닝 모델을 서비스화하기 위해서는 모델 개발 외에도 많은 **부가적인** 기능들이 필요합니다. 1. 학습 단계 - 모델 학습 명령의 스케줄 관리 - 학습된 모델의 Reproducibility 보장 2. 배포 단계 - 트래픽 분산 - 서비스 장애 모니터링 - 장애 시 트러블슈팅 다행히도 이런 기능들에 대한 needs는 소프트웨어 개발 쪽에서 이미 많은 고민을 거쳐 발전되어 왔습니다. 따라서 머신러닝 모델을 배포할 때도 이런 고민의 결과물들을 활용하면 큰 도움을 받을 수 있습니다. MLOps에서 대표적으로 활용하는 소프트웨어 제품이 바로 도커와 쿠버네티스입니다. ## 도커와 쿠버네티스 ### 기술 이름이 아니라 제품 이름 도커와 쿠버네티스는 각각 컨테이너라이제이션(Containerization) 기능과 컨테이너 오케스트레이션(Container Orchestration) 기능을 제공하는 대표 소프트웨어(제품)입니다. #### 도커 도커는 과거에 대세였지만 유료화 관련 정책들을 하나씩 추가하면서 점점 사용 빈도가 하락세입니다. 하지만 2022년 3월 기준으로 아직까지도 가장 일반적으로 사용되는 컨테이너 가상화 소프트웨어입니다. ![sysdig-2019.png](./img/sysdig-2019.png)
[from sysdig 2019]
![sysdig-2021.png](./img/sysdig-2021.png)
[from sysdig 2021]
#### 쿠버네티스 쿠버네티스는 지금까지는 비교 대상조차 거의 없는 제품입니다. ![cncf-survey.png](./img/cncf-survey.png)
[from cncf survey]
![t4-ai.png](./img/t4-ai.png)
[from t4.ai]
### **재미있는 오픈소스 역사 이야기** #### 초기 도커 & 쿠버네티스 초기 도커 개발시에는 Docker Engine이라는 **하나의 패키지**에 API, CLI, 네트워크, 스토리지 등 여러 기능들을 모두 포함했으나, **MSA** 의 철학을 담아 **하나씩 분리**하기 시작했습니다. 하지만 초기의 쿠버네티스는 컨테이너 가상화를 위해 Docker Engine을 내장하고 있었습니다. 따라서 도커 버전이 업데이트될 때마다 Docker Engine 의 인터페이스가 변경되어 쿠버네티스에서 크게 영향을 받는 일이 계속해서 발생하였습니다. #### Open Container Initiative 그래서 **이런 불편함을 해소**하고자, 도커를 중심으로 구글 등 컨테이너 기술에 관심있는 **여러 집단**들이 한데 모여 **Open Container Initiative,** 이하 **OCI**라는 프로젝트를 시작하여 컨테이너에 관한 **표준**을 정하는 일들을 시작하였습니다. 도커에서도 인터페이스를 **한 번 더 분리**해서, OCI 표준을 준수하는 **containerd**라는 Container Runtime 를 개발하고, **dockerd** 가 containerd 의 API 를 호출하도록 추상화 레이어를 추가하였습니다. 이러한 흐름에 맞추어서 쿠버네티스에서도 이제부터는 도커만을 지원하지 않고, **OCI 표준을** 준수하고, 정해진 스펙을 지키는 컨테이너 런타임은 무엇이든 쿠버네티스에서 사용할 수 있도록, Container Runtime Interface, 이하 **CRI 스펙**을 버전 1.5부터 제공하기 시작했습니다. #### CRI-O Red Hat, Intel, SUSE, IBM에서 **OCI 표준+CRI 스펙을** 따라 Kubernetes 전용 Container Runtime 을 목적으로 개발한 컨테이너 런타임입니다. #### 지금의 도커 & 쿠버네티스 쿠버네티스는 Docker Engine 을 디폴트 컨테이너 런타임으로 사용해왔지만, 도커의 API 가 **CRI** 스펙에 맞지 않아(*OCI 는 따름*) 도커의 API를 **CRI**와 호환되게 바꿔주는 **dockershim**을 쿠버네티스 자체적으로 개발 및 지원해왔었는데,(*도커 측이 아니라 쿠버네티스 측에서 지원했다는 점이 굉장히 큰 짐이었습니다.*) 이걸 쿠버네티스 **v1.20 부터는 Deprecated하고,** **v1.23 부터는 지원을 포기**하기로 결정하였습니다. - v1.23 은 2021 년 12월 릴리즈 그래서 쿠버네티스 v1.23 부터는 도커를 native 하게 쓸 수 없습니다다. 그렇지만 **사용자들은 이런 변화에 크게 관련이 있진 않습니다.** 왜냐하면 Docker Engine을 통해 만들어진 도커 이미지는 OCI 표준을 준수하기 때문에, 쿠버네티스가 어떤 컨테이너 런타임으로 이루어져있든 사용 가능하기 때문입니다. ### References - [*https://www.linkedin.com/pulse/containerd는-무엇이고-왜-중요할까-sean-lee/?originalSubdomain=kr*](https://www.linkedin.com/pulse/containerd%EB%8A%94-%EB%AC%B4%EC%97%87%EC%9D%B4%EA%B3%A0-%EC%99%9C-%EC%A4%91%EC%9A%94%ED%95%A0%EA%B9%8C-sean-lee/?originalSubdomain=kr) - [https://kubernetes.io/blog/2021/12/07/kubernetes-1-23-release-announcement/](https://kubernetes.io/blog/2021/12/07/kubernetes-1-23-release-announcement/) - [https://kubernetes.io/blog/2020/12/02/dockershim-faq/](https://kubernetes.io/blog/2020/12/02/dockershim-faq/) - [https://kubernetes.io/blog/2020/12/02/dont-panic-kubernetes-and-docker/](https://kubernetes.io/blog/2020/12/02/dont-panic-kubernetes-and-docker/) - [https://kubernetes.io/ko/blog/2020/12/02/dont-panic-kubernetes-and-docker/](https://kubernetes.io/ko/blog/2020/12/02/dont-panic-kubernetes-and-docker/) ================================================ FILE: versioned_docs/version-1.0/setup-components/_category_.json ================================================ { "label": "Setup Components", "position": 3, "link": { "type": "generated-index" } } ================================================ FILE: versioned_docs/version-1.0/setup-components/install-components-kf.md ================================================ --- title : "1. Kubeflow" description: "구성요소 설치 - Kubeflow" sidebar_position: 1 date: 2021-12-13 lastmod: 2021-12-20 contributors: ["Jaeyeon Kim", "SeungTae Kim"] --- ## 설치 파일 준비 Kubeflow **v1.4.0** 버전을 설치하기 위해서, 설치에 필요한 manifests 파일들을 준비합니다. [kubeflow/manifests Repository](https://github.com/kubeflow/manifests) 를 **v1.4.0** 태그로 깃 클론한 뒤, 해당 폴더로 이동합니다. ```bash git clone -b v1.4.0 https://github.com/kubeflow/manifests.git cd manifests ``` ## 각 구성 요소별 설치 kubeflow/manifests Repository 에 각 구성 요소별 설치 커맨드가 적혀져 있지만, 설치하며 발생할 수 있는 이슈 혹은 정상적으로 설치되었는지 확인하는 방법이 적혀져 있지 않아 처음 설치하는 경우 어려움을 겪는 경우가 많습니다. 따라서, 각 구성 요소별로 정상적으로 설치되었는지 확인하는 방법을 함께 작성합니다. 또한, 본 문서에서는 **모두의 MLOps** 에서 다루지 않는 구성요소인 Knative, KFServing, MPI Operator 의 설치는 리소스의 효율적 사용을 위해 따로 설치하지 않습니다. ### Cert-manager 1. cert-manager 를 설치합니다. ```bash kustomize build common/cert-manager/cert-manager/base | kubectl apply -f - ``` 정상적으로 설치되면 다음과 같이 출력됩니다. ```bash namespace/cert-manager created customresourcedefinition.apiextensions.k8s.io/certificaterequests.cert-manager.io created customresourcedefinition.apiextensions.k8s.io/certificates.cert-manager.io created customresourcedefinition.apiextensions.k8s.io/challenges.acme.cert-manager.io created customresourcedefinition.apiextensions.k8s.io/clusterissuers.cert-manager.io created customresourcedefinition.apiextensions.k8s.io/issuers.cert-manager.io created customresourcedefinition.apiextensions.k8s.io/orders.acme.cert-manager.io created serviceaccount/cert-manager created serviceaccount/cert-manager-cainjector created serviceaccount/cert-manager-webhook created role.rbac.authorization.k8s.io/cert-manager-webhook:dynamic-serving created role.rbac.authorization.k8s.io/cert-manager-cainjector:leaderelection created role.rbac.authorization.k8s.io/cert-manager:leaderelection created clusterrole.rbac.authorization.k8s.io/cert-manager-cainjector created clusterrole.rbac.authorization.k8s.io/cert-manager-controller-approve:cert-manager-io created clusterrole.rbac.authorization.k8s.io/cert-manager-controller-certificates created clusterrole.rbac.authorization.k8s.io/cert-manager-controller-challenges created clusterrole.rbac.authorization.k8s.io/cert-manager-controller-clusterissuers created clusterrole.rbac.authorization.k8s.io/cert-manager-controller-ingress-shim created clusterrole.rbac.authorization.k8s.io/cert-manager-controller-issuers created clusterrole.rbac.authorization.k8s.io/cert-manager-controller-orders created clusterrole.rbac.authorization.k8s.io/cert-manager-edit created clusterrole.rbac.authorization.k8s.io/cert-manager-view created clusterrole.rbac.authorization.k8s.io/cert-manager-webhook:subjectaccessreviews created rolebinding.rbac.authorization.k8s.io/cert-manager-webhook:dynamic-serving created rolebinding.rbac.authorization.k8s.io/cert-manager-cainjector:leaderelection created rolebinding.rbac.authorization.k8s.io/cert-manager:leaderelection created clusterrolebinding.rbac.authorization.k8s.io/cert-manager-cainjector created clusterrolebinding.rbac.authorization.k8s.io/cert-manager-controller-approve:cert-manager-io created clusterrolebinding.rbac.authorization.k8s.io/cert-manager-controller-certificates created clusterrolebinding.rbac.authorization.k8s.io/cert-manager-controller-challenges created clusterrolebinding.rbac.authorization.k8s.io/cert-manager-controller-clusterissuers created clusterrolebinding.rbac.authorization.k8s.io/cert-manager-controller-ingress-shim created clusterrolebinding.rbac.authorization.k8s.io/cert-manager-controller-issuers created clusterrolebinding.rbac.authorization.k8s.io/cert-manager-controller-orders created clusterrolebinding.rbac.authorization.k8s.io/cert-manager-webhook:subjectaccessreviews created service/cert-manager created service/cert-manager-webhook created deployment.apps/cert-manager created deployment.apps/cert-manager-cainjector created deployment.apps/cert-manager-webhook created mutatingwebhookconfiguration.admissionregistration.k8s.io/cert-manager-webhook created validatingwebhookconfiguration.admissionregistration.k8s.io/cert-manager-webhook created ``` cert-manager namespace 의 3 개의 pod 가 모두 Running 이 될 때까지 기다립니다. ```bash kubectl get pod -n cert-manager ``` 모두 Running 이 되면 다음과 비슷한 결과가 출력됩니다. ```bash NAME READY STATUS RESTARTS AGE cert-manager-7dd5854bb4-7nmpd 1/1 Running 0 2m10s cert-manager-cainjector-64c949654c-2scxr 1/1 Running 0 2m10s cert-manager-webhook-6b57b9b886-7q6g2 1/1 Running 0 2m10s ``` 2. kubeflow-issuer 를 설치합니다. ```bash kustomize build common/cert-manager/kubeflow-issuer/base | kubectl apply -f - ``` 정상적으로 설치되면 다음과 같이 출력됩니다. ```bash clusterissuer.cert-manager.io/kubeflow-self-signing-issuer created ``` - cert-manager-webhook 이슈 cert-manager-webhook deployment 가 Running 이 아닌 경우, 다음과 비슷한 에러가 발생하며 kubeflow-issuer가 설치되지 않을 수 있음에 주의하시기 바랍니다. 해당 에러가 발생한 경우, cert-manager 의 3개의 pod 가 모두 Running 이 되는 것을 확인한 이후 다시 명령어를 수행하시기 바랍니다. ```bash Error from server: error when retrieving current configuration of: Resource: "cert-manager.io/v1alpha2, Resource=clusterissuers", GroupVersionKind: "cert-manager.io/v1alpha2, Kind=ClusterIssuer" Name: "kubeflow-self-signing-issuer", Namespace: "" from server for: "STDIN": conversion webhook for cert-manager.io/v1, Kind=ClusterIssuer failed: Post "https://cert-manager-webhook.cert-manager.svc:443/convert?timeout=30s": dial tcp 10.101.177.157:443: connect: connection refused ``` ### Istio 1. istio 관련 Custom Resource Definition(CRD) 를 설치합니다. ```bash kustomize build common/istio-1-9/istio-crds/base | kubectl apply -f - ``` 정상적으로 수행되면 다음과 같이 출력됩니다. ```bash customresourcedefinition.apiextensions.k8s.io/authorizationpolicies.security.istio.io created customresourcedefinition.apiextensions.k8s.io/destinationrules.networking.istio.io created customresourcedefinition.apiextensions.k8s.io/envoyfilters.networking.istio.io created customresourcedefinition.apiextensions.k8s.io/gateways.networking.istio.io created customresourcedefinition.apiextensions.k8s.io/istiooperators.install.istio.io created customresourcedefinition.apiextensions.k8s.io/peerauthentications.security.istio.io created customresourcedefinition.apiextensions.k8s.io/requestauthentications.security.istio.io created customresourcedefinition.apiextensions.k8s.io/serviceentries.networking.istio.io created customresourcedefinition.apiextensions.k8s.io/sidecars.networking.istio.io created customresourcedefinition.apiextensions.k8s.io/virtualservices.networking.istio.io created customresourcedefinition.apiextensions.k8s.io/workloadentries.networking.istio.io created customresourcedefinition.apiextensions.k8s.io/workloadgroups.networking.istio.io created ``` 2. istio namespace 를 설치합니다. ```bash kustomize build common/istio-1-9/istio-namespace/base | kubectl apply -f - ``` 정상적으로 수행되면 다음과 같이 출력됩니다. ```bash namespace/istio-system created ``` 3. istio 를 설치합니다. ```bash kustomize build common/istio-1-9/istio-install/base | kubectl apply -f - ``` 정상적으로 수행되면 다음과 같이 출력됩니다. ```bash serviceaccount/istio-ingressgateway-service-account created serviceaccount/istio-reader-service-account created serviceaccount/istiod-service-account created role.rbac.authorization.k8s.io/istio-ingressgateway-sds created role.rbac.authorization.k8s.io/istiod-istio-system created clusterrole.rbac.authorization.k8s.io/istio-reader-istio-system created clusterrole.rbac.authorization.k8s.io/istiod-istio-system created rolebinding.rbac.authorization.k8s.io/istio-ingressgateway-sds created rolebinding.rbac.authorization.k8s.io/istiod-istio-system created clusterrolebinding.rbac.authorization.k8s.io/istio-reader-istio-system created clusterrolebinding.rbac.authorization.k8s.io/istiod-istio-system created configmap/istio created configmap/istio-sidecar-injector created service/istio-ingressgateway created service/istiod created deployment.apps/istio-ingressgateway created deployment.apps/istiod created envoyfilter.networking.istio.io/metadata-exchange-1.8 created envoyfilter.networking.istio.io/metadata-exchange-1.9 created envoyfilter.networking.istio.io/stats-filter-1.8 created envoyfilter.networking.istio.io/stats-filter-1.9 created envoyfilter.networking.istio.io/tcp-metadata-exchange-1.8 created envoyfilter.networking.istio.io/tcp-metadata-exchange-1.9 created envoyfilter.networking.istio.io/tcp-stats-filter-1.8 created envoyfilter.networking.istio.io/tcp-stats-filter-1.9 created envoyfilter.networking.istio.io/x-forwarded-host created gateway.networking.istio.io/istio-ingressgateway created authorizationpolicy.security.istio.io/global-deny-all created authorizationpolicy.security.istio.io/istio-ingressgateway created mutatingwebhookconfiguration.admissionregistration.k8s.io/istio-sidecar-injector created validatingwebhookconfiguration.admissionregistration.k8s.io/istiod-istio-system created ``` istio-system namespace 의 2 개의 pod 가 모두 Running 이 될 때까지 기다립니다. ```bash kubectl get po -n istio-system ``` 모두 Running 이 되면 다음과 비슷한 결과가 출력됩니다. ```bash NAME READY STATUS RESTARTS AGE istio-ingressgateway-79b665c95-xm22l 1/1 Running 0 16s istiod-86457659bb-5h58w 1/1 Running 0 16s ``` ### Dex dex 를 설치합니다. ```bash kustomize build common/dex/overlays/istio | kubectl apply -f - ``` 정상적으로 수행되면 다음과 같이 출력됩니다. ```bash namespace/auth created customresourcedefinition.apiextensions.k8s.io/authcodes.dex.coreos.com created serviceaccount/dex created clusterrole.rbac.authorization.k8s.io/dex created clusterrolebinding.rbac.authorization.k8s.io/dex created configmap/dex created secret/dex-oidc-client created service/dex created deployment.apps/dex created virtualservice.networking.istio.io/dex created ``` auth namespace 의 1 개의 pod 가 모두 Running 이 될 때까지 기다립니다. ```bash kubectl get po -n auth ``` 모두 Running 이 되면 다음과 비슷한 결과가 출력됩니다. ```bash NAME READY STATUS RESTARTS AGE dex-5ddf47d88d-458cs 1/1 Running 1 12s ``` ### OIDC AuthService OIDC AuthService 를 설치합니다. ```bash kustomize build common/oidc-authservice/base | kubectl apply -f - ``` 정상적으로 수행되면 다음과 같이 출력됩니다. ```bash configmap/oidc-authservice-parameters created secret/oidc-authservice-client created service/authservice created persistentvolumeclaim/authservice-pvc created statefulset.apps/authservice created envoyfilter.networking.istio.io/authn-filter created ``` istio-system namespace 에 authservice-0 pod 가 Running 이 될 때까지 기다립니다. ```bash kubectl get po -n istio-system -w ``` 모두 Running 이 되면 다음과 비슷한 결과가 출력됩니다. ```bash NAME READY STATUS RESTARTS AGE authservice-0 1/1 Running 0 14s istio-ingressgateway-79b665c95-xm22l 1/1 Running 0 2m37s istiod-86457659bb-5h58w 1/1 Running 0 2m37s ``` ### Kubeflow Namespace kubeflow namespace 를 생성합니다. ```bash kustomize build common/kubeflow-namespace/base | kubectl apply -f - ``` 정상적으로 수행되면 다음과 같이 출력됩니다. ```bash namespace/kubeflow created ``` kubeflow namespace 를 조회합니다. ```bash kubectl get ns kubeflow ``` 정상적으로 생성되면 다음과 비슷한 결과가 출력됩니다. ```bash NAME STATUS AGE kubeflow Active 8s ``` ### Kubeflow Roles kubeflow-roles 를 설치합니다. ```bash kustomize build common/kubeflow-roles/base | kubectl apply -f - ``` 정상적으로 수행되면 다음과 같이 출력됩니다. ```bash clusterrole.rbac.authorization.k8s.io/kubeflow-admin created clusterrole.rbac.authorization.k8s.io/kubeflow-edit created clusterrole.rbac.authorization.k8s.io/kubeflow-kubernetes-admin created clusterrole.rbac.authorization.k8s.io/kubeflow-kubernetes-edit created clusterrole.rbac.authorization.k8s.io/kubeflow-kubernetes-view created clusterrole.rbac.authorization.k8s.io/kubeflow-view created ``` 방금 생성한 kubeflow roles 를 조회합니다. ```bash kubectl get clusterrole | grep kubeflow ``` 다음과 같이 총 6개의 clusterrole 이 출력됩니다. ```bash kubeflow-admin 2021-12-03T08:51:36Z kubeflow-edit 2021-12-03T08:51:36Z kubeflow-kubernetes-admin 2021-12-03T08:51:36Z kubeflow-kubernetes-edit 2021-12-03T08:51:36Z kubeflow-kubernetes-view 2021-12-03T08:51:36Z kubeflow-view 2021-12-03T08:51:36Z ``` ### Kubeflow Istio Resources kubeflow-istio-resources 를 설치합니다. ```bash kustomize build common/istio-1-9/kubeflow-istio-resources/base | kubectl apply -f - ``` 정상적으로 수행되면 다음과 같이 출력됩니다. ```bash clusterrole.rbac.authorization.k8s.io/kubeflow-istio-admin created clusterrole.rbac.authorization.k8s.io/kubeflow-istio-edit created clusterrole.rbac.authorization.k8s.io/kubeflow-istio-view created gateway.networking.istio.io/kubeflow-gateway created ``` 방금 생성한 kubeflow roles 를 조회합니다. ```bash kubectl get clusterrole | grep kubeflow-istio ``` 다음과 같이 총 3개의 clusterrole 이 출력됩니다. ```bash kubeflow-istio-admin 2021-12-03T08:53:17Z kubeflow-istio-edit 2021-12-03T08:53:17Z kubeflow-istio-view 2021-12-03T08:53:17Z ``` Kubeflow namespace 에 gateway 가 정상적으로 설치되었는지 확인합니다. ```bash kubectl get gateway -n kubeflow ``` 정상적으로 생성되면 다음과 비슷한 결과가 출력됩니다. ```bash NAME AGE kubeflow-gateway 31s ``` ### Kubeflow Pipelines kubeflow pipelines 를 설치합니다. ```bash kustomize build apps/pipeline/upstream/env/platform-agnostic-multi-user | kubectl apply -f - ``` 정상적으로 수행되면 다음과 같이 출력됩니다. ```bash customresourcedefinition.apiextensions.k8s.io/clusterworkflowtemplates.argoproj.io created customresourcedefinition.apiextensions.k8s.io/cronworkflows.argoproj.io created customresourcedefinition.apiextensions.k8s.io/workfloweventbindings.argoproj.io created ...(생략) authorizationpolicy.security.istio.io/ml-pipeline-visualizationserver created authorizationpolicy.security.istio.io/mysql created authorizationpolicy.security.istio.io/service-cache-server created ``` 위 명령어는 여러 resources 를 한 번에 설치하고 있지만, 설치 순서의 의존성이 있는 리소스가 존재합니다. 따라서 때에 따라 다음과 비슷한 에러가 발생할 수 있습니다. ```bash "error: unable to recognize "STDIN": no matches for kind "CompositeController" in version "metacontroller.k8s.io/v1alpha1"" ``` 위와 비슷한 에러가 발생한다면, 10 초 정도 기다린 뒤 다시 위의 명령을 수행합니다. ```bash kustomize build apps/pipeline/upstream/env/platform-agnostic-multi-user | kubectl apply -f - ``` 정상적으로 설치되었는지 확인합니다. ```bash kubectl get po -n kubeflow ``` 다음과 같이 총 16개의 pod 가 모두 Running 이 될 때까지 기다립니다. ```bash NAME READY STATUS RESTARTS AGE cache-deployer-deployment-79fdf9c5c9-bjnbg 2/2 Running 1 5m3s cache-server-5bdf4f4457-48gbp 2/2 Running 0 5m3s kubeflow-pipelines-profile-controller-7b947f4748-8d26b 1/1 Running 0 5m3s metacontroller-0 1/1 Running 0 5m3s metadata-envoy-deployment-5b4856dd5-xtlkd 1/1 Running 0 5m3s metadata-grpc-deployment-6b5685488-kwvv7 2/2 Running 3 5m3s metadata-writer-548bd879bb-zjkcn 2/2 Running 1 5m3s minio-5b65df66c9-k5gzg 2/2 Running 0 5m3s ml-pipeline-8c4b99589-85jw6 2/2 Running 1 5m3s ml-pipeline-persistenceagent-d6bdc77bd-ssxrv 2/2 Running 0 5m3s ml-pipeline-scheduledworkflow-5db54d75c5-zk2cw 2/2 Running 0 5m2s ml-pipeline-ui-5bd8d6dc84-j7wqr 2/2 Running 0 5m2s ml-pipeline-viewer-crd-68fb5f4d58-mbcbg 2/2 Running 1 5m2s ml-pipeline-visualizationserver-8476b5c645-wljfm 2/2 Running 0 5m2s mysql-f7b9b7dd4-xfnw4 2/2 Running 0 5m2s workflow-controller-5cbbb49bd8-5zrwx 2/2 Running 1 5m2s ``` 추가로 ml-pipeline UI가 정상적으로 접속되는지 확인합니다. ```bash kubectl port-forward svc/ml-pipeline-ui -n kubeflow 8888:80 ``` 웹 브라우저를 열어 [http://localhost:8888/#/pipelines/](http://localhost:8888/#/pipelines/) 경로에 접속합니다. 다음과 같은 화면이 출력되는 것을 확인합니다. ![pipeline-ui](./img/pipeline-ui.png) - localhost 연결 거부 이슈 ![localhost-reject](./img/localhost-reject.png) 만약 다음과 같이 `localhost에서 연결을 거부했습니다` 라는 에러가 출력될 경우, 커맨드로 address 설정을 통해 접근하는 것이 가능합니다. **보안상의 문제가 되지 않는다면,** 아래와 같이 `0.0.0.0` 로 모든 주소의 bind를 열어주는 방향으로 ml-pipeline UI가 정상적으로 접속되는지 확인합니다. ```bash kubectl port-forward --address 0.0.0.0 svc/ml-pipeline-ui -n kubeflow 8888:80 ``` - 위의 옵션으로 실행했음에도 여전히 연결 거부 이슈가 발생할 경우 방화벽 설정으로 접속해 모든 tcp 프로토콜의 포트에 대한 접속을 허가 또는 8888번 포트의 접속 허가를 추가해 접근 권한을 허가해줍니다. 웹 브라우저를 열어 `http://<당신의 가상 인스턴스 공인 ip 주소>:8888/#/pipelines/` 경로에 접속하면, ml-pipeline UI 화면이 출력되는 것을 확인할 수 있습니다. 하단에서 진행되는 다른 포트의 경로에 접속할 때도 위의 절차와 동일하게 커맨드를 실행하고, 방화벽에 포트 번호를 추가해주면 실행하는 것이 가능합니다. ### Katib Katib 를 설치합니다. ```bash kustomize build apps/katib/upstream/installs/katib-with-kubeflow | kubectl apply -f - ``` 정상적으로 수행되면 다음과 같이 출력됩니다. ```bash customresourcedefinition.apiextensions.k8s.io/experiments.kubeflow.org created customresourcedefinition.apiextensions.k8s.io/suggestions.kubeflow.org created customresourcedefinition.apiextensions.k8s.io/trials.kubeflow.org created serviceaccount/katib-controller created serviceaccount/katib-ui created clusterrole.rbac.authorization.k8s.io/katib-controller created clusterrole.rbac.authorization.k8s.io/katib-ui created clusterrole.rbac.authorization.k8s.io/kubeflow-katib-admin created clusterrole.rbac.authorization.k8s.io/kubeflow-katib-edit created clusterrole.rbac.authorization.k8s.io/kubeflow-katib-view created clusterrolebinding.rbac.authorization.k8s.io/katib-controller created clusterrolebinding.rbac.authorization.k8s.io/katib-ui created configmap/katib-config created configmap/trial-templates created secret/katib-mysql-secrets created service/katib-controller created service/katib-db-manager created service/katib-mysql created service/katib-ui created persistentvolumeclaim/katib-mysql created deployment.apps/katib-controller created deployment.apps/katib-db-manager created deployment.apps/katib-mysql created deployment.apps/katib-ui created certificate.cert-manager.io/katib-webhook-cert created issuer.cert-manager.io/katib-selfsigned-issuer created virtualservice.networking.istio.io/katib-ui created mutatingwebhookconfiguration.admissionregistration.k8s.io/katib.kubeflow.org created validatingwebhookconfiguration.admissionregistration.k8s.io/katib.kubeflow.org created ``` 정상적으로 설치되었는지 확인합니다. ```bash kubectl get po -n kubeflow | grep katib ``` 다음과 같이 총 4 개의 pod 가 Running 이 될 때까지 기다립니다. ```bash katib-controller-68c47fbf8b-b985z 1/1 Running 0 82s katib-db-manager-6c948b6b76-2d9gr 1/1 Running 0 82s katib-mysql-7894994f88-scs62 1/1 Running 0 82s katib-ui-64bb96d5bf-d89kp 1/1 Running 0 82s ``` 추가로 katib UI가 정상적으로 접속되는지 확인합니다. ```bash kubectl port-forward svc/katib-ui -n kubeflow 8081:80 ``` 웹 브라우저를 열어 [http://localhost:8081/katib/](http://localhost:8081/katib/) 경로에 접속합니다. 다음과 같은 화면이 출력되는 것을 확인합니다. ![katib-ui](./img/katib-ui.png) ### Central Dashboard Dashboard 를 설치합니다. ```bash kustomize build apps/centraldashboard/upstream/overlays/istio | kubectl apply -f - ``` 정상적으로 수행되면 다음과 같이 출력됩니다. ```bash serviceaccount/centraldashboard created role.rbac.authorization.k8s.io/centraldashboard created clusterrole.rbac.authorization.k8s.io/centraldashboard created rolebinding.rbac.authorization.k8s.io/centraldashboard created clusterrolebinding.rbac.authorization.k8s.io/centraldashboard created configmap/centraldashboard-config created configmap/centraldashboard-parameters created service/centraldashboard created deployment.apps/centraldashboard created virtualservice.networking.istio.io/centraldashboard created ``` 정상적으로 설치되었는지 확인합니다. ```bash kubectl get po -n kubeflow | grep centraldashboard ``` kubeflow namespace 에 centraldashboard 관련 1 개의 pod 가 Running 이 될 때까지 기다립니다. ```bash centraldashboard-8fc7d8cc-xl7ts 1/1 Running 0 52s ``` 추가로 Central Dashboard UI가 정상적으로 접속되는지 확인합니다. ```bash kubectl port-forward svc/centraldashboard -n kubeflow 8082:80 ``` 웹 브라우저를 열어 [http://localhost:8082/](http://localhost:8082/) 경로에 접속합니다. 다음과 같은 화면이 출력되는 것을 확인합니다. ![central-dashboard](./img/central-dashboard.png) ### Admission Webhook ```bash kustomize build apps/admission-webhook/upstream/overlays/cert-manager | kubectl apply -f - ``` 정상적으로 수행되면 다음과 같이 출력됩니다. ```bash customresourcedefinition.apiextensions.k8s.io/poddefaults.kubeflow.org created serviceaccount/admission-webhook-service-account created clusterrole.rbac.authorization.k8s.io/admission-webhook-cluster-role created clusterrole.rbac.authorization.k8s.io/admission-webhook-kubeflow-poddefaults-admin created clusterrole.rbac.authorization.k8s.io/admission-webhook-kubeflow-poddefaults-edit created clusterrole.rbac.authorization.k8s.io/admission-webhook-kubeflow-poddefaults-view created clusterrolebinding.rbac.authorization.k8s.io/admission-webhook-cluster-role-binding created service/admission-webhook-service created deployment.apps/admission-webhook-deployment created certificate.cert-manager.io/admission-webhook-cert created issuer.cert-manager.io/admission-webhook-selfsigned-issuer created mutatingwebhookconfiguration.admissionregistration.k8s.io/admission-webhook-mutating-webhook-configuration created ``` 정상적으로 설치되었는지 확인합니다. ```bash kubectl get po -n kubeflow | grep admission-webhook ``` 1 개의 pod 가 Running 이 될 때까지 기다립니다. ```bash admission-webhook-deployment-667bd68d94-2hhrx 1/1 Running 0 11s ``` ### Notebooks & Jupyter Web App 1. Notebook controller 를 설치합니다. ```bash kustomize build apps/jupyter/notebook-controller/upstream/overlays/kubeflow | kubectl apply -f - ``` 정상적으로 수행되면 다음과 같이 출력됩니다. ```bash customresourcedefinition.apiextensions.k8s.io/notebooks.kubeflow.org created serviceaccount/notebook-controller-service-account created role.rbac.authorization.k8s.io/notebook-controller-leader-election-role created clusterrole.rbac.authorization.k8s.io/notebook-controller-kubeflow-notebooks-admin created clusterrole.rbac.authorization.k8s.io/notebook-controller-kubeflow-notebooks-edit created clusterrole.rbac.authorization.k8s.io/notebook-controller-kubeflow-notebooks-view created clusterrole.rbac.authorization.k8s.io/notebook-controller-role created rolebinding.rbac.authorization.k8s.io/notebook-controller-leader-election-rolebinding created clusterrolebinding.rbac.authorization.k8s.io/notebook-controller-role-binding created configmap/notebook-controller-config-m44cmb547t created service/notebook-controller-service created deployment.apps/notebook-controller-deployment created ``` 정상적으로 설치되었는지 확인합니다. ```bash kubectl get po -n kubeflow | grep notebook-controller ``` 1 개의 pod 가 Running 이 될 때까지 기다립니다. ```bash notebook-controller-deployment-75b4f7b578-w4d4l 1/1 Running 0 105s ``` 2. Jupyter Web App 을 설치합니다. ```bash kustomize build apps/jupyter/jupyter-web-app/upstream/overlays/istio | kubectl apply -f - ``` 정상적으로 수행되면 다음과 같이 출력됩니다. ```bash serviceaccount/jupyter-web-app-service-account created role.rbac.authorization.k8s.io/jupyter-web-app-jupyter-notebook-role created clusterrole.rbac.authorization.k8s.io/jupyter-web-app-cluster-role created clusterrole.rbac.authorization.k8s.io/jupyter-web-app-kubeflow-notebook-ui-admin created clusterrole.rbac.authorization.k8s.io/jupyter-web-app-kubeflow-notebook-ui-edit created clusterrole.rbac.authorization.k8s.io/jupyter-web-app-kubeflow-notebook-ui-view created rolebinding.rbac.authorization.k8s.io/jupyter-web-app-jupyter-notebook-role-binding created clusterrolebinding.rbac.authorization.k8s.io/jupyter-web-app-cluster-role-binding created configmap/jupyter-web-app-config-76844k4cd7 created configmap/jupyter-web-app-logos created configmap/jupyter-web-app-parameters-chmg88cm48 created service/jupyter-web-app-service created deployment.apps/jupyter-web-app-deployment created virtualservice.networking.istio.io/jupyter-web-app-jupyter-web-app created ``` 정상적으로 설치되었는지 확인합니다. ```bash kubectl get po -n kubeflow | grep jupyter-web-app ``` 1개의 pod 가 Running 이 될 때까지 기다립니다. ```bash jupyter-web-app-deployment-6f744fbc54-p27ts 1/1 Running 0 2m ``` ### Profiles + KFAM Profile Controller를 설치합니다. ```bash kustomize build apps/profiles/upstream/overlays/kubeflow | kubectl apply -f - ``` 정상적으로 수행되면 다음과 같이 출력됩니다. ```bash customresourcedefinition.apiextensions.k8s.io/profiles.kubeflow.org created serviceaccount/profiles-controller-service-account created role.rbac.authorization.k8s.io/profiles-leader-election-role created rolebinding.rbac.authorization.k8s.io/profiles-leader-election-rolebinding created clusterrolebinding.rbac.authorization.k8s.io/profiles-cluster-role-binding created configmap/namespace-labels-data-48h7kd55mc created configmap/profiles-config-46c7tgh6fd created service/profiles-kfam created deployment.apps/profiles-deployment created virtualservice.networking.istio.io/profiles-kfam created ``` 정상적으로 설치되었는지 확인합니다. ```bash kubectl get po -n kubeflow | grep profiles-deployment ``` 1 개의 pod 가 Running 이 될 때까지 기다립니다. ```bash profiles-deployment-89f7d88b-qsnrd 2/2 Running 0 42s ``` ### Volumes Web App Volumes Web App 을 설치합니다. ```bash kustomize build apps/volumes-web-app/upstream/overlays/istio | kubectl apply -f - ``` 정상적으로 수행되면 다음과 같이 출력됩니다. ```bash serviceaccount/volumes-web-app-service-account created clusterrole.rbac.authorization.k8s.io/volumes-web-app-cluster-role created clusterrole.rbac.authorization.k8s.io/volumes-web-app-kubeflow-volume-ui-admin created clusterrole.rbac.authorization.k8s.io/volumes-web-app-kubeflow-volume-ui-edit created clusterrole.rbac.authorization.k8s.io/volumes-web-app-kubeflow-volume-ui-view created clusterrolebinding.rbac.authorization.k8s.io/volumes-web-app-cluster-role-binding created configmap/volumes-web-app-parameters-4gg8cm2gmk created service/volumes-web-app-service created deployment.apps/volumes-web-app-deployment created virtualservice.networking.istio.io/volumes-web-app-volumes-web-app created ``` 정상적으로 설치되었는지 확인합니다. ```bash kubectl get po -n kubeflow | grep volumes-web-app ``` 1개의 pod가 Running 이 될 때까지 기다립니다. ```bash volumes-web-app-deployment-8589d664cc-62svl 1/1 Running 0 27s ``` ### Tensorboard & Tensorboard Web App 1. Tensorboard Web App 를 설치합니다. ```bash kustomize build apps/tensorboard/tensorboards-web-app/upstream/overlays/istio | kubectl apply -f - ``` 정상적으로 수행되면 다음과 같이 출력됩니다. ```bash serviceaccount/tensorboards-web-app-service-account created clusterrole.rbac.authorization.k8s.io/tensorboards-web-app-cluster-role created clusterrole.rbac.authorization.k8s.io/tensorboards-web-app-kubeflow-tensorboard-ui-admin created clusterrole.rbac.authorization.k8s.io/tensorboards-web-app-kubeflow-tensorboard-ui-edit created clusterrole.rbac.authorization.k8s.io/tensorboards-web-app-kubeflow-tensorboard-ui-view created clusterrolebinding.rbac.authorization.k8s.io/tensorboards-web-app-cluster-role-binding created configmap/tensorboards-web-app-parameters-g28fbd6cch created service/tensorboards-web-app-service created deployment.apps/tensorboards-web-app-deployment created virtualservice.networking.istio.io/tensorboards-web-app-tensorboards-web-app created ``` 정상적으로 설치되었는지 확인합니다. ```bash kubectl get po -n kubeflow | grep tensorboards-web-app ``` 1 개의 pod 가 Running 이 될 때까지 기다립니다. ```bash tensorboards-web-app-deployment-6ff79b7f44-qbzmw 1/1 Running 0 22s ``` 2. Tensorboard Controller 를 설치합니다. ```bash kustomize build apps/tensorboard/tensorboard-controller/upstream/overlays/kubeflow | kubectl apply -f - ``` 정상적으로 수행되면 다음과 같이 출력됩니다. ```bash customresourcedefinition.apiextensions.k8s.io/tensorboards.tensorboard.kubeflow.org created serviceaccount/tensorboard-controller created role.rbac.authorization.k8s.io/tensorboard-controller-leader-election-role created clusterrole.rbac.authorization.k8s.io/tensorboard-controller-manager-role created clusterrole.rbac.authorization.k8s.io/tensorboard-controller-proxy-role created rolebinding.rbac.authorization.k8s.io/tensorboard-controller-leader-election-rolebinding created clusterrolebinding.rbac.authorization.k8s.io/tensorboard-controller-manager-rolebinding created clusterrolebinding.rbac.authorization.k8s.io/tensorboard-controller-proxy-rolebinding created configmap/tensorboard-controller-config-bf88mm96c8 created service/tensorboard-controller-controller-manager-metrics-service created deployment.apps/tensorboard-controller-controller-manager created ``` 정상적으로 설치되었는지 확인합니다. ```bash kubectl get po -n kubeflow | grep tensorboard-controller ``` 1 개의 pod 가 Running 이 될 때까지 기다립니다. ```bash tensorboard-controller-controller-manager-954b7c544-vjpzj 3/3 Running 1 73s ``` ### Training Operator Training Operator 를 설치합니다. ```bash kustomize build apps/training-operator/upstream/overlays/kubeflow | kubectl apply -f - ``` 정상적으로 수행되면 다음과 같이 출력됩니다. ```bash customresourcedefinition.apiextensions.k8s.io/mxjobs.kubeflow.org created customresourcedefinition.apiextensions.k8s.io/pytorchjobs.kubeflow.org created customresourcedefinition.apiextensions.k8s.io/tfjobs.kubeflow.org created customresourcedefinition.apiextensions.k8s.io/xgboostjobs.kubeflow.org created serviceaccount/training-operator created clusterrole.rbac.authorization.k8s.io/kubeflow-training-admin created clusterrole.rbac.authorization.k8s.io/kubeflow-training-edit created clusterrole.rbac.authorization.k8s.io/kubeflow-training-view created clusterrole.rbac.authorization.k8s.io/training-operator created clusterrolebinding.rbac.authorization.k8s.io/training-operator created service/training-operator created deployment.apps/training-operator created ``` 정상적으로 설치되었는지 확인합니다. ```bash kubectl get po -n kubeflow | grep training-operator ``` 1 개의 pod 가 Running 이 될 때까지 기다립니다. ```bash training-operator-7d98f9dd88-6887f 1/1 Running 0 28s ``` ### User Namespace Kubeflow 사용을 위해, 사용할 User의 Kubeflow Profile 을 생성합니다. ```bash kustomize build common/user-namespace/base | kubectl apply -f - ``` 정상적으로 수행되면 다음과 같이 출력됩니다. ```bash configmap/default-install-config-9h2h2b6hbk created profile.kubeflow.org/kubeflow-user-example-com created ``` kubeflow-user-example-com profile 이 생성된 것을 확인합니다. ```bash kubectl get profile ``` ```bash kubeflow-user-example-com 37s ``` ## 정상 설치 확인 Kubeflow central dashboard에 web browser로 접속하기 위해 포트 포워딩합니다. ```bash kubectl port-forward svc/istio-ingressgateway -n istio-system 8080:80 ``` Web Browser 를 열어 [http://localhost:8080](http://localhost:8080) 으로 접속하여, 다음과 같은 화면이 출력되는 것을 확인합니다. ![login-ui](./img/login-after-install.png) 다음 접속 정보를 입력하여 접속합니다. - Email Address: `user@example.com` - Password: `12341234` ![central-dashboard](./img/after-login.png) ================================================ FILE: versioned_docs/version-1.0/setup-components/install-components-mlflow.md ================================================ --- title : "2. MLflow Tracking Server" description: "구성요소 설치 - MLflow" sidebar_position: 2 date: 2021-12-13 lastmod: 2021-12-13 contributors: ["Jaeyeon Kim"] --- ## Install MLflow Tracking Server MLflow는 대표적인 오픈소스 ML 실험 관리 도구입니다. MLflow는 [실험 관리 용도](https://mlflow.org/docs/latest/tracking.html#tracking) 외에도 [ML Model 패키징](https://mlflow.org/docs/latest/projects.html#projects), [ML 모델 배포 관리](https://mlflow.org/docs/latest/models.html#models), [ML 모델 저장](https://mlflow.org/docs/latest/model-registry.html#registry)과 같은 기능도 제공하고 있습니다. *모두의 MLOps*에서는 MLflow를 실험 관리 용도로 사용합니다. 그래서 MLflow에서 관리하는 데이터를 저장하고 UI를 제공하는 MLflow Tracking Server를 쿠버네티스 클러스터에 배포하여 사용할 예정입니다. ## Before Install MLflow Tracking Server ### PostgreSQL DB 설치 MLflow Tracking Server가 Backend Store로 사용할 용도의 PostgreSQL DB를 쿠버네티스 클러스터에 배포합니다. 먼저 `mlflow-system`이라는 namespace 를 생성합니다. ```bash kubectl create ns mlflow-system ``` 다음과 같은 메시지가 출력되면 정상적으로 생성된 것을 의미합니다. ```bash namespace/mlflow-system created ``` postgresql DB를 `mlflow-system` namespace 에 생성합니다. ```bash kubectl -n mlflow-system apply -f https://raw.githubusercontent.com/mlops-for-all/helm-charts/b94b5fe4133f769c04b25068b98ccfa7a505aa60/mlflow/manifests/postgres.yaml ``` 정상적으로 수행되면 다음과 같이 출력됩니다. ```bash service/postgresql-mlflow-service created deployment.apps/postgresql-mlflow created persistentvolumeclaim/postgresql-mlflow-pvc created ``` mlflow-system namespace 에 1개의 postgresql 관련 pod 가 Running 이 될 때까지 기다립니다. ```bash kubectl get pod -n mlflow-system | grep postgresql ``` 다음과 비슷하게 출력되면 정상적으로 실행된 것입니다. ```bash postgresql-mlflow-7b9bc8c79f-srkh7 1/1 Running 0 38s ``` ### Minio 설정 MLflow Tracking Server가 Artifacts Store로 사용할 용도의 Minio는 이전 Kubeflow 설치 단계에서 설치한 Minio를 활용합니다. 단, kubeflow 용도와 mlflow 용도를 분리하기 위해, mlflow 전용 버킷(bucket)을 생성하겠습니다. minio 에 접속하여 버킷을 생성하기 위해, 우선 minio-service 를 포트포워딩합니다. ```bash kubectl port-forward svc/minio-service -n kubeflow 9000:9000 ``` 웹 브라우저를 열어 [localhost:9000](http://localhost:9000)으로 접속하면 다음과 같은 화면이 출력됩니다. ![minio-install](./img/minio-install.png) 다음과 같은 접속 정보를 입력하여 로그인합니다. - Username: `minio` - Password: `minio123` 우측 하단의 **`+`** 버튼을 클릭하여, `Create Bucket`를 클릭합니다. ![create-bucket](./img/create-bucket.png) `Bucket Name`에 `mlflow`를 입력하여 버킷을 생성합니다. 정상적으로 생성되면 다음과 같이 왼쪽에 `mlflow`라는 이름의 버킷이 생성됩니다. ![mlflow-bucket](./img/mlflow-bucket.png) --- ## Let's Install MLflow Tracking Server ### Helm Repository 추가 ```bash helm repo add mlops-for-all https://mlops-for-all.github.io/helm-charts ``` 다음과 같은 메시지가 출력되면 정상적으로 추가된 것을 의미합니다. ```bash "mlops-for-all" has been added to your repositories ``` ### Helm Repository 업데이트 ```bash helm repo update ``` 다음과 같은 메시지가 출력되면 정상적으로 업데이트된 것을 의미합니다. ```bash Hang tight while we grab the latest from your chart repositories... ...Successfully got an update from the "mlops-for-all" chart repository Update Complete. ⎈Happy Helming!⎈ ``` ### Helm Install mlflow-server Helm Chart 0.2.0 버전을 설치합니다. ```bash helm install mlflow-server mlops-for-all/mlflow-server \ --namespace mlflow-system \ --version 0.2.0 ``` - **주의**: 위의 helm chart는 MLflow 의 backend store 와 artifacts store 의 접속 정보를 kubeflow 설치 과정에서 생성한 minio와 위의 [PostgreSQL DB 설치](#postgresql-db-설치)에서 생성한 postgresql 정보를 default로 하여 설치합니다. - 별개로 생성한 DB 혹은 Object storage를 활용하고 싶은 경우, [Helm Chart Repo](https://github.com/mlops-for-all/helm-charts/tree/main/mlflow/chart)를 참고하여 helm install 시 value를 따로 설정하여 설치하시기 바랍니다. 다음과 같은 메시지가 출력되어야 합니다. ```bash NAME: mlflow-server LAST DEPLOYED: Sat Dec 18 22:02:13 2021 NAMESPACE: mlflow-system STATUS: deployed REVISION: 1 TEST SUITE: None ``` 정상적으로 설치되었는지 확인합니다. ```bash kubectl get pod -n mlflow-system | grep mlflow-server ``` mlflow-system namespace 에 1 개의 mlflow-server 관련 pod 가 Running 이 될 때까지 기다립니다. 다음과 비슷하게 출력되면 정상적으로 실행된 것입니다. ```bash mlflow-server-ffd66d858-6hm62 1/1 Running 0 74s ``` ### 정상 설치 확인 그럼 이제 MLflow Server에 정상적으로 접속되는지 확인해보겠습니다. 우선 클라이언트 노드에서 접속하기 위해, 포트포워딩을 수행합니다. ```bash kubectl port-forward svc/mlflow-server-service -n mlflow-system 5000:5000 ``` 웹 브라우저를 열어 [localhost:5000](http://localhost:5000)으로 접속하면 다음과 같은 화면이 출력됩니다. ![mlflow-install](./img/mlflow-install.png) ================================================ FILE: versioned_docs/version-1.0/setup-components/install-components-pg.md ================================================ --- title : "4. Prometheus & Grafana" description: "구성요소 설치 - Prometheus & Grafana" sidebar_position: 4 date: 2021-12-13 lastmod: 2021-12-13 contributors: ["Jaeyeon Kim"] --- ## Prometheus & Grafana 프로메테우스(Prometheus) 와 그라파나(Grafana) 는 모니터링을 위한 도구입니다. 안정적인 서비스 운영을 위해서는 서비스와 서비스가 운영되고 있는 인프라의 상태를 지속해서 관찰하고, 관찰한 메트릭을 바탕으로 문제가 생길 때 빠르게 대응해야 합니다. 이러한 모니터링을 효율적으로 수행하기 위한 많은 도구 중 *모두의 MLOps*에서는 오픈소스인 프로메테우스와 그라파나를 사용할 예정입니다. 더 자세한 내용은 [Prometheus 공식 문서](https://prometheus.io/docs/introduction/overview/), [Grafana 공식 문서](https://grafana.com/docs/)를 확인해주시기를 바랍니다. 프로메테우스는 다양한 대상으로부터 Metric을 수집하는 도구이며, 그라파나는 모인 데이터를 시각화하는 것을 도와주는 도구입니다. 서로 간의 종속성은 없지만 상호 보완적으로 사용할 수 있어 함께 사용되는 경우가 많습니다. 이번 페이지에서는 쿠버네티스 클러스터에 프로메테우스와 그라파나를 설치한 뒤, Seldon-Core 로 생성한 SeldonDeployment 로 API 요청을 보내, 정상적으로 Metrics 이 수집되는지 확인해보겠습니다. 본 글에서는 seldonio/seldon-core-analytics Helm Chart 1.12.0 버전을 활용해 쿠버네티스 클러스터에 프로메테우스와 그라파나를 설치하고, Seldon-Core 에서 생성한 SeldonDeployment의 Metrics 을 효율적으로 확인하기 위한 대시보드도 함께 설치합니다. ### Helm Repository 추가 ```bash helm repo add seldonio https://storage.googleapis.com/seldon-charts ``` 다음과 같은 메시지가 출력되면 정상적으로 추가된 것을 의미합니다. ```bash "seldonio" has been added to your repositories ``` ### Helm Repository 업데이트 ```bash helm repo update ``` 다음과 같은 메시지가 출력되면 정상적으로 업데이트된 것을 의미합니다. ```bash Hang tight while we grab the latest from your chart repositories... ...Successfully got an update from the "seldonio" chart repository ...Successfully got an update from the "datawire" chart repository Update Complete. ⎈Happy Helming!⎈ ``` ### Helm Install seldon-core-analytics Helm Chart 1.12.0 버전을 설치합니다. ```bash helm install seldon-core-analytics seldonio/seldon-core-analytics \ --namespace seldon-system \ --version 1.12.0 ``` 다음과 같은 메시지가 출력되어야 합니다. ```bash 생략... NAME: seldon-core-analytics LAST DEPLOYED: Tue Dec 14 18:29:38 2021 NAMESPACE: seldon-system STATUS: deployed REVISION: 1 ``` 정상적으로 설치되었는지 확인합니다. ```bash kubectl get pod -n seldon-system | grep seldon-core-analytics ``` seldon-system namespace 에 6개의 seldon-core-analytics 관련 pod 가 Running 이 될 때까지 기다립니다. ```bash seldon-core-analytics-grafana-657c956c88-ng8wn 2/2 Running 0 114s seldon-core-analytics-kube-state-metrics-94bb6cb9-svs82 1/1 Running 0 114s seldon-core-analytics-prometheus-alertmanager-64cf7b8f5-nxbl8 2/2 Running 0 114s seldon-core-analytics-prometheus-node-exporter-5rrj5 1/1 Running 0 114s seldon-core-analytics-prometheus-pushgateway-8476474cff-sr4n6 1/1 Running 0 114s seldon-core-analytics-prometheus-seldon-685c664894-7cr45 2/2 Running 0 114s ``` ### 정상 설치 확인 그럼 이제 그라파나에 정상적으로 접속되는지 확인해보겠습니다. 우선 클라이언트 노드에서 접속하기 위해, 포트포워딩을 수행합니다. ```bash kubectl port-forward svc/seldon-core-analytics-grafana -n seldon-system 8090:80 ``` 웹 브라우저를 열어 [localhost:8090](http://localhost:8090)으로 접속하면 다음과 같은 화면이 출력됩니다. ![grafana-install](./img/grafana-install.png) 다음과 같은 접속정보를 입력하여 접속합니다. - Email or username : `admin` - Password : `password` 로그인하면 다음과 같은 화면이 출력됩니다. ![grafana-login](./img/grafana-login.png) 좌측의 대시보드 아이콘을 클릭하여, `Manage` 버튼을 클릭합니다. ![dashboard-click](./img/dashboard-click.png) 기본적인 그라파나 대시보드가 포함되어있는 것을 확인할 수 있습니다. 이 중 `Prediction Analytics` 대시보드를 클릭합니다. ![dashboard](./img/dashboard.png) Seldon Core API Dashboard 가 보이고, 다음과 같이 출력되는 것을 확인할 수 있습니다. ![seldon-dashboard](./img/seldon-dashboard.png) ## References - [Seldon-Core-Analytics Helm Chart](https://github.com/SeldonIO/seldon-core/tree/master/helm-charts/seldon-core-analytics) ================================================ FILE: versioned_docs/version-1.0/setup-components/install-components-seldon.md ================================================ --- title : "3. Seldon-Core" description: "구성요소 설치 - Seldon-Core" sidebar_position: 3 date: 2021-12-13 lastmod: 2021-12-13 contributors: ["Jaeyeon Kim"] --- ## Seldon-Core Seldon-Core는 쿠버네티스 환경에 수많은 머신러닝 모델을 배포하고 관리할 수 있는 오픈소스 프레임워크 중 하나입니다. 더 자세한 내용은 Seldon-Core 의 공식 [제품 설명 페이지](https://www.seldon.io/tech/products/core/) 와 [깃헙](https://github.com/SeldonIO/seldon-core) 그리고 API Deployment 파트를 참고해주시기를 바랍니다. ## Selon-Core 설치 Seldon-Core를 사용하기 위해서는 쿠버네티스의 인그레스(Ingress)를 담당하는 Ambassador 와 Istio 와 같은 [모듈이 필요합니다](https://docs.seldon.io/projects/seldon-core/en/latest/workflow/install.html). Seldon-Core 에서는 Ambassador 와 Istio 만을 공식적으로 지원하며, *모두의 MLOps*에서는 Ambassador를 사용해 Seldon-core를 사용하므로 Ambassador를 설치하겠습니다. ### Ambassador - Helm Repository 추가 ```bash helm repo add datawire https://www.getambassador.io ``` 다음과 같은 메시지가 출력되면 정상적으로 추가된 것을 의미합니다. ```bash "datawire" has been added to your repositories ``` ### Ambassador - Helm Repository 업데이트 ```bash helm repo update ``` 다음과 같은 메시지가 출력되면 정상적으로 업데이트된 것을 의미합니다. ```bash Hang tight while we grab the latest from your chart repositories... ...Successfully got an update from the "datawire" chart repository Update Complete. ⎈Happy Helming!⎈ ``` ### Ambassador - Helm Install ambassador Chart 6.9.3 버전을 설치합니다. ```bash helm install ambassador datawire/ambassador \ --namespace seldon-system \ --create-namespace \ --set image.repository=quay.io/datawire/ambassador \ --set enableAES=false \ --set crds.keep=false \ --version 6.9.3 ``` 다음과 같은 메시지가 출력되어야 합니다. ```bash 생략... W1206 17:01:36.026326 26635 warnings.go:70] rbac.authorization.k8s.io/v1beta1 Role is deprecated in v1.17+, unavailable in v1.22+; use rbac.authorization.k8s.io/v1 Role W1206 17:01:36.029764 26635 warnings.go:70] rbac.authorization.k8s.io/v1beta1 RoleBinding is deprecated in v1.17+, unavailable in v1.22+; use rbac.authorization.k8s.io/v1 RoleBinding NAME: ambassador LAST DEPLOYED: Mon Dec 6 17:01:34 2021 NAMESPACE: seldon-system STATUS: deployed REVISION: 1 NOTES: ------------------------------------------------------------------------------- Congratulations! You've successfully installed Ambassador! ------------------------------------------------------------------------------- To get the IP address of Ambassador, run the following commands: NOTE: It may take a few minutes for the LoadBalancer IP to be available. You can watch the status of by running 'kubectl get svc -w --namespace seldon-system ambassador' On GKE/Azure: export SERVICE_IP=$(kubectl get svc --namespace seldon-system ambassador -o jsonpath='{.status.loadBalancer.ingress[0].ip}') On AWS: export SERVICE_IP=$(kubectl get svc --namespace seldon-system ambassador -o jsonpath='{.status.loadBalancer.ingress[0].hostname}') echo http://$SERVICE_IP: For help, visit our Slack at http://a8r.io/Slack or view the documentation online at https://www.getambassador.io. ``` seldon-system 에 4 개의 pod 가 Running 이 될 때까지 기다립니다. ```bash kubectl get pod -n seldon-system ``` ```bash ambassador-7f596c8b57-4s9xh 1/1 Running 0 7m15s ambassador-7f596c8b57-dt6lr 1/1 Running 0 7m15s ambassador-7f596c8b57-h5l6f 1/1 Running 0 7m15s ambassador-agent-77bccdfcd5-d5jxj 1/1 Running 0 7m15s ``` ### Seldon-Core - Helm Install seldon-core-operator Chart 1.11.2 버전을 설치합니다. ```bash helm install seldon-core seldon-core-operator \ --repo https://storage.googleapis.com/seldon-charts \ --namespace seldon-system \ --set usageMetrics.enabled=true \ --set ambassador.enabled=true \ --version 1.11.2 ``` 다음과 같은 메시지가 출력되어야 합니다. ```bash 생략... W1206 17:05:38.336391 28181 warnings.go:70] admissionregistration.k8s.io/v1beta1 ValidatingWebhookConfiguration is deprecated in v1.16+, unavailable in v1.22+; use admissionregistration.k8s.io/v1 ValidatingWebhookConfiguration NAME: seldon-core LAST DEPLOYED: Mon Dec 6 17:05:34 2021 NAMESPACE: seldon-system STATUS: deployed REVISION: 1 TEST SUITE: None ``` seldon-system namespace 에 1 개의 seldon-controller-manager pod 가 Running 이 될 때까지 기다립니다. ```bash kubectl get pod -n seldon-system | grep seldon-controller ``` ```bash seldon-controller-manager-8457b8b5c7-r2frm 1/1 Running 0 2m22s ``` ## References - [Example Model Servers with Seldon](https://docs.seldon.io/projects/seldon-core/en/latest/examples/server_examples.html#examples-server-examples--page-root) ================================================ FILE: versioned_docs/version-1.0/setup-kubernetes/_category_.json ================================================ { "label": "Setup Kubernetes", "position": 2, "link": { "type": "generated-index" } } ================================================ FILE: versioned_docs/version-1.0/setup-kubernetes/install-kubernetes/_category_.json ================================================ { "label": "4. Install Kubernetes", "position": 4, "link": { "type": "generated-index" } } ================================================ FILE: versioned_docs/version-1.0/setup-kubernetes/install-kubernetes/kubernetes-with-k3s.md ================================================ --- title: "4.1. K3s" description: "" sidebar_position: 1 date: 2021-12-13 lastmod: 2021-12-20 draft: false weight: 221 contributors: ["Jongseob Jeon"] menu: docs: parent:../setup-kubernetes" images: [] --- ## 1. Prerequisite 쿠버네티스 클러스터를 구축하기에 앞서, 필요한 구성 요소들을 **클러스터에** 설치합니다. [Install Prerequisite](../../setup-kubernetes/install-prerequisite.md)을 참고하여 Kubernetes를 설치하기 전에 필요한 요소들을 **클러스터에** 설치해 주시기 바랍니다. k3s 에서는 기본값으로 containerd를 백엔드로 이용해 설치합니다. 하지만 저희는 GPU를 사용하기 위해서 docker를 백엔드로 사용해야 하므로 `--docker` 옵션을 통해 백엔드를 docker로 설치하겠습니다. ```bash curl -sfL https://get.k3s.io | INSTALL_K3S_VERSION=v1.21.7+k3s1 sh -s - server --disable traefik --disable servicelb --disable local-storage --docker ``` k3s를 설치 후 k3s config를 확인합니다 ```bash sudo cat /etc/rancher/k3s/k3s.yaml ``` 정상적으로 설치되면 다음과 같은 항목이 출력됩니다. (보안 문제와 관련된 키들은 <...>로 가렸습니다.) ```bash apiVersion: v1 clusters: - cluster: certificate-authority-data: <...> server: https://127.0.0.1:6443 name: default contexts: - context: cluster: default user: default name: default current-context: default kind: Config preferences: {} users: - name: default user: client-certificate-data: <...> client-key-data: <...> ``` ## 2. 쿠버네티스 클러스터 셋업 k3s config를 클러스터의 kubeconfig로 사용하기 위해서 복사합니다. ```bash mkdir .kube sudo cp /etc/rancher/k3s/k3s.yaml .kube/config ``` 복사된 config 파일에 user가 접근할 수 있는 권한을 줍니다. ```bash sudo chown $USER:$USER .kube/config ``` ## 3. 쿠버네티스 클라이언트 셋업 이제 클러스터에서 설정한 kubeconfig를 로컬로 이동합니다. 로컬에서는 경로를 `~/.kube/config`로 설정합니다. 처음 복사한 config 파일에는 server ip가 `https://127.0.0.1:6443` 으로 되어 있습니다. 이 값을 클러스터의 ip에 맞게 수정합니다. (이번 페이지에서 사용하는 클러스터의 ip에 맞춰서 `https://192.168.0.19:6443` 으로 수정했습니다.) ```bash apiVersion: v1 clusters: - cluster: certificate-authority-data: <...> server: https://192.168.0.19:6443 name: default contexts: - context: cluster: default user: default name: default current-context: default kind: Config preferences: {} users: - name: default user: client-certificate-data: <...> client-key-data: <...> ``` ## 4. 쿠버네티스 기본 모듈 설치 [Setup Kubernetes Modules](../../setup-kubernetes/install-kubernetes-module.md)을 참고하여 다음 컴포넌트들을 설치해 주시기 바랍니다. - helm - kustomize - CSI plugin - [Optional] nvidia-docker, nvidia-device-plugin ## 5. 정상 설치 확인 최종적으로 node가 Ready 인지, OS, Docker, Kubernetes 버전을 확인합니다. ```bash kubectl get nodes -o wide ``` 다음과 같은 메시지가 보이면 정상적으로 설치된 것을 의미합니다. ```bash NAME STATUS ROLES AGE VERSION INTERNAL-IP EXTERNAL-IP OS-IMAGE KERNEL-VERSION CONTAINER-RUNTIME ubuntu Ready control-plane,master 11m v1.21.7+k3s1 192.168.0.19 Ubuntu 20.04.3 LTS 5.4.0-91-generic docker://20.10.11 ``` ## 6. References - [https://rancher.com/docs/k3s/latest/en/installation/install-options/](https://rancher.com/docs/k3s/latest/en/installation/install-options/) ================================================ FILE: versioned_docs/version-1.0/setup-kubernetes/install-kubernetes/kubernetes-with-kubeadm.md ================================================ --- title: "4.3. Kubeadm" description: "" sidebar_position: 3 date: 2021-12-13 lastmod: 2021-12-20 contributors: ["Youngcheol Jang"] --- ## 1. Prerequisite 쿠버네티스 클러스터를 구축하기에 앞서, 필요한 구성 요소들을 **클러스터에** 설치합니다. [Install Prerequisite](../../setup-kubernetes/install-prerequisite.md)을 참고하여 Kubernetes를 설치하기 전에 필요한 요소들을 **클러스터에** 설치해 주시기 바랍니다. 쿠버네티스를 위한 네트워크의 설정을 변경합니다. ```bash sudo modprobe br_netfilter cat < Ubuntu 20.04.3 LTS 5.4.0-91-generic docker://20.10.11 ``` ================================================ FILE: versioned_docs/version-1.0/setup-kubernetes/install-kubernetes-module.md ================================================ --- title: "5. Install Kubernetes Modules" description: "Install Helm, Kustomize" sidebar_position: 5 date: 2021-12-13 lastmod: 2021-12-20 contributors: ["Jaeyeon Kim"] --- ## Setup Kubernetes Modules 이번 페이지에서는 클러스터에서 사용할 모듈을 클라이언트 노드에서 설치하는 과정에 관해서 설명합니다. 앞으로 소개되는 과정은 모두 **클라이언트 노드**에서 진행됩니다. ## Helm Helm은 쿠버네티스 패키지와 관련된 자원을 한 번에 배포하고 관리할 수 있게 도와주는 패키지 매니징 도구 중 하나입니다. 1. 현재 폴더에 Helm v3.7.1 버전을 내려받습니다. - For Linux amd64 ```bash wget https://get.helm.sh/helm-v3.7.1-linux-amd64.tar.gz ``` - 다른 OS는 [공식 홈페이지](https://github.com/helm/helm/releases/tag/v3.7.1)를 참고하시어, 클라이언트 노드의 OS와 CPU에 맞는 바이너리의 다운 경로를 확인하시기 바랍니다. 2. helm을 사용할 수 있도록 압축을 풀고, 파일의 위치를 변경합니다. ```bash tar -zxvf helm-v3.7.1-linux-amd64.tar.gz sudo mv linux-amd64/helm /usr/local/bin/helm ``` 3. 정상적으로 설치되었는지 확인합니다. ```bash helm help ``` 다음과 같은 메시지가 보이면 정상적으로 설치된 것을 의미합니다. ```bash The Kubernetes package manager Common actions for Helm: - helm search: search for charts - helm pull: download a chart to your local directory to view - helm install: upload the chart to Kubernetes - helm list: list releases of charts Environment variables: | Name | Description | |--------------------------|---------------------------------------------------------------------| | $HELM_CACHE_HOME | set an alternative location for storing cached files. | | $HELM_CONFIG_HOME | set an alternative location for storing Helm configuration. | | $HELM_DATA_HOME | set an alternative location for storing Helm data. | ... ``` ## Kustomize kustomize 또한 여러 쿠버네티스 리소스를 한 번에 배포하고 관리할 수 있게 도와주는 패키지 매니징 도구 중 하나입니다. 1. 현재 폴더에 kustomize v3.10.0 버전의 바이너리를 다운받습니다. - For Linux amd64 ```bash wget https://github.com/kubernetes-sigs/kustomize/releases/download/kustomize%2Fv3.10.0/kustomize_v3.10.0_linux_amd64.tar.gz ``` - 다른 OS는 [kustomize/v3.10.0](https://github.com/kubernetes-sigs/kustomize/releases/tag/kustomize%2Fv3.10.0)에서 확인 후 다운로드 받습니다. 2. kustomize 를 사용할 수 있도록 압축을 풀고, 파일의 위치를 변경합니다. ```bash tar -zxvf kustomize_v3.10.0_linux_amd64.tar.gz sudo mv kustomize /usr/local/bin/kustomize ``` 3. 정상적으로 설치되었는지 확인합니다. ```bash kustomize help ``` 다음과 같은 메시지가 보이면 정상적으로 설치된 것을 의미합니다. ```bash Manages declarative configuration of Kubernetes. See https://sigs.k8s.io/kustomize Usage: kustomize [command] Available Commands: build Print configuration per contents of kustomization.yaml cfg Commands for reading and writing configuration. completion Generate shell completion script create Create a new kustomization in the current directory edit Edits a kustomization file fn Commands for running functions against configuration. ... ``` ## CSI Plugin : Local Path Provisioner 1. CSI Plugin은 kubernetes 내의 스토리지를 담당하는 모듈입니다. 단일 노드 클러스터에서 쉽게 사용할 수 있는 CSI Plugin인 Local Path Provisioner를 설치합니다. ```bash kubectl apply -f https://raw.githubusercontent.com/rancher/local-path-provisioner/v0.0.20/deploy/local-path-storage.yaml ``` 다음과 같은 메시지가 보이면 정상적으로 설치된 것을 의미합니다. ```bash namespace/local-path-storage created serviceaccount/local-path-provisioner-service-account created clusterrole.rbac.authorization.k8s.io/local-path-provisioner-role created clusterrolebinding.rbac.authorization.k8s.io/local-path-provisioner-bind created deployment.apps/local-path-provisioner created storageclass.storage.k8s.io/local-path created configmap/local-path-config created ``` 2. 또한, 다음과 같이 local-path-storage namespace 에 provisioner pod이 Running 인지 확인합니다. ```bash kubectl -n local-path-storage get pod ``` 정상적으로 수행되면 아래와 같이 출력됩니다. ```bash NAME READY STATUS RESTARTS AGE local-path-provisioner-d744ccf98-xfcbk 1/1 Running 0 7m ``` 4. 다음을 수행하여 default storage class로 변경합니다. ```bash kubectl patch storageclass local-path -p '{"metadata": {"annotations":{"storageclass.kubernetes.io/is-default-class":"true"}}}' ``` 정상적으로 수행되면 아래와 같이 출력됩니다. ```bash storageclass.storage.k8s.io/local-path patched ``` 5. default storage class로 설정되었는지 확인합니다. ```bash kubectl get sc ``` 다음과 같이 NAME에 `local-path (default)` 인 storage class가 존재하는 것을 확인합니다. ```bash NAME PROVISIONER RECLAIMPOLICY VOLUMEBINDINGMODE ALLOWVOLUMEEXPANSION AGE local-path (default) rancher.io/local-path Delete WaitForFirstConsumer false 2h ``` ================================================ FILE: versioned_docs/version-1.0/setup-kubernetes/install-prerequisite.md ================================================ --- title: "3. Install Prerequisite" description: "Install docker" sidebar_position: 3 date: 2021-12-13 lastmod: 2021-12-20 contributors: ["Jaeyeon Kim", "Jongsun Shinn", "Sangwoo Shim"] --- 이 페이지에서는 쿠버네티스를 설치하기에 앞서, **클러스터**와 **클라이언트**에 설치 혹은 설정해두어야 하는 컴포넌트들에 대한 매뉴얼을 설명합니다. ## Install apt packages 추후 클라이언트와 클러스터의 원활한 통신을 위해서는 Port-Forwarding을 수행해야 할 일이 있습니다. Port-Forwarding을 위해서는 **클러스터**에 다음 패키지를 설치해 주어야 합니다. ```bash sudo apt-get update sudo apt-get install -y socat ``` ## Install Docker 1. 도커 설치에 필요한 APT 패키지들을 설치합니다. ```bash sudo apt-get update && sudo apt-get install -y ca-certificates curl gnupg lsb-release ``` 2. 도커의 공식 GPG key를 추가합니다. ```bash curl -fsSL https://download.docker.com/linux/ubuntu/gpg | sudo gpg --dearmor -o /usr/share/keyrings/docker-archive-keyring.gpg ``` 3. apt 패키지 매니저로 도커를 설치할 때, stable Repository에서 받아오도록 설정합니다. ```bash echo \ "deb [arch=$(dpkg --print-architecture) signed-by=/usr/share/keyrings/docker-archive-keyring.gpg] https://download.docker.com/linux/ubuntu \ $(lsb_release -cs) stable" | sudo tee /etc/apt/sources.list.d/docker.list > /dev/null ``` 4. 현재 설치할 수 있는 도커 버전을 확인합니다. ```bash sudo apt-get update && apt-cache madison docker-ce ``` 출력되는 버전 중 `5:20.10.11~3-0~ubuntu-focal` 버전이 있는지 확인합니다. ```bash apt-cache madison docker-ce | grep 5:20.10.11~3-0~ubuntu-focal ``` 정상적으로 추가가 된 경우 다음과 같이 출력됩니다. ```bash docker-ce | 5:20.10.11~3-0~ubuntu-focal | https://download.docker.com/linux/ubuntu focal/stable amd64 Packages ``` 5. `5:20.10.11~3-0~ubuntu-focal` 버전의 도커를 설치합니다. ```bash sudo apt-get install -y containerd.io docker-ce=5:20.10.11~3-0~ubuntu-focal docker-ce-cli=5:20.10.11~3-0~ubuntu-focal ``` 6. 도커가 정상적으로 설치된 것을 확인합니다. ```bash sudo docker run hello-world ``` 명령어 실행 후 다음과 같은 메시지가 보이면 정상적으로 설치된 것을 의미합니다. ```bash mlops@ubuntu:~$ sudo docker run hello-world Hello from Docker! This message shows that your installation appears to be working correctly. To generate this message, Docker took the following steps: 1. The Docker client contacted the Docker daemon. 2. The Docker daemon pulled the "hello-world" image from the Docker Hub. (amd64) 3. The Docker daemon created a new container from that image which runs the executable that produces the output you are currently reading. 4. The Docker daemon streamed that output to the Docker client, which sent it to your terminal. To try something more ambitious, you can run an Ubuntu container with: $ docker run -it ubuntu bash Share images, automate workflows, and more with a free Docker ID: https://hub.docker.com/ For more examples and ideas, visit: https://docs.docker.com/get-started/ ``` 7. docker 관련 command를 sudo 키워드 없이 사용할 수 있게 하도록 다음 명령어를 통해 권한을 추가합니다. ```bash sudo groupadd docker sudo usermod -aG docker $USER newgrp docker ``` 8. sudo 키워드 없이 docker command를 사용할 수 있게 된 것을 확인하기 위해, 다시 한번 docker run을 실행합니다. ```bash docker run hello-world ``` 명령어 실행 후 다음과 같은 메시지가 보이면 정상적으로 권한이 추가된 것을 의미합니다. ```bash mlops@ubuntu:~$ docker run hello-world Hello from Docker! This message shows that your installation appears to be working correctly. To generate this message, Docker took the following steps: 1. The Docker client contacted the Docker daemon. 2. The Docker daemon pulled the "hello-world" image from the Docker Hub. (amd64) 3. The Docker daemon created a new container from that image which runs the executable that produces the output you are currently reading. 4. The Docker daemon streamed that output to the Docker client, which sent it to your terminal. To try something more ambitious, you can run an Ubuntu container with: $ docker run -it ubuntu bash Share images, automate workflows, and more with a free Docker ID: https://hub.docker.com/ For more examples and ideas, visit: https://docs.docker.com/get-started/ ``` ## Turn off Swap Memory kubelet 이 정상적으로 동작하게 하기 위해서는 **클러스터** 노드에서 swap이라고 불리는 가상메모리를 꺼 두어야 합니다. 다음 명령어를 통해 swap을 꺼 둡니다. **(클러스터와 클라이언트를 같은 데스크톱에서 사용할 때 swap 메모리를 종료하면 속도의 저하가 있을 수 있습니다)** ```bash sudo sed -i '/ swap / s/^\(.*\)$/#\1/g' /etc/fstab sudo swapoff -a ``` ## Install Kubectl kubectl 은 쿠버네티스 클러스터에 API를 요청할 때 사용하는 클라이언트 툴입니다. **클라이언트** 노드에 설치해두어야 합니다. 1. 현재 폴더에 kubectl v1.21.7 버전을 다운받습니다. ```bash curl -LO https://dl.k8s.io/release/v1.21.7/bin/linux/amd64/kubectl ``` 2. kubectl 을 사용할 수 있도록 파일의 권한과 위치를 변경합니다. ```bash sudo install -o root -g root -m 0755 kubectl /usr/local/bin/kubectl ``` 3. 정상적으로 설치되었는지 확인합니다. ```bash kubectl version --client ``` 다음과 같은 메시지가 보이면 정상적으로 설치된 것을 의미합니다. ```bash Client Version: version.Info{Major:"1", Minor:"21", GitVersion:"v1.21.7", GitCommit:"1f86634ff08f37e54e8bfcd86bc90b61c98f84d4", GitTreeState:"clean", BuildDate:"2021-11-17T14:41:19Z", GoVersion:"go1.16.10", Compiler:"gc", Platform:"linux/amd64"} ``` 4. 여러 개의 쿠버네티스 클러스터를 사용하는 경우, 여러 개의 kubeconfig 파일을 관리해야 하는 경우가 있습니다. 여러 개의 kubeconfig 파일 혹은 여러 개의 kube-context를 효율적으로 관리하는 방법은 다음과 같은 문서를 참고하시기 바랍니다. - [https://dev.to/aabiseverywhere/configuring-multiple-kubeconfig-on-your-machine-59eo](https://dev.to/aabiseverywhere/configuring-multiple-kubeconfig-on-your-machine-59eo) - [https://github.com/ahmetb/kubectx](https://github.com/ahmetb/kubectx) ## References - [Install Docker Engine on Ubuntu](https://docs.docker.com/engine/install/ubuntu/) - [리눅스에 kubectl 설치 및 설정](https://kubernetes.io/ko/docs/tasks/tools/install-kubectl-linux/) ================================================ FILE: versioned_docs/version-1.0/setup-kubernetes/intro.md ================================================ --- title: "1. Introduction" description: "Setup Introduction" sidebar_position: 1 date: 2021-12-13 lastmod: 2021-12-13 contributors: ["Jaeyeon Kim", "Jongsun Shinn", "Youngdon Tae", "SeungTae Kim"] --- ## MLOps 시스템 구축해보기 MLOps를 공부하는 데 있어서 가장 큰 장벽은 MLOps 시스템을 구성해보고 사용해보기가 어렵다는 점입니다. AWS, GCP 등의 퍼블릭 클라우드 혹은 Weight & Bias, neptune.ai 등의 상용 툴을 사용해보기에는 과금에 대한 부담이 존재하고, 처음부터 모든 환경을 혼자서 구성하기에는 어디서부터 시작해야 할지 막막하게 느껴질 수밖에 없습니다. 이런 이유들로 MLOps를 선뜻 시작해보지 못하시는 분들을 위해, *모두의 MLOps*에서는 우분투가 설치되는 데스크톱 하나만 준비되어 있다면 MLOps 시스템을 밑바닥부터 구축하고 사용해 볼 수 있는 방법을 다룰 예정입니다. 우분투 데스크탑 환경을 준비할 수 없는 경우, 가상머신을 활용하여 환경을 구성하기 >Windows 혹은 Intel Mac을 사용해 `모두의 MLops` 실습을 진행 중인 분들은 `Virtual Box`, `VMware` 등의 가상머신 소프트웨어를 이용하여 우분투 데스크탑 환경을 준비할 수 있습니다. 이 때, 권장 사양을 맞춰 가상 머신을 생성해주시기 바랍니다. >또한, M1 Mac을 사용하시는 분들은 작성일(2022년 2월) 기준으로는 Virtual Box, VMware 는 이용할 수 없습니다. ([M1 Apple Silicone Mac에 최적화된 macOS 앱 지원 확인하기](https://isapplesiliconready.com/kr)) >따라서, 클라우드 환경을 이용해 실습하는 것이 아니라면, [UTM , Virtual machines for Mac](https://mac.getutm.app/)을 설치하여 가상 머신을 이용해주세요. >(앱스토어에서 구매하여 다운로드 받는 소프트웨어는 일종의 Donation 개념의 비용 지불입니다. 무료 버전과 자동 업데이트 정도의 차이가 있어, 무료버전을 사용해도 무방합니다.) >해당 가상머신 소프트웨어는 `Ubuntu 20.04.3 LTS` 실습 운영체제를 지원하고 있어, M1 Mac에서 실습을 수행하는 것을 가능하게 합니다. 하지만 [MLOps의 구성요소](../introduction/component.md)에서 설명하는 요소들을 모두 사용해볼 수는 없기에, *모두의 MLOps*에서는 대표적인 오픈소스만을 설치한 뒤, 서로 연동하여 사용하는 부분을 주로 다룰 예정입니다. *모두의 MLOps*에서 설치하는 오픈소스가 표준을 의미하는 것은 아니며, 여러분의 상황에 맞게 적절한 툴을 취사선택하는 것을 권장합니다. ## 구성 요소 이 글에서 만들어 볼 MLOps 시스템의 구성 요소들과 각 버전은 아래와 같은 환경에서 검증되었습니다. 원활한 환경에서 테스트하기 위해 **싱글 노드 클러스터 (혹은 클러스터)** 와 **클라이언트**를 분리하여 설명해 드릴 예정입니다. **클러스터** 는 우분투가 설치되어 있는 데스크톱 하나를 의미합니다. **클라이언트** 는 노트북 혹은 클러스터가 설치되어 있는 데스크톱 외의 클라이언트로 사용할 수 있는 다른 데스크톱을 사용하는 것을 권장합니다. 하지만 두 대의 머신을 준비할 수 없다면 데스크톱 하나를 동시에 클러스터와 클라이언트 용도로 사용하셔도 괜찮습니다. ### 클러스터 #### 1. Software 아래는 클러스터에 설치해야 할 소프트웨어 목록입니다. | Software | Version | | --------------- | ----------- | | Ubuntu | 20.04.3 LTS | | Docker (Server) | 20.10.11 | | NVIDIA-Driver | 470.86 | | Kubernetes | v1.21.7 | | Kubeflow | v1.4.0 | | MLFlow | v1.21.0 | #### 2. Helm Chart 아래는 Helm을 이용해 설치되어야 할 써드파티 소프트웨어 목록입니다. | Helm Chart Repo Name | Version | | ----------------------------- | ------- | | datawire/ambassador | 6.9.3 | | seldonio/seldon-core-operator | 1.11.2 | ### 클라이언트 클라이언트는 MacOS (Intel CPU), Ubuntu 20.04 에서 검증되었습니다. | Software | Version | | --------------- | ----------- | | kubectl | v1.21.7 | | helm | v3.7.1 | | kustomize | v3.10.0 | ### Minimum System Requirements 모두의 MLOps를 설치할 클러스터는 다음과 같은 사양을 만족시키는 것을 권장합니다. 이는 Kubernetes 및 Kubeflow 의 권장 사양에 의존합니다. - CPU : 6 core - RAM : 12GB - DISK : 50GB - GPU : NVIDIA GPU (Optional) ================================================ FILE: versioned_docs/version-1.0/setup-kubernetes/kubernetes.md ================================================ --- title : "2. Setup Kubernetes" description: "Setup Kubernetes" sidebar_position: 2 date: 2021-12-13 lastmod: 2021-12-13 contributors: ["Jaeyeon Kim"] --- ## Setup Kubernetes Cluster 쿠버네티스를 처음 배우시는 분들에게 첫 진입 장벽은 쿠버네티스 실습 환경을 구축하는 것입니다. 프로덕션 레벨의 쿠버네티스 클러스터를 구축할 수 있게 공식적으로 지원하는 도구는 kubeadm 이지만, 사용자들이 조금 더 쉽게 구축할 수 있도록 도와주는 kubespray, kops 등의 도구도 존재하며, 학습 목적을 위해서 컴팩트한 쿠버네티스 클러스터를 정말 쉽게 구축할 수 있도록 도와주는 k3s, minikube, microk8s, kind 등의 도구도 존재합니다. 각각의 도구는 장단점이 다르기에 사용자마다 선호하는 도구가 다른 점을 고려하여, 본 글에서는 kubeadm, k3s, minikube의 3가지 도구를 활용하여 쿠버네티스 클러스터를 구축하는 방법을 다룹니다. 각 도구에 대한 자세한 비교는 다음 쿠버네티스 [공식 문서](https://kubernetes.io/ko/docs/tasks/tools/)를 확인해주시기를 바랍니다. *모두의 MLOps*에서 권장하는 툴은 **k3s**로 쿠버네티스 클러스터를 구축할 때 쉽게 할 수 있다는 장점이 있습니다. 만약 쿠버네티스의 모든 기능을 사용하고 노드 구성까지 활용하고 싶다면 **kubeadm**을 권장해 드립니다. **minikube** 는 저희가 설명하는 컴포넌트 외에도 다른 쿠버네티스를 add-on 형식으로 쉽게 설치할 수 있다는 장점이 있습니다. 본 *모두의 MLOps*에서는 구축하게 될 MLOps 구성 요소들을 원활히 사용하기 위해, 각각의 도구를 활용해 쿠버네티스 클러스터를 구축할 때, 추가로 설정해 주어야 하는 부분이 추가되어 있습니다. Ubuntu OS까지는 설치되어 있는 데스크탑을 k8s cluster로 구축한 뒤, 외부 클라이언트 노드에서 쿠버네티스 클러스터에 접근하는 것을 확인하는 것까지가 본 **Setup Kubernetes**단원의 범위입니다. 자세한 구축 방법은 3가지 도구마다 다르기에 다음과 같은 흐름으로 구성되어 있습니다. ```bash 3. Setup Prerequisite 4. Setup Kubernetes 4.1. with k3s 4.2. with minikube 4.3. with kubeadm 5. Setup Kubernetes Modules ``` 그럼 이제 각각의 도구를 활용해 쿠버네티스 클러스터를 구축해보겠습니다. 반드시 모든 도구를 사용해 볼 필요는 없으며, 이 중 여러분이 익숙하신 도구를 활용해주시면 충분합니다. ================================================ FILE: versioned_docs/version-1.0/setup-kubernetes/setup-nvidia-gpu.md ================================================ --- title: "6. (Optional) Setup GPU" description: "Install nvidia docker, nvidia device plugin" sidebar_position: 6 date: 2021-12-13 lastmod: 2021-12-13 contributors: ["Jaeyeon Kim"] --- 쿠버네티스 및 Kubeflow 등에서 GP 를 사용하기 위해서는 다음 작업이 필요합니다. ## 1. Install NVIDIA Driver `nvidia-smi` 수행 시 다음과 같은 화면이 출력된다면 이 단계는 생략해 주시기 바랍니다. ```bash mlops@ubuntu:~$ nvidia-smi +-----------------------------------------------------------------------------+ | NVIDIA-SMI 470.86 Driver Version: 470.86 CUDA Version: 11.4 | |-------------------------------+----------------------+----------------------+ | GPU Name Persistence-M| Bus-Id Disp.A | Volatile Uncorr. ECC | | Fan Temp Perf Pwr:Usage/Cap| Memory-Usage | GPU-Util Compute M. | | | | MIG M. | |===============================+======================+======================| | 0 NVIDIA GeForce ... Off | 00000000:01:00.0 Off | N/A | | 25% 32C P8 4W / 120W | 211MiB / 6078MiB | 0% Default | | | | N/A | +-------------------------------+----------------------+----------------------+ | 1 NVIDIA GeForce ... Off | 00000000:02:00.0 Off | N/A | | 0% 34C P8 7W / 175W | 5MiB / 7982MiB | 0% Default | | | | N/A | +-------------------------------+----------------------+----------------------+ +-----------------------------------------------------------------------------+ | Processes: | | GPU GI CI PID Type Process name GPU Memory | | ID ID Usage | |=============================================================================| | 0 N/A N/A 1644 G /usr/lib/xorg/Xorg 198MiB | | 0 N/A N/A 1893 G /usr/bin/gnome-shell 10MiB | | 1 N/A N/A 1644 G /usr/lib/xorg/Xorg 4MiB | +-----------------------------------------------------------------------------+ ``` `nvidia-smi`의 출력 결과가 위와 같지 않다면 장착된 GPU에 맞는 nvidia driver를 설치해 주시기 바랍니다. 만약 nvidia driver의 설치에 익숙하지 않다면 아래 명령어를 통해 설치하시기 바랍니다. ```bash sudo add-apt-repository ppa:graphics-drivers/ppa sudo apt update && sudo apt install -y ubuntu-drivers-common sudo ubuntu-drivers autoinstall sudo reboot ``` ## 2. NVIDIA-Docker 설치 NVIDIA-Docker를 설치합니다. ```bash curl -s -L https://nvidia.github.io/nvidia-docker/gpgkey | \ sudo apt-key add - distribution=$(. /etc/os-release;echo $ID$VERSION_ID) curl -s -L https://nvidia.github.io/nvidia-docker/$distribution/nvidia-docker.list | sudo tee /etc/apt/sources.list.d/nvidia-docker.list sudo apt-get update sudo apt-get install -y nvidia-docker2 && sudo systemctl restart docker ``` 정상적으로 설치되었는지 확인하기 위해, GPU를 사용하는 도커 컨테이너를 실행해봅니다. ```bash sudo docker run --rm --gpus all nvidia/cuda:11.0-base nvidia-smi ``` 다음과 같은 메시지가 보이면 정상적으로 설치된 것을 의미합니다. ```bash mlops@ubuntu:~$ sudo docker run --rm --gpus all nvidia/cuda:11.0-base nvidia-smi +-----------------------------------------------------------------------------+ | NVIDIA-SMI 470.86 Driver Version: 470.86 CUDA Version: 11.4 | |-------------------------------+----------------------+----------------------+ | GPU Name Persistence-M| Bus-Id Disp.A | Volatile Uncorr. ECC | | Fan Temp Perf Pwr:Usage/Cap| Memory-Usage | GPU-Util Compute M. | | | | MIG M. | |===============================+======================+======================| | 0 NVIDIA GeForce ... Off | 00000000:01:00.0 Off | N/A | | 25% 32C P8 4W / 120W | 211MiB / 6078MiB | 0% Default | | | | N/A | +-------------------------------+----------------------+----------------------+ | 1 NVIDIA GeForce ... Off | 00000000:02:00.0 Off | N/A | | 0% 34C P8 6W / 175W | 5MiB / 7982MiB | 0% Default | | | | N/A | +-------------------------------+----------------------+----------------------+ +-----------------------------------------------------------------------------+ | Processes: | | GPU GI CI PID Type Process name GPU Memory | | ID ID Usage | |=============================================================================| +-----------------------------------------------------------------------------+ ``` ## 3. NVIDIA-Docker를 Default Container Runtime으로 설정 쿠버네티스는 기본적으로 Docker-CE를 Default Container Runtime으로 사용합니다. 따라서, Docker Container 내에서 NVIDIA GPU를 사용하기 위해서는 NVIDIA-Docker 를 Container Runtime 으로 사용하여 pod를 생성할 수 있도록 Default Runtime을 수정해 주어야 합니다. 1. `/etc/docker/daemon.json` 파일을 열어 다음과 같이 수정합니다. ```bash sudo vi /etc/docker/daemon.json { "default-runtime": "nvidia", "runtimes": { "nvidia": { "path": "nvidia-container-runtime", "runtimeArgs": [] } } } ``` 2. 파일이 변경된 것을 확인한 후, Docker를 재시작합니다. ```bash sudo systemctl daemon-reload sudo service docker restart ``` 3. 변경 사항이 반영되었는지 확인합니다. ```bash sudo docker info | grep nvidia ``` 다음과 같은 메시지가 보이면 정상적으로 설치된 것을 의미합니다. ```bash mlops@ubuntu:~$ docker info | grep nvidia Runtimes: io.containerd.runc.v2 io.containerd.runtime.v1.linux nvidia runc Default Runtime: nvidia ``` ## 4. Nvidia-Device-Plugin 1. nvidia-device-plugin daemonset을 생성합니다. ```bash kubectl create -f https://raw.githubusercontent.com/NVIDIA/k8s-device-plugin/v0.10.0/nvidia-device-plugin.yml ``` 2. nvidia-device-plugin pod이 RUNNING 상태로 생성되었는지 확인합니다. ```bash kubectl get pod -n kube-system | grep nvidia ``` 다음과 같은 결과가 출력되어야 합니다. ```bash kube-system nvidia-device-plugin-daemonset-nlqh2 1/1 Running 0 1h ``` 3. node 정보에 gpu가 사용가능하도록 설정되었는지 확인합니다. ```bash kubectl get nodes "-o=custom-columns=NAME:.metadata.name,GPU:.status.allocatable.nvidia\.com/gpu" ``` 다음과 같은 메시지가 보이면 정상적으로 설정된 것을 의미합니다. (*모두의 MLOps* 에서 실습을 진행한 클러스터는 2개의 GPU가 있어서 2가 출력됩니다. 본인의 클러스터의 GPU 개수와 맞는 숫자가 출력된다면 됩니다.) ```bash NAME GPU ubuntu 2 ``` 설정되지 않은 경우, GPU의 value가 `` 으로 표시됩니다. ================================================ FILE: versioned_sidebars/version-1.0-sidebars.json ================================================ { "tutorialSidebar": [ { "type": "category", "label": "Introduction", "items": [ "introduction/intro", "introduction/levels", "introduction/component", "introduction/why_kubernetes" ] }, { "type": "category", "label": "Setup Kubernetes", "items": [ "setup-kubernetes/intro", "setup-kubernetes/kubernetes", "setup-kubernetes/install-prerequisite", { "type": "category", "label": "4. Install Kubernetes", "items": [ "setup-kubernetes/install-kubernetes/kubernetes-with-k3s", "setup-kubernetes/install-kubernetes/kubernetes-with-kubeadm", "setup-kubernetes/install-kubernetes/kubernetes-with-minikube" ] }, "setup-kubernetes/install-kubernetes-module", "setup-kubernetes/setup-nvidia-gpu" ] }, { "type": "category", "label": "Setup Components", "items": [ "setup-components/install-components-kf", "setup-components/install-components-mlflow", "setup-components/install-components-seldon", "setup-components/install-components-pg" ] }, { "type": "category", "label": "Kubeflow UI Guide", "items": [ "kubeflow-dashboard-guide/intro", "kubeflow-dashboard-guide/notebooks", "kubeflow-dashboard-guide/tensorboards", "kubeflow-dashboard-guide/volumes", "kubeflow-dashboard-guide/experiments", "kubeflow-dashboard-guide/experiments-and-others" ] }, { "type": "category", "label": "Kubeflow", "items": [ "kubeflow/kubeflow-intro", "kubeflow/kubeflow-concepts", "kubeflow/basic-requirements", "kubeflow/basic-component", "kubeflow/basic-pipeline", "kubeflow/basic-pipeline-upload", "kubeflow/basic-run", "kubeflow/advanced-component", "kubeflow/advanced-environment", "kubeflow/advanced-pipeline", "kubeflow/advanced-run", "kubeflow/advanced-mlflow", "kubeflow/how-to-debug" ] }, { "type": "category", "label": "API Deployment", "items": [ "api-deployment/what-is-api-deployment", "api-deployment/seldon-iris", "api-deployment/seldon-pg", "api-deployment/seldon-fields", "api-deployment/seldon-mlflow", "api-deployment/seldon-children" ] }, { "type": "category", "label": "Appendix", "items": [ "appendix/pyenv", "appendix/metallb" ] }, { "type": "category", "label": "Further Readings", "items": [ "further-readings/info" ] } ], "preSidebar": [ { "type": "category", "label": "Docker", "items": [ "prerequisites/docker/install", "prerequisites/docker/introduction", "prerequisites/docker/docker", "prerequisites/docker/command", "prerequisites/docker/images", "prerequisites/docker/advanced" ] } ] } ================================================ FILE: versions.json ================================================ [ "1.0" ]