Repository: aws/aws-k8s-tester Branch: main Commit: 2b0b6c2b51b7 Files: 270 Total size: 1.4 MB Directory structure: gitextract_r74v3ht2/ ├── .dockerignore ├── .github/ │ ├── PULL_REQUEST_TEMPLATE.md │ └── workflows/ │ ├── build-neuron-ci.yaml │ ├── build-nvidia-ci.yaml │ ├── ci.yaml │ ├── update-go-dependencies.yaml │ ├── update-image-tags.yaml │ ├── update-neuron-dependencies.yaml │ └── update-nvidia-dependencies.yaml ├── .gitignore ├── .vscode/ │ └── settings.json ├── CODE_OF_CONDUCT.md ├── CONTRIBUTING.md ├── Config ├── Dockerfile ├── LICENSE ├── Makefile ├── NOTICE ├── README.md ├── bmg.json ├── cmd/ │ ├── kubetest2-eksapi/ │ │ └── main.go │ ├── kubetest2-eksapi-janitor/ │ │ └── main.go │ ├── kubetest2-eksctl/ │ │ └── main.go │ ├── kubetest2-tester-ginkgo-v1/ │ │ └── main.go │ └── kubetest2-tester-multi/ │ └── main.go ├── external/ │ └── tools.go ├── go.mod ├── go.sum ├── hack/ │ ├── download-kubernetes-binaries.sh │ ├── free-disk-space.sh │ ├── update-go-dependencies.sh │ ├── update-image-tags.sh │ ├── update-neuron-dependencies.sh │ └── update-nvidia-dependencies.sh ├── internal/ │ ├── awssdk/ │ │ └── config.go │ ├── deployers/ │ │ ├── eksapi/ │ │ │ ├── addons.go │ │ │ ├── ami_resolver.go │ │ │ ├── ami_resolver_test.go │ │ │ ├── auth_map_role.go │ │ │ ├── auth_map_role_test.go │ │ │ ├── aws.go │ │ │ ├── cluster.go │ │ │ ├── common.go │ │ │ ├── common_test.go │ │ │ ├── deployer.go │ │ │ ├── infra.go │ │ │ ├── janitor.go │ │ │ ├── k8s.go │ │ │ ├── kubeconfig.go │ │ │ ├── logs.go │ │ │ ├── logs_ssm_doc.json │ │ │ ├── metrics.go │ │ │ ├── node.go │ │ │ ├── static_cluster.go │ │ │ ├── templates/ │ │ │ │ ├── auth_map_role.yaml.template │ │ │ │ ├── busybox_deployment.yaml.template │ │ │ │ ├── cloudwatch-infra.yaml.template │ │ │ │ ├── cloudwatch_agent_infra.yaml │ │ │ │ ├── infra.yaml │ │ │ │ ├── nvidia_static_cluster_nodepool.yaml.template │ │ │ │ ├── templates.go │ │ │ │ ├── templates_test.go │ │ │ │ ├── unmanaged-nodegroup.yaml.template │ │ │ │ ├── userdata_bootstrap.sh.mimepart.template │ │ │ │ ├── userdata_bottlerocket.toml.template │ │ │ │ └── userdata_nodeadm.yaml.mimepart.template │ │ │ ├── userdata.go │ │ │ ├── userdata_test.go │ │ │ ├── vpccni.go │ │ │ └── vpccni_test.go │ │ └── eksctl/ │ │ ├── build.go │ │ ├── cluster_config.go │ │ ├── deployer.go │ │ ├── down.go │ │ └── up.go │ ├── e2e/ │ │ ├── client.go │ │ ├── conditions.go │ │ ├── doc.go │ │ ├── ec2.go │ │ ├── health.go │ │ ├── logs.go │ │ ├── mpijobs/ │ │ │ ├── conditions.go │ │ │ ├── conditions_test.go │ │ │ └── types.go │ │ └── resources.go │ ├── metrics/ │ │ ├── cloudwatch.go │ │ ├── noop.go │ │ └── registry.go │ ├── testers/ │ │ ├── ginkgov1/ │ │ │ ├── LICENSE.original │ │ │ ├── README.md │ │ │ ├── ginkgo.go │ │ │ ├── kubectl/ │ │ │ │ └── kubectl.go │ │ │ └── package.go │ │ └── multi/ │ │ └── cmd.go │ ├── util/ │ │ ├── cloudformation.go │ │ ├── exec.go │ │ ├── http.go │ │ ├── http_test.go │ │ ├── lang.go │ │ ├── path.go │ │ └── version.go │ └── version.go └── test/ ├── cases/ │ ├── disruptive/ │ │ ├── graceful_reboot_test.go │ │ ├── graceful_shutdown_test.go │ │ └── main_test.go │ ├── dra/ │ │ ├── dra_example_driver_test.go │ │ └── main_test.go │ ├── efa/ │ │ ├── commons.go │ │ ├── main_test.go │ │ ├── pingpong_test.go │ │ └── unit_test.go │ ├── fips/ │ │ ├── README.md │ │ ├── fips_test.go │ │ ├── main_test.go │ │ └── manifests/ │ │ ├── registry-fips.yaml │ │ ├── registry-nonfips.yaml │ │ └── test-pods.yaml │ ├── netpol/ │ │ ├── main_test.go │ │ └── np_test.go │ ├── neuron/ │ │ ├── main_test.go │ │ ├── manifests/ │ │ │ ├── multi-node-test-neuron.yaml │ │ │ └── single-node-test-neuronx.yaml │ │ └── neuron_test.go │ ├── neuron-dra/ │ │ ├── main_test.go │ │ ├── neuron_dra_test.go │ │ ├── rcts/ │ │ │ └── trn1/ │ │ │ ├── rct-2-efas-4-neurons-wrong-match.yaml │ │ │ └── rct-all-efas-all-neurons.yaml │ │ ├── templates/ │ │ │ └── nccom-test-mpijob.yaml.tmpl │ │ ├── testcases/ │ │ │ └── trn1/ │ │ │ ├── 2-efas-4-neurons-wrong-match.yaml │ │ │ └── all-efas-all-neurons.yaml │ │ └── topology.go │ ├── neuron-inference/ │ │ ├── bert_inference_test.go │ │ ├── main_test.go │ │ ├── manifests/ │ │ │ └── neuron-bert-inference.yaml │ │ └── vars.go │ ├── neuron-training/ │ │ ├── bert_training_test.go │ │ ├── main_test.go │ │ ├── manifests/ │ │ │ ├── bert-training.yaml │ │ │ └── training-comm-service.yaml │ │ └── vars.go │ ├── nvidia/ │ │ ├── capabilities_test.go │ │ ├── containerd_test.go │ │ ├── main_test.go │ │ ├── manifests/ │ │ │ ├── daemonset-containerd-check.yaml │ │ │ ├── job-hpc-benchmarks.yaml │ │ │ ├── job-unit-test-single-node.yaml │ │ │ ├── mpi-job-nccl-test-multi-node.yaml │ │ │ ├── mpi-job-pytorch-training-single-node.yaml │ │ │ └── nvidia-driver-capabilities-check.yaml │ │ ├── mpi_test.go │ │ └── unit_test.go │ ├── nvidia-dra/ │ │ ├── main_test.go │ │ ├── nvidia_dra_test.go │ │ ├── rcts/ │ │ │ └── p5/ │ │ │ ├── rct-all-efas.yaml │ │ │ ├── rct-all-gpus.yaml │ │ │ └── rct-five-efas-one-gpu.yaml │ │ ├── templates/ │ │ │ └── nccl-test-mpijob.yaml.tmpl │ │ ├── testcases/ │ │ │ └── p5/ │ │ │ ├── all-efas-all-gpus.yaml │ │ │ └── five-efas-one-gpu-negative-test.yaml │ │ └── topology.go │ ├── nvidia-inference/ │ │ ├── bert_inference_test.go │ │ ├── main_test.go │ │ └── manifests/ │ │ └── bert-inference.yaml │ ├── nvidia-training/ │ │ ├── bert_training_test.go │ │ ├── main_test.go │ │ ├── manifests/ │ │ │ └── bert-training.yaml │ │ └── vars.go │ ├── quick/ │ │ ├── io_uring_test.go │ │ ├── limit_test.go │ │ ├── main_test.go │ │ ├── manifests/ │ │ │ └── ulimit.yaml │ │ └── node_topology_test.go │ └── workload/ │ ├── main_test.go │ └── workload_test.go ├── common/ │ ├── dra.go │ ├── dra_features.go │ ├── dra_types.go │ ├── flags.go │ └── resources.go ├── images/ │ ├── efa/ │ │ ├── Dockerfile │ │ └── scripts/ │ │ └── unit-test.sh │ ├── neuron/ │ │ ├── Dockerfile │ │ ├── hack/ │ │ │ └── install-test-resources.sh │ │ └── tests/ │ │ ├── singleNodeTest.sh │ │ ├── testNeuronMlp.py │ │ ├── testNeuronParallelState.py │ │ └── testNeuronSingleAllReduce.py │ ├── neuron-inference/ │ │ ├── Dockerfile │ │ └── infer.py │ ├── neuron-training/ │ │ ├── Dockerfile │ │ └── train.py │ ├── nvidia/ │ │ ├── Dockerfile │ │ └── gpu_unit_tests/ │ │ ├── README.md │ │ ├── bash_unit │ │ ├── tests/ │ │ │ ├── common.sh │ │ │ ├── test_basic.sh │ │ │ ├── test_sysinfo.sh │ │ │ └── test_sysinfo.sh.data/ │ │ │ ├── g5.48xlarge/ │ │ │ │ ├── gpu_count.txt │ │ │ │ ├── numa_topo.txt │ │ │ │ ├── nvidia_persistence_status.txt │ │ │ │ └── nvidia_smi_topo.txt │ │ │ ├── g5.8xlarge/ │ │ │ │ ├── gpu_count.txt │ │ │ │ ├── numa_topo.txt │ │ │ │ ├── nvidia_persistence_status.txt │ │ │ │ └── nvidia_smi_topo.txt │ │ │ ├── g5g.2xlarge/ │ │ │ │ ├── gpu_count.txt │ │ │ │ ├── numa_topo.txt │ │ │ │ ├── nvidia_persistence_status.txt │ │ │ │ └── nvidia_smi_topo.txt │ │ │ ├── g6f.2xlarge/ │ │ │ │ ├── efa_count.txt │ │ │ │ ├── gpu_count.txt │ │ │ │ ├── numa_topo.txt │ │ │ │ ├── nvidia_persistence_status.txt │ │ │ │ ├── nvidia_smi_topo.txt │ │ │ │ └── nvidia_vgpu_license_status.txt │ │ │ ├── g6f.4xlarge/ │ │ │ │ ├── efa_count.txt │ │ │ │ ├── gpu_count.txt │ │ │ │ ├── numa_topo.txt │ │ │ │ ├── nvidia_persistence_status.txt │ │ │ │ ├── nvidia_smi_topo.txt │ │ │ │ └── nvidia_vgpu_license_status.txt │ │ │ ├── g6f.large/ │ │ │ │ ├── efa_count.txt │ │ │ │ ├── gpu_count.txt │ │ │ │ ├── numa_topo.txt │ │ │ │ ├── nvidia_persistence_status.txt │ │ │ │ ├── nvidia_smi_topo.txt │ │ │ │ └── nvidia_vgpu_license_status.txt │ │ │ ├── g6f.xlarge/ │ │ │ │ ├── efa_count.txt │ │ │ │ ├── gpu_count.txt │ │ │ │ ├── numa_topo.txt │ │ │ │ ├── nvidia_persistence_status.txt │ │ │ │ ├── nvidia_smi_topo.txt │ │ │ │ └── nvidia_vgpu_license_status.txt │ │ │ ├── p3.16xlarge/ │ │ │ │ ├── gpu_count.txt │ │ │ │ ├── numa_topo.txt │ │ │ │ ├── nvidia_persistence_status.txt │ │ │ │ └── nvidia_smi_topo.txt │ │ │ ├── p3.2xlarge/ │ │ │ │ ├── gpu_count.txt │ │ │ │ ├── numa_topo.txt │ │ │ │ ├── nvidia_persistence_status.txt │ │ │ │ └── nvidia_smi_topo.txt │ │ │ ├── p4d.24xlarge/ │ │ │ │ ├── gpu_count.txt │ │ │ │ ├── numa_topo.txt │ │ │ │ ├── nvidia_persistence_status.txt │ │ │ │ └── nvidia_smi_topo.txt │ │ │ ├── p4de.24xlarge/ │ │ │ │ ├── gpu_count.txt │ │ │ │ ├── numa_topo.txt │ │ │ │ ├── nvidia_persistence_status.txt │ │ │ │ └── nvidia_smi_topo.txt │ │ │ └── p5.48xlarge/ │ │ │ ├── gpu_count.txt │ │ │ ├── numa_topo.txt │ │ │ ├── nvidia_persistence_status.txt │ │ │ └── nvidia_smi_topo.txt │ │ └── unit_test │ ├── nvidia-inference/ │ │ ├── Dockerfile │ │ ├── infer.py │ │ └── requirements.txt │ └── nvidia-training/ │ ├── Dockerfile │ ├── requirements.txt │ └── train.py └── manifests/ ├── assets/ │ ├── cloudwatch-agent.yaml │ ├── dcgm-exporter.yaml │ ├── dranet.yaml │ ├── efa-device-plugin.yaml │ ├── k8s-neuron-device-plugin-rbac.yml │ ├── k8s-neuron-device-plugin.yml │ ├── mpi-operator.yaml │ └── nvidia-device-plugin.yaml ├── raw.go └── rendered.go ================================================ FILE CONTENTS ================================================ ================================================ FILE: .dockerignore ================================================ .git/ .github/ bin/ CHANGELOG/ Dockerfile Makefile aws-k8s-tester ================================================ FILE: .github/PULL_REQUEST_TEMPLATE.md ================================================ *Issue #, if available:* *Description of changes:* By submitting this pull request, I confirm that my contribution is made under the terms of the Apache 2.0 license. ================================================ FILE: .github/workflows/build-neuron-ci.yaml ================================================ name: "Neuron Images CI" on: pull_request: types: - opened - reopened - synchronize paths: - 'test/images/neuron**' jobs: build-image-neuronx: runs-on: ubuntu-latest steps: - uses: actions/checkout@v3 - run: ./hack/free-disk-space.sh - run: docker build --file test/images/neuron/Dockerfile . build-image-neuron-training: runs-on: ubuntu-latest steps: - uses: actions/checkout@v3 - run: ./hack/free-disk-space.sh - run: docker build --file test/images/neuron-training/Dockerfile test/images/neuron-training build-image-neuron-inference: runs-on: ubuntu-latest steps: - uses: actions/checkout@v3 - run: ./hack/free-disk-space.sh - run: docker build --file test/images/neuron-inference/Dockerfile test/images/neuron-inference ================================================ FILE: .github/workflows/build-nvidia-ci.yaml ================================================ name: "Nvidia Images CI" on: pull_request: types: - opened - reopened - synchronize paths: - 'test/images/nvidia**' jobs: build-image-nvidia: runs-on: ubuntu-latest steps: - uses: actions/checkout@v3 - run: ./hack/free-disk-space.sh - run: docker build --file test/images/nvidia/Dockerfile . build-image-nvidia-training: runs-on: ubuntu-latest steps: - uses: actions/checkout@v3 - run: ./hack/free-disk-space.sh - run: | docker build --file test/images/nvidia-training/Dockerfile test/images/nvidia-training \ --build-arg PYTORCH_BUILD_ENV="MAX_JOBS=$(($(nproc) - 2)) USE_MKLDNN=0 USE_DISTRIBUTED=0 USE_CUDA=0 USE_ROCM=0 USE_CAFFE2=0 USE_QNNPACK=0 USE_NNPACK=0 USE_XNNPACK=0 USE_MPS=0 BUILD_SHARED_LIBS=OFF USE_FLASH_ATTENTION=0 USE_MEM_EFF_ATTENTION=0 BUILD_TEST=0" build-image-nvidia-inference: runs-on: ubuntu-latest steps: - uses: actions/checkout@v3 - run: ./hack/free-disk-space.sh - run: | docker build --file test/images/nvidia-inference/Dockerfile test/images/nvidia-inference \ --build-arg PYTORCH_BUILD_ENV="MAX_JOBS=$(($(nproc) - 2)) USE_MKLDNN=0 USE_DISTRIBUTED=0 USE_CUDA=0 USE_ROCM=0 USE_CAFFE2=0 USE_QNNPACK=0 USE_NNPACK=0 USE_XNNPACK=0 USE_MPS=0 BUILD_SHARED_LIBS=OFF USE_FLASH_ATTENTION=0 USE_MEM_EFF_ATTENTION=0 BUILD_TEST=0" ================================================ FILE: .github/workflows/ci.yaml ================================================ name: "CI" on: pull_request: types: - opened - reopened - synchronize paths-ignore: - 'test/images/nvidia**' - 'test/images/neuron**' jobs: build: runs-on: ubuntu-latest steps: - uses: actions/checkout@v3 - run: go build ./... - run: go test ./... build-test: runs-on: ubuntu-latest steps: - uses: actions/checkout@v3 - run: go test -c -tags=e2e ./test/... build-image: runs-on: ubuntu-latest steps: - uses: actions/checkout@v3 - run: ./hack/free-disk-space.sh - run: docker build --build-arg=KUBERNETES_MINOR_VERSION=latest --file Dockerfile . build-image-efa: runs-on: ubuntu-latest steps: - uses: actions/checkout@v3 - run: ./hack/free-disk-space.sh - run: docker build --file test/images/efa/Dockerfile . ================================================ FILE: .github/workflows/update-go-dependencies.yaml ================================================ name: "[CI] update-go-dependencies" on: workflow_dispatch: schedule: # once a week - cron: "0 0 * * 0" permissions: id-token: write contents: write pull-requests: write jobs: update-dependencies: runs-on: ubuntu-latest if: github.repository == 'aws/aws-k8s-tester' steps: - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # 4.2.2 - uses: actions/setup-go@d35c59abb061a4a6fb18e82ac0862c26744d6ab5 # 5.5.0 - run: | ./hack/update-go-dependencies.sh - uses: peter-evans/create-pull-request@271a8d0340265f705b14b6d32b9829c1cb33d45e # 7.0.8 with: branch: update-go-dependencies base: main add-paths: | . commit-message: "chore: update go dependencies" committer: "GitHub " author: "GitHub " title: "chore: update go dependencies" body: | Generated by: ``` ./hack/update-go-dependencies.sh ``` ================================================ FILE: .github/workflows/update-image-tags.yaml ================================================ name: "[CI] update-image-tags" on: workflow_dispatch: schedule: # once a week - cron: "0 0 * * 0" permissions: id-token: write contents: write pull-requests: write jobs: update-dependencies: runs-on: ubuntu-latest if: github.repository == 'aws/aws-k8s-tester' steps: - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # 4.2.2 - run: ./hack/update-image-tags.sh - uses: peter-evans/create-pull-request@271a8d0340265f705b14b6d32b9829c1cb33d45e # 7.0.8 with: branch: update-image-tags base: main add-paths: | test/images/ commit-message: "chore: update image tags" committer: "GitHub " author: "GitHub " title: "chore: update image tags" body: | Generated by: ``` ./hack/update-image-tags.sh ``` ================================================ FILE: .github/workflows/update-neuron-dependencies.yaml ================================================ name: "[CI] update-neuron-dependencies" on: workflow_dispatch: schedule: # once a week - cron: "0 0 * * 0" permissions: id-token: write contents: write pull-requests: write jobs: update-dependencies: runs-on: ubuntu-latest if: github.repository == 'aws/aws-k8s-tester' steps: - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # 4.2.2 - run: | ./hack/update-neuron-dependencies.sh - uses: peter-evans/create-pull-request@271a8d0340265f705b14b6d32b9829c1cb33d45e # 7.0.8 with: branch: update-neuron-dependencies base: main add-paths: | test/images/ commit-message: "chore: update neuron dependencies" committer: "GitHub " author: "GitHub " title: "chore: update neuron dependencies" body: | Generated by: ``` ./hack/update-neuron-dependencies.sh ``` See the following URL for artifactes in the latest Neuron SDK release: https://awsdocs-neuron.readthedocs-hosted.com/en/latest/release-notes/releasecontent.html#latest-neuron-release-artifacts ================================================ FILE: .github/workflows/update-nvidia-dependencies.yaml ================================================ name: "[CI] update-nvidia-dependencies" on: workflow_dispatch: schedule: # once a week - cron: "0 0 * * 0" permissions: id-token: write contents: write pull-requests: write jobs: update-dependencies: runs-on: ubuntu-latest if: github.repository == 'aws/aws-k8s-tester' steps: - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # 4.2.2 - run: ./hack/update-nvidia-dependencies.sh - uses: peter-evans/create-pull-request@271a8d0340265f705b14b6d32b9829c1cb33d45e # 7.0.8 with: branch: update-nvidia-dependencies base: main add-paths: | test/images/ commit-message: "chore: update nvidia test dependencies" committer: "GitHub " author: "GitHub " title: "chore: update nvidia test dependencies" body: | Generated by: ``` ./hack/update-nvidia-dependencies.sh ``` ================================================ FILE: .gitignore ================================================ /.DS_Store /bin /_tmp .idea *.swp /aws-k8s-tester */*/.DS_Store */.DS_Store /_artifacts /_rundir ================================================ FILE: .vscode/settings.json ================================================ { "git.ignoreLimitWarning": true } ================================================ FILE: CODE_OF_CONDUCT.md ================================================ ## Code of Conduct This project has adopted the [Amazon Open Source Code of Conduct](https://aws.github.io/code-of-conduct). For more information see the [Code of Conduct FAQ](https://aws.github.io/code-of-conduct-faq) or contact opensource-codeofconduct@amazon.com with any additional questions or comments. ================================================ FILE: CONTRIBUTING.md ================================================ # Contributing Guidelines Thank you for your interest in contributing to our project. Whether it's a bug report, new feature, correction, or additional documentation, we greatly value feedback and contributions from our community. Please read through this document before submitting any issues or pull requests to ensure we have all the necessary information to effectively respond to your bug report or contribution. ## Reporting Bugs/Feature Requests We welcome you to use the GitHub issue tracker to report bugs or suggest features. When filing an issue, please check [existing open](https://github.com/aws/aws-k8s-tester/issues), or [recently closed](https://github.com/aws/aws-k8s-tester/issues?utf8=%E2%9C%93&q=is%3Aissue%20is%3Aclosed%20), issues to make sure somebody else hasn't already reported the issue. Please try to include as much information as you can. Details like these are incredibly useful: * A reproducible test case or series of steps * The version of our code being used * Any modifications you've made relevant to the bug * Anything unusual about your environment or deployment ## Contributing via Pull Requests Contributions via pull requests are much appreciated. Before sending us a pull request, please ensure that: 1. You are working against the latest source on the *master* branch. 2. You check existing open, and recently merged, pull requests to make sure someone else hasn't addressed the problem already. 3. You open an issue to discuss any significant work - we would hate for your time to be wasted. To send us a pull request, please: 1. Fork the repository. 2. Modify the source; please focus on the specific change you are contributing. If you also reformat all the code, it will be hard for us to focus on your change. 3. Ensure local tests pass. 4. Commit to your fork using clear commit messages. 5. Send us a pull request, answering any default questions in the pull request interface. 6. Pay attention to any automated CI failures reported in the pull request, and stay involved in the conversation. GitHub provides additional document on [forking a repository](https://help.github.com/articles/fork-a-repo/) and [creating a pull request](https://help.github.com/articles/creating-a-pull-request/). ## Finding contributions to work on Looking at the existing issues is a great way to find something to contribute on. As our projects, by default, use the default GitHub issue labels (enhancement/bug/duplicate/help wanted/invalid/question/wontfix), looking at any ['help wanted'](https://github.com/aws/aws-k8s-tester/labels/help%20wanted) issues is a great place to start. ## Code of Conduct This project has adopted the [Amazon Open Source Code of Conduct](https://aws.github.io/code-of-conduct). For more information see the [Code of Conduct FAQ](https://aws.github.io/code-of-conduct-faq) or contact opensource-codeofconduct@amazon.com with any additional questions or comments. ## Security issue notifications If you discover a potential security issue in this project we ask that you notify AWS/Amazon Security via our [vulnerability reporting page](http://aws.amazon.com/security/vulnerability-reporting/). Please do **not** create a public github issue. ## Licensing See the [LICENSE](https://github.com/aws/aws-k8s-tester/blob/master/LICENSE) file for our project's licensing. We will ask you to confirm the licensing of your contribution. We may ask you to sign a [Contributor License Agreement (CLA)](http://en.wikipedia.org/wiki/Contributor_License_Agreement) for larger changes. ================================================ FILE: Config ================================================ # This file is for Amazon internal build processes # Copyright 2025 Amazon.com, Inc. or its affiliates. # SPDX-License-Identifier: Apache-2.0 package.Aws-k8s-tester-mirror = { interfaces = (1.0); build-system = bgo-wrap-make; build-tools = { 1.0 = { BrazilMakeGo = 3.0; GoLang = 1.x; }; }; }; ================================================ FILE: Dockerfile ================================================ FROM public.ecr.aws/amazonlinux/amazonlinux:2023 AS builder ARG TARGETOS ARG TARGETARCH RUN dnf install -y git tar gzip make unzip gcc rsync wget jq ARG GO_MINOR_VERSION=1.25 RUN curl https://go.dev/dl/?mode=json | jq -r .[].version | grep "^go${GO_MINOR_VERSION}" | head -n1 > go-version.txt RUN wget -O go.tar.gz https://go.dev/dl/$(cat go-version.txt).${TARGETOS}-${TARGETARCH}.tar.gz && \ rm -rf /usr/local/go && \ tar -C /usr/local -xzf go.tar.gz ENV GOPATH=/usr/local/go ENV PATH=$PATH:$GOPATH/bin ENV GOPROXY=direct WORKDIR $GOPATH/src/github.com/aws/aws-k8s-tester COPY . . RUN go install ./... RUN go test -c -tags=e2e ./test/... -o $GOPATH/bin/ RUN go install sigs.k8s.io/kubetest2 && \ go install sigs.k8s.io/kubetest2/kubetest2-tester-exec && \ go install sigs.k8s.io/kubetest2/kubetest2-tester-ginkgo && \ go install sigs.k8s.io/hydrophone@latest FROM public.ecr.aws/amazonlinux/amazonlinux:2023 ARG TARGETOS ARG TARGETARCH WORKDIR /workdir RUN dnf install -y tar gzip unzip wget openssh diffutils RUN wget -O awscli.zip https://awscli.amazonaws.com/awscli-exe-linux-x86_64.zip && \ unzip awscli.zip && \ ./aws/install # we need gsutil from the gcloud CLI for kubetest-tester-ginkgo RUN dnf install -y python3.13 ARG GCLOUD_SDK_URL=https://dl.google.com/dl/cloudsdk/channels/rapid/google-cloud-sdk.tar.gz RUN wget -O google-cloud-sdk.tar.gz -q $GCLOUD_SDK_URL && \ tar xzf google-cloud-sdk.tar.gz -C / && \ rm google-cloud-sdk.tar.gz && \ /google-cloud-sdk/install.sh \ --disable-installation-options \ --bash-completion=false \ --path-update=false \ --usage-reporting=false ENV PATH=$PATH:/google-cloud-sdk/bin ARG EKSCTL_VERSION=latest RUN wget -O eksctl.tar.gz "https://github.com/eksctl-io/eksctl/releases/${EKSCTL_VERSION}/download/eksctl_Linux_${TARGETARCH}.tar.gz" && \ tar xzf eksctl.tar.gz -C /bin/ && \ rm eksctl.tar.gz ARG HELM_VERSION=v4.1.4 RUN wget -O helm.tar.gz "https://get.helm.sh/helm-${HELM_VERSION}-${TARGETOS}-${TARGETARCH}.tar.gz" && \ tar xzf helm.tar.gz --strip-components=1 -C /bin/ "${TARGETOS}-${TARGETARCH}/helm" && \ rm helm.tar.gz ARG KUBERNETES_MINOR_VERSION COPY hack/download-kubernetes-binaries.sh . RUN ./download-kubernetes-binaries.sh "${KUBERNETES_MINOR_VERSION}" "${TARGETOS}" "${TARGETARCH}" RUN mkdir /info ENV PATH=$PATH:/info RUN cp kubernetes-version.txt /info/ RUN mv kubernetes/*/bin/* /bin/ RUN rm -rf /workdir COPY --from=builder /usr/local/go/bin/* /bin/ ================================================ FILE: LICENSE ================================================ Apache License Version 2.0, January 2004 http://www.apache.org/licenses/ TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 1. Definitions. "License" shall mean the terms and conditions for use, reproduction, and distribution as defined by Sections 1 through 9 of this document. "Licensor" shall mean the copyright owner or entity authorized by the copyright owner that is granting the License. "Legal Entity" shall mean the union of the acting entity and all other entities that control, are controlled by, or are under common control with that entity. For the purposes of this definition, "control" means (i) the power, direct or indirect, to cause the direction or management of such entity, whether by contract or otherwise, or (ii) ownership of fifty percent (50%) or more of the outstanding shares, or (iii) beneficial ownership of such entity. "You" (or "Your") shall mean an individual or Legal Entity exercising permissions granted by this License. "Source" form shall mean the preferred form for making modifications, including but not limited to software source code, documentation source, and configuration files. "Object" form shall mean any form resulting from mechanical transformation or translation of a Source form, including but not limited to compiled object code, generated documentation, and conversions to other media types. "Work" shall mean the work of authorship, whether in Source or Object form, made available under the License, as indicated by a copyright notice that is included in or attached to the work (an example is provided in the Appendix below). "Derivative Works" shall mean any work, whether in Source or Object form, that is based on (or derived from) the Work and for which the editorial revisions, annotations, elaborations, or other modifications represent, as a whole, an original work of authorship. For the purposes of this License, Derivative Works shall not include works that remain separable from, or merely link (or bind by name) to the interfaces of, the Work and Derivative Works thereof. "Contribution" shall mean any work of authorship, including the original version of the Work and any modifications or additions to that Work or Derivative Works thereof, that is intentionally submitted to Licensor for inclusion in the Work by the copyright owner or by an individual or Legal Entity authorized to submit on behalf of the copyright owner. For the purposes of this definition, "submitted" means any form of electronic, verbal, or written communication sent to the Licensor or its representatives, including but not limited to communication on electronic mailing lists, source code control systems, and issue tracking systems that are managed by, or on behalf of, the Licensor for the purpose of discussing and improving the Work, but excluding communication that is conspicuously marked or otherwise designated in writing by the copyright owner as "Not a Contribution." "Contributor" shall mean Licensor and any individual or Legal Entity on behalf of whom a Contribution has been received by Licensor and subsequently incorporated within the Work. 2. Grant of Copyright License. Subject to the terms and conditions of this License, each Contributor hereby grants to You a perpetual, worldwide, non-exclusive, no-charge, royalty-free, irrevocable copyright license to reproduce, prepare Derivative Works of, publicly display, publicly perform, sublicense, and distribute the Work and such Derivative Works in Source or Object form. 3. Grant of Patent License. Subject to the terms and conditions of this License, each Contributor hereby grants to You a perpetual, worldwide, non-exclusive, no-charge, royalty-free, irrevocable (except as stated in this section) patent license to make, have made, use, offer to sell, sell, import, and otherwise transfer the Work, where such license applies only to those patent claims licensable by such Contributor that are necessarily infringed by their Contribution(s) alone or by combination of their Contribution(s) with the Work to which such Contribution(s) was submitted. If You institute patent litigation against any entity (including a cross-claim or counterclaim in a lawsuit) alleging that the Work or a Contribution incorporated within the Work constitutes direct or contributory patent infringement, then any patent licenses granted to You under this License for that Work shall terminate as of the date such litigation is filed. 4. Redistribution. You may reproduce and distribute copies of the Work or Derivative Works thereof in any medium, with or without modifications, and in Source or Object form, provided that You meet the following conditions: (a) You must give any other recipients of the Work or Derivative Works a copy of this License; and (b) You must cause any modified files to carry prominent notices stating that You changed the files; and (c) You must retain, in the Source form of any Derivative Works that You distribute, all copyright, patent, trademark, and attribution notices from the Source form of the Work, excluding those notices that do not pertain to any part of the Derivative Works; and (d) If the Work includes a "NOTICE" text file as part of its distribution, then any Derivative Works that You distribute must include a readable copy of the attribution notices contained within such NOTICE file, excluding those notices that do not pertain to any part of the Derivative Works, in at least one of the following places: within a NOTICE text file distributed as part of the Derivative Works; within the Source form or documentation, if provided along with the Derivative Works; or, within a display generated by the Derivative Works, if and wherever such third-party notices normally appear. The contents of the NOTICE file are for informational purposes only and do not modify the License. You may add Your own attribution notices within Derivative Works that You distribute, alongside or as an addendum to the NOTICE text from the Work, provided that such additional attribution notices cannot be construed as modifying the License. You may add Your own copyright statement to Your modifications and may provide additional or different license terms and conditions for use, reproduction, or distribution of Your modifications, or for any such Derivative Works as a whole, provided Your use, reproduction, and distribution of the Work otherwise complies with the conditions stated in this License. 5. Submission of Contributions. Unless You explicitly state otherwise, any Contribution intentionally submitted for inclusion in the Work by You to the Licensor shall be under the terms and conditions of this License, without any additional terms or conditions. Notwithstanding the above, nothing herein shall supersede or modify the terms of any separate license agreement you may have executed with Licensor regarding such Contributions. 6. Trademarks. This License does not grant permission to use the trade names, trademarks, service marks, or product names of the Licensor, except as required for reasonable and customary use in describing the origin of the Work and reproducing the content of the NOTICE file. 7. Disclaimer of Warranty. Unless required by applicable law or agreed to in writing, Licensor provides the Work (and each Contributor provides its Contributions) on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied, including, without limitation, any warranties or conditions of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A PARTICULAR PURPOSE. You are solely responsible for determining the appropriateness of using or redistributing the Work and assume any risks associated with Your exercise of permissions under this License. 8. Limitation of Liability. In no event and under no legal theory, whether in tort (including negligence), contract, or otherwise, unless required by applicable law (such as deliberate and grossly negligent acts) or agreed to in writing, shall any Contributor be liable to You for damages, including any direct, indirect, special, incidental, or consequential damages of any character arising as a result of this License or out of the use or inability to use the Work (including but not limited to damages for loss of goodwill, work stoppage, computer failure or malfunction, or any and all other commercial damages or losses), even if such Contributor has been advised of the possibility of such damages. 9. Accepting Warranty or Additional Liability. While redistributing the Work or Derivative Works thereof, You may choose to offer, and charge a fee for, acceptance of support, warranty, indemnity, or other liability obligations and/or rights consistent with this License. However, in accepting such obligations, You may act only on Your own behalf and on Your sole responsibility, not on behalf of any other Contributor, and only if You agree to indemnify, defend, and hold each Contributor harmless for any liability incurred by, or claims asserted against, such Contributor by reason of your accepting any such warranty or additional liability. END OF TERMS AND CONDITIONS APPENDIX: How to apply the Apache License to your work. To apply the Apache License to your work, attach the following boilerplate notice, with the fields enclosed by brackets "[]" replaced with your own identifying information. (Don't include the brackets!) The text should be enclosed in the appropriate comment syntax for the file format. We also recommend that a file or class name and description of purpose be included on the same "printed page" as the copyright notice for easier identification within third-party archives. Copyright [yyyy] [name of copyright owner] Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. ================================================ FILE: Makefile ================================================ include ${BGO_MAKEFILE} pre-release:: go test -c -tags=e2e ./test/... -o $(GOBIN) go install sigs.k8s.io/kubetest2/...@latest update-deps: for SCRIPT in ./hack/update-*.sh; do \ "$$SCRIPT" ; \ done .PHONY: test-integration test-integration: ## Run unit and integration tests go test -v -tags=integration ./... ================================================ FILE: NOTICE ================================================ Awstester Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved. ================================================ FILE: README.md ================================================ # Tools for testing Kubernetes on AWS ## Installation This project will use rolling releases going forward; we recommend fetching the latest commit: ``` go install github.com/aws/aws-k8s-tester/...@HEAD ``` You'll need the standard `kubetest` tools as well: ``` go install sigs.k8s.io/kubetest2/...@latest ``` ## `kubetest2` deployers and testers for EKS ### Usage **Auto-detect cluster version** The deployers will search for a file called `kubernetes-version.txt` on your `PATH`. This file should contain a valid tag for a Kubernetes release. The `--kubernetes-version` flag can be omitted if this file exists. --- ### `eksctl` deployer This deployer is a thin wrapper around `eksctl`. The simplest usage is: ``` kubetest2 \ eksctl \ --kubernetes-version=X.XX \ --up \ --down \ --test=exec \ -- echo "Hello world" ``` **Additional flags** - `--instance-types` - comma-separated list of instance types to use for nodes - `--ami` - AMI ID for nodes - `--nodes` - number of nodes - `--region` - AWS region - `--config-file` - Path to eksctl config file (**if provided, other flags are ignored**) - `--availability-zones` - Node availability zones - `--ami-family` - AMI family to use: `AmazonLinux2023` | `Bottlerocket` - `--efa-enabled` - Enable Elastic Fabric Adapter for the nodegroup - `--volume-size` - Size of the node root volume in GB - `--private-networking` - Use private networking for nodes - `--with-oidc` - Enable OIDC provider for IAM roles for service accounts - `--deploy-target` - The target to deploy: `cluster` | `nodegroup` (defaults to `cluster`) - `--cluster-name` - Name of the EKS cluster (defaults to RunID if not specified) - `--unmanaged-nodegroup` - Use unmanaged nodegroup instead of managed nodegroup - `--nodegroup-name` - Name of the nodegroup (defaults to `ng-1`) --- ### `eksapi` deployer This deployer calls the EKS API directly, instead of using CloudFormation for EKS resources. The simplest usage is: ``` kubetest2 \ eksapi \ --kubernetes-version=X.XX \ --up \ --down \ --test=exec \ -- echo "Hello world" ``` **Additional flags** - `--instance-types` - comma-separated list of instance types to use for nodes - `--ami` - AMI ID for nodes - `--nodes` - number of nodes - `--region` - AWS region - `--endpoint-url` - Override the EKS endpoint URL - `--cluster-role-service-principal` - Additional service principal that can assume the cluster IAM role. --- ### `multi` tester This tester wraps multiple executions of other testers. Tester argument groups are separated by `--`, with the first group being passed to the `multi` tester itself. The first positional argument of each subsequent group should be the name of a tester. ``` kubetest2 \ noop \ --test=multi \ -- \ --fail-fast=true \ -- \ ginkgo \ --focus-regex='\[Conformance\]' \ --parallel=4 \ -- \ exec \ go test ./my/test/package ``` ================================================ FILE: bmg.json ================================================ { "binary_artifacts_only": true } ================================================ FILE: cmd/kubetest2-eksapi/main.go ================================================ package main import ( "github.com/aws/aws-k8s-tester/internal/deployers/eksapi" "sigs.k8s.io/kubetest2/pkg/app" ) func main() { app.Main(eksapi.DeployerName, eksapi.NewDeployer) } ================================================ FILE: cmd/kubetest2-eksapi-janitor/main.go ================================================ package main import ( "context" "flag" "log/slog" "os" "time" "github.com/aws/aws-k8s-tester/internal/deployers/eksapi" ) func main() { var maxResourceAge time.Duration flag.DurationVar(&maxResourceAge, "max-resource-age", time.Hour*3, "Maximum resource age") var workers int flag.IntVar(&workers, "workers", 1, "number of workers to processes resources in parallel") var stackStatus string flag.StringVar(&stackStatus, "stack-status", "", "only process stacks with a specific status") var emitMetrics bool flag.BoolVar(&emitMetrics, "emit-metrics", false, "Send metrics to CloudWatch") flag.Parse() j := eksapi.NewJanitor(maxResourceAge, emitMetrics, workers, stackStatus) if err := j.Sweep(context.Background()); err != nil { slog.Error("failed to sweep resources", "error", err) os.Exit(1) } } ================================================ FILE: cmd/kubetest2-eksctl/main.go ================================================ package main import ( "github.com/aws/aws-k8s-tester/internal/deployers/eksctl" "sigs.k8s.io/kubetest2/pkg/app" ) func main() { app.Main(eksctl.DeployerName, eksctl.NewDeployer) } ================================================ FILE: cmd/kubetest2-tester-ginkgo-v1/main.go ================================================ package main import ( "github.com/aws/aws-k8s-tester/internal/testers/ginkgov1" ) func main() { ginkgov1.Main() } ================================================ FILE: cmd/kubetest2-tester-multi/main.go ================================================ package main import "github.com/aws/aws-k8s-tester/internal/testers/multi" func main() { multi.Main() } ================================================ FILE: external/tools.go ================================================ //go:build tools // +build tools package external // this file allows us to declare direct dependencies on our required external tools. // this file will not compile! that's expected. import ( _ "sigs.k8s.io/kubetest2" _ "sigs.k8s.io/kubetest2/kubetest2-tester-exec" _ "sigs.k8s.io/kubetest2/kubetest2-tester-ginkgo" ) ================================================ FILE: go.mod ================================================ module github.com/aws/aws-k8s-tester go 1.25.5 require ( github.com/aws/aws-sdk-go v1.55.8 github.com/aws/aws-sdk-go-v2 v1.41.1 github.com/aws/aws-sdk-go-v2/config v1.32.7 github.com/aws/aws-sdk-go-v2/service/autoscaling v1.62.5 github.com/aws/aws-sdk-go-v2/service/cloudformation v1.71.5 github.com/aws/aws-sdk-go-v2/service/cloudwatch v1.53.1 github.com/aws/aws-sdk-go-v2/service/ec2 v1.279.1 github.com/aws/aws-sdk-go-v2/service/eks v1.76.4 github.com/aws/aws-sdk-go-v2/service/iam v1.53.2 github.com/aws/aws-sdk-go-v2/service/s3 v1.95.1 github.com/aws/smithy-go v1.24.0 github.com/kballard/go-shellquote v0.0.0-20180428030007-95032a82bc51 github.com/spf13/pflag v1.0.10 github.com/stretchr/testify v1.11.1 golang.org/x/exp v0.0.0-20251219203646-944ab1f22d93 k8s.io/api v0.35.0 k8s.io/apimachinery v0.35.0 k8s.io/client-go v0.35.0 k8s.io/klog v1.0.0 k8s.io/klog/v2 v2.130.1 sigs.k8s.io/controller-runtime v0.22.4 sigs.k8s.io/karpenter v1.8.0 sigs.k8s.io/kubetest2 v0.0.0-20260108084739-2f9a9397f033 ) require ( github.com/aws/aws-sdk-go-v2/aws/protocol/eventstream v1.7.4 // indirect github.com/aws/aws-sdk-go-v2/internal/v4a v1.4.17 // indirect github.com/aws/aws-sdk-go-v2/service/internal/checksum v1.9.8 // indirect github.com/aws/aws-sdk-go-v2/service/internal/s3shared v1.19.17 // indirect github.com/awslabs/operatorpkg v0.0.0-20250909182303-e8e550b6f339 // indirect github.com/evanphx/json-patch/v5 v5.9.11 // indirect github.com/fxamacker/cbor/v2 v2.9.0 // indirect github.com/go-logr/zapr v1.3.0 // indirect github.com/mitchellh/hashstructure/v2 v2.0.2 // indirect github.com/pkg/errors v0.9.1 github.com/robfig/cron/v3 v3.0.1 // indirect github.com/samber/lo v1.51.0 // indirect github.com/x448/float16 v0.8.4 // indirect golang.org/x/crypto v0.46.0 // indirect gomodules.xyz/jsonpatch/v2 v2.4.0 // indirect k8s.io/apiextensions-apiserver v0.34.1 // indirect ) require ( cloud.google.com/go v0.121.2 // indirect cloud.google.com/go/compute/metadata v0.9.0 // indirect cloud.google.com/go/iam v1.5.2 // indirect cloud.google.com/go/storage v1.53.0 // indirect cuelabs.dev/go/oci/ociregistry v0.0.0-20240404174027-a39bec0462d2 // indirect cuelang.org/go v0.9.2 // indirect dario.cat/mergo v1.0.2 // indirect filippo.io/edwards25519 v1.1.0 // indirect github.com/Azure/azure-sdk-for-go v68.0.0+incompatible // indirect github.com/Azure/go-autorest v14.2.0+incompatible // indirect github.com/Azure/go-autorest/autorest v0.11.29 // indirect github.com/Azure/go-autorest/autorest/adal v0.9.23 // indirect github.com/Azure/go-autorest/autorest/azure/auth v0.5.12 // indirect github.com/Azure/go-autorest/autorest/azure/cli v0.4.6 // indirect github.com/Azure/go-autorest/autorest/date v0.3.0 // indirect github.com/Azure/go-autorest/logger v0.2.1 // indirect github.com/Azure/go-autorest/tracing v0.6.0 // indirect github.com/MakeNowJust/heredoc/v2 v2.0.1 // indirect github.com/Microsoft/go-winio v0.6.2 // indirect github.com/ProtonMail/go-crypto v1.1.6 // indirect github.com/ThalesIgnite/crypto11 v1.2.5 // indirect github.com/agnivade/levenshtein v1.2.1 // indirect github.com/alibabacloud-go/alibabacloud-gateway-spi v0.0.4 // indirect github.com/alibabacloud-go/cr-20160607 v1.0.1 // indirect github.com/alibabacloud-go/cr-20181201 v1.0.10 // indirect github.com/alibabacloud-go/darabonba-openapi v0.2.1 // indirect github.com/alibabacloud-go/debug v1.0.0 // indirect github.com/alibabacloud-go/endpoint-util v1.1.1 // indirect github.com/alibabacloud-go/openapi-util v0.1.0 // indirect github.com/alibabacloud-go/tea v1.2.2 // indirect github.com/alibabacloud-go/tea-utils v1.4.5 // indirect github.com/alibabacloud-go/tea-xml v1.1.3 // indirect github.com/aliyun/credentials-go v1.3.2 // indirect github.com/asaskevich/govalidator v0.0.0-20230301143203-a9d515a09cc2 // indirect github.com/aws/aws-sdk-go-v2/credentials v1.19.7 // indirect github.com/aws/aws-sdk-go-v2/feature/ec2/imds v1.18.17 // indirect github.com/aws/aws-sdk-go-v2/internal/configsources v1.4.17 // indirect github.com/aws/aws-sdk-go-v2/internal/endpoints/v2 v2.7.17 // indirect github.com/aws/aws-sdk-go-v2/internal/ini v1.8.4 // indirect github.com/aws/aws-sdk-go-v2/service/ecr v1.36.2 // indirect github.com/aws/aws-sdk-go-v2/service/ecrpublic v1.27.2 // indirect github.com/aws/aws-sdk-go-v2/service/internal/accept-encoding v1.13.4 // indirect github.com/aws/aws-sdk-go-v2/service/internal/presigned-url v1.13.17 // indirect github.com/aws/aws-sdk-go-v2/service/ssm v1.67.8 github.com/aws/aws-sdk-go-v2/service/sso v1.30.9 // indirect github.com/aws/aws-sdk-go-v2/service/ssooidc v1.35.13 // indirect github.com/aws/aws-sdk-go-v2/service/sts v1.41.6 // indirect github.com/awslabs/amazon-ecr-credential-helper/ecr-login v0.0.0-20240318154307-a1a918375412 // indirect github.com/beorn7/perks v1.0.1 // indirect github.com/blang/semver v3.5.1+incompatible // indirect github.com/blang/semver/v4 v4.0.0 // indirect github.com/buildkite/agent/v3 v3.81.0 // indirect github.com/buildkite/go-pipeline v0.13.1 // indirect github.com/buildkite/interpolate v0.1.3 // indirect github.com/buildkite/roko v1.2.0 // indirect github.com/cespare/xxhash/v2 v2.3.0 // indirect github.com/chrismellard/docker-credential-acr-env v0.0.0-20230304212654-82a0ddb27589 // indirect github.com/clbanning/mxj/v2 v2.7.0 // indirect github.com/cloudflare/circl v1.6.3 // indirect github.com/cockroachdb/apd/v3 v3.2.1 // indirect github.com/common-nighthawk/go-figure v0.0.0-20210622060536-734e95fb86be // indirect github.com/containerd/stargz-snapshotter/estargz v0.18.1 // indirect github.com/coreos/go-oidc/v3 v3.17.0 // indirect github.com/cyberphone/json-canonicalization v0.0.0-20231217050601-ba74d44ecf5f // indirect github.com/cyphar/filepath-securejoin v0.4.1 // indirect github.com/davecgh/go-spew v1.1.2-0.20180830191138-d8f796af33cc // indirect github.com/digitorus/pkcs7 v0.0.0-20230818184609-3a137a874352 // indirect github.com/digitorus/timestamp v0.0.0-20231217203849-220c5c2851b7 // indirect github.com/dimchansky/utfbom v1.1.1 // indirect github.com/docker/cli v29.0.3+incompatible // indirect github.com/docker/distribution v2.8.3+incompatible // indirect github.com/docker/docker-credential-helpers v0.9.3 // indirect github.com/dustin/go-humanize v1.0.1 // indirect github.com/emicklei/go-restful/v3 v3.12.2 // indirect github.com/emicklei/proto v1.13.2 // indirect github.com/emirpasic/gods v1.18.1 // indirect github.com/felixge/httpsnoop v1.0.4 // indirect github.com/fsnotify/fsnotify v1.9.0 // indirect github.com/glebarez/go-sqlite v1.22.0 // indirect github.com/go-chi/chi v4.1.2+incompatible // indirect github.com/go-git/gcfg v1.5.1-0.20230307220236-3a3c6141e376 // indirect github.com/go-git/go-billy/v5 v5.8.0 // indirect github.com/go-git/go-git/v5 v5.17.1 // indirect github.com/go-ini/ini v1.67.0 // indirect github.com/go-jose/go-jose/v3 v3.0.4 // indirect github.com/go-logr/logr v1.4.3 // indirect github.com/go-logr/stdr v1.2.2 // indirect github.com/go-ole/go-ole v1.3.0 // indirect github.com/go-openapi/analysis v0.23.0 // indirect github.com/go-openapi/errors v0.22.1 // indirect github.com/go-openapi/jsonpointer v0.21.0 // indirect github.com/go-openapi/jsonreference v0.21.0 // indirect github.com/go-openapi/loads v0.22.0 // indirect github.com/go-openapi/runtime v0.28.0 // indirect github.com/go-openapi/spec v0.21.0 // indirect github.com/go-openapi/strfmt v0.23.0 // indirect github.com/go-openapi/swag v0.23.0 // indirect github.com/go-openapi/validate v0.24.0 // indirect github.com/go-piv/piv-go v1.11.0 // indirect github.com/gobwas/glob v0.2.3 // indirect github.com/gogo/protobuf v1.3.2 // indirect github.com/golang-jwt/jwt/v4 v4.5.2 // indirect github.com/golang/groupcache v0.0.0-20241129210726-2c02b8208cf8 // indirect github.com/golang/snappy v0.0.4 // indirect github.com/google/certificate-transparency-go v1.3.2-0.20250507091337-0eddb39e94f8 // indirect github.com/google/gnostic-models v0.7.0 // indirect github.com/google/go-cmp v0.7.0 // indirect github.com/google/go-containerregistry v0.20.7 // indirect github.com/google/go-github/v55 v55.0.0 // indirect github.com/google/go-querystring v1.1.0 // indirect github.com/google/licenseclassifier/v2 v2.0.0 // indirect github.com/google/s2a-go v0.1.9 // indirect github.com/google/uuid v1.6.0 // indirect github.com/googleapis/enterprise-certificate-proxy v0.3.6 // indirect github.com/googleapis/gax-go/v2 v2.15.0 // indirect github.com/gorilla/mux v1.8.1 // indirect github.com/hashicorp/go-cleanhttp v0.5.2 // indirect github.com/hashicorp/go-retryablehttp v0.7.7 // indirect github.com/in-toto/in-toto-golang v0.9.0 // indirect github.com/inconshreveable/mousetrap v1.1.0 // indirect github.com/jbenet/go-context v0.0.0-20150711004518-d14ea06fba99 // indirect github.com/jedisct1/go-minisign v0.0.0-20230811132847-661be99b8267 // indirect github.com/jellydator/ttlcache/v3 v3.3.0 // indirect github.com/jmespath/go-jmespath v0.4.1-0.20220621161143-b0104c826a24 // indirect github.com/josharian/intern v1.0.0 // indirect github.com/json-iterator/go v1.1.12 // indirect github.com/kevinburke/ssh_config v1.2.0 // indirect github.com/klauspost/compress v1.18.1 // indirect github.com/knqyf263/go-rpmdb v0.1.0 // indirect github.com/mailru/easyjson v0.9.0 // indirect github.com/mattn/go-isatty v0.0.20 // indirect github.com/miekg/pkcs11 v1.1.1 // indirect github.com/mitchellh/go-homedir v1.1.0 // indirect github.com/mitchellh/go-wordwrap v1.0.1 // indirect github.com/mitchellh/mapstructure v1.5.0 // indirect github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd // indirect github.com/modern-go/reflect2 v1.0.3-0.20250322232337-35a7c28c31ee // indirect github.com/mozillazg/docker-credential-acr-helper v0.4.0 // indirect github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822 // indirect github.com/ncruces/go-strftime v0.1.9 // indirect github.com/nozzle/throttler v0.0.0-20180817012639-2ea982251481 // indirect github.com/oklog/ulid v1.3.1 // indirect github.com/oleiade/reflections v1.1.0 // indirect github.com/open-policy-agent/opa v1.4.0 // indirect github.com/opencontainers/go-digest v1.0.0 // indirect github.com/opencontainers/image-spec v1.1.1 // indirect github.com/opentracing/opentracing-go v1.2.0 // indirect github.com/package-url/packageurl-go v0.1.2 // indirect github.com/pborman/uuid v1.2.1 // indirect github.com/pelletier/go-toml/v2 v2.2.4 // indirect github.com/pjbgf/sha1cd v0.3.2 // indirect github.com/pmezard/go-difflib v1.0.1-0.20181226105442-5d4384ee4fb2 // indirect github.com/power-devops/perfstat v0.0.0-20240221224432-82ca36839d55 // indirect github.com/prometheus/client_golang v1.23.2 // indirect github.com/prometheus/client_model v0.6.2 // indirect github.com/prometheus/common v0.66.1 // indirect github.com/prometheus/procfs v0.17.0 // indirect github.com/protocolbuffers/txtpbfmt v0.0.0-20240116145035-ef3ab179eed6 // indirect github.com/rcrowley/go-metrics v0.0.0-20201227073835-cf1acfcdf475 // indirect github.com/remyoudompheng/bigfft v0.0.0-20230129092748-24d4a6f8daec // indirect github.com/rogpeppe/go-internal v1.14.1 // indirect github.com/sagikazarmark/locafero v0.9.0 // indirect github.com/sassoftware/relic v7.2.1+incompatible // indirect github.com/secure-systems-lab/go-securesystemslib v0.10.0 // indirect github.com/sergi/go-diff v1.4.0 // indirect github.com/shibumi/go-pathspec v1.3.0 // indirect github.com/shirou/gopsutil/v3 v3.24.5 // indirect github.com/sigstore/cosign/v2 v2.4.1 // indirect github.com/sigstore/fulcio v1.6.3 // indirect github.com/sigstore/rekor v1.3.9 // indirect github.com/sigstore/sigstore v1.10.3 // indirect github.com/sigstore/timestamp-authority v1.2.2 // indirect github.com/sirupsen/logrus v1.9.3 // indirect github.com/skeema/knownhosts v1.3.1 // indirect github.com/sourcegraph/conc v0.3.0 // indirect github.com/spf13/afero v1.15.0 // indirect github.com/spf13/cast v1.9.2 // indirect github.com/spf13/cobra v1.10.2 // indirect github.com/spf13/viper v1.20.1 // indirect github.com/spiffe/go-spiffe/v2 v2.6.0 // indirect github.com/subosito/gotenv v1.6.0 // indirect github.com/syndtr/goleveldb v1.0.1-0.20220721030215-126854af5e6d // indirect github.com/tchap/go-patricia/v2 v2.3.2 // indirect github.com/thales-e-security/pool v0.0.2 // indirect github.com/theupdateframework/go-tuf v0.7.0 // indirect github.com/tjfoc/gmsm v1.4.1 // indirect github.com/transparency-dev/merkle v0.0.2 // indirect github.com/vbatts/tar-split v0.12.2 // indirect github.com/xanzy/go-gitlab v0.109.0 // indirect github.com/xanzy/ssh-agent v0.3.3 // indirect github.com/xeipuuv/gojsonpointer v0.0.0-20190905194746-02993c407bfb // indirect github.com/xeipuuv/gojsonreference v0.0.0-20180127040603-bd5ef7bd5415 // indirect github.com/yashtewari/glob-intersection v0.2.0 // indirect github.com/yusufpapurcu/wmi v1.2.4 // indirect gitlab.alpinelinux.org/alpine/go v0.10.0 // indirect go.mongodb.org/mongo-driver v1.17.2 // indirect go.opentelemetry.io/contrib/instrumentation/google.golang.org/grpc/otelgrpc v0.63.0 // indirect go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp v0.63.0 // indirect go.opentelemetry.io/otel v1.39.0 // indirect go.opentelemetry.io/otel/metric v1.39.0 // indirect go.opentelemetry.io/otel/sdk v1.39.0 // indirect go.opentelemetry.io/otel/trace v1.39.0 // indirect go.step.sm/crypto v0.57.0 // indirect go.uber.org/multierr v1.11.0 // indirect go.uber.org/zap v1.27.0 // indirect golang.org/x/mod v0.31.0 // indirect golang.org/x/net v0.48.0 // indirect golang.org/x/oauth2 v0.34.0 // indirect golang.org/x/sync v0.19.0 // indirect golang.org/x/sys v0.39.0 // indirect golang.org/x/term v0.38.0 // indirect golang.org/x/text v0.32.0 // indirect golang.org/x/time v0.13.0 // indirect golang.org/x/tools/go/vcs v0.1.0-deprecated // indirect golang.org/x/xerrors v0.0.0-20240716161551-93cc26a95ae9 // indirect google.golang.org/api v0.242.0 // indirect google.golang.org/genproto v0.0.0-20250603155806-513f23925822 // indirect google.golang.org/genproto/googleapis/api v0.0.0-20251202230838-ff82c1b0f217 // indirect google.golang.org/genproto/googleapis/rpc v0.0.0-20251202230838-ff82c1b0f217 // indirect google.golang.org/grpc v1.79.3 // indirect google.golang.org/protobuf v1.36.10 // indirect gopkg.in/inf.v0 v0.9.1 // indirect gopkg.in/ini.v1 v1.67.0 // indirect gopkg.in/warnings.v0 v0.1.2 // indirect gopkg.in/yaml.v2 v2.4.0 // indirect gopkg.in/yaml.v3 v3.0.1 // indirect k8s.io/kube-openapi v0.0.0-20250910181357-589584f1c912 // indirect k8s.io/release v0.18.0 // indirect k8s.io/utils v0.0.0-20260108192941-914a6e750570 modernc.org/libc v1.45.2 // indirect modernc.org/mathutil v1.6.0 // indirect modernc.org/memory v1.7.2 // indirect modernc.org/sqlite v1.29.5 // indirect sigs.k8s.io/bom v0.6.0 // indirect sigs.k8s.io/json v0.0.0-20250730193827-2d320260d730 // indirect sigs.k8s.io/promo-tools/v3 v3.6.0 // indirect sigs.k8s.io/release-sdk v0.12.2 // indirect sigs.k8s.io/release-utils v0.12.0 // indirect sigs.k8s.io/yaml v1.6.0 ) require ( github.com/urfave/sflags v0.4.1 github.com/weaveworks/eksctl v0.221.0 k8s.io/cli-runtime v0.35.0 k8s.io/cloud-provider-aws v1.35.0 sigs.k8s.io/e2e-framework v0.6.1-0.20250909060333-8677714ff9a6 // bump version once https://github.com/kubernetes-sigs/e2e-framework/pull/517 gets released ) require ( cel.dev/expr v0.25.1 // indirect cloud.google.com/go/auth v0.16.5 // indirect cloud.google.com/go/auth/oauth2adapt v0.2.8 // indirect cloud.google.com/go/monitoring v1.24.2 // indirect github.com/AliyunContainerService/ack-ram-tool/pkg/credentials/provider v0.14.0 // indirect github.com/GoogleCloudPlatform/opentelemetry-operations-go/detectors/gcp v1.30.0 // indirect github.com/GoogleCloudPlatform/opentelemetry-operations-go/exporter/metric v0.51.0 // indirect github.com/GoogleCloudPlatform/opentelemetry-operations-go/internal/resourcemapping v0.51.0 // indirect github.com/avast/retry-go/v4 v4.6.1 // indirect github.com/aws/aws-sdk-go-v2/service/cloudtrail v1.55.1 // indirect github.com/aws/aws-sdk-go-v2/service/cloudwatchlogs v1.61.1 // indirect github.com/aws/aws-sdk-go-v2/service/elasticloadbalancing v1.33.15 // indirect github.com/aws/aws-sdk-go-v2/service/elasticloadbalancingv2 v1.54.2 // indirect github.com/aws/aws-sdk-go-v2/service/kms v1.47.1 // indirect github.com/aws/aws-sdk-go-v2/service/outposts v1.57.8 // indirect github.com/aws/aws-sdk-go-v2/service/signin v1.0.5 // indirect github.com/cenkalti/backoff/v5 v5.0.3 // indirect github.com/cncf/xds/go v0.0.0-20251210132809-ee656c7534f5 // indirect github.com/envoyproxy/go-control-plane/envoy v1.36.0 // indirect github.com/envoyproxy/protoc-gen-validate v1.3.0 // indirect github.com/fatih/color v1.18.0 // indirect github.com/go-errors/errors v1.5.1 // indirect github.com/go-jose/go-jose/v4 v4.1.3 // indirect github.com/go-resty/resty/v2 v2.16.5 // indirect github.com/go-viper/mapstructure/v2 v2.4.0 // indirect github.com/google/btree v1.1.3 // indirect github.com/google/go-github/v60 v60.0.0 // indirect github.com/gorilla/websocket v1.5.4-0.20250319132907-e064f32e3674 // indirect github.com/hashicorp/go-version v1.7.0 // indirect github.com/in-toto/attestation v1.1.0 // indirect github.com/kris-nova/logger v0.2.2 // indirect github.com/mattn/go-colorable v0.1.14 // indirect github.com/mattn/go-runewidth v0.0.16 // indirect github.com/miekg/dns v1.1.61 // indirect github.com/moby/spdystream v0.5.0 // indirect github.com/monochromegane/go-gitignore v0.0.0-20200626010858-205db1a8cc00 // indirect github.com/mxk/go-flowrate v0.0.0-20140419014527-cca7078d478f // indirect github.com/octago/sflags v0.3.1 // indirect github.com/olekukonko/errors v0.0.0-20250405072817-4e6d85265da6 // indirect github.com/olekukonko/ll v0.0.8 // indirect github.com/olekukonko/tablewriter v1.0.8 // indirect github.com/pkg/browser v0.0.0-20240102092130-5ac0b6a4141c // indirect github.com/planetscale/vtprotobuf v0.6.1-0.20240319094008-0393e58bdf10 // indirect github.com/rivo/uniseg v0.4.7 // indirect github.com/sigstore/protobuf-specs v0.5.0 // indirect github.com/sigstore/sigstore-go v0.6.1 // indirect github.com/stretchr/objx v0.5.2 // indirect github.com/theupdateframework/go-tuf/v2 v2.3.1 // indirect github.com/vladimirvivien/gexe v0.5.0 // indirect github.com/xlab/treeprint v1.2.0 // indirect go.opentelemetry.io/auto/sdk v1.2.1 // indirect go.opentelemetry.io/contrib/detectors/gcp v1.39.0 // indirect go.opentelemetry.io/otel/exporters/otlp/otlptrace v1.38.0 // indirect go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracegrpc v1.38.0 // indirect go.opentelemetry.io/otel/sdk/metric v1.39.0 // indirect go.yaml.in/yaml/v2 v2.4.3 // indirect go.yaml.in/yaml/v3 v3.0.4 // indirect gopkg.in/evanphx/json-patch.v4 v4.13.0 // indirect gopkg.in/gcfg.v1 v1.2.3 // indirect k8s.io/cloud-provider v0.35.0 // indirect k8s.io/component-base v0.35.0 // indirect k8s.io/kubelet v0.35.0 // indirect sigs.k8s.io/kustomize/api v0.20.1 // indirect sigs.k8s.io/kustomize/kyaml v0.20.1 // indirect sigs.k8s.io/randfill v1.0.0 // indirect sigs.k8s.io/structured-merge-diff/v6 v6.3.0 // indirect ) ================================================ FILE: go.sum ================================================ cel.dev/expr v0.25.1 h1:1KrZg61W6TWSxuNZ37Xy49ps13NUovb66QLprthtwi4= cel.dev/expr v0.25.1/go.mod h1:hrXvqGP6G6gyx8UAHSHJ5RGk//1Oj5nXQ2NI02Nrsg4= cloud.google.com/go v0.26.0/go.mod h1:aQUYkXzVsufM+DwF1aE+0xfcU+56JwCaLick0ClmMTw= cloud.google.com/go v0.121.2 h1:v2qQpN6Dx9x2NmwrqlesOt3Ys4ol5/lFZ6Mg1B7OJCg= cloud.google.com/go v0.121.2/go.mod h1:nRFlrHq39MNVWu+zESP2PosMWA0ryJw8KUBZ2iZpxbw= cloud.google.com/go/auth v0.16.5 h1:mFWNQ2FEVWAliEQWpAdH80omXFokmrnbDhUS9cBywsI= cloud.google.com/go/auth v0.16.5/go.mod h1:utzRfHMP+Vv0mpOkTRQoWD2q3BatTOoWbA7gCc2dUhQ= cloud.google.com/go/auth/oauth2adapt v0.2.8 h1:keo8NaayQZ6wimpNSmW5OPc283g65QNIiLpZnkHRbnc= cloud.google.com/go/auth/oauth2adapt v0.2.8/go.mod h1:XQ9y31RkqZCcwJWNSx2Xvric3RrU88hAYYbjDWYDL+c= cloud.google.com/go/compute/metadata v0.9.0 h1:pDUj4QMoPejqq20dK0Pg2N4yG9zIkYGdBtwLoEkH9Zs= cloud.google.com/go/compute/metadata v0.9.0/go.mod h1:E0bWwX5wTnLPedCKqk3pJmVgCBSM6qQI1yTBdEb3C10= cloud.google.com/go/iam v1.5.2 h1:qgFRAGEmd8z6dJ/qyEchAuL9jpswyODjA2lS+w234g8= cloud.google.com/go/iam v1.5.2/go.mod h1:SE1vg0N81zQqLzQEwxL2WI6yhetBdbNQuTvIKCSkUHE= cloud.google.com/go/kms v1.22.0 h1:dBRIj7+GDeeEvatJeTB19oYZNV0aj6wEqSIT/7gLqtk= cloud.google.com/go/kms v1.22.0/go.mod h1:U7mf8Sva5jpOb4bxYZdtw/9zsbIjrklYwPcvMk34AL8= cloud.google.com/go/logging v1.13.0 h1:7j0HgAp0B94o1YRDqiqm26w4q1rDMH7XNRU34lJXHYc= cloud.google.com/go/logging v1.13.0/go.mod h1:36CoKh6KA/M0PbhPKMq6/qety2DCAErbhXT62TuXALA= cloud.google.com/go/longrunning v0.6.7 h1:IGtfDWHhQCgCjwQjV9iiLnUta9LBCo8R9QmAFsS/PrE= cloud.google.com/go/longrunning v0.6.7/go.mod h1:EAFV3IZAKmM56TyiE6VAP3VoTzhZzySwI/YI1s/nRsY= cloud.google.com/go/monitoring v1.24.2 h1:5OTsoJ1dXYIiMiuL+sYscLc9BumrL3CarVLL7dd7lHM= cloud.google.com/go/monitoring v1.24.2/go.mod h1:x7yzPWcgDRnPEv3sI+jJGBkwl5qINf+6qY4eq0I9B4U= cloud.google.com/go/storage v1.53.0 h1:gg0ERZwL17pJ+Cz3cD2qS60w1WMDnwcm5YPAIQBHUAw= cloud.google.com/go/storage v1.53.0/go.mod h1:7/eO2a/srr9ImZW9k5uufcNahT2+fPb8w5it1i5boaA= cloud.google.com/go/trace v1.11.6 h1:2O2zjPzqPYAHrn3OKl029qlqG6W8ZdYaOWRyr8NgMT4= cloud.google.com/go/trace v1.11.6/go.mod h1:GA855OeDEBiBMzcckLPE2kDunIpC72N+Pq8WFieFjnI= cuelabs.dev/go/oci/ociregistry v0.0.0-20240404174027-a39bec0462d2 h1:BnG6pr9TTr6CYlrJznYUDj6V7xldD1W+1iXPum0wT/w= cuelabs.dev/go/oci/ociregistry v0.0.0-20240404174027-a39bec0462d2/go.mod h1:pK23AUVXuNzzTpfMCA06sxZGeVQ/75FdVtW249de9Uo= cuelang.org/go v0.9.2 h1:pfNiry2PdRBr02G/aKm5k2vhzmqbAOoaB4WurmEbWvs= cuelang.org/go v0.9.2/go.mod h1:qpAYsLOf7gTM1YdEg6cxh553uZ4q9ZDWlPbtZr9q1Wk= dario.cat/mergo v1.0.2 h1:85+piFYR1tMbRrLcDwR18y4UKJ3aH1Tbzi24VRW1TK8= dario.cat/mergo v1.0.2/go.mod h1:E/hbnu0NxMFBjpMIE34DRGLWqDy0g5FuKDhCb31ngxA= filippo.io/edwards25519 v1.1.0 h1:FNf4tywRC1HmFuKW5xopWpigGjJKiJSV0Cqo0cJWDaA= filippo.io/edwards25519 v1.1.0/go.mod h1:BxyFTGdWcka3PhytdK4V28tE5sGfRvvvRV7EaN4VDT4= github.com/AdamKorcz/go-fuzz-headers-1 v0.0.0-20230919221257-8b5d3ce2d11d h1:zjqpY4C7H15HjRPEenkS4SAn3Jy2eRRjkjZbGR30TOg= github.com/AdamKorcz/go-fuzz-headers-1 v0.0.0-20230919221257-8b5d3ce2d11d/go.mod h1:XNqJ7hv2kY++g8XEHREpi+JqZo3+0l+CH2egBVN4yqM= github.com/AliyunContainerService/ack-ram-tool/pkg/credentials/provider v0.14.0 h1:kcnfY4vljxXliXDBrA9K9lwF8IoEZ4Up6Eg9kWTIm28= github.com/AliyunContainerService/ack-ram-tool/pkg/credentials/provider v0.14.0/go.mod h1:tlqp9mUGbsP+0z3Q+c0Q5MgSdq/OMwQhm5bffR3Q3ss= github.com/Azure/azure-sdk-for-go v68.0.0+incompatible h1:fcYLmCpyNYRnvJbPerq7U0hS+6+I79yEDJBqVNcqUzU= github.com/Azure/azure-sdk-for-go v68.0.0+incompatible/go.mod h1:9XXNKU+eRnpl9moKnB4QOLf1HestfXbmab5FXxiDBjc= github.com/Azure/azure-sdk-for-go/sdk/azcore v1.18.0 h1:Gt0j3wceWMwPmiazCa8MzMA0MfhmPIz0Qp0FJ6qcM0U= github.com/Azure/azure-sdk-for-go/sdk/azcore v1.18.0/go.mod h1:Ot/6aikWnKWi4l9QB7qVSwa8iMphQNqkWALMoNT3rzM= github.com/Azure/azure-sdk-for-go/sdk/azidentity v1.10.1 h1:B+blDbyVIG3WaikNxPnhPiJ1MThR03b3vKGtER95TP4= github.com/Azure/azure-sdk-for-go/sdk/azidentity v1.10.1/go.mod h1:JdM5psgjfBf5fo2uWOZhflPWyDBZ/O/CNAH9CtsuZE4= github.com/Azure/azure-sdk-for-go/sdk/internal v1.11.1 h1:FPKJS1T+clwv+OLGt13a8UjqeRuh0O4SJ3lUriThc+4= github.com/Azure/azure-sdk-for-go/sdk/internal v1.11.1/go.mod h1:j2chePtV91HrC22tGoRX3sGY42uF13WzmmV80/OdVAA= github.com/Azure/azure-sdk-for-go/sdk/security/keyvault/azkeys v1.3.0 h1:7rKG7UmnrxX4N53TFhkYqjc+kVUZuw0fL8I3Fh+Ld9E= github.com/Azure/azure-sdk-for-go/sdk/security/keyvault/azkeys v1.3.0/go.mod h1:Wjo+24QJVhhl/L7jy6w9yzFF2yDOf3cKECAa8ecf9vE= github.com/Azure/azure-sdk-for-go/sdk/security/keyvault/internal v1.1.0 h1:eXnN9kaS8TiDwXjoie3hMRLuwdUBUMW9KRgOqB3mCaw= github.com/Azure/azure-sdk-for-go/sdk/security/keyvault/internal v1.1.0/go.mod h1:XIpam8wumeZ5rVMuhdDQLMfIPDf1WO3IzrCRO3e3e3o= github.com/Azure/go-ansiterm v0.0.0-20250102033503-faa5f7b0171c h1:udKWzYgxTojEKWjV8V+WSxDXJ4NFATAsZjh8iIbsQIg= github.com/Azure/go-ansiterm v0.0.0-20250102033503-faa5f7b0171c/go.mod h1:xomTg63KZ2rFqZQzSB4Vz2SUXa1BpHTVz9L5PTmPC4E= github.com/Azure/go-autorest v14.2.0+incompatible h1:V5VMDjClD3GiElqLWO7mz2MxNAK/vTfRHdAubSIPRgs= github.com/Azure/go-autorest v14.2.0+incompatible/go.mod h1:r+4oMnoxhatjLLJ6zxSWATqVooLgysK6ZNox3g/xq24= github.com/Azure/go-autorest/autorest v0.11.24/go.mod h1:G6kyRlFnTuSbEYkQGawPfsCswgme4iYf6rfSKUDzbCc= github.com/Azure/go-autorest/autorest v0.11.29 h1:I4+HL/JDvErx2LjyzaVxllw2lRDB5/BT2Bm4g20iqYw= github.com/Azure/go-autorest/autorest v0.11.29/go.mod h1:ZtEzC4Jy2JDrZLxvWs8LrBWEBycl1hbT1eknI8MtfAs= github.com/Azure/go-autorest/autorest/adal v0.9.18/go.mod h1:XVVeme+LZwABT8K5Lc3hA4nAe8LDBVle26gTrguhhPQ= github.com/Azure/go-autorest/autorest/adal v0.9.22/go.mod h1:XuAbAEUv2Tta//+voMI038TrJBqjKam0me7qR+L8Cmk= github.com/Azure/go-autorest/autorest/adal v0.9.23 h1:Yepx8CvFxwNKpH6ja7RZ+sKX+DWYNldbLiALMC3BTz8= github.com/Azure/go-autorest/autorest/adal v0.9.23/go.mod h1:5pcMqFkdPhviJdlEy3kC/v1ZLnQl0MH6XA5YCcMhy4c= github.com/Azure/go-autorest/autorest/azure/auth v0.5.12 h1:wkAZRgT/pn8HhFyzfe9UnqOjJYqlembgCTi72Bm/xKk= github.com/Azure/go-autorest/autorest/azure/auth v0.5.12/go.mod h1:84w/uV8E37feW2NCJ08uT9VBfjfUHpgLVnG2InYD6cg= github.com/Azure/go-autorest/autorest/azure/cli v0.4.5/go.mod h1:ADQAXrkgm7acgWVUNamOgh8YNrv4p27l3Wc55oVfpzg= github.com/Azure/go-autorest/autorest/azure/cli v0.4.6 h1:w77/uPk80ZET2F+AfQExZyEWtn+0Rk/uw17m9fv5Ajc= github.com/Azure/go-autorest/autorest/azure/cli v0.4.6/go.mod h1:piCfgPho7BiIDdEQ1+g4VmKyD5y+p/XtSNqE6Hc4QD0= github.com/Azure/go-autorest/autorest/date v0.3.0 h1:7gUk1U5M/CQbp9WoqinNzJar+8KY+LPI6wiWrP/myHw= github.com/Azure/go-autorest/autorest/date v0.3.0/go.mod h1:BI0uouVdmngYNUzGWeSYnokU+TrmwEsOqdt8Y6sso74= github.com/Azure/go-autorest/autorest/mocks v0.4.1/go.mod h1:LTp+uSrOhSkaKrUy935gNZuuIPPVsHlr9DSOxSayd+k= github.com/Azure/go-autorest/autorest/mocks v0.4.2 h1:PGN4EDXnuQbojHbU0UWoNvmu9AGVwYHG9/fkDYhtAfw= github.com/Azure/go-autorest/autorest/mocks v0.4.2/go.mod h1:Vy7OitM9Kei0i1Oj+LvyAWMXJHeKH1MVlzFugfVrmyU= github.com/Azure/go-autorest/logger v0.2.1 h1:IG7i4p/mDa2Ce4TRyAO8IHnVhAVF3RFU+ZtXWSmf4Tg= github.com/Azure/go-autorest/logger v0.2.1/go.mod h1:T9E3cAhj2VqvPOtCYAvby9aBXkZmbF5NWuPV8+WeEW8= github.com/Azure/go-autorest/tracing v0.6.0 h1:TYi4+3m5t6K48TGI9AUdb+IzbnSxvnvUMfuitfgcfuo= github.com/Azure/go-autorest/tracing v0.6.0/go.mod h1:+vhtPC754Xsa23ID7GlGsrdKBpUA79WCAKPPZVC2DeU= github.com/AzureAD/microsoft-authentication-library-for-go v1.4.2 h1:oygO0locgZJe7PpYPXT5A29ZkwJaPqcva7BVeemZOZs= github.com/AzureAD/microsoft-authentication-library-for-go v1.4.2/go.mod h1:wP83P5OoQ5p6ip3ScPr0BAq0BvuPAvacpEuSzyouqAI= github.com/BurntSushi/toml v0.3.1/go.mod h1:xHWCNGjB5oqiDr8zfno3MHue2Ht5sIBksp03qcyfWMU= github.com/GoogleCloudPlatform/opentelemetry-operations-go/detectors/gcp v1.30.0 h1:sBEjpZlNHzK1voKq9695PJSX2o5NEXl7/OL3coiIY0c= github.com/GoogleCloudPlatform/opentelemetry-operations-go/detectors/gcp v1.30.0/go.mod h1:P4WPRUkOhJC13W//jWpyfJNDAIpvRbAUIYLX/4jtlE0= github.com/GoogleCloudPlatform/opentelemetry-operations-go/exporter/metric v0.51.0 h1:fYE9p3esPxA/C0rQ0AHhP0drtPXDRhaWiwg1DPqO7IU= github.com/GoogleCloudPlatform/opentelemetry-operations-go/exporter/metric v0.51.0/go.mod h1:BnBReJLvVYx2CS/UHOgVz2BXKXD9wsQPxZug20nZhd0= github.com/GoogleCloudPlatform/opentelemetry-operations-go/internal/cloudmock v0.51.0 h1:OqVGm6Ei3x5+yZmSJG1Mh2NwHvpVmZ08CB5qJhT9Nuk= github.com/GoogleCloudPlatform/opentelemetry-operations-go/internal/cloudmock v0.51.0/go.mod h1:SZiPHWGOOk3bl8tkevxkoiwPgsIl6CwrWcbwjfHZpdM= github.com/GoogleCloudPlatform/opentelemetry-operations-go/internal/resourcemapping v0.51.0 h1:6/0iUd0xrnX7qt+mLNRwg5c0PGv8wpE8K90ryANQwMI= github.com/GoogleCloudPlatform/opentelemetry-operations-go/internal/resourcemapping v0.51.0/go.mod h1:otE2jQekW/PqXk1Awf5lmfokJx4uwuqcj1ab5SpGeW0= github.com/MakeNowJust/heredoc/v2 v2.0.1 h1:rlCHh70XXXv7toz95ajQWOWQnN4WNLt0TdpZYIR/J6A= github.com/MakeNowJust/heredoc/v2 v2.0.1/go.mod h1:6/2Abh5s+hc3g9nbWLe9ObDIOhaRrqsyY9MWy+4JdRM= github.com/Masterminds/semver/v3 v3.4.0 h1:Zog+i5UMtVoCU8oKka5P7i9q9HgrJeGzI9SA1Xbatp0= github.com/Masterminds/semver/v3 v3.4.0/go.mod h1:4V+yj/TJE1HU9XfppCwVMZq3I84lprf4nC11bSS5beM= github.com/Microsoft/go-winio v0.5.2/go.mod h1:WpS1mjBmmwHBEWmogvA2mj8546UReBk4v8QkMxJ6pZY= github.com/Microsoft/go-winio v0.6.2 h1:F2VQgta7ecxGYO8k3ZZz3RS8fVIXVxONVUPlNERoyfY= github.com/Microsoft/go-winio v0.6.2/go.mod h1:yd8OoFMLzJbo9gZq8j5qaps8bJ9aShtEA8Ipt1oGCvU= github.com/Pallinder/go-randomdata v1.2.0 h1:DZ41wBchNRb/0GfsePLiSwb0PHZmT67XY00lCDlaYPg= github.com/Pallinder/go-randomdata v1.2.0/go.mod h1:yHmJgulpD2Nfrm0cR9tI/+oAgRqCQQixsA8HyRZfV9Y= github.com/ProtonMail/go-crypto v1.1.6 h1:ZcV+Ropw6Qn0AX9brlQLAUXfqLBc7Bl+f/DmNxpLfdw= github.com/ProtonMail/go-crypto v1.1.6/go.mod h1:rA3QumHc/FZ8pAHreoekgiAbzpNsfQAosU5td4SnOrE= github.com/ThalesIgnite/crypto11 v1.2.5 h1:1IiIIEqYmBvUYFeMnHqRft4bwf/O36jryEUpY+9ef8E= github.com/ThalesIgnite/crypto11 v1.2.5/go.mod h1:ILDKtnCKiQ7zRoNxcp36Y1ZR8LBPmR2E23+wTQe/MlE= github.com/agnivade/levenshtein v1.2.1 h1:EHBY3UOn1gwdy/VbFwgo4cxecRznFk7fKWN1KOX7eoM= github.com/agnivade/levenshtein v1.2.1/go.mod h1:QVVI16kDrtSuwcpd0p1+xMC6Z/VfhtCyDIjcwga4/DU= github.com/alessio/shellescape v1.4.1 h1:V7yhSDDn8LP4lc4jS8pFkt0zCnzVJlG5JXy9BVKJUX0= github.com/alessio/shellescape v1.4.1/go.mod h1:PZAiSCk0LJaZkiCSkPv8qIobYglO3FPpyFjDCtHLS30= github.com/alibabacloud-go/alibabacloud-gateway-spi v0.0.2/go.mod h1:sCavSAvdzOjul4cEqeVtvlSaSScfNsTQ+46HwlTL1hc= github.com/alibabacloud-go/alibabacloud-gateway-spi v0.0.4 h1:iC9YFYKDGEy3n/FtqJnOkZsene9olVspKmkX5A2YBEo= github.com/alibabacloud-go/alibabacloud-gateway-spi v0.0.4/go.mod h1:sCavSAvdzOjul4cEqeVtvlSaSScfNsTQ+46HwlTL1hc= github.com/alibabacloud-go/cr-20160607 v1.0.1 h1:WEnP1iPFKJU74ryUKh/YDPHoxMZawqlPajOymyNAkts= github.com/alibabacloud-go/cr-20160607 v1.0.1/go.mod h1:QHeKZtZ3F3FOE+/uIXCBAp8POwnUYekpLwr1dtQa5r0= github.com/alibabacloud-go/cr-20181201 v1.0.10 h1:B60f6S1imsgn2fgC6X6FrVNrONDrbCT0NwYhsJ0C9/c= github.com/alibabacloud-go/cr-20181201 v1.0.10/go.mod h1:VN9orB/w5G20FjytoSpZROqu9ZqxwycASmGqYUJSoDc= github.com/alibabacloud-go/darabonba-openapi v0.1.12/go.mod h1:sTAjsFJmVsmcVeklL9d9uDBlFsgl43wZ6jhI6BHqHqU= github.com/alibabacloud-go/darabonba-openapi v0.1.14/go.mod h1:w4CosR7O/kapCtEEMBm3JsQqWBU/CnZ2o0pHorsTWDI= github.com/alibabacloud-go/darabonba-openapi v0.2.1 h1:WyzxxKvhdVDlwpAMOHgAiCJ+NXa6g5ZWPFEzaK/ewwY= github.com/alibabacloud-go/darabonba-openapi v0.2.1/go.mod h1:zXOqLbpIqq543oioL9IuuZYOQgHQ5B8/n5OPrnko8aY= github.com/alibabacloud-go/darabonba-string v1.0.0/go.mod h1:93cTfV3vuPhhEwGGpKKqhVW4jLe7tDpo3LUM0i0g6mA= github.com/alibabacloud-go/debug v0.0.0-20190504072949-9472017b5c68/go.mod h1:6pb/Qy8c+lqua8cFpEy7g39NRRqOWc3rOwAy8m5Y2BY= github.com/alibabacloud-go/debug v1.0.0 h1:3eIEQWfay1fB24PQIEzXAswlVJtdQok8f3EVN5VrBnA= github.com/alibabacloud-go/debug v1.0.0/go.mod h1:8gfgZCCAC3+SCzjWtY053FrOcd4/qlH6IHTI4QyICOc= github.com/alibabacloud-go/endpoint-util v1.1.0/go.mod h1:O5FuCALmCKs2Ff7JFJMudHs0I5EBgecXXxZRyswlEjE= github.com/alibabacloud-go/endpoint-util v1.1.1 h1:ZkBv2/jnghxtU0p+upSU0GGzW1VL9GQdZO3mcSUTUy8= github.com/alibabacloud-go/endpoint-util v1.1.1/go.mod h1:O5FuCALmCKs2Ff7JFJMudHs0I5EBgecXXxZRyswlEjE= github.com/alibabacloud-go/openapi-util v0.0.9/go.mod h1:sQuElr4ywwFRlCCberQwKRFhRzIyG4QTP/P4y1CJ6Ws= github.com/alibabacloud-go/openapi-util v0.0.10/go.mod h1:sQuElr4ywwFRlCCberQwKRFhRzIyG4QTP/P4y1CJ6Ws= github.com/alibabacloud-go/openapi-util v0.0.11/go.mod h1:sQuElr4ywwFRlCCberQwKRFhRzIyG4QTP/P4y1CJ6Ws= github.com/alibabacloud-go/openapi-util v0.1.0 h1:0z75cIULkDrdEhkLWgi9tnLe+KhAFE/r5Pb3312/eAY= github.com/alibabacloud-go/openapi-util v0.1.0/go.mod h1:sQuElr4ywwFRlCCberQwKRFhRzIyG4QTP/P4y1CJ6Ws= github.com/alibabacloud-go/tea v1.1.0/go.mod h1:IkGyUSX4Ba1V+k4pCtJUc6jDpZLFph9QMy2VUPTwukg= github.com/alibabacloud-go/tea v1.1.7/go.mod h1:/tmnEaQMyb4Ky1/5D+SE1BAsa5zj/KeGOFfwYm3N/p4= github.com/alibabacloud-go/tea v1.1.8/go.mod h1:/tmnEaQMyb4Ky1/5D+SE1BAsa5zj/KeGOFfwYm3N/p4= github.com/alibabacloud-go/tea v1.1.11/go.mod h1:/tmnEaQMyb4Ky1/5D+SE1BAsa5zj/KeGOFfwYm3N/p4= github.com/alibabacloud-go/tea v1.1.17/go.mod h1:nXxjm6CIFkBhwW4FQkNrolwbfon8Svy6cujmKFUq98A= github.com/alibabacloud-go/tea v1.1.19/go.mod h1:nXxjm6CIFkBhwW4FQkNrolwbfon8Svy6cujmKFUq98A= github.com/alibabacloud-go/tea v1.2.2 h1:aTsR6Rl3ANWPfqeQugPglfurloyBJY85eFy7Gc1+8oU= github.com/alibabacloud-go/tea v1.2.2/go.mod h1:CF3vOzEMAG+bR4WOql8gc2G9H3EkH3ZLAQdpmpXMgwk= github.com/alibabacloud-go/tea-utils v1.3.1/go.mod h1:EI/o33aBfj3hETm4RLiAxF/ThQdSngxrpF8rKUDJjPE= github.com/alibabacloud-go/tea-utils v1.3.9/go.mod h1:EI/o33aBfj3hETm4RLiAxF/ThQdSngxrpF8rKUDJjPE= github.com/alibabacloud-go/tea-utils v1.4.3/go.mod h1:KNcT0oXlZZxOXINnZBs6YvgOd5aYp9U67G+E3R8fcQw= github.com/alibabacloud-go/tea-utils v1.4.5 h1:h0/6Xd2f3bPE4XHTvkpjwxowIwRCJAJOqY6Eq8f3zfA= github.com/alibabacloud-go/tea-utils v1.4.5/go.mod h1:KNcT0oXlZZxOXINnZBs6YvgOd5aYp9U67G+E3R8fcQw= github.com/alibabacloud-go/tea-xml v1.1.2/go.mod h1:Rq08vgCcCAjHyRi/M7xlHKUykZCEtyBy9+DPF6GgEu8= github.com/alibabacloud-go/tea-xml v1.1.3 h1:7LYnm+JbOq2B+T/B0fHC4Ies4/FofC4zHzYtqw7dgt0= github.com/alibabacloud-go/tea-xml v1.1.3/go.mod h1:Rq08vgCcCAjHyRi/M7xlHKUykZCEtyBy9+DPF6GgEu8= github.com/aliyun/credentials-go v1.1.2/go.mod h1:ozcZaMR5kLM7pwtCMEpVmQ242suV6qTJya2bDq4X1Tw= github.com/aliyun/credentials-go v1.3.2 h1:L4WppI9rctC8PdlMgyTkF8bBsy9pyKQEzBD1bHMRl+g= github.com/aliyun/credentials-go v1.3.2/go.mod h1:tlpz4uys4Rn7Ik4/piGRrTbXy2uLKvePgQJJduE+Y5c= github.com/anmitsu/go-shlex v0.0.0-20200514113438-38f4b401e2be h1:9AeTilPcZAjCFIImctFaOjnTIavg87rW78vTPkQqLI8= github.com/anmitsu/go-shlex v0.0.0-20200514113438-38f4b401e2be/go.mod h1:ySMOLuWl6zY27l47sB3qLNK6tF2fkHG55UZxx8oIVo4= github.com/arbovm/levenshtein v0.0.0-20160628152529-48b4e1c0c4d0 h1:jfIu9sQUG6Ig+0+Ap1h4unLjW6YQJpKZVmUzxsD4E/Q= github.com/arbovm/levenshtein v0.0.0-20160628152529-48b4e1c0c4d0/go.mod h1:t2tdKJDJF9BV14lnkjHmOQgcvEKgtqs5a1N3LNdJhGE= github.com/armon/go-socks5 v0.0.0-20160902184237-e75332964ef5 h1:0CwZNZbxp69SHPdPJAN/hZIm0C4OItdklCFmMRWYpio= github.com/armon/go-socks5 v0.0.0-20160902184237-e75332964ef5/go.mod h1:wHh0iHkYZB8zMSxRWpUBQtwG5a7fFgvEO+odwuTv2gs= github.com/asaskevich/govalidator v0.0.0-20230301143203-a9d515a09cc2 h1:DklsrG3dyBCFEj5IhUbnKptjxatkF07cF2ak3yi77so= github.com/asaskevich/govalidator v0.0.0-20230301143203-a9d515a09cc2/go.mod h1:WaHUgvxTVq04UNunO+XhnAqY/wQc+bxr74GqbsZ/Jqw= github.com/atotto/clipboard v0.1.4 h1:EH0zSVneZPSuFR11BlR9YppQTVDbh5+16AmcJi4g1z4= github.com/atotto/clipboard v0.1.4/go.mod h1:ZY9tmq7sm5xIbd9bOK4onWV4S6X0u6GY7Vn0Yu86PYI= github.com/avast/retry-go v3.0.0+incompatible h1:4SOWQ7Qs+oroOTQOYnAHqelpCO0biHSxpiH9JdtuBj0= github.com/avast/retry-go v3.0.0+incompatible/go.mod h1:XtSnn+n/sHqQIpZ10K1qAevBhOOCWBLXXy3hyiqqBrY= github.com/avast/retry-go/v4 v4.6.1 h1:VkOLRubHdisGrHnTu89g08aQEWEgRU7LVEop3GbIcMk= github.com/avast/retry-go/v4 v4.6.1/go.mod h1:V6oF8njAwxJ5gRo1Q7Cxab24xs5NCWZBeaHHBklR8mA= github.com/aws/amazon-ec2-instance-selector/v3 v3.1.2 h1:F8GBspJo+RmR4rYyw75XywEEQHQxBbF7QYKaMMnYREc= github.com/aws/amazon-ec2-instance-selector/v3 v3.1.2/go.mod h1:wdlMRtz9G4IO6H1yZPsqfGBxR8E6B/bdxHlGkls4kGQ= github.com/aws/aws-sdk-go v1.55.8 h1:JRmEUbU52aJQZ2AjX4q4Wu7t4uZjOu71uyNmaWlUkJQ= github.com/aws/aws-sdk-go v1.55.8/go.mod h1:ZkViS9AqA6otK+JBBNH2++sx1sgxrPKcSzPPvQkUtXk= github.com/aws/aws-sdk-go-v2 v1.41.1 h1:ABlyEARCDLN034NhxlRUSZr4l71mh+T5KAeGh6cerhU= github.com/aws/aws-sdk-go-v2 v1.41.1/go.mod h1:MayyLB8y+buD9hZqkCW3kX1AKq07Y5pXxtgB+rRFhz0= github.com/aws/aws-sdk-go-v2/aws/protocol/eventstream v1.7.4 h1:489krEF9xIGkOaaX3CE/Be2uWjiXrkCH6gUX+bZA/BU= github.com/aws/aws-sdk-go-v2/aws/protocol/eventstream v1.7.4/go.mod h1:IOAPF6oT9KCsceNTvvYMNHy0+kMF8akOjeDvPENWxp4= github.com/aws/aws-sdk-go-v2/config v1.32.7 h1:vxUyWGUwmkQ2g19n7JY/9YL8MfAIl7bTesIUykECXmY= github.com/aws/aws-sdk-go-v2/config v1.32.7/go.mod h1:2/Qm5vKUU/r7Y+zUk/Ptt2MDAEKAfUtKc1+3U1Mo3oY= github.com/aws/aws-sdk-go-v2/credentials v1.19.7 h1:tHK47VqqtJxOymRrNtUXN5SP/zUTvZKeLx4tH6PGQc8= github.com/aws/aws-sdk-go-v2/credentials v1.19.7/go.mod h1:qOZk8sPDrxhf+4Wf4oT2urYJrYt3RejHSzgAquYeppw= github.com/aws/aws-sdk-go-v2/feature/ec2/imds v1.18.17 h1:I0GyV8wiYrP8XpA70g1HBcQO1JlQxCMTW9npl5UbDHY= github.com/aws/aws-sdk-go-v2/feature/ec2/imds v1.18.17/go.mod h1:tyw7BOl5bBe/oqvoIeECFJjMdzXoa/dfVz3QQ5lgHGA= github.com/aws/aws-sdk-go-v2/internal/configsources v1.4.17 h1:xOLELNKGp2vsiteLsvLPwxC+mYmO6OZ8PYgiuPJzF8U= github.com/aws/aws-sdk-go-v2/internal/configsources v1.4.17/go.mod h1:5M5CI3D12dNOtH3/mk6minaRwI2/37ifCURZISxA/IQ= github.com/aws/aws-sdk-go-v2/internal/endpoints/v2 v2.7.17 h1:WWLqlh79iO48yLkj1v3ISRNiv+3KdQoZ6JWyfcsyQik= github.com/aws/aws-sdk-go-v2/internal/endpoints/v2 v2.7.17/go.mod h1:EhG22vHRrvF8oXSTYStZhJc1aUgKtnJe+aOiFEV90cM= github.com/aws/aws-sdk-go-v2/internal/ini v1.8.4 h1:WKuaxf++XKWlHWu9ECbMlha8WOEGm0OUEZqm4K/Gcfk= github.com/aws/aws-sdk-go-v2/internal/ini v1.8.4/go.mod h1:ZWy7j6v1vWGmPReu0iSGvRiise4YI5SkR3OHKTZ6Wuc= github.com/aws/aws-sdk-go-v2/internal/v4a v1.4.17 h1:JqcdRG//czea7Ppjb+g/n4o8i/R50aTBHkA7vu0lK+k= github.com/aws/aws-sdk-go-v2/internal/v4a v1.4.17/go.mod h1:CO+WeGmIdj/MlPel2KwID9Gt7CNq4M65HUfBW97liM0= github.com/aws/aws-sdk-go-v2/service/autoscaling v1.62.5 h1:3maqUQlVW7C6zAdSknv6V/LInH/RJaDW0kTFcy7dkOw= github.com/aws/aws-sdk-go-v2/service/autoscaling v1.62.5/go.mod h1:8O5Pj92iNpfw/Fa7WdHbn6YiEjDoVdutz+9PGRNoP3Y= github.com/aws/aws-sdk-go-v2/service/cloudformation v1.71.5 h1:UNllAzfiRvz9il9s0yHJkySMJbxWqEVDfyLdDblnuT4= github.com/aws/aws-sdk-go-v2/service/cloudformation v1.71.5/go.mod h1:d6XSvIZM3pSKyXNbezwYT3nAcJeUzsJIXtZMNuQ9K2k= github.com/aws/aws-sdk-go-v2/service/cloudtrail v1.55.1 h1:fRFvc/mgSPujB9JrKuPt+HGnJE9I+nDwXMhEAwHI/GM= github.com/aws/aws-sdk-go-v2/service/cloudtrail v1.55.1/go.mod h1:XSNDmicqamWtX6yg5lisFAiFaf56PErQo/cMQvUQWX0= github.com/aws/aws-sdk-go-v2/service/cloudwatch v1.53.1 h1:ElB5x0nrBHgQs+XcpQ1XJpSJzMFCq6fDTpT6WQCWOtQ= github.com/aws/aws-sdk-go-v2/service/cloudwatch v1.53.1/go.mod h1:Cj+LUEvAU073qB2jInKV6Y0nvHX0k7bL7KAga9zZ3jw= github.com/aws/aws-sdk-go-v2/service/cloudwatchlogs v1.61.1 h1:1Ci283hJE+S3XC4n5b2peV/wlcAo5rTVDb6j6JJ1aTo= github.com/aws/aws-sdk-go-v2/service/cloudwatchlogs v1.61.1/go.mod h1:WXcA3mYRgWVIzjD+kxzap0axltmt4zBVDZaRX0S86gk= github.com/aws/aws-sdk-go-v2/service/ec2 v1.279.1 h1:hnNVFVOYrzJjkqI+mxc1M4ztgcVw986n0t0TCPlnDPY= github.com/aws/aws-sdk-go-v2/service/ec2 v1.279.1/go.mod h1:Uy+C+Sc58jozdoL1McQr8bDsEvNFx+/nBY+vpO1HVUY= github.com/aws/aws-sdk-go-v2/service/ecr v1.36.2 h1:VDQaVwGOokbd3VUbHF+wupiffdrbAZPdQnr5XZMJqrs= github.com/aws/aws-sdk-go-v2/service/ecr v1.36.2/go.mod h1:lvUlMghKYmSxSfv0vU7pdU/8jSY+s0zpG8xXhaGKCw0= github.com/aws/aws-sdk-go-v2/service/ecrpublic v1.27.2 h1:Zru9Iy2JPM5+uRnFnoqeOZzi8JIVIHJ0ua6JdeDHcyg= github.com/aws/aws-sdk-go-v2/service/ecrpublic v1.27.2/go.mod h1:PtQC3XjutCYFCn1+i8+wtpDaXvEK+vXF2gyLIKAmh4A= github.com/aws/aws-sdk-go-v2/service/eks v1.76.4 h1:5f9jIMcEd0wvRpEoo925Ltfw/2Yalcf+amFm3e1tRd8= github.com/aws/aws-sdk-go-v2/service/eks v1.76.4/go.mod h1:Qg678m+87sCuJhcsZojenz8mblYG+Tq86V4m3hjVz0s= github.com/aws/aws-sdk-go-v2/service/elasticloadbalancing v1.33.15 h1:dJtNm4/eMx8nczyN3P4iAARXMj2rAvOJnj608zCqCmw= github.com/aws/aws-sdk-go-v2/service/elasticloadbalancing v1.33.15/go.mod h1:QEbuU4eh8HGdv4uvld0Jth+KW8L0lOSYlyPcW6+JJo8= github.com/aws/aws-sdk-go-v2/service/elasticloadbalancingv2 v1.54.2 h1:xJkfrBzq4b4JxnxwNNzjUKmbQj1hPa4uUikSeXQFBYk= github.com/aws/aws-sdk-go-v2/service/elasticloadbalancingv2 v1.54.2/go.mod h1:DpGMmFhQwV/HH9zugLT5Ovf9HMKdQ+6ejfJybqEC9i4= github.com/aws/aws-sdk-go-v2/service/iam v1.53.2 h1:62G6btFUwAa5uR5iPlnlNVAM0zJSLbWgDfKOfUC7oW4= github.com/aws/aws-sdk-go-v2/service/iam v1.53.2/go.mod h1:av9clChrbZbJ5E21msSsiT2oghl2BJHfQGhCkXmhyu8= github.com/aws/aws-sdk-go-v2/service/internal/accept-encoding v1.13.4 h1:0ryTNEdJbzUCEWkVXEXoqlXV72J5keC1GvILMOuD00E= github.com/aws/aws-sdk-go-v2/service/internal/accept-encoding v1.13.4/go.mod h1:HQ4qwNZh32C3CBeO6iJLQlgtMzqeG17ziAA/3KDJFow= github.com/aws/aws-sdk-go-v2/service/internal/checksum v1.9.8 h1:Z5EiPIzXKewUQK0QTMkutjiaPVeVYXX7KIqhXu/0fXs= github.com/aws/aws-sdk-go-v2/service/internal/checksum v1.9.8/go.mod h1:FsTpJtvC4U1fyDXk7c71XoDv3HlRm8V3NiYLeYLh5YE= github.com/aws/aws-sdk-go-v2/service/internal/presigned-url v1.13.17 h1:RuNSMoozM8oXlgLG/n6WLaFGoea7/CddrCfIiSA+xdY= github.com/aws/aws-sdk-go-v2/service/internal/presigned-url v1.13.17/go.mod h1:F2xxQ9TZz5gDWsclCtPQscGpP0VUOc8RqgFM3vDENmU= github.com/aws/aws-sdk-go-v2/service/internal/s3shared v1.19.17 h1:bGeHBsGZx0Dvu/eJC0Lh9adJa3M1xREcndxLNZlve2U= github.com/aws/aws-sdk-go-v2/service/internal/s3shared v1.19.17/go.mod h1:dcW24lbU0CzHusTE8LLHhRLI42ejmINN8Lcr22bwh/g= github.com/aws/aws-sdk-go-v2/service/kms v1.47.1 h1:6+C0RoGF4HJQALrsecOXN7cm/l5rgNHCw2xbcvFgpH4= github.com/aws/aws-sdk-go-v2/service/kms v1.47.1/go.mod h1:VJcNH6BLr+3VJwinRKdotLOMglHO8mIKlD3ea5c7hbw= github.com/aws/aws-sdk-go-v2/service/outposts v1.57.8 h1:zB9Q/dG0NkURC5E1g4qL/lsUp7aOqilfb7Ru9EOigDU= github.com/aws/aws-sdk-go-v2/service/outposts v1.57.8/go.mod h1:3osURGv9q/2wxP1qYnB15GWYgr6w2AbQkSxYtE6vTaY= github.com/aws/aws-sdk-go-v2/service/pricing v1.34.3 h1:vAv0hi3SWcc8cotkWRP4mPkmRbp/XqWKFyPW4Nwpzv0= github.com/aws/aws-sdk-go-v2/service/pricing v1.34.3/go.mod h1:giTP9ufzBQJRB6bc7P30PO8s35hCp6au5uM70zkohU4= github.com/aws/aws-sdk-go-v2/service/s3 v1.95.1 h1:C2dUPSnEpy4voWFIq3JNd8gN0Y5vYGDo44eUE58a/p8= github.com/aws/aws-sdk-go-v2/service/s3 v1.95.1/go.mod h1:5jggDlZ2CLQhwJBiZJb4vfk4f0GxWdEDruWKEJ1xOdo= github.com/aws/aws-sdk-go-v2/service/signin v1.0.5 h1:VrhDvQib/i0lxvr3zqlUwLwJP4fpmpyD9wYG1vfSu+Y= github.com/aws/aws-sdk-go-v2/service/signin v1.0.5/go.mod h1:k029+U8SY30/3/ras4G/Fnv/b88N4mAfliNn08Dem4M= github.com/aws/aws-sdk-go-v2/service/ssm v1.67.8 h1:31Llf5VfrZ78YvYs7sWcS7L2m3waikzRc6q1nYenVS4= github.com/aws/aws-sdk-go-v2/service/ssm v1.67.8/go.mod h1:/jgaDlU1UImoxTxhRNxXHvBAPqPZQ8oCjcPbbkR6kac= github.com/aws/aws-sdk-go-v2/service/sso v1.30.9 h1:v6EiMvhEYBoHABfbGB4alOYmCIrcgyPPiBE1wZAEbqk= github.com/aws/aws-sdk-go-v2/service/sso v1.30.9/go.mod h1:yifAsgBxgJWn3ggx70A3urX2AN49Y5sJTD1UQFlfqBw= github.com/aws/aws-sdk-go-v2/service/ssooidc v1.35.13 h1:gd84Omyu9JLriJVCbGApcLzVR3XtmC4ZDPcAI6Ftvds= github.com/aws/aws-sdk-go-v2/service/ssooidc v1.35.13/go.mod h1:sTGThjphYE4Ohw8vJiRStAcu3rbjtXRsdNB0TvZ5wwo= github.com/aws/aws-sdk-go-v2/service/sts v1.41.6 h1:5fFjR/ToSOzB2OQ/XqWpZBmNvmP/pJ1jOWYlFDJTjRQ= github.com/aws/aws-sdk-go-v2/service/sts v1.41.6/go.mod h1:qgFDZQSD/Kys7nJnVqYlWKnh0SSdMjAi0uSwON4wgYQ= github.com/aws/smithy-go v1.24.0 h1:LpilSUItNPFr1eY85RYgTIg5eIEPtvFbskaFcmmIUnk= github.com/aws/smithy-go v1.24.0/go.mod h1:LEj2LM3rBRQJxPZTB4KuzZkaZYnZPnvgIhb4pu07mx0= github.com/awslabs/amazon-ecr-credential-helper/ecr-login v0.0.0-20240318154307-a1a918375412 h1:tfbmGNeOidVXzO1I7zo/WsT5QX7Aa0BGTbnEAE4FG3E= github.com/awslabs/amazon-ecr-credential-helper/ecr-login v0.0.0-20240318154307-a1a918375412/go.mod h1:kcUkjB9HwuV7PSck2b60kJtgDy+eTHWuAP0kb93FXsk= github.com/awslabs/amazon-eks-ami/nodeadm v0.0.0-20251001043626-89ce6578d960 h1:F/q1AN14KuY3I6HyEJxEUuQmEo5cDRpbXptP7UlB8GQ= github.com/awslabs/amazon-eks-ami/nodeadm v0.0.0-20251001043626-89ce6578d960/go.mod h1:cOBzmLe5lF+1C3h0SNnbl2LvMi+Gm8EXGlPxdXoucio= github.com/awslabs/operatorpkg v0.0.0-20250909182303-e8e550b6f339 h1:p4oSlQ9IaT7/DHfgcrs9zdNhdIp37VIMujZLuxSgECk= github.com/awslabs/operatorpkg v0.0.0-20250909182303-e8e550b6f339/go.mod h1:tNmCf0qIjaGbODGbm3DM8GIKBUvvxM7iW3KHbpSnVgw= github.com/aymanbagabas/go-osc52/v2 v2.0.1 h1:HwpRHbFMcZLEVr42D4p7XBqjyuxQH5SMiErDT4WkJ2k= github.com/aymanbagabas/go-osc52/v2 v2.0.1/go.mod h1:uYgXzlJ7ZpABp8OJ+exZzJJhRNQ2ASbcXHWsFqH8hp8= github.com/beorn7/perks v1.0.1 h1:VlbKKnNfV8bJzeqoa4cOKqO6bYr3WgKZxO8Z16+hsOM= github.com/beorn7/perks v1.0.1/go.mod h1:G2ZrVWU2WbWT9wwq4/hrbKbnv/1ERSJQ0ibhJ6rlkpw= github.com/blang/semver v3.5.1+incompatible h1:cQNTCjp13qL8KC3Nbxr/y2Bqb63oX6wdnnjpJbkM4JQ= github.com/blang/semver v3.5.1+incompatible/go.mod h1:kRBLl5iJ+tD4TcOOxsy/0fnwebNt5EWlYSAyrTnjyyk= github.com/blang/semver/v4 v4.0.0 h1:1PFHFE6yCCTv8C1TeyNNarDzntLi7wMI5i/pzqYIsAM= github.com/blang/semver/v4 v4.0.0/go.mod h1:IbckMUScFkM3pff0VJDNKRiT6TG/YpiHIM2yvyW5YoQ= github.com/buildkite/agent/v3 v3.81.0 h1:JVfkng2XnsXesFXwiFwLJFkuzVu4zvoJCvedfoIXD6E= github.com/buildkite/agent/v3 v3.81.0/go.mod h1:edJeyycODRxaFvpT22rDGwaQ5oa4eB8GjtbjgX5VpFw= github.com/buildkite/go-pipeline v0.13.1 h1:Y9p8pQIwPtauVwNrcmTDH6+XK7jE1nLuvWVaK8oymA8= github.com/buildkite/go-pipeline v0.13.1/go.mod h1:2HHqlSFTYgHFhzedJu0LhLs9n5c9XkYnHiQFVN5HE4U= github.com/buildkite/interpolate v0.1.3 h1:OFEhqji1rNTRg0u9DsSodg63sjJQEb1uWbENq9fUOBM= github.com/buildkite/interpolate v0.1.3/go.mod h1:UNVe6A+UfiBNKbhAySrBbZFZFxQ+DXr9nWen6WVt/A8= github.com/buildkite/roko v1.2.0 h1:hbNURz//dQqNl6Eo9awjQOVOZwSDJ8VEbBDxSfT9rGQ= github.com/buildkite/roko v1.2.0/go.mod h1:23R9e6nHxgedznkwwfmqZ6+0VJZJZ2Sg/uVcp2cP46I= github.com/bytecodealliance/wasmtime-go/v3 v3.0.2 h1:3uZCA/BLTIu+DqCfguByNMJa2HVHpXvjfy0Dy7g6fuA= github.com/bytecodealliance/wasmtime-go/v3 v3.0.2/go.mod h1:RnUjnIXxEJcL6BgCvNyzCCRzZcxCgsZCi+RNlvYor5Q= github.com/cenkalti/backoff/v4 v4.3.0 h1:MyRJ/UdXutAwSAT+s3wNd7MfTIcy71VQueUuFK343L8= github.com/cenkalti/backoff/v4 v4.3.0/go.mod h1:Y3VNntkOUPxTVeUxJ/G5vcM//AlwfmyYozVcomhLiZE= github.com/cenkalti/backoff/v5 v5.0.3 h1:ZN+IMa753KfX5hd8vVaMixjnqRZ3y8CuJKRKj1xcsSM= github.com/cenkalti/backoff/v5 v5.0.3/go.mod h1:rkhZdG3JZukswDf7f0cwqPNk4K0sa+F97BxZthm/crw= github.com/census-instrumentation/opencensus-proto v0.2.1/go.mod h1:f6KPmirojxKA12rnyqOA5BBL4O983OfeGPqjHWSTneU= github.com/cespare/xxhash/v2 v2.3.0 h1:UL815xU9SqsFlibzuggzjXhog7bL6oX9BbNZnL2UFvs= github.com/cespare/xxhash/v2 v2.3.0/go.mod h1:VGX0DQ3Q6kWi7AoAeZDth3/j3BFtOZR5XLFGgcrjCOs= github.com/charmbracelet/bubbles v0.20.0 h1:jSZu6qD8cRQ6k9OMfR1WlM+ruM8fkPWkHvQWD9LIutE= github.com/charmbracelet/bubbles v0.20.0/go.mod h1:39slydyswPy+uVOHZ5x/GjwVAFkCsV8IIVy+4MhzwwU= github.com/charmbracelet/bubbletea v1.3.10 h1:otUDHWMMzQSB0Pkc87rm691KZ3SWa4KUlvF9nRvCICw= github.com/charmbracelet/bubbletea v1.3.10/go.mod h1:ORQfo0fk8U+po9VaNvnV95UPWA1BitP1E0N6xJPlHr4= github.com/charmbracelet/colorprofile v0.2.3-0.20250311203215-f60798e515dc h1:4pZI35227imm7yK2bGPcfpFEmuY1gc2YSTShr4iJBfs= github.com/charmbracelet/colorprofile v0.2.3-0.20250311203215-f60798e515dc/go.mod h1:X4/0JoqgTIPSFcRA/P6INZzIuyqdFY5rm8tb41s9okk= github.com/charmbracelet/lipgloss v1.1.0 h1:vYXsiLHVkK7fp74RkV7b2kq9+zDLoEU4MZoFqR/noCY= github.com/charmbracelet/lipgloss v1.1.0/go.mod h1:/6Q8FR2o+kj8rz4Dq0zQc3vYf7X+B0binUUBwA0aL30= github.com/charmbracelet/x/ansi v0.10.1 h1:rL3Koar5XvX0pHGfovN03f5cxLbCF2YvLeyz7D2jVDQ= github.com/charmbracelet/x/ansi v0.10.1/go.mod h1:3RQDQ6lDnROptfpWuUVIUG64bD2g2BgntdxH0Ya5TeE= github.com/charmbracelet/x/cellbuf v0.0.13-0.20250311204145-2c3ea96c31dd h1:vy0GVL4jeHEwG5YOXDmi86oYw2yuYUGqz6a8sLwg0X8= github.com/charmbracelet/x/cellbuf v0.0.13-0.20250311204145-2c3ea96c31dd/go.mod h1:xe0nKWGd3eJgtqZRaN9RjMtK7xUYchjzPr7q6kcvCCs= github.com/charmbracelet/x/term v0.2.1 h1:AQeHeLZ1OqSXhrAWpYUtZyX1T3zVxfpZuEQMIQaGIAQ= github.com/charmbracelet/x/term v0.2.1/go.mod h1:oQ4enTYFV7QN4m0i9mzHrViD7TQKvNEEkHUMCmsxdUg= github.com/chrismellard/docker-credential-acr-env v0.0.0-20230304212654-82a0ddb27589 h1:krfRl01rzPzxSxyLyrChD+U+MzsBXbm0OwYYB67uF+4= github.com/chrismellard/docker-credential-acr-env v0.0.0-20230304212654-82a0ddb27589/go.mod h1:OuDyvmLnMCwa2ep4Jkm6nyA0ocJuZlGyk2gGseVzERM= github.com/chzyer/logex v1.1.10/go.mod h1:+Ywpsq7O8HXn0nuIou7OrIPyXbp3wmkHB+jjWRnGsAI= github.com/chzyer/readline v0.0.0-20180603132655-2972be24d48e/go.mod h1:nSuG5e5PlCu98SY8svDHJxuZscDgtXS6KTTbou5AhLI= github.com/chzyer/test v0.0.0-20180213035817-a1ea475d72b1/go.mod h1:Q3SI9o4m/ZMnBNeIyt5eFwwo7qiLfzFZmjNmxjkiQlU= github.com/clbanning/mxj/v2 v2.5.5/go.mod h1:hNiWqW14h+kc+MdF9C6/YoRfjEJoR3ou6tn/Qo+ve2s= github.com/clbanning/mxj/v2 v2.7.0 h1:WA/La7UGCanFe5NpHF0Q3DNtnCsVoxbPKuyBNHWRyME= github.com/clbanning/mxj/v2 v2.7.0/go.mod h1:hNiWqW14h+kc+MdF9C6/YoRfjEJoR3ou6tn/Qo+ve2s= github.com/client9/misspell v0.3.4/go.mod h1:qj6jICC3Q7zFZvVWo7KLAzC3yx5G7kyvSDkc90ppPyw= github.com/cloudflare/cfssl v1.6.5 h1:46zpNkm6dlNkMZH/wMW22ejih6gIaJbzL2du6vD7ZeI= github.com/cloudflare/cfssl v1.6.5/go.mod h1:Bk1si7sq8h2+yVEDrFJiz3d7Aw+pfjjJSZVaD+Taky4= github.com/cloudflare/circl v1.6.3 h1:9GPOhQGF9MCYUeXyMYlqTR6a5gTrgR/fBLXvUgtVcg8= github.com/cloudflare/circl v1.6.3/go.mod h1:2eXP6Qfat4O/Yhh8BznvKnJ+uzEoTQ6jVKJRn81BiS4= github.com/cncf/udpa/go v0.0.0-20191209042840-269d4d468f6f/go.mod h1:M8M6+tZqaGXZJjfX53e64911xZQV5JYwmTeXPW+k8Sc= github.com/cncf/xds/go v0.0.0-20251210132809-ee656c7534f5 h1:6xNmx7iTtyBRev0+D/Tv1FZd4SCg8axKApyNyRsAt/w= github.com/cncf/xds/go v0.0.0-20251210132809-ee656c7534f5/go.mod h1:KdCmV+x/BuvyMxRnYBlmVaq4OLiKW6iRQfvC62cvdkI= github.com/cockroachdb/apd/v3 v3.2.1 h1:U+8j7t0axsIgvQUqthuNm82HIrYXodOV2iWLWtEaIwg= github.com/cockroachdb/apd/v3 v3.2.1/go.mod h1:klXJcjp+FffLTHlhIG69tezTDvdP065naDsHzKhYSqc= github.com/codahale/rfc6979 v0.0.0-20141003034818-6a90f24967eb h1:EDmT6Q9Zs+SbUoc7Ik9EfrFqcylYqgPZ9ANSbTAntnE= github.com/codahale/rfc6979 v0.0.0-20141003034818-6a90f24967eb/go.mod h1:ZjrT6AXHbDs86ZSdt/osfBi5qfexBrKUdONk989Wnk4= github.com/common-nighthawk/go-figure v0.0.0-20210622060536-734e95fb86be h1:J5BL2kskAlV9ckgEsNQXscjIaLiOYiZ75d4e94E6dcQ= github.com/common-nighthawk/go-figure v0.0.0-20210622060536-734e95fb86be/go.mod h1:mk5IQ+Y0ZeO87b858TlA645sVcEcbiX6YqP98kt+7+w= github.com/containerd/stargz-snapshotter/estargz v0.18.1 h1:cy2/lpgBXDA3cDKSyEfNOFMA/c10O1axL69EU7iirO8= github.com/containerd/stargz-snapshotter/estargz v0.18.1/go.mod h1:ALIEqa7B6oVDsrF37GkGN20SuvG/pIMm7FwP7ZmRb0Q= github.com/coreos/go-oidc/v3 v3.17.0 h1:hWBGaQfbi0iVviX4ibC7bk8OKT5qNr4klBaCHVNvehc= github.com/coreos/go-oidc/v3 v3.17.0/go.mod h1:wqPbKFrVnE90vty060SB40FCJ8fTHTxSwyXJqZH+sI8= github.com/cpuguy83/go-md2man/v2 v2.0.6/go.mod h1:oOW0eioCTA6cOiMLiUPZOpcVxMig6NIQQ7OS05n1F4g= github.com/cyberphone/json-canonicalization v0.0.0-20231217050601-ba74d44ecf5f h1:eHnXnuK47UlSTOQexbzxAZfekVz6i+LKRdj1CU5DPaM= github.com/cyberphone/json-canonicalization v0.0.0-20231217050601-ba74d44ecf5f/go.mod h1:uzvlm1mxhHkdfqitSA92i7Se+S9ksOn3a3qmv/kyOCw= github.com/cyphar/filepath-securejoin v0.4.1 h1:JyxxyPEaktOD+GAnqIqTf9A8tHyAG22rowi7HkoSU1s= github.com/cyphar/filepath-securejoin v0.4.1/go.mod h1:Sdj7gXlvMcPZsbhwhQ33GguGLDGQL7h7bg04C/+u9jI= github.com/danieljoos/wincred v1.2.2 h1:774zMFJrqaeYCK2W57BgAem/MLi6mtSE47MB6BOJ0i0= github.com/danieljoos/wincred v1.2.2/go.mod h1:w7w4Utbrz8lqeMbDAK0lkNJUv5sAOkFi7nd/ogr0Uh8= github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= github.com/davecgh/go-spew v1.1.2-0.20180830191138-d8f796af33cc h1:U9qPSI2PIWSS1VwoXQT9A3Wy9MM3WgvqSxFWenqJduM= github.com/davecgh/go-spew v1.1.2-0.20180830191138-d8f796af33cc/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= github.com/depcheck-test/depcheck-test v0.0.0-20220607135614-199033aaa936 h1:foGzavPWwtoyBvjWyKJYDYsyzy+23iBV7NKTwdk+LRY= github.com/depcheck-test/depcheck-test v0.0.0-20220607135614-199033aaa936/go.mod h1:ttKPnOepYt4LLzD+loXQ1rT6EmpyIYHro7TAJuIIlHo= github.com/dgraph-io/badger/v4 v4.7.0 h1:Q+J8HApYAY7UMpL8d9owqiB+odzEc0zn/aqOD9jhc6Y= github.com/dgraph-io/badger/v4 v4.7.0/go.mod h1:He7TzG3YBy3j4f5baj5B7Zl2XyfNe5bl4Udl0aPemVA= github.com/dgraph-io/ristretto/v2 v2.2.0 h1:bkY3XzJcXoMuELV8F+vS8kzNgicwQFAaGINAEJdWGOM= github.com/dgraph-io/ristretto/v2 v2.2.0/go.mod h1:RZrm63UmcBAaYWC1DotLYBmTvgkrs0+XhBd7Npn7/zI= github.com/dgryski/trifles v0.0.0-20230903005119-f50d829f2e54 h1:SG7nF6SRlWhcT7cNTs5R6Hk4V2lcmLz2NsG2VnInyNo= github.com/dgryski/trifles v0.0.0-20230903005119-f50d829f2e54/go.mod h1:if7Fbed8SFyPtHLHbg49SI7NAdJiC5WIA09pe59rfAA= github.com/digitorus/pkcs7 v0.0.0-20230713084857-e76b763bdc49/go.mod h1:SKVExuS+vpu2l9IoOc0RwqE7NYnb0JlcFHFnEJkVDzc= github.com/digitorus/pkcs7 v0.0.0-20230818184609-3a137a874352 h1:ge14PCmCvPjpMQMIAH7uKg0lrtNSOdpYsRXlwk3QbaE= github.com/digitorus/pkcs7 v0.0.0-20230818184609-3a137a874352/go.mod h1:SKVExuS+vpu2l9IoOc0RwqE7NYnb0JlcFHFnEJkVDzc= github.com/digitorus/timestamp v0.0.0-20231217203849-220c5c2851b7 h1:lxmTCgmHE1GUYL7P0MlNa00M67axePTq+9nBSGddR8I= github.com/digitorus/timestamp v0.0.0-20231217203849-220c5c2851b7/go.mod h1:GvWntX9qiTlOud0WkQ6ewFm0LPy5JUR1Xo0Ngbd1w6Y= github.com/dimchansky/utfbom v1.1.1 h1:vV6w1AhK4VMnhBno/TPVCoK9U/LP0PkLCS9tbxHdi/U= github.com/dimchansky/utfbom v1.1.1/go.mod h1:SxdoEBH5qIqFocHMyGOXVAybYJdr71b1Q/j0mACtrfE= github.com/docker/cli v29.0.3+incompatible h1:8J+PZIcF2xLd6h5sHPsp5pvvJA+Sr2wGQxHkRl53a1E= github.com/docker/cli v29.0.3+incompatible/go.mod h1:JLrzqnKDaYBop7H2jaqPtU4hHvMKP+vjCwu2uszcLI8= github.com/docker/distribution v2.8.3+incompatible h1:AtKxIZ36LoNK51+Z6RpzLpddBirtxJnzDrHLEKxTAYk= github.com/docker/distribution v2.8.3+incompatible/go.mod h1:J2gT2udsDAN96Uj4KfcMRqY0/ypR+oyYUYmja8H+y+w= github.com/docker/docker-credential-helpers v0.9.3 h1:gAm/VtF9wgqJMoxzT3Gj5p4AqIjCBS4wrsOh9yRqcz8= github.com/docker/docker-credential-helpers v0.9.3/go.mod h1:x+4Gbw9aGmChi3qTLZj8Dfn0TD20M/fuWy0E5+WDeCo= github.com/dustin/go-humanize v1.0.1 h1:GzkhY7T5VNhEkwH0PVJgjz+fX1rhBrR7pRT3mDkpeCY= github.com/dustin/go-humanize v1.0.1/go.mod h1:Mu1zIs6XwVuF/gI1OepvI0qD18qycQx+mFykh5fBlto= github.com/elazarl/goproxy v1.7.2 h1:Y2o6urb7Eule09PjlhQRGNsqRfPmYI3KKQLFpCAV3+o= github.com/elazarl/goproxy v1.7.2/go.mod h1:82vkLNir0ALaW14Rc399OTTjyNREgmdL2cVoIbS6XaE= github.com/emicklei/go-restful/v3 v3.12.2 h1:DhwDP0vY3k8ZzE0RunuJy8GhNpPL6zqLkDf9B/a0/xU= github.com/emicklei/go-restful/v3 v3.12.2/go.mod h1:6n3XBCmQQb25CM2LCACGz8ukIrRry+4bhvbpWn3mrbc= github.com/emicklei/proto v1.13.2 h1:z/etSFO3uyXeuEsVPzfl56WNgzcvIr42aQazXaQmFZY= github.com/emicklei/proto v1.13.2/go.mod h1:rn1FgRS/FANiZdD2djyH7TMA9jdRDcYQ9IEN9yvjX0A= github.com/emirpasic/gods v1.18.1 h1:FXtiHYKDGKCW2KzwZKx0iC0PQmdlorYgdFG9jPXJ1Bc= github.com/emirpasic/gods v1.18.1/go.mod h1:8tpGGwCnJ5H4r6BWwaV6OrWmMoPhUl5jm/FMNAnJvWQ= github.com/envoyproxy/go-control-plane v0.9.0/go.mod h1:YTl/9mNaCwkRvm6d1a2C3ymFceY/DCBVvsKhRF0iEA4= github.com/envoyproxy/go-control-plane v0.9.4/go.mod h1:6rpuAdCZL397s3pYoYcLgu1mIlRU8Am5FuJP05cCM98= github.com/envoyproxy/go-control-plane v0.14.0 h1:hbG2kr4RuFj222B6+7T83thSPqLjwBIfQawTkC++2HA= github.com/envoyproxy/go-control-plane v0.14.0/go.mod h1:NcS5X47pLl/hfqxU70yPwL9ZMkUlwlKxtAohpi2wBEU= github.com/envoyproxy/go-control-plane/envoy v1.36.0 h1:yg/JjO5E7ubRyKX3m07GF3reDNEnfOboJ0QySbH736g= github.com/envoyproxy/go-control-plane/envoy v1.36.0/go.mod h1:ty89S1YCCVruQAm9OtKeEkQLTb+Lkz0k8v9W0Oxsv98= github.com/envoyproxy/go-control-plane/ratelimit v0.1.0 h1:/G9QYbddjL25KvtKTv3an9lx6VBE2cnb8wp1vEGNYGI= github.com/envoyproxy/go-control-plane/ratelimit v0.1.0/go.mod h1:Wk+tMFAFbCXaJPzVVHnPgRKdUdwW/KdbRt94AzgRee4= github.com/envoyproxy/protoc-gen-validate v0.1.0/go.mod h1:iSmxcyjqTsJpI2R4NaDN7+kN2VEUnK/pcBlmesArF7c= github.com/envoyproxy/protoc-gen-validate v1.3.0 h1:TvGH1wof4H33rezVKWSpqKz5NXWg5VPuZ0uONDT6eb4= github.com/envoyproxy/protoc-gen-validate v1.3.0/go.mod h1:HvYl7zwPa5mffgyeTUHA9zHIH36nmrm7oCbo4YKoSWA= github.com/erikgeiser/coninput v0.0.0-20211004153227-1c3628e74d0f h1:Y/CXytFA4m6baUTXGLOoWe4PQhGxaX0KpnayAqC48p4= github.com/erikgeiser/coninput v0.0.0-20211004153227-1c3628e74d0f/go.mod h1:vw97MGsxSvLiUE2X8qFplwetxpGLQrlU1Q9AUEIzCaM= github.com/evanphx/json-patch v5.9.11+incompatible h1:ixHHqfcGvxhWkniF1tWxBHA0yb4Z+d1UQi45df52xW8= github.com/evanphx/json-patch v5.9.11+incompatible/go.mod h1:50XU6AFN0ol/bzJsmQLiYLvXMP4fmwYFNcr97nuDLSk= github.com/evanphx/json-patch/v5 v5.9.11 h1:/8HVnzMq13/3x9TPvjG08wUGqBTmZBsCWzjTM0wiaDU= github.com/evanphx/json-patch/v5 v5.9.11/go.mod h1:3j+LviiESTElxA4p3EMKAB9HXj3/XEtnUf6OZxqIQTM= github.com/evertras/bubble-table v0.17.1 h1:HJwq3iQrZulXDE93ZcqJNiUVQCBbN4IJ2CkB/IxO3kk= github.com/evertras/bubble-table v0.17.1/go.mod h1:ifHujS1YxwnYSOgcR2+m3GnJ84f7CVU/4kUOxUCjEbQ= github.com/fatih/color v1.10.0/go.mod h1:ELkj/draVOlAH/xkhN6mQ50Qd0MPOk5AAr3maGEBuJM= github.com/fatih/color v1.18.0 h1:S8gINlzdQ840/4pfAwic/ZE0djQEH3wM94VfqLTZcOM= github.com/fatih/color v1.18.0/go.mod h1:4FelSpRwEGDpQ12mAdzqdOukCy4u8WUtOY6lkT/6HfU= github.com/felixge/httpsnoop v1.0.4 h1:NFTV2Zj1bL4mc9sqWACXbQFVBBg2W3GPvqp8/ESS2Wg= github.com/felixge/httpsnoop v1.0.4/go.mod h1:m8KPJKqk1gH5J9DgRY2ASl2lWCfGKXixSwevea8zH2U= github.com/fortytw2/leaktest v1.3.0 h1:u8491cBMTQ8ft8aeV+adlcytMZylmA5nnwwkRZjI8vw= github.com/fortytw2/leaktest v1.3.0/go.mod h1:jDsjWgpAGjm2CA7WthBh/CdZYEPF31XHquHwclZch5g= github.com/foxcpp/go-mockdns v1.1.0 h1:jI0rD8M0wuYAxL7r/ynTrCQQq0BVqfB99Vgk7DlmewI= github.com/foxcpp/go-mockdns v1.1.0/go.mod h1:IhLeSFGed3mJIAXPH2aiRQB+kqz7oqu8ld2qVbOu7Wk= github.com/frankban/quicktest v1.14.6 h1:7Xjx+VpznH+oBnejlPUj8oUpdxnVs4f8XU8WnHkI4W8= github.com/frankban/quicktest v1.14.6/go.mod h1:4ptaffx2x8+WTWXmUCuVU6aPUX1/Mz7zb5vbUoiM6w0= github.com/fsnotify/fsnotify v1.4.7/go.mod h1:jwhsz4b93w/PPRr/qN1Yymfu8t87LnFCMoQvtojpjFo= github.com/fsnotify/fsnotify v1.4.9/go.mod h1:znqG4EE+3YCdAaPaxE2ZRY/06pZUdp0tY4IgpuI1SZQ= github.com/fsnotify/fsnotify v1.5.4/go.mod h1:OVB6XrOHzAwXMpEM7uPOzcehqUV2UqJxmVXmkdnm1bU= github.com/fsnotify/fsnotify v1.9.0 h1:2Ml+OJNzbYCTzsxtv8vKSFD9PbJjmhYF14k/jKC7S9k= github.com/fsnotify/fsnotify v1.9.0/go.mod h1:8jBTzvmWwFyi3Pb8djgCCO5IBqzKJ/Jwo8TRcHyHii0= github.com/fxamacker/cbor/v2 v2.9.0 h1:NpKPmjDBgUfBms6tr6JZkTHtfFGcMKsw3eGcmD/sapM= github.com/fxamacker/cbor/v2 v2.9.0/go.mod h1:vM4b+DJCtHn+zz7h3FFp/hDAI9WNWCsZj23V5ytsSxQ= github.com/glebarez/go-sqlite v1.22.0 h1:uAcMJhaA6r3LHMTFgP0SifzgXg46yJkgxqyuyec+ruQ= github.com/glebarez/go-sqlite v1.22.0/go.mod h1:PlBIdHe0+aUEFn+r2/uthrWq4FxbzugL0L8Li6yQJbc= github.com/gliderlabs/ssh v0.3.8 h1:a4YXD1V7xMF9g5nTkdfnja3Sxy1PVDCj1Zg4Wb8vY6c= github.com/gliderlabs/ssh v0.3.8/go.mod h1:xYoytBv1sV0aL3CavoDuJIQNURXkkfPA/wxQ1pL1fAU= github.com/go-chi/chi v4.1.2+incompatible h1:fGFk2Gmi/YKXk0OmGfBh0WgmN3XB8lVnEyNz34tQRec= github.com/go-chi/chi v4.1.2+incompatible/go.mod h1:eB3wogJHnLi3x/kFX2A+IbTBlXxmMeXJVKy9tTv1XzQ= github.com/go-errors/errors v1.5.1 h1:ZwEMSLRCapFLflTpT7NKaAc7ukJ8ZPEjzlxt8rPN8bk= github.com/go-errors/errors v1.5.1/go.mod h1:sIVyrIiJhuEF+Pj9Ebtd6P/rEYROXFi3BopGUQ5a5Og= github.com/go-git/gcfg v1.5.1-0.20230307220236-3a3c6141e376 h1:+zs/tPmkDkHx3U66DAb0lQFJrpS6731Oaa12ikc+DiI= github.com/go-git/gcfg v1.5.1-0.20230307220236-3a3c6141e376/go.mod h1:an3vInlBmSxCcxctByoQdvwPiA7DTK7jaaFDBTtu0ic= github.com/go-git/go-billy/v5 v5.8.0 h1:I8hjc3LbBlXTtVuFNJuwYuMiHvQJDq1AT6u4DwDzZG0= github.com/go-git/go-billy/v5 v5.8.0/go.mod h1:RpvI/rw4Vr5QA+Z60c6d6LXH0rYJo0uD5SqfmrrheCY= github.com/go-git/go-git-fixtures/v4 v4.3.2-0.20231010084843-55a94097c399 h1:eMje31YglSBqCdIqdhKBW8lokaMrL3uTkpGYlE2OOT4= github.com/go-git/go-git-fixtures/v4 v4.3.2-0.20231010084843-55a94097c399/go.mod h1:1OCfN199q1Jm3HZlxleg+Dw/mwps2Wbk9frAWm+4FII= github.com/go-git/go-git/v5 v5.17.1 h1:WnljyxIzSj9BRRUlnmAU35ohDsjRK0EKmL0evDqi5Jk= github.com/go-git/go-git/v5 v5.17.1/go.mod h1:pW/VmeqkanRFqR6AljLcs7EA7FbZaN5MQqO7oZADXpo= github.com/go-ini/ini v1.67.0 h1:z6ZrTEZqSWOTyH2FlglNbNgARyHG8oLW9gMELqKr06A= github.com/go-ini/ini v1.67.0/go.mod h1:ByCAeIL28uOIIG0E3PJtZPDL8WnHpFKFOtgjp+3Ies8= github.com/go-jose/go-jose/v3 v3.0.4 h1:Wp5HA7bLQcKnf6YYao/4kpRpVMp/yf6+pJKV8WFSaNY= github.com/go-jose/go-jose/v3 v3.0.4/go.mod h1:5b+7YgP7ZICgJDBdfjZaIt+H/9L9T/YQrVfLAMboGkQ= github.com/go-jose/go-jose/v4 v4.1.3 h1:CVLmWDhDVRa6Mi/IgCgaopNosCaHz7zrMeF9MlZRkrs= github.com/go-jose/go-jose/v4 v4.1.3/go.mod h1:x4oUasVrzR7071A4TnHLGSPpNOm2a21K9Kf04k1rs08= github.com/go-logr/logr v0.1.0/go.mod h1:ixOQHD9gLJUVQQ2ZOR7zLEifBX6tGkNJF4QyIY7sIas= github.com/go-logr/logr v1.2.2/go.mod h1:jdQByPbusPIv2/zmleS9BjJVeZ6kBagPoEUsqbVz/1A= github.com/go-logr/logr v1.4.3 h1:CjnDlHq8ikf6E492q6eKboGOC0T8CDaOvkHCIg8idEI= github.com/go-logr/logr v1.4.3/go.mod h1:9T104GzyrTigFIr8wt5mBrctHMim0Nb2HLGrmQ40KvY= github.com/go-logr/stdr v1.2.2 h1:hSWxHoqTgW2S2qGc0LTAI563KZ5YKYRhT3MFKZMbjag= github.com/go-logr/stdr v1.2.2/go.mod h1:mMo/vtBO5dYbehREoey6XUKy/eSumjCCveDpRre4VKE= github.com/go-logr/zapr v1.3.0 h1:XGdV8XW8zdwFiwOA2Dryh1gj2KRQyOOoNmBy4EplIcQ= github.com/go-logr/zapr v1.3.0/go.mod h1:YKepepNBd1u/oyhd/yQmtjVXmm9uML4IXUgMOwR8/Gg= github.com/go-ole/go-ole v1.2.6/go.mod h1:pprOEPIfldk/42T2oK7lQ4v4JSDwmV0As9GaiUsvbm0= github.com/go-ole/go-ole v1.3.0 h1:Dt6ye7+vXGIKZ7Xtk4s6/xVdGDQynvom7xCFEdWr6uE= github.com/go-ole/go-ole v1.3.0/go.mod h1:5LS6F96DhAwUc7C+1HLexzMXY1xGRSryjyPPKW6zv78= github.com/go-openapi/analysis v0.23.0 h1:aGday7OWupfMs+LbmLZG4k0MYXIANxcuBTYUC03zFCU= github.com/go-openapi/analysis v0.23.0/go.mod h1:9mz9ZWaSlV8TvjQHLl2mUW2PbZtemkE8yA5v22ohupo= github.com/go-openapi/errors v0.22.1 h1:kslMRRnK7NCb/CvR1q1VWuEQCEIsBGn5GgKD9e+HYhU= github.com/go-openapi/errors v0.22.1/go.mod h1:+n/5UdIqdVnLIJ6Q9Se8HNGUXYaY6CN8ImWzfi/Gzp0= github.com/go-openapi/jsonpointer v0.21.0 h1:YgdVicSA9vH5RiHs9TZW5oyafXZFc6+2Vc1rr/O9oNQ= github.com/go-openapi/jsonpointer v0.21.0/go.mod h1:IUyH9l/+uyhIYQ/PXVA41Rexl+kOkAPDdXEYns6fzUY= github.com/go-openapi/jsonreference v0.21.0 h1:Rs+Y7hSXT83Jacb7kFyjn4ijOuVGSvOdF2+tg1TRrwQ= github.com/go-openapi/jsonreference v0.21.0/go.mod h1:LmZmgsrTkVg9LG4EaHeY8cBDslNPMo06cago5JNLkm4= github.com/go-openapi/loads v0.22.0 h1:ECPGd4jX1U6NApCGG1We+uEozOAvXvJSF4nnwHZ8Aco= github.com/go-openapi/loads v0.22.0/go.mod h1:yLsaTCS92mnSAZX5WWoxszLj0u+Ojl+Zs5Stn1oF+rs= github.com/go-openapi/runtime v0.28.0 h1:gpPPmWSNGo214l6n8hzdXYhPuJcGtziTOgUpvsFWGIQ= github.com/go-openapi/runtime v0.28.0/go.mod h1:QN7OzcS+XuYmkQLw05akXk0jRH/eZ3kb18+1KwW9gyc= github.com/go-openapi/spec v0.21.0 h1:LTVzPc3p/RzRnkQqLRndbAzjY0d0BCL72A6j3CdL9ZY= github.com/go-openapi/spec v0.21.0/go.mod h1:78u6VdPw81XU44qEWGhtr982gJ5BWg2c0I5XwVMotYk= github.com/go-openapi/strfmt v0.23.0 h1:nlUS6BCqcnAk0pyhi9Y+kdDVZdZMHfEKQiS4HaMgO/c= github.com/go-openapi/strfmt v0.23.0/go.mod h1:NrtIpfKtWIygRkKVsxh7XQMDQW5HKQl6S5ik2elW+K4= github.com/go-openapi/swag v0.23.0 h1:vsEVJDUo2hPJ2tu0/Xc+4noaxyEffXNIs3cOULZ+GrE= github.com/go-openapi/swag v0.23.0/go.mod h1:esZ8ITTYEsH1V2trKHjAN8Ai7xHb8RV+YSZ577vPjgQ= github.com/go-openapi/validate v0.24.0 h1:LdfDKwNbpB6Vn40xhTdNZAnfLECL81w+VX3BumrGD58= github.com/go-openapi/validate v0.24.0/go.mod h1:iyeX1sEufmv3nPbBdX3ieNviWnOZaJ1+zquzJEf2BAQ= github.com/go-piv/piv-go v1.11.0 h1:5vAaCdRTFSIW4PeqMbnsDlUZ7odMYWnHBDGdmtU/Zhg= github.com/go-piv/piv-go v1.11.0/go.mod h1:NZ2zmjVkfFaL/CF8cVQ/pXdXtuj110zEKGdJM6fJZZM= github.com/go-quicktest/qt v1.101.0 h1:O1K29Txy5P2OK0dGo59b7b0LR6wKfIhttaAhHUyn7eI= github.com/go-quicktest/qt v1.101.0/go.mod h1:14Bz/f7NwaXPtdYEgzsx46kqSxVwTbzVZsDC26tQJow= github.com/go-resty/resty/v2 v2.16.5 h1:hBKqmWrr7uRc3euHVqmh1HTHcKn99Smr7o5spptdhTM= github.com/go-resty/resty/v2 v2.16.5/go.mod h1:hkJtXbA2iKHzJheXYvQ8snQES5ZLGKMwQ07xAwp/fiA= github.com/go-rod/rod v0.116.2 h1:A5t2Ky2A+5eD/ZJQr1EfsQSe5rms5Xof/qj296e+ZqA= github.com/go-rod/rod v0.116.2/go.mod h1:H+CMO9SCNc2TJ2WfrG+pKhITz57uGNYU43qYHh438Mg= github.com/go-sql-driver/mysql v1.9.1 h1:FrjNGn/BsJQjVRuSa8CBrM5BWA9BWoXXat3KrtSb/iI= github.com/go-sql-driver/mysql v1.9.1/go.mod h1:qn46aNg1333BRMNU69Lq93t8du/dwxI64Gl8i5p1WMU= github.com/go-task/slim-sprig v0.0.0-20210107165309-348f09dbbbc0 h1:p104kn46Q8WdvHunIJ9dAyjPVtrBPhSr3KT2yUst43I= github.com/go-task/slim-sprig v0.0.0-20210107165309-348f09dbbbc0/go.mod h1:fyg7847qk6SyHyPtNmDHnmrv/HOrqktSC+C9fM+CJOE= github.com/go-task/slim-sprig/v3 v3.0.0 h1:sUs3vkvUymDpBKi3qH1YSqBQk9+9D/8M2mN1vB6EwHI= github.com/go-task/slim-sprig/v3 v3.0.0/go.mod h1:W848ghGpv3Qj3dhTPRyJypKRiqCdHZiAzKg9hl15HA8= github.com/go-viper/mapstructure/v2 v2.4.0 h1:EBsztssimR/CONLSZZ04E8qAkxNYq4Qp9LvH92wZUgs= github.com/go-viper/mapstructure/v2 v2.4.0/go.mod h1:oJDH3BJKyqBA2TXFhDsKDGDTlndYOZ6rGS0BRZIxGhM= github.com/gobwas/glob v0.2.3 h1:A4xDbljILXROh+kObIiy5kIaPYD8e96x1tgBhUI5J+Y= github.com/gobwas/glob v0.2.3/go.mod h1:d3Ez4x06l9bZtSvzIay5+Yzi0fmZzPgnTbPcKjJAkT8= github.com/godbus/dbus/v5 v5.1.0 h1:4KLkAxT3aOY8Li4FRJe/KvhoNFFxo0m6fNuFUO8QJUk= github.com/godbus/dbus/v5 v5.1.0/go.mod h1:xhWf0FNVPg57R7Z0UbKHbJfkEywrmjJnf7w5xrFpKfA= github.com/gofrs/flock v0.13.0 h1:95JolYOvGMqeH31+FC7D2+uULf6mG61mEZ/A8dRYMzw= github.com/gofrs/flock v0.13.0/go.mod h1:jxeyy9R1auM5S6JYDBhDt+E2TCo7DkratH4Pgi8P+Z0= github.com/gogo/protobuf v1.3.2 h1:Ov1cvc58UF3b5XjBnZv7+opcTcQFZebYjWzi34vdm4Q= github.com/gogo/protobuf v1.3.2/go.mod h1:P1XiOD3dCwIKUDQYPy72D8LYyHL2YPYrpS2s69NZV8Q= github.com/golang-jwt/jwt v3.2.2+incompatible h1:IfV12K8xAKAnZqdXVzCZ+TOjboZ2keLg81eXfW3O+oY= github.com/golang-jwt/jwt/v4 v4.0.0/go.mod h1:/xlHOz8bRuivTWchD4jCa+NbatV+wEUSzwAxVc6locg= github.com/golang-jwt/jwt/v4 v4.2.0/go.mod h1:/xlHOz8bRuivTWchD4jCa+NbatV+wEUSzwAxVc6locg= github.com/golang-jwt/jwt/v4 v4.5.0/go.mod h1:m21LjoU+eqJr34lmDMbreY2eSTRJ1cv77w39/MY0Ch0= github.com/golang-jwt/jwt/v4 v4.5.2 h1:YtQM7lnr8iZ+j5q71MGKkNw9Mn7AjHM68uc9g5fXeUI= github.com/golang-jwt/jwt/v4 v4.5.2/go.mod h1:m21LjoU+eqJr34lmDMbreY2eSTRJ1cv77w39/MY0Ch0= github.com/golang-jwt/jwt/v5 v5.2.2 h1:Rl4B7itRWVtYIHFrSNd7vhTiz9UpLdi6gZhZ3wEeDy8= github.com/golang-jwt/jwt/v5 v5.2.2/go.mod h1:pqrtFR0X4osieyHYxtmOUWsAWrfe1Q5UVIyoH402zdk= github.com/golang/glog v0.0.0-20160126235308-23def4e6c14b/go.mod h1:SBH7ygxi8pfUlaOkMMuAQtPIUF8ecWP5IEl/CR7VP2Q= github.com/golang/groupcache v0.0.0-20241129210726-2c02b8208cf8 h1:f+oWsMOmNPc8JmEHVZIycC7hBoQxHH9pNKQORJNozsQ= github.com/golang/groupcache v0.0.0-20241129210726-2c02b8208cf8/go.mod h1:wcDNUvekVysuuOpQKo3191zZyTpiI6se1N1ULghS0sw= github.com/golang/mock v1.1.1/go.mod h1:oTYuIxOrZwtPieC+H1uAHpcLFnEyAGVDL/k47Jfbm0A= github.com/golang/protobuf v1.2.0/go.mod h1:6lQm79b+lXiMfvg/cZm0SGofjICqVBUtrP5yJMmIC1U= github.com/golang/protobuf v1.3.2/go.mod h1:6lQm79b+lXiMfvg/cZm0SGofjICqVBUtrP5yJMmIC1U= github.com/golang/protobuf v1.3.3/go.mod h1:vzj43D7+SQXF/4pzW/hwtAqwc6iTitCiVSaWz5lYuqw= github.com/golang/protobuf v1.4.0-rc.1/go.mod h1:ceaxUfeHdC40wWswd/P6IGgMaK3YpKi5j83Wpe3EHw8= github.com/golang/protobuf v1.4.0-rc.1.0.20200221234624-67d41d38c208/go.mod h1:xKAWHe0F5eneWXFV3EuXVDTCmh+JuBKY0li0aMyXATA= github.com/golang/protobuf v1.4.0-rc.2/go.mod h1:LlEzMj4AhA7rCAGe4KMBDvJI+AwstrUpVNzEA03Pprs= github.com/golang/protobuf v1.4.0-rc.4.0.20200313231945-b860323f09d0/go.mod h1:WU3c8KckQ9AFe+yFwt9sWVRKCVIyN9cPHBJSNnbL67w= github.com/golang/protobuf v1.4.0/go.mod h1:jodUvKwWbYaEsadDk5Fwe5c77LiNKVO9IDvqG2KuDX0= github.com/golang/protobuf v1.4.2/go.mod h1:oDoupMAO8OvCJWAcko0GGGIgR6R6ocIYbsSw735rRwI= github.com/golang/protobuf v1.5.0/go.mod h1:FsONVRAS9T7sI+LIUmWTfcYkHO4aIWwzhcaSAoJOfIk= github.com/golang/protobuf v1.5.2/go.mod h1:XVQd3VNwM+JqD3oG2Ue2ip4fOMUkwXdXDdiuN0vRsmY= github.com/golang/protobuf v1.5.4 h1:i7eJL8qZTpSEXOPTxNKhASYpMn+8e5Q6AdndVa1dWek= github.com/golang/protobuf v1.5.4/go.mod h1:lnTiLA8Wa4RWRcIUkrtSVa5nRhsEGBg48fD6rSs7xps= github.com/golang/snappy v0.0.4 h1:yAGX7huGHXlcLOEtBnF4w7FQwA26wojNCwOYAEhLjQM= github.com/golang/snappy v0.0.4/go.mod h1:/XxbfmMg8lxefKM7IXC3fBNl/7bRcc72aCRzEWrmP2Q= github.com/google/btree v1.1.3 h1:CVpQJjYgC4VbzxeGVHfvZrv1ctoYCAI8vbl07Fcxlyg= github.com/google/btree v1.1.3/go.mod h1:qOPhT0dTNdNzV6Z/lhRX0YXUafgPLFUh+gZMl761Gm4= github.com/google/certificate-transparency-go v1.3.2-0.20250507091337-0eddb39e94f8 h1:1RSWsOSxq2gk4pD/63bhsPwoOXgz2yXVadxXPbwZ0ec= github.com/google/certificate-transparency-go v1.3.2-0.20250507091337-0eddb39e94f8/go.mod h1:6Rm5w0Mlv87LyBNOCgfKYjdIBBpF42XpXGsbQvQGomQ= github.com/google/flatbuffers v25.2.10+incompatible h1:F3vclr7C3HpB1k9mxCGRMXq6FdUalZ6H/pNX4FP1v0Q= github.com/google/flatbuffers v25.2.10+incompatible/go.mod h1:1AeVuKshWv4vARoZatz6mlQ0JxURH0Kv5+zNeJKJCa8= github.com/google/gnostic-models v0.7.0 h1:qwTtogB15McXDaNqTZdzPJRHvaVJlAl+HVQnLmJEJxo= github.com/google/gnostic-models v0.7.0/go.mod h1:whL5G0m6dmc5cPxKc5bdKdEN3UjI7OUGxBlw57miDrQ= github.com/google/go-cmp v0.2.0/go.mod h1:oXzfMopK8JAjlY9xF4vHSVASa0yLyX7SntLO5aqRK0M= github.com/google/go-cmp v0.3.0/go.mod h1:8QqcDgzrUqlUb/G2PQTWiueGozuR1884gddMywk6iLU= github.com/google/go-cmp v0.3.1/go.mod h1:8QqcDgzrUqlUb/G2PQTWiueGozuR1884gddMywk6iLU= github.com/google/go-cmp v0.4.0/go.mod h1:v8dTdLbMG2kIc/vJvl+f65V22dbkXbowE6jgT/gNBxE= github.com/google/go-cmp v0.5.2/go.mod h1:v8dTdLbMG2kIc/vJvl+f65V22dbkXbowE6jgT/gNBxE= github.com/google/go-cmp v0.5.5/go.mod h1:v8dTdLbMG2kIc/vJvl+f65V22dbkXbowE6jgT/gNBxE= github.com/google/go-cmp v0.5.9/go.mod h1:17dUlkBOakJ0+DkrSSNjCkIjxS6bF9zb3elmeNGIjoY= github.com/google/go-cmp v0.7.0 h1:wk8382ETsv4JYUZwIsn6YpYiWiBsYLSJiTsyBybVuN8= github.com/google/go-cmp v0.7.0/go.mod h1:pXiqmnSA92OHEEa9HXL2W4E7lf9JzCmGVUdgjX3N/iU= github.com/google/go-containerregistry v0.20.7 h1:24VGNpS0IwrOZ2ms2P1QE3Xa5X9p4phx0aUgzYzHW6I= github.com/google/go-containerregistry v0.20.7/go.mod h1:Lx5LCZQjLH1QBaMPeGwsME9biPeo1lPx6lbGj/UmzgM= github.com/google/go-github/v55 v55.0.0 h1:4pp/1tNMB9X/LuAhs5i0KQAE40NmiR/y6prLNb9x9cg= github.com/google/go-github/v55 v55.0.0/go.mod h1:JLahOTA1DnXzhxEymmFF5PP2tSS9JVNj68mSZNDwskA= github.com/google/go-github/v60 v60.0.0 h1:oLG98PsLauFvvu4D/YPxq374jhSxFYdzQGNCyONLfn8= github.com/google/go-github/v60 v60.0.0/go.mod h1:ByhX2dP9XT9o/ll2yXAu2VD8l5eNVg8hD4Cr0S/LmQk= github.com/google/go-querystring v1.1.0 h1:AnCroh3fv4ZBgVIf1Iwtovgjaw/GiKJo8M8yD/fhyJ8= github.com/google/go-querystring v1.1.0/go.mod h1:Kcdr2DB4koayq7X8pmAG4sNG59So17icRSOU623lUBU= github.com/google/gofuzz v1.0.0/go.mod h1:dBl0BpW6vV/+mYPU4Po3pmUjxk6FQPldtuIdl/M65Eg= github.com/google/gofuzz v1.2.1-0.20210504230335-f78f29fc09ea h1:VcIYpAGBae3Z6BVncE0OnTE/ZjlDXqtYhOZky88neLM= github.com/google/gofuzz v1.2.1-0.20210504230335-f78f29fc09ea/go.mod h1:dBl0BpW6vV/+mYPU4Po3pmUjxk6FQPldtuIdl/M65Eg= github.com/google/licenseclassifier/v2 v2.0.0 h1:1Y57HHILNf4m0ABuMVb6xk4vAJYEUO0gDxNpog0pyeA= github.com/google/licenseclassifier/v2 v2.0.0/go.mod h1:cOjbdH0kyC9R22sdQbYsFkto4NGCAc+ZSwbeThazEtM= github.com/google/martian/v3 v3.3.3 h1:DIhPTQrbPkgs2yJYdXU/eNACCG5DVQjySNRNlflZ9Fc= github.com/google/martian/v3 v3.3.3/go.mod h1:iEPrYcgCF7jA9OtScMFQyAlZZ4YXTKEtJ1E6RWzmBA0= github.com/google/pprof v0.0.0-20210407192527-94a9f03dee38/go.mod h1:kpwsk12EmLew5upagYY7GY0pfYCcupk39gWOCRROcvE= github.com/google/pprof v0.0.0-20250820193118-f64d9cf942d6 h1:EEHtgt9IwisQ2AZ4pIsMjahcegHh6rmhqxzIRQIyepY= github.com/google/pprof v0.0.0-20250820193118-f64d9cf942d6/go.mod h1:I6V7YzU0XDpsHqbsyrghnFZLO1gwK6NPTNvmetQIk9U= github.com/google/s2a-go v0.1.9 h1:LGD7gtMgezd8a/Xak7mEWL0PjoTQFvpRudN895yqKW0= github.com/google/s2a-go v0.1.9/go.mod h1:YA0Ei2ZQL3acow2O62kdp9UlnvMmU7kA6Eutn0dXayM= github.com/google/tink/go v1.7.0 h1:6Eox8zONGebBFcCBqkVmt60LaWZa6xg1cl/DwAh/J1w= github.com/google/tink/go v1.7.0/go.mod h1:GAUOd+QE3pgj9q8VKIGTCP33c/B7eb4NhxLcgTJZStM= github.com/google/trillian v1.7.1 h1:+zX8jLM3524bAMPS+VxaDIDgsMv3/ty6DuLWerHXcek= github.com/google/trillian v1.7.1/go.mod h1:E1UMAHqpZCA8AQdrKdWmHmtUfSeiD0sDWD1cv00Xa+c= github.com/google/uuid v1.0.0/go.mod h1:TIyPZe4MgqvfeYDBFedMoGGpEw/LqOeaOT+nhxU+yHo= github.com/google/uuid v1.6.0 h1:NIvaJDMOsjHA8n1jAhLSgzrAzy1Hgr+hNrb57e+94F0= github.com/google/uuid v1.6.0/go.mod h1:TIyPZe4MgqvfeYDBFedMoGGpEw/LqOeaOT+nhxU+yHo= github.com/googleapis/enterprise-certificate-proxy v0.3.6 h1:GW/XbdyBFQ8Qe+YAmFU9uHLo7OnF5tL52HFAgMmyrf4= github.com/googleapis/enterprise-certificate-proxy v0.3.6/go.mod h1:MkHOF77EYAE7qfSuSS9PU6g4Nt4e11cnsDUowfwewLA= github.com/googleapis/gax-go/v2 v2.15.0 h1:SyjDc1mGgZU5LncH8gimWo9lW1DtIfPibOG81vgd/bo= github.com/googleapis/gax-go/v2 v2.15.0/go.mod h1:zVVkkxAQHa1RQpg9z2AUCMnKhi0Qld9rcmyfL1OZhoc= github.com/gopherjs/gopherjs v0.0.0-20181017120253-0766667cb4d1/go.mod h1:wJfORRmW1u3UXTncJ5qlYoELFm8eSnnEO6hX4iZ3EWY= github.com/gopherjs/gopherjs v0.0.0-20200217142428-fce0ec30dd00/go.mod h1:wJfORRmW1u3UXTncJ5qlYoELFm8eSnnEO6hX4iZ3EWY= github.com/gorilla/mux v1.8.1 h1:TuBL49tXwgrFYWhqrNgrUNEY92u81SPhu7sTdzQEiWY= github.com/gorilla/mux v1.8.1/go.mod h1:AKf9I4AEqPTmMytcMc0KkNouC66V3BtZ4qD5fmWSiMQ= github.com/gorilla/websocket v1.5.4-0.20250319132907-e064f32e3674 h1:JeSE6pjso5THxAzdVpqr6/geYxZytqFMBCOtn/ujyeo= github.com/gorilla/websocket v1.5.4-0.20250319132907-e064f32e3674/go.mod h1:r4w70xmWCQKmi1ONH4KIaBptdivuRPyosB9RmPlGEwA= github.com/grpc-ecosystem/grpc-gateway v1.16.0 h1:gmcG1KaJ57LophUzW0Hy8NmPhnMZb4M0+kPpLofRdBo= github.com/grpc-ecosystem/grpc-gateway/v2 v2.27.2 h1:8Tjv8EJ+pM1xP8mK6egEbD1OgnVTyacbefKhmbLhIhU= github.com/grpc-ecosystem/grpc-gateway/v2 v2.27.2/go.mod h1:pkJQ2tZHJ0aFOVEEot6oZmaVEZcRme73eIFmhiVuRWs= github.com/hashicorp/errwrap v1.1.0 h1:OxrOeh75EUXMY8TBjag2fzXGZ40LB6IKw45YeGUDY2I= github.com/hashicorp/errwrap v1.1.0/go.mod h1:YH+1FKiLXxHSkmPseP+kNlulaMuP3n2brvKWEqk/Jc4= github.com/hashicorp/go-cleanhttp v0.5.2 h1:035FKYIWjmULyFRBKPs8TBQoi0x6d9G4xc9neXJWAZQ= github.com/hashicorp/go-cleanhttp v0.5.2/go.mod h1:kO/YDlP8L1346E6Sodw+PrpBSV4/SoxCXGY6BqNFT48= github.com/hashicorp/go-hclog v1.6.3 h1:Qr2kF+eVWjTiYmU7Y31tYlP1h0q/X3Nl3tPGdaB11/k= github.com/hashicorp/go-hclog v1.6.3/go.mod h1:W4Qnvbt70Wk/zYJryRzDRU/4r0kIg0PVHBcfoyhpF5M= github.com/hashicorp/go-multierror v1.1.1 h1:H5DkEtf6CXdFp0N0Em5UCwQpXMWke8IA0+lD48awMYo= github.com/hashicorp/go-multierror v1.1.1/go.mod h1:iw975J/qwKPdAO1clOe2L8331t/9/fmwbPZ6JB6eMoM= github.com/hashicorp/go-retryablehttp v0.7.7 h1:C8hUCYzor8PIfXHa4UrZkU4VvK8o9ISHxT2Q8+VepXU= github.com/hashicorp/go-retryablehttp v0.7.7/go.mod h1:pkQpWZeYWskR+D1tR2O5OcBFOxfA7DoAO6xtkuQnHTk= github.com/hashicorp/go-rootcerts v1.0.2 h1:jzhAVGtqPKbwpyCPELlgNWhE1znq+qwJtW5Oi2viEzc= github.com/hashicorp/go-rootcerts v1.0.2/go.mod h1:pqUvnprVnM5bf7AOirdbb01K4ccR319Vf4pU3K5EGc8= github.com/hashicorp/go-secure-stdlib/parseutil v0.1.7 h1:UpiO20jno/eV1eVZcxqWnUohyKRe1g8FPV/xH1s/2qs= github.com/hashicorp/go-secure-stdlib/parseutil v0.1.7/go.mod h1:QmrqtbKuxxSWTN3ETMPuB+VtEiBJ/A9XhoYGv8E1uD8= github.com/hashicorp/go-secure-stdlib/strutil v0.1.2 h1:kes8mmyCpxJsI7FTwtzRqEy9CdjCtrXrXGuOpxEA7Ts= github.com/hashicorp/go-secure-stdlib/strutil v0.1.2/go.mod h1:Gou2R9+il93BqX25LAKCLuM+y9U2T4hlwvT1yprcna4= github.com/hashicorp/go-sockaddr v1.0.5 h1:dvk7TIXCZpmfOlM+9mlcrWmWjw/wlKT+VDq2wMvfPJU= github.com/hashicorp/go-sockaddr v1.0.5/go.mod h1:uoUUmtwU7n9Dv3O4SNLeFvg0SxQ3lyjsj6+CCykpaxI= github.com/hashicorp/go-version v1.7.0 h1:5tqGy27NaOTB8yJKUZELlFAS/LTKJkrmONwQKeRZfjY= github.com/hashicorp/go-version v1.7.0/go.mod h1:fltr4n8CU8Ke44wwGCBoEymUuxUHl09ZGVZPK5anwXA= github.com/hashicorp/golang-lru v1.0.2 h1:dV3g9Z/unq5DpblPpw+Oqcv4dU/1omnb4Ok8iPY6p1c= github.com/hashicorp/golang-lru/v2 v2.0.7 h1:a+bsQ5rvGLjzHuww6tVxozPZFVghXaHOwFs4luLUK2k= github.com/hashicorp/golang-lru/v2 v2.0.7/go.mod h1:QeFd9opnmA6QUJc5vARoKUSoFhyfM2/ZepoAG6RGpeM= github.com/hashicorp/hcl v1.0.1-vault-5 h1:kI3hhbbyzr4dldA8UdTb7ZlVVlI2DACdCfz31RPDgJM= github.com/hashicorp/hcl v1.0.1-vault-5/go.mod h1:XYhtn6ijBSAj6n4YqAaf7RBPS4I06AItNorpy+MoQNM= github.com/hashicorp/vault/api v1.15.0 h1:O24FYQCWwhwKnF7CuSqP30S51rTV7vz1iACXE/pj5DA= github.com/hashicorp/vault/api v1.15.0/go.mod h1:+5YTO09JGn0u+b6ySD/LLVf8WkJCPLAL2Vkmrn2+CM8= github.com/howeyc/gopass v0.0.0-20210920133722-c8aef6fb66ef h1:A9HsByNhogrvm9cWb28sjiS3i7tcKCkflWFEkHfuAgM= github.com/howeyc/gopass v0.0.0-20210920133722-c8aef6fb66ef/go.mod h1:lADxMC39cJJqL93Duh1xhAs4I2Zs8mKS89XWXFGp9cs= github.com/hpcloud/tail v1.0.0/go.mod h1:ab1qPbhIpdTxEkNHXyeSf5vhxWSCs/tWer42PpOxQnU= github.com/ianlancetaylor/demangle v0.0.0-20200824232613-28f6c0f3b639/go.mod h1:aSSvb/t6k1mPoxDqO4vJh6VOCGPwU4O0C2/Eqndh1Sc= github.com/imdario/mergo v0.3.16 h1:wwQJbIsHYGMUyLSPrEq1CT16AhnhNJQ51+4fdHUnCl4= github.com/imdario/mergo v0.3.16/go.mod h1:WBLT9ZmE3lPoWsEzCh9LPo3TiwVN+ZKEjmz+hD27ysY= github.com/in-toto/attestation v1.1.0 h1:oRWzfmZPDSctChD0VaQV7MJrywKOzyNrtpENQFq//2Q= github.com/in-toto/attestation v1.1.0/go.mod h1:DB59ytd3z7cIHgXxwpSX2SABrU6WJUKg/grpdgHVgVs= github.com/in-toto/in-toto-golang v0.9.0 h1:tHny7ac4KgtsfrG6ybU8gVOZux2H8jN05AXJ9EBM1XU= github.com/in-toto/in-toto-golang v0.9.0/go.mod h1:xsBVrVsHNsB61++S6Dy2vWosKhuA3lUTQd+eF9HdeMo= github.com/inconshreveable/mousetrap v1.1.0 h1:wN+x4NVGpMsO7ErUn/mUI3vEoE6Jt13X2s0bqwp9tc8= github.com/inconshreveable/mousetrap v1.1.0/go.mod h1:vpF70FUmC8bwa3OWnCshd2FqLfsEA9PFc4w1p2J65bw= github.com/jackc/pgpassfile v1.0.0 h1:/6Hmqy13Ss2zCq62VdNG8tM1wchn8zjSGOBJ6icpsIM= github.com/jackc/pgpassfile v1.0.0/go.mod h1:CEx0iS5ambNFdcRtxPj5JhEz+xB6uRky5eyVu/W2HEg= github.com/jackc/pgservicefile v0.0.0-20240606120523-5a60cdf6a761 h1:iCEnooe7UlwOQYpKFhBabPMi4aNAfoODPEFNiAnClxo= github.com/jackc/pgservicefile v0.0.0-20240606120523-5a60cdf6a761/go.mod h1:5TJZWKEWniPve33vlWYSoGYefn3gLQRzjfDlhSJ9ZKM= github.com/jackc/pgx/v5 v5.7.4 h1:9wKznZrhWa2QiHL+NjTSPP6yjl3451BX3imWDnokYlg= github.com/jackc/pgx/v5 v5.7.4/go.mod h1:ncY89UGWxg82EykZUwSpUKEfccBGGYq1xjrOpsbsfGQ= github.com/jackc/puddle/v2 v2.2.2 h1:PR8nw+E/1w0GLuRFSmiioY6UooMp6KJv0/61nB7icHo= github.com/jackc/puddle/v2 v2.2.2/go.mod h1:vriiEXHvEE654aYKXXjOvZM39qJ0q+azkZFrfEOc3H4= github.com/jbenet/go-context v0.0.0-20150711004518-d14ea06fba99 h1:BQSFePA1RWJOlocH6Fxy8MmwDt+yVQYULKfN0RoTN8A= github.com/jbenet/go-context v0.0.0-20150711004518-d14ea06fba99/go.mod h1:1lJo3i6rXxKeerYnT8Nvf0QmHCRC1n8sfWVwXF2Frvo= github.com/jedisct1/go-minisign v0.0.0-20230811132847-661be99b8267 h1:TMtDYDHKYY15rFihtRfck/bfFqNfvcabqvXAFQfAUpY= github.com/jedisct1/go-minisign v0.0.0-20230811132847-661be99b8267/go.mod h1:h1nSAbGFqGVzn6Jyl1R/iCcBUHN4g+gW1u9CoBTrb9E= github.com/jellydator/ttlcache/v3 v3.3.0 h1:BdoC9cE81qXfrxeb9eoJi9dWrdhSuwXMAnHTbnBm4Wc= github.com/jellydator/ttlcache/v3 v3.3.0/go.mod h1:bj2/e0l4jRnQdrnSTaGTsh4GSXvMjQcy41i7th0GVGw= github.com/jmespath/go-jmespath v0.4.1-0.20220621161143-b0104c826a24 h1:liMMTbpW34dhU4az1GN0pTPADwNmvoRSeoZ6PItiqnY= github.com/jmespath/go-jmespath v0.4.1-0.20220621161143-b0104c826a24/go.mod h1:T8mJZnbsbmF+m6zOOFylbeCJqk5+pHWvzYPziyZiYoo= github.com/jmespath/go-jmespath/internal/testify v1.5.1 h1:shLQSRRSCCPj3f2gpwzGwWFoC7ycTf1rcQZHOlsJ6N8= github.com/jmespath/go-jmespath/internal/testify v1.5.1/go.mod h1:L3OGu8Wl2/fWfCI6z80xFu9LTZmf1ZRjMHUOPmWr69U= github.com/josharian/intern v1.0.0 h1:vlS4z54oSdjm0bgjRigI+G1HpF+tI+9rE5LLzOg8HmY= github.com/josharian/intern v1.0.0/go.mod h1:5DoeVV0s6jJacbCEi61lwdGj/aVlrQvzHFFd8Hwg//Y= github.com/json-iterator/go v1.1.10/go.mod h1:KdQUCv79m/52Kvf8AW2vK1V8akMuk1QjK/uOdHXbAo4= github.com/json-iterator/go v1.1.12 h1:PV8peI4a0ysnczrg+LtxykD8LfKY9ML6u2jnxaEnrnM= github.com/json-iterator/go v1.1.12/go.mod h1:e30LSqwooZae/UwlEbR2852Gd8hjQvJoHmT4TnhNGBo= github.com/jtolds/gls v4.20.0+incompatible/go.mod h1:QJZ7F/aHp+rZTRtaJ1ow/lLfFfVYBRgL+9YlvaHOwJU= github.com/kballard/go-shellquote v0.0.0-20180428030007-95032a82bc51 h1:Z9n2FFNUXsshfwJMBgNA0RU6/i7WVaAegv3PtuIHPMs= github.com/kballard/go-shellquote v0.0.0-20180428030007-95032a82bc51/go.mod h1:CzGEWj7cYgsdH8dAjBGEr58BoE7ScuLd+fwFZ44+/x8= github.com/kevinburke/ssh_config v1.2.0 h1:x584FjTGwHzMwvHx18PXxbBVzfnxogHaAReU4gf13a4= github.com/kevinburke/ssh_config v1.2.0/go.mod h1:CT57kijsi8u/K/BOFA39wgDQJ9CxiF4nAY/ojJ6r6mM= github.com/kisielk/errcheck v1.5.0/go.mod h1:pFxgyoBC7bSaBwPgfKdkLd5X25qrDl4LWUI2bnpBCr8= github.com/kisielk/gotool v1.0.0/go.mod h1:XhKaO+MFFWcvkIS/tQcRk01m1F5IRFswLeQ+oQHNcck= github.com/klauspost/compress v1.18.1 h1:bcSGx7UbpBqMChDtsF28Lw6v/G94LPrrbMbdC3JH2co= github.com/klauspost/compress v1.18.1/go.mod h1:ZQFFVG+MdnR0P+l6wpXgIL4NTtwiKIdBnrBd8Nrxr+0= github.com/knqyf263/go-rpmdb v0.1.0 h1:pOgjtOGtW0B+ibY905hP3ETrYFmLZsHiReKsplcs+to= github.com/knqyf263/go-rpmdb v0.1.0/go.mod h1:9LQcoMCMQ9vrF7HcDtXfvqGO4+ddxFQ8+YF/0CVGDww= github.com/kr/pretty v0.1.0/go.mod h1:dAy3ld7l9f0ibDNOQOHHMYYIIbhfbHSm3C4ZsoJORNo= github.com/kr/pretty v0.3.1 h1:flRD4NNwYAUpkphVc1HcthR4KEIFJ65n8Mw5qdRn3LE= github.com/kr/pretty v0.3.1/go.mod h1:hoEshYVHaxMs3cyo3Yncou5ZscifuDolrwPKZanG3xk= github.com/kr/pty v1.1.1/go.mod h1:pFQYn66WHrOpPYNljwOMqo10TkYh1fy3cYio2l3bCsQ= github.com/kr/text v0.1.0/go.mod h1:4Jbv+DJW3UT/LiOwJeYQe1efqtUx/iVham/4vfdArNI= github.com/kr/text v0.2.0 h1:5Nx0Ya0ZqY2ygV366QzturHI13Jq95ApcVaJBhpS+AY= github.com/kr/text v0.2.0/go.mod h1:eLer722TekiGuMkidMxC/pM04lWEeraHUUmBw8l2grE= github.com/kris-nova/logger v0.2.2 h1:qdWg2fNr4Bni4obkgehwOSbCoxaX+wDGGrzQ1T2mA20= github.com/kris-nova/logger v0.2.2/go.mod h1:uOTzfb9ssx0XYb3UpeAjKsys8KByjD12OMN4szmym4w= github.com/kris-nova/lolgopher v0.0.0-20210112022122-73f0047e8b65/go.mod h1:V0HF/ZBlN86HqewcDC/cVxMmYDiRukWjSrgKLUAn9Js= github.com/kubicorn/kubicorn v0.0.0-20191114212505-a2c64ce430b9 h1:HgzA4yC4kPQfNIya55o4yA1WiKCXXA5wXvwoBKgIwXI= github.com/kubicorn/kubicorn v0.0.0-20191114212505-a2c64ce430b9/go.mod h1:Z/PU7XQicaZV6QFTAvm8EaWyfNbAb4a76kmR4Am4KA8= github.com/kylelemons/godebug v1.1.0 h1:RPNrshWIDI6G2gRW9EHilWtl7Z6Sb1BR0xunSBf0SNc= github.com/kylelemons/godebug v1.1.0/go.mod h1:9/0rRGxNHcop5bhtWyNeEfOS8JIWk580+fNqagV/RAw= github.com/lib/pq v1.10.9 h1:YXG7RB+JIjhP29X+OtkiDnYaXQwpS4JEWq7dtCCRUEw= github.com/lib/pq v1.10.9/go.mod h1:AlVN5x4E4T544tWzH6hKfbfQvm3HdbOxrmggDNAPY9o= github.com/liggitt/tabwriter v0.0.0-20181228230101-89fcab3d43de h1:9TO3cAIGXtEhnIaL+V+BEER86oLrvS+kWobKpbJuye0= github.com/liggitt/tabwriter v0.0.0-20181228230101-89fcab3d43de/go.mod h1:zAbeS9B/r2mtpb6U+EI2rYA5OAXxsYw6wTamcNW+zcE= github.com/lucasb-eyer/go-colorful v1.2.0 h1:1nnpGOrhyZZuNyfu1QjKiUICQ74+3FNCN69Aj6K7nkY= github.com/lucasb-eyer/go-colorful v1.2.0/go.mod h1:R4dSotOR9KMtayYi1e77YzuveK+i7ruzyGqttikkLy0= github.com/mailru/easyjson v0.9.0 h1:PrnmzHw7262yW8sTBwxi1PdJA3Iw/EKBa8psRf7d9a4= github.com/mailru/easyjson v0.9.0/go.mod h1:1+xMtQp2MRNVL/V1bOzuP3aP8VNwRW55fQUto+XFtTU= github.com/mattn/go-colorable v0.1.8/go.mod h1:u6P/XSegPjTcexA+o6vUJrdnUu04hMope9wVRipJSqc= github.com/mattn/go-colorable v0.1.14 h1:9A9LHSqF/7dyVVX6g0U9cwm9pG3kP9gSzcuIPHPsaIE= github.com/mattn/go-colorable v0.1.14/go.mod h1:6LmQG8QLFO4G5z1gPvYEzlUgJ2wF+stgPZH1UqBm1s8= github.com/mattn/go-isatty v0.0.12/go.mod h1:cbi8OIDigv2wuxKPP5vlRcQ1OAZbq2CE4Kysco4FUpU= github.com/mattn/go-isatty v0.0.20 h1:xfD0iDuEKnDkl03q4limB+vH+GxLEtL/jb4xVJSWWEY= github.com/mattn/go-isatty v0.0.20/go.mod h1:W+V8PltTTMOvKvAeJH7IuucS94S2C6jfK/D7dTCTo3Y= github.com/mattn/go-localereader v0.0.1 h1:ygSAOl7ZXTx4RdPYinUpg6W99U8jWvWi9Ye2JC/oIi4= github.com/mattn/go-localereader v0.0.1/go.mod h1:8fBrzywKY7BI3czFoHkuzRoWE9C+EiG4R1k4Cjx5p88= github.com/mattn/go-runewidth v0.0.16 h1:E5ScNMtiwvlvB5paMFdw9p4kSQzbXFikJ5SQO6TULQc= github.com/mattn/go-runewidth v0.0.16/go.mod h1:Jdepj2loyihRzMpdS35Xk/zdY8IAYHsh153qUoGf23w= github.com/miekg/dns v1.1.61 h1:nLxbwF3XxhwVSm8g9Dghm9MHPaUZuqhPiGL+675ZmEs= github.com/miekg/dns v1.1.61/go.mod h1:mnAarhS3nWaW+NVP2wTkYVIZyHNJ098SJZUki3eykwQ= github.com/miekg/pkcs11 v1.0.3-0.20190429190417-a667d056470f/go.mod h1:XsNlhZGX73bx86s2hdc/FuaLm2CPZJemRLMA+WTFxgs= github.com/miekg/pkcs11 v1.1.1 h1:Ugu9pdy6vAYku5DEpVWVFPYnzV+bxB+iRdbuFSu7TvU= github.com/miekg/pkcs11 v1.1.1/go.mod h1:XsNlhZGX73bx86s2hdc/FuaLm2CPZJemRLMA+WTFxgs= github.com/mitchellh/go-homedir v1.1.0 h1:lukF9ziXFxDFPkA1vsr5zpc1XuPDn/wFntq5mG+4E0Y= github.com/mitchellh/go-homedir v1.1.0/go.mod h1:SfyaCUpYCn1Vlf4IUYiD9fPX4A5wJrkLzIz1N1q0pr0= github.com/mitchellh/go-wordwrap v1.0.1 h1:TLuKupo69TCn6TQSyGxwI1EblZZEsQ0vMlAFQflz0v0= github.com/mitchellh/go-wordwrap v1.0.1/go.mod h1:R62XHJLzvMFRBbcrT7m7WgmE1eOyTSsCt+hzestvNj0= github.com/mitchellh/hashstructure/v2 v2.0.2 h1:vGKWl0YJqUNxE8d+h8f6NJLcCJrgbhC4NcD46KavDd4= github.com/mitchellh/hashstructure/v2 v2.0.2/go.mod h1:MG3aRVU/N29oo/V/IhBX8GR/zz4kQkprJgF2EVszyDE= github.com/mitchellh/mapstructure v1.5.0 h1:jeMsZIYE/09sWLaz43PL7Gy6RuMjD2eJVyuac5Z2hdY= github.com/mitchellh/mapstructure v1.5.0/go.mod h1:bFUtVrKA4DC2yAKiSyO/QUcy7e+RRV2QTWOzhPopBRo= github.com/moby/spdystream v0.5.0 h1:7r0J1Si3QO/kjRitvSLVVFUjxMEb/YLj6S9FF62JBCU= github.com/moby/spdystream v0.5.0/go.mod h1:xBAYlnt/ay+11ShkdFKNAG7LsyK/tmNBVvVOwrfMgdI= github.com/moby/term v0.5.2 h1:6qk3FJAFDs6i/q3W/pQ97SX192qKfZgGjCQqfCJkgzQ= github.com/moby/term v0.5.2/go.mod h1:d3djjFCrjnB+fl8NJux+EJzu0msscUP+f8it8hPkFLc= github.com/modern-go/concurrent v0.0.0-20180228061459-e0a39a4cb421/go.mod h1:6dJC0mAP4ikYIbvyc7fijjWJddQyLn8Ig3JB5CqoB9Q= github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd h1:TRLaZ9cD/w8PVh93nsPXa1VrQ6jlwL5oN8l14QlcNfg= github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd/go.mod h1:6dJC0mAP4ikYIbvyc7fijjWJddQyLn8Ig3JB5CqoB9Q= github.com/modern-go/reflect2 v0.0.0-20180701023420-4b7aa43c6742/go.mod h1:bx2lNnkwVCuqBIxFjflWJWanXIb3RllmbCylyMrvgv0= github.com/modern-go/reflect2 v1.0.1/go.mod h1:bx2lNnkwVCuqBIxFjflWJWanXIb3RllmbCylyMrvgv0= github.com/modern-go/reflect2 v1.0.2/go.mod h1:yWuevngMOJpCy52FWWMvUC8ws7m/LJsjYzDa0/r8luk= github.com/modern-go/reflect2 v1.0.3-0.20250322232337-35a7c28c31ee h1:W5t00kpgFdJifH4BDsTlE89Zl93FEloxaWZfGcifgq8= github.com/modern-go/reflect2 v1.0.3-0.20250322232337-35a7c28c31ee/go.mod h1:yWuevngMOJpCy52FWWMvUC8ws7m/LJsjYzDa0/r8luk= github.com/monochromegane/go-gitignore v0.0.0-20200626010858-205db1a8cc00 h1:n6/2gBQ3RWajuToeY6ZtZTIKv2v7ThUy5KKusIT0yc0= github.com/monochromegane/go-gitignore v0.0.0-20200626010858-205db1a8cc00/go.mod h1:Pm3mSP3c5uWn86xMLZ5Sa7JB9GsEZySvHYXCTK4E9q4= github.com/mozillazg/docker-credential-acr-helper v0.4.0 h1:Uoh3Z9CcpEDnLiozDx+D7oDgRq7X+R296vAqAumnOcw= github.com/mozillazg/docker-credential-acr-helper v0.4.0/go.mod h1:2kiicb3OlPytmlNC9XGkLvVC+f0qTiJw3f/mhmeeQBg= github.com/muesli/ansi v0.0.0-20230316100256-276c6243b2f6 h1:ZK8zHtRHOkbHy6Mmr5D264iyp3TiX5OmNcI5cIARiQI= github.com/muesli/ansi v0.0.0-20230316100256-276c6243b2f6/go.mod h1:CJlz5H+gyd6CUWT45Oy4q24RdLyn7Md9Vj2/ldJBSIo= github.com/muesli/cancelreader v0.2.2 h1:3I4Kt4BQjOR54NavqnDogx/MIoWBFa0StPA8ELUXHmA= github.com/muesli/cancelreader v0.2.2/go.mod h1:3XuTXfFS2VjM+HTLZY9Ak0l6eUKfijIfMUZ4EgX0QYo= github.com/muesli/reflow v0.3.0 h1:IFsN6K9NfGtjeggFP+68I4chLZV2yIKsXJFNZ+eWh6s= github.com/muesli/reflow v0.3.0/go.mod h1:pbwTDkVPibjO2kyvBQRBxTWEEGDGq0FlB1BIKtnHY/8= github.com/muesli/termenv v0.16.0 h1:S5AlUN9dENB57rsbnkPyfdGuWIlkmzJjbFf0Tf5FWUc= github.com/muesli/termenv v0.16.0/go.mod h1:ZRfOIKPFDYQoDFF4Olj7/QJbW60Ol/kL1pU3VfY/Cnk= github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822 h1:C3w9PqII01/Oq1c1nUAm88MOHcQC9l5mIlSMApZMrHA= github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822/go.mod h1:+n7T8mK8HuQTcFwEeznm/DIxMOiR9yIdICNftLE1DvQ= github.com/mxk/go-flowrate v0.0.0-20140419014527-cca7078d478f h1:y5//uYreIhSUg3J1GEMiLbxo1LJaP8RfCpH6pymGZus= github.com/mxk/go-flowrate v0.0.0-20140419014527-cca7078d478f/go.mod h1:ZdcZmHo+o7JKHSa8/e818NopupXU1YMK5fe1lsApnBw= github.com/ncruces/go-strftime v0.1.9 h1:bY0MQC28UADQmHmaF5dgpLmImcShSi2kHU9XLdhx/f4= github.com/ncruces/go-strftime v0.1.9/go.mod h1:Fwc5htZGVVkseilnfgOVb9mKy6w1naJmn9CehxcKcls= github.com/niemeyer/pretty v0.0.0-20200227124842-a10e7caefd8e/go.mod h1:zD1mROLANZcx1PVRCS0qkT7pwLkGfwJo4zjcN/Tysno= github.com/nozzle/throttler v0.0.0-20180817012639-2ea982251481 h1:Up6+btDp321ZG5/zdSLo48H9Iaq0UQGthrhWC6pCxzE= github.com/nozzle/throttler v0.0.0-20180817012639-2ea982251481/go.mod h1:yKZQO8QE2bHlgozqWDiRVqTFlLQSj30K/6SAK8EeYFw= github.com/nxadm/tail v1.4.4/go.mod h1:kenIhsEOeOJmVchQTgglprH7qJGnHDVpk1VPCcaMI8A= github.com/nxadm/tail v1.4.8/go.mod h1:+ncqLTQzXmGhMZNUePPaPqPvBxHAIsmXswZKocGu+AU= github.com/nxadm/tail v1.4.11 h1:8feyoE3OzPrcshW5/MJ4sGESc5cqmGkGCWlco4l0bqY= github.com/nxadm/tail v1.4.11/go.mod h1:OTaG3NK980DZzxbRq6lEuzgU+mug70nY11sMd4JXXHc= github.com/octago/sflags v0.3.1 h1:LW65z20iAQKteEyjsnnc+/lyoCUnIoRuAocggr6RB6A= github.com/octago/sflags v0.3.1/go.mod h1:hVUkbnYwMU9kZiZJyOAIVN56YiVMMPxgJ46kRZ19jh0= github.com/oklog/ulid v1.3.1 h1:EGfNDEx6MqHz8B3uNV6QAib1UR2Lm97sHi3ocA6ESJ4= github.com/oklog/ulid v1.3.1/go.mod h1:CirwcVhetQ6Lv90oh/F+FBtV6XMibvdAFo93nm5qn4U= github.com/oleiade/reflections v1.1.0 h1:D+I/UsXQB4esMathlt0kkZRJZdUDmhv5zGi/HOwYTWo= github.com/oleiade/reflections v1.1.0/go.mod h1:mCxx0QseeVCHs5Um5HhJeCKVC7AwS8kO67tky4rdisA= github.com/olekukonko/errors v0.0.0-20250405072817-4e6d85265da6 h1:r3FaAI0NZK3hSmtTDrBVREhKULp8oUeqLT5Eyl2mSPo= github.com/olekukonko/errors v0.0.0-20250405072817-4e6d85265da6/go.mod h1:ppzxA5jBKcO1vIpCXQ9ZqgDh8iwODz6OXIGKU8r5m4Y= github.com/olekukonko/ll v0.0.8 h1:sbGZ1Fx4QxJXEqL/6IG8GEFnYojUSQ45dJVwN2FH2fc= github.com/olekukonko/ll v0.0.8/go.mod h1:En+sEW0JNETl26+K8eZ6/W4UQ7CYSrrgg/EdIYT2H8g= github.com/olekukonko/tablewriter v1.0.8 h1:f6wJzHg4QUtJdvrVPKco4QTrAylgaU0+b9br/lJxEiQ= github.com/olekukonko/tablewriter v1.0.8/go.mod h1:H428M+HzoUXC6JU2Abj9IT9ooRmdq9CxuDmKMtrOCMs= github.com/oliveagle/jsonpath v0.0.0-20180606110733-2e52cf6e6852 h1:Yl0tPBa8QPjGmesFh1D0rDy+q1Twx6FyU7VWHi8wZbI= github.com/oliveagle/jsonpath v0.0.0-20180606110733-2e52cf6e6852/go.mod h1:eqOVx5Vwu4gd2mmMZvVZsgIqNSaW3xxRThUJ0k/TPk4= github.com/onsi/ginkgo v1.6.0/go.mod h1:lLunBs/Ym6LB5Z9jYTR76FiuTmxDTDusOGeTQH+WWjE= github.com/onsi/ginkgo v1.12.1/go.mod h1:zj2OWP4+oCPe1qIXoGWkgMRwljMUYCdkwsT2108oapk= github.com/onsi/ginkgo v1.16.4/go.mod h1:dX+/inL/fNMqNlz0e9LfyB9TswhZpCVdJM/Z6Vvnwo0= github.com/onsi/ginkgo v1.16.5 h1:8xi0RTUf59SOSfEtZMvwTvXYMzG4gV23XVHOZiXNtnE= github.com/onsi/ginkgo v1.16.5/go.mod h1:+E8gABHa3K6zRBolWtd+ROzc/U5bkGt0FwiG042wbpU= github.com/onsi/ginkgo/v2 v2.1.3/go.mod h1:vw5CSIxN1JObi/U8gcbwft7ZxR2dgaR70JSE3/PpL4c= github.com/onsi/ginkgo/v2 v2.27.2 h1:LzwLj0b89qtIy6SSASkzlNvX6WktqurSHwkk2ipF/Ns= github.com/onsi/ginkgo/v2 v2.27.2/go.mod h1:ArE1D/XhNXBXCBkKOLkbsb2c81dQHCRcF5zwn/ykDRo= github.com/onsi/gomega v1.7.1/go.mod h1:XdKZgCCFLUoM/7CFJVPcG8C1xQ1AJ0vpAezJrB7JYyY= github.com/onsi/gomega v1.10.1/go.mod h1:iN09h71vgCQne3DLsj+A5owkum+a2tYe+TOCB1ybHNo= github.com/onsi/gomega v1.17.0/go.mod h1:HnhC7FXeEQY45zxNK3PPoIUhzk/80Xly9PcubAlGdZY= github.com/onsi/gomega v1.19.0/go.mod h1:LY+I3pBVzYsTBU1AnDwOSxaYi9WoWiqgwooUqq9yPro= github.com/onsi/gomega v1.38.2 h1:eZCjf2xjZAqe+LeWvKb5weQ+NcPwX84kqJ0cZNxok2A= github.com/onsi/gomega v1.38.2/go.mod h1:W2MJcYxRGV63b418Ai34Ud0hEdTVXq9NW9+Sx6uXf3k= github.com/open-policy-agent/opa v1.4.0 h1:IGO3xt5HhQKQq2axfa9memIFx5lCyaBlG+fXcgHpd3A= github.com/open-policy-agent/opa v1.4.0/go.mod h1:DNzZPKqKh4U0n0ANxcCVlw8lCSv2c+h5G/3QvSYdWZ8= github.com/opencontainers/go-digest v1.0.0 h1:apOUWs51W5PlhuyGyz9FCeeBIOUDA/6nW8Oi/yOhh5U= github.com/opencontainers/go-digest v1.0.0/go.mod h1:0JzlMkj0TRzQZfJkVvzbP0HBR3IKzErnv2BNG4W4MAM= github.com/opencontainers/image-spec v1.1.1 h1:y0fUlFfIZhPF1W537XOLg0/fcx6zcHCJwooC2xJA040= github.com/opencontainers/image-spec v1.1.1/go.mod h1:qpqAh3Dmcf36wStyyWU+kCeDgrGnAve2nCC8+7h8Q0M= github.com/opentracing/opentracing-go v1.2.0 h1:uEJPy/1a5RIPAJ0Ov+OIO8OxWu77jEv+1B0VhjKrZUs= github.com/opentracing/opentracing-go v1.2.0/go.mod h1:GxEUsuufX4nBwe+T+Wl9TAgYrxe9dPLANfrWvHYVTgc= github.com/package-url/packageurl-go v0.1.2 h1:0H2DQt6DHd/NeRlVwW4EZ4oEI6Bn40XlNPRqegcxuo4= github.com/package-url/packageurl-go v0.1.2/go.mod h1:uQd4a7Rh3ZsVg5j0lNyAfyxIeGde9yrlhjF78GzeW0c= github.com/patrickmn/go-cache v2.1.0+incompatible h1:HRMgzkcYKYpi3C8ajMPV8OFXaaRUnok+kx1WdO15EQc= github.com/patrickmn/go-cache v2.1.0+incompatible/go.mod h1:3Qf8kWWT7OJRJbdiICTKqZju1ZixQ/KpMGzzAfe6+WQ= github.com/pborman/uuid v1.2.1 h1:+ZZIw58t/ozdjRaXh/3awHfmWRbzYxJoAdNJxe/3pvw= github.com/pborman/uuid v1.2.1/go.mod h1:X/NO0urCmaxf9VXbdlT7C2Yzkj2IKimNn4k+gtPdI/k= github.com/pelletier/go-toml v1.9.5 h1:4yBQzkHv+7BHq2PQUZF3Mx0IYxG7LsP222s7Agd3ve8= github.com/pelletier/go-toml v1.9.5/go.mod h1:u1nR/EPcESfeI/szUZKdtJ0xRNbUoANCkoOuaOx1Y+c= github.com/pelletier/go-toml/v2 v2.2.4 h1:mye9XuhQ6gvn5h28+VilKrrPoQVanw5PMw/TB0t5Ec4= github.com/pelletier/go-toml/v2 v2.2.4/go.mod h1:2gIqNv+qfxSVS7cM2xJQKtLSTLUE9V8t9Stt+h56mCY= github.com/pjbgf/sha1cd v0.3.2 h1:a9wb0bp1oC2TGwStyn0Umc/IGKQnEgF0vVaZ8QF8eo4= github.com/pjbgf/sha1cd v0.3.2/go.mod h1:zQWigSxVmsHEZow5qaLtPYxpcKMMQpa09ixqBxuCS6A= github.com/pkg/browser v0.0.0-20240102092130-5ac0b6a4141c h1:+mdjkGKdHQG3305AYmdv1U2eRNDiU2ErMBj1gwrq8eQ= github.com/pkg/browser v0.0.0-20240102092130-5ac0b6a4141c/go.mod h1:7rwL4CYBLnjLxUqIJNnCWiEdr3bn6IUYi15bNlnbCCU= github.com/pkg/errors v0.8.1/go.mod h1:bwawxfHBFNV+L2hUp1rHADufV3IMtnDRdf1r5NINEl0= github.com/pkg/errors v0.9.1 h1:FEBLx1zS214owpjy7qsBeixbURkuhQAwrK5UwLGTwt4= github.com/pkg/errors v0.9.1/go.mod h1:bwawxfHBFNV+L2hUp1rHADufV3IMtnDRdf1r5NINEl0= github.com/planetscale/vtprotobuf v0.6.1-0.20240319094008-0393e58bdf10 h1:GFCKgmp0tecUJ0sJuv4pzYCqS9+RGSn52M3FUwPs+uo= github.com/planetscale/vtprotobuf v0.6.1-0.20240319094008-0393e58bdf10/go.mod h1:t/avpk3KcrXxUnYOhZhMXJlSEyie6gQbtLq5NM3loB8= github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4= github.com/pmezard/go-difflib v1.0.1-0.20181226105442-5d4384ee4fb2 h1:Jamvg5psRIccs7FGNTlIRMkT8wgtp5eCXdBlqhYGL6U= github.com/pmezard/go-difflib v1.0.1-0.20181226105442-5d4384ee4fb2/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4= github.com/power-devops/perfstat v0.0.0-20240221224432-82ca36839d55 h1:o4JXh1EVt9k/+g42oCprj/FisM4qX9L3sZB3upGN2ZU= github.com/power-devops/perfstat v0.0.0-20240221224432-82ca36839d55/go.mod h1:OmDBASR4679mdNQnz2pUhc2G8CO2JrUAVFDRBDP/hJE= github.com/prometheus/client_golang v1.23.2 h1:Je96obch5RDVy3FDMndoUsjAhG5Edi49h0RJWRi/o0o= github.com/prometheus/client_golang v1.23.2/go.mod h1:Tb1a6LWHB3/SPIzCoaDXI4I8UHKeFTEQ1YCr+0Gyqmg= github.com/prometheus/client_model v0.0.0-20190812154241-14fe0d1b01d4/go.mod h1:xMI15A0UPsDsEKsMN9yxemIoYk6Tm2C1GtYGdfGttqA= github.com/prometheus/client_model v0.6.2 h1:oBsgwpGs7iVziMvrGhE53c/GrLUsZdHnqNwqPLxwZyk= github.com/prometheus/client_model v0.6.2/go.mod h1:y3m2F6Gdpfy6Ut/GBsUqTWZqCUvMVzSfMLjcu6wAwpE= github.com/prometheus/common v0.66.1 h1:h5E0h5/Y8niHc5DlaLlWLArTQI7tMrsfQjHV+d9ZoGs= github.com/prometheus/common v0.66.1/go.mod h1:gcaUsgf3KfRSwHY4dIMXLPV0K/Wg1oZ8+SbZk/HH/dA= github.com/prometheus/procfs v0.17.0 h1:FuLQ+05u4ZI+SS/w9+BWEM2TXiHKsUQ9TADiRH7DuK0= github.com/prometheus/procfs v0.17.0/go.mod h1:oPQLaDAMRbA+u8H5Pbfq+dl3VDAvHxMUOVhe0wYB2zw= github.com/protocolbuffers/txtpbfmt v0.0.0-20240116145035-ef3ab179eed6 h1:MAzmm+JtFxQwTPb1cVMLkemw2OxLy5AB/d/rxtAwGQQ= github.com/protocolbuffers/txtpbfmt v0.0.0-20240116145035-ef3ab179eed6/go.mod h1:jgxiZysxFPM+iWKwQwPR+y+Jvo54ARd4EisXxKYpB5c= github.com/rcrowley/go-metrics v0.0.0-20201227073835-cf1acfcdf475 h1:N/ElC8H3+5XpJzTSTfLsJV/mx9Q9g7kxmchpfZyxgzM= github.com/rcrowley/go-metrics v0.0.0-20201227073835-cf1acfcdf475/go.mod h1:bCqnVzQkZxMG4s8nGwiZ5l3QUCyqpo9Y+/ZMZ9VjZe4= github.com/remyoudompheng/bigfft v0.0.0-20230129092748-24d4a6f8daec h1:W09IVJc94icq4NjY3clb7Lk8O1qJ8BdBEF8z0ibU0rE= github.com/remyoudompheng/bigfft v0.0.0-20230129092748-24d4a6f8daec/go.mod h1:qqbHyh8v60DhA7CoWK5oRCqLrMHRGoxYCSS9EjAz6Eo= github.com/rivo/uniseg v0.2.0/go.mod h1:J6wj4VEh+S6ZtnVlnTBMWIodfgj8LQOQFoIToxlJtxc= github.com/rivo/uniseg v0.4.7 h1:WUdvkW8uEhrYfLC4ZzdpI2ztxP1I582+49Oc5Mq64VQ= github.com/rivo/uniseg v0.4.7/go.mod h1:FN3SvrM+Zdj16jyLfmOkMNblXMcoc8DfTHruCPUcx88= github.com/robfig/cron/v3 v3.0.1 h1:WdRxkvbJztn8LMz/QEvLN5sBU+xKpSqwwUO1Pjr4qDs= github.com/robfig/cron/v3 v3.0.1/go.mod h1:eQICP3HwyT7UooqI/z+Ov+PtYAWygg1TEWWzGIFLtro= github.com/rogpeppe/go-internal v1.14.1 h1:UQB4HGPB6osV0SQTLymcB4TgvyWu6ZyliaW0tI/otEQ= github.com/rogpeppe/go-internal v1.14.1/go.mod h1:MaRKkUm5W0goXpeCfT7UZI6fk/L7L7so1lCWt35ZSgc= github.com/russross/blackfriday/v2 v2.1.0/go.mod h1:+Rmxgy9KzJVeS9/2gXHxylqXiyQDYRxCVz55jmeOWTM= github.com/ryanuber/go-glob v1.0.0 h1:iQh3xXAumdQ+4Ufa5b25cRpC5TYKlno6hsv6Cb3pkBk= github.com/ryanuber/go-glob v1.0.0/go.mod h1:807d1WSdnB0XRJzKNil9Om6lcp/3a0v4qIHxIXzX/Yc= github.com/sagikazarmark/locafero v0.9.0 h1:GbgQGNtTrEmddYDSAH9QLRyfAHY12md+8YFTqyMTC9k= github.com/sagikazarmark/locafero v0.9.0/go.mod h1:UBUyz37V+EdMS3hDF3QWIiVr/2dPrx49OMO0Bn0hJqk= github.com/sahilm/fuzzy v0.1.1 h1:ceu5RHF8DGgoi+/dR5PsECjCDH1BE3Fnmpo7aVXOdRA= github.com/sahilm/fuzzy v0.1.1/go.mod h1:VFvziUEIMCrT6A6tw2RFIXPXXmzXbOsSHF0DOI8ZK9Y= github.com/samber/lo v1.51.0 h1:kysRYLbHy/MB7kQZf5DSN50JHmMsNEdeY24VzJFu7wI= github.com/samber/lo v1.51.0/go.mod h1:4+MXEGsJzbKGaUEQFKBq2xtfuznW9oz/WrgyzMzRoM0= github.com/sanathkr/go-yaml v0.0.0-20170819195128-ed9d249f429b h1:jUK33OXuZP/l6babJtnLo1qsGvq6G9so9KMflGAm4YA= github.com/sanathkr/go-yaml v0.0.0-20170819195128-ed9d249f429b/go.mod h1:8458kAagoME2+LN5//WxE71ysZ3B7r22fdgb7qVmXSY= github.com/sanathkr/yaml v0.0.0-20170819201035-0056894fa522 h1:fOCp11H0yuyAt2wqlbJtbyPzSgaxHTv8uN1pMpkG1t8= github.com/sanathkr/yaml v0.0.0-20170819201035-0056894fa522/go.mod h1:tQTYKOQgxoH3v6dEmdHiz4JG+nbxWwM5fgPQUpSZqVQ= github.com/sassoftware/relic v7.2.1+incompatible h1:Pwyh1F3I0r4clFJXkSI8bOyJINGqpgjJU3DYAZeI05A= github.com/sassoftware/relic v7.2.1+incompatible/go.mod h1:CWfAxv73/iLZ17rbyhIEq3K9hs5w6FpNMdUT//qR+zk= github.com/sassoftware/relic/v7 v7.6.2 h1:rS44Lbv9G9eXsukknS4mSjIAuuX+lMq/FnStgmZlUv4= github.com/sassoftware/relic/v7 v7.6.2/go.mod h1:kjmP0IBVkJZ6gXeAu35/KCEfca//+PKM6vTAsyDPY+k= github.com/secure-systems-lab/go-securesystemslib v0.10.0 h1:l+H5ErcW0PAehBNrBxoGv1jjNpGYdZ9RcheFkB2WI14= github.com/secure-systems-lab/go-securesystemslib v0.10.0/go.mod h1:MRKONWmRoFzPNQ9USRF9i1mc7MvAVvF1LlW8X5VWDvk= github.com/sergi/go-diff v1.1.0/go.mod h1:STckp+ISIX8hZLjrqAeVduY0gWCT9IjLuqbuNXdaHfM= github.com/sergi/go-diff v1.4.0 h1:n/SP9D5ad1fORl+llWyN+D6qoUETXNZARKjyY2/KVCw= github.com/sergi/go-diff v1.4.0/go.mod h1:A0bzQcvG0E7Rwjx0REVgAGH58e96+X0MeOfepqsbeW4= github.com/shibumi/go-pathspec v1.3.0 h1:QUyMZhFo0Md5B8zV8x2tesohbb5kfbpTi9rBnKh5dkI= github.com/shibumi/go-pathspec v1.3.0/go.mod h1:Xutfslp817l2I1cZvgcfeMQJG5QnU2lh5tVaaMCl3jE= github.com/shirou/gopsutil/v3 v3.24.5 h1:i0t8kL+kQTvpAYToeuiVk3TgDeKOFioZO3Ztz/iZ9pI= github.com/shirou/gopsutil/v3 v3.24.5/go.mod h1:bsoOS1aStSs9ErQ1WWfxllSeS1K5D+U30r2NfcubMVk= github.com/sigstore/cosign/v2 v2.4.1 h1:b8UXEfJFks3hmTwyxrRNrn6racpmccUycBHxDMkEPvU= github.com/sigstore/cosign/v2 v2.4.1/go.mod h1:GvzjBeUKigI+XYnsoVQDmMAsMMc6engxztRSuxE+x9I= github.com/sigstore/fulcio v1.6.3 h1:Mvm/bP6ELHgazqZehL8TANS1maAkRoM23CRAdkM4xQI= github.com/sigstore/fulcio v1.6.3/go.mod h1:5SDgLn7BOUVLKe1DwOEX3wkWFu5qEmhUlWm+SFf0GH8= github.com/sigstore/protobuf-specs v0.5.0 h1:F8YTI65xOHw70NrvPwJ5PhAzsvTnuJMGLkA4FIkofAY= github.com/sigstore/protobuf-specs v0.5.0/go.mod h1:+gXR+38nIa2oEupqDdzg4qSBT0Os+sP7oYv6alWewWc= github.com/sigstore/rekor v1.3.9 h1:sUjRpKVh/hhgqGMs0t+TubgYsksArZ6poLEC3MsGAzU= github.com/sigstore/rekor v1.3.9/go.mod h1:xThNUhm6eNEmkJ/SiU/FVU7pLY2f380fSDZFsdDWlcM= github.com/sigstore/sigstore v1.10.3 h1:s7fBYYOzW/2Vd0nND2ZdpWySb5vRF2u9eix/NZMHJm0= github.com/sigstore/sigstore v1.10.3/go.mod h1:T26vXIkpnGEg391v3TaZ8EERcXbnjtZb/1erh5jbIQk= github.com/sigstore/sigstore-go v0.6.1 h1:tGkkv1oDIER+QYU5MrjqlttQOVDWfSkmYwMqkJhB/cg= github.com/sigstore/sigstore-go v0.6.1/go.mod h1:Xe5GHmUeACRFbomUWzVkf/xYCn8xVifb9DgqJrV2dIw= github.com/sigstore/sigstore/pkg/signature/kms/aws v1.8.12 h1:EC3UmIaa7nV9sCgSpVevmvgvTYTkMqyrRbj5ojPp7tE= github.com/sigstore/sigstore/pkg/signature/kms/aws v1.8.12/go.mod h1:aw60vs3crnQdM/DYH+yF2P0MVKtItwAX34nuaMrY7Lk= github.com/sigstore/sigstore/pkg/signature/kms/azure v1.8.12 h1:FPpliDTywSy0woLHMAdmTSZ5IS/lVBZ0dY0I+2HmnSY= github.com/sigstore/sigstore/pkg/signature/kms/azure v1.8.12/go.mod h1:NkPiz4XA0JcBSXzJUrjMj7Xi7oSTew1Ip3Zmt56mHlw= github.com/sigstore/sigstore/pkg/signature/kms/gcp v1.8.12 h1:kweBChR6M9FEvmxN3BMEcl7SNnwxTwKF7THYFKLOE5U= github.com/sigstore/sigstore/pkg/signature/kms/gcp v1.8.12/go.mod h1:6+d+A6oYt1W5OgtzgEVb21V7tAZ/C2Ihtzc5MNJbayY= github.com/sigstore/sigstore/pkg/signature/kms/hashivault v1.8.12 h1:jvY1B9bjP+tKzdKDyuq5K7O19CG2IKzGJNTy5tuL2Gs= github.com/sigstore/sigstore/pkg/signature/kms/hashivault v1.8.12/go.mod h1:2uEeOb8xE2RC6OvzxKux1wkS39Zv8gA27z92m49xUTc= github.com/sigstore/timestamp-authority v1.2.2 h1:X4qyutnCQqJ0apMewFyx+3t7Tws00JQ/JonBiu3QvLE= github.com/sigstore/timestamp-authority v1.2.2/go.mod h1:nEah4Eq4wpliDjlY342rXclGSO7Kb9hoRrl9tqLW13A= github.com/sirupsen/logrus v1.7.0/go.mod h1:yWOB1SBYBC5VeMP7gHvWumXLIWorT60ONWic61uBYv0= github.com/sirupsen/logrus v1.9.3 h1:dueUQJ1C2q9oE3F7wvmSGAaVtTmUizReu6fjN8uqzbQ= github.com/sirupsen/logrus v1.9.3/go.mod h1:naHLuLoDiP4jHNo9R0sCBMtWGeIprob74mVsIT4qYEQ= github.com/skeema/knownhosts v1.3.1 h1:X2osQ+RAjK76shCbvhHHHVl3ZlgDm8apHEHFqRjnBY8= github.com/skeema/knownhosts v1.3.1/go.mod h1:r7KTdC8l4uxWRyK2TpQZ/1o5HaSzh06ePQNxPwTcfiY= github.com/smallstep/assert v0.0.0-20200723003110-82e2b9b3b262 h1:unQFBIznI+VYD1/1fApl1A+9VcBk+9dcqGfnePY87LY= github.com/smallstep/assert v0.0.0-20200723003110-82e2b9b3b262/go.mod h1:MyOHs9Po2fbM1LHej6sBUT8ozbxmMOFG+E+rx/GSGuc= github.com/smartystreets/assertions v0.0.0-20180927180507-b2de0cb4f26d/go.mod h1:OnSkiWE9lh6wB0YB77sQom3nweQdgAjqCqsofrRNTgc= github.com/smartystreets/assertions v1.1.0/go.mod h1:tcbTF8ujkAEcZ8TElKY+i30BzYlVhC/LOxJk7iOWnoo= github.com/smartystreets/goconvey v1.6.4/go.mod h1:syvi0/a8iFYH4r/RixwvyeAJjdLS9QV7WQ/tjFTllLA= github.com/sourcegraph/conc v0.3.0 h1:OQTbbt6P72L20UqAkXXuLOj79LfEanQ+YQFNpLA9ySo= github.com/sourcegraph/conc v0.3.0/go.mod h1:Sdozi7LEKbFPqYX2/J+iBAM6HpqSLTASQIKqDmF7Mt0= github.com/spf13/afero v1.15.0 h1:b/YBCLWAJdFWJTN9cLhiXXcD7mzKn9Dm86dNnfyQw1I= github.com/spf13/afero v1.15.0/go.mod h1:NC2ByUVxtQs4b3sIUphxK0NioZnmxgyCrfzeuq8lxMg= github.com/spf13/cast v1.9.2 h1:SsGfm7M8QOFtEzumm7UZrZdLLquNdzFYfIbEXntcFbE= github.com/spf13/cast v1.9.2/go.mod h1:jNfB8QC9IA6ZuY2ZjDp0KtFO2LZZlg4S/7bzP6qqeHo= github.com/spf13/cobra v1.10.2 h1:DMTTonx5m65Ic0GOoRY2c16WCbHxOOw6xxezuLaBpcU= github.com/spf13/cobra v1.10.2/go.mod h1:7C1pvHqHw5A4vrJfjNwvOdzYu0Gml16OCs2GRiTUUS4= github.com/spf13/pflag v1.0.9/go.mod h1:McXfInJRrz4CZXVZOBLb0bTZqETkiAhM9Iw0y3An2Bg= github.com/spf13/pflag v1.0.10 h1:4EBh2KAYBwaONj6b2Ye1GiHfwjqyROoF4RwYO+vPwFk= github.com/spf13/pflag v1.0.10/go.mod h1:McXfInJRrz4CZXVZOBLb0bTZqETkiAhM9Iw0y3An2Bg= github.com/spf13/viper v1.20.1 h1:ZMi+z/lvLyPSCoNtFCpqjy0S4kPbirhpTMwl8BkW9X4= github.com/spf13/viper v1.20.1/go.mod h1:P9Mdzt1zoHIG8m2eZQinpiBjo6kCmZSKBClNNqjJvu4= github.com/spiffe/go-spiffe/v2 v2.6.0 h1:l+DolpxNWYgruGQVV0xsfeya3CsC7m8iBzDnMpsbLuo= github.com/spiffe/go-spiffe/v2 v2.6.0/go.mod h1:gm2SeUoMZEtpnzPNs2Csc0D/gX33k1xIx7lEzqblHEs= github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME= github.com/stretchr/objx v0.2.0/go.mod h1:qt09Ya8vawLte6SNmTgCsAVtYtaKzEcn8ATUoHMkEqE= github.com/stretchr/objx v0.4.0/go.mod h1:YvHI0jy2hoMjB+UWwv71VJQ9isScKT/TqJzVSSt89Yw= github.com/stretchr/objx v0.5.0/go.mod h1:Yh+to48EsGEfYuaHDzXPcE3xhTkx73EhmCGUpEOglKo= github.com/stretchr/objx v0.5.2 h1:xuMeJ0Sdp5ZMRXx/aWO6RZxdr3beISkG5/G/aIRr3pY= github.com/stretchr/objx v0.5.2/go.mod h1:FRsXN1f5AsAjCGJKqEizvkpNtU+EGNCLh3NxZ/8L+MA= github.com/stretchr/testify v1.2.2/go.mod h1:a8OnRcib4nhh0OaRAV+Yts87kKdq0PP7pXfy6kDkUVs= github.com/stretchr/testify v1.3.0/go.mod h1:M5WIy9Dh21IEIfnGCwXGc5bZfKNJtfHm1UVUgZn+9EI= github.com/stretchr/testify v1.4.0/go.mod h1:j7eGeouHqKxXV5pUuKE4zz7dFj8WfuZ+81PSLYec5m4= github.com/stretchr/testify v1.5.1/go.mod h1:5W2xD1RspED5o8YsWQXVCued0rvSQ+mT+I5cxcmMvtA= github.com/stretchr/testify v1.7.0/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/h/Wwjteg= github.com/stretchr/testify v1.7.1/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/h/Wwjteg= github.com/stretchr/testify v1.7.2/go.mod h1:R6va5+xMeoiuVRoj+gSkQ7d3FALtqAAGI1FQKckRals= github.com/stretchr/testify v1.8.0/go.mod h1:yNjHg4UonilssWZ8iaSj1OCr/vHnekPRkoO+kdMU+MU= github.com/stretchr/testify v1.8.2/go.mod h1:w2LPCIKwWwSfY2zedu0+kehJoqGctiVI29o6fzry7u4= github.com/stretchr/testify v1.11.1 h1:7s2iGBzp5EwR7/aIZr8ao5+dra3wiQyKjjFuvgVKu7U= github.com/stretchr/testify v1.11.1/go.mod h1:wZwfW3scLgRK+23gO65QZefKpKQRnfz6sD981Nm4B6U= github.com/subosito/gotenv v1.6.0 h1:9NlTDc1FTs4qu0DDq7AEtTPNw6SVm7uBMsUCUjABIf8= github.com/subosito/gotenv v1.6.0/go.mod h1:Dk4QP5c2W3ibzajGcXpNraDfq2IrhjMIvMSWPKKo0FU= github.com/syndtr/goleveldb v1.0.1-0.20220721030215-126854af5e6d h1:vfofYNRScrDdvS342BElfbETmL1Aiz3i2t0zfRj16Hs= github.com/syndtr/goleveldb v1.0.1-0.20220721030215-126854af5e6d/go.mod h1:RRCYJbIwD5jmqPI9XoAFR0OcDxqUctll6zUj/+B4S48= github.com/tchap/go-patricia/v2 v2.3.2 h1:xTHFutuitO2zqKAQ5rCROYgUb7Or/+IC3fts9/Yc7nM= github.com/tchap/go-patricia/v2 v2.3.2/go.mod h1:VZRHKAb53DLaG+nA9EaYYiaEx6YztwDlLElMsnSHD4k= github.com/thales-e-security/pool v0.0.2 h1:RAPs4q2EbWsTit6tpzuvTFlgFRJ3S8Evf5gtvVDbmPg= github.com/thales-e-security/pool v0.0.2/go.mod h1:qtpMm2+thHtqhLzTwgDBj/OuNnMpupY8mv0Phz0gjhU= github.com/theupdateframework/go-tuf v0.7.0 h1:CqbQFrWo1ae3/I0UCblSbczevCCbS31Qvs5LdxRWqRI= github.com/theupdateframework/go-tuf v0.7.0/go.mod h1:uEB7WSY+7ZIugK6R1hiBMBjQftaFzn7ZCDJcp1tCUug= github.com/theupdateframework/go-tuf/v2 v2.3.1 h1:fReZUTLvPdqIL8Rd9xEKPmaxig8GIXe0kS4RSEaRfaM= github.com/theupdateframework/go-tuf/v2 v2.3.1/go.mod h1:9S0Srkf3c13FelsOyt5OyG3ZZDq9OJDA4IILavrt72Y= github.com/tidwall/gjson v1.18.0 h1:FIDeeyB800efLX89e5a8Y0BNH+LOngJyGrIWxG2FKQY= github.com/tidwall/gjson v1.18.0/go.mod h1:/wbyibRr2FHMks5tjHJ5F8dMZh3AcwJEMf5vlfC0lxk= github.com/tidwall/match v1.1.1 h1:+Ho715JplO36QYgwN9PGYNhgZvoUSc9X2c80KVTi+GA= github.com/tidwall/match v1.1.1/go.mod h1:eRSPERbgtNPcGhD8UCthc6PmLEQXEWd3PRB5JTxsfmM= github.com/tidwall/pretty v1.2.1 h1:qjsOFOWWQl+N3RsoF5/ssm1pHmJJwhjlSbZ51I6wMl4= github.com/tidwall/pretty v1.2.1/go.mod h1:ITEVvHYasfjBbM0u2Pg8T2nJnzm8xPwvNhhsoaGGjNU= github.com/tidwall/sjson v1.2.5 h1:kLy8mja+1c9jlljvWTlSazM7cKDRfJuR/bOJhcY5NcY= github.com/tidwall/sjson v1.2.5/go.mod h1:Fvgq9kS/6ociJEDnK0Fk1cpYF4FIW6ZF7LAe+6jwd28= github.com/tink-crypto/tink-go-awskms/v2 v2.1.0 h1:N9UxlsOzu5mttdjhxkDLbzwtEecuXmlxZVo/ds7JKJI= github.com/tink-crypto/tink-go-awskms/v2 v2.1.0/go.mod h1:PxSp9GlOkKL9rlybW804uspnHuO9nbD98V/fDX4uSis= github.com/tink-crypto/tink-go-gcpkms/v2 v2.2.0 h1:3B9i6XBXNTRspfkTC0asN5W0K6GhOSgcujNiECNRNb0= github.com/tink-crypto/tink-go-gcpkms/v2 v2.2.0/go.mod h1:jY5YN2BqD/KSCHM9SqZPIpJNG/u3zwfLXHgws4x2IRw= github.com/tink-crypto/tink-go/v2 v2.5.0 h1:B8KLF6AofxdBIE4UJIaFbmoj5/1ehEtt7/MmzfI4Zpw= github.com/tink-crypto/tink-go/v2 v2.5.0/go.mod h1:2WbBA6pfNsAfBwDCggboaHeB2X29wkU8XHtGwh2YIk8= github.com/tjfoc/gmsm v1.3.2/go.mod h1:HaUcFuY0auTiaHB9MHFGCPx5IaLhTUd2atbCFBQXn9w= github.com/tjfoc/gmsm v1.4.1 h1:aMe1GlZb+0bLjn+cKTPEvvn9oUEBlJitaZiiBwsbgho= github.com/tjfoc/gmsm v1.4.1/go.mod h1:j4INPkHWMrhJb38G+J6W4Tw0AbuN8Thu3PbdVYhVcTE= github.com/transparency-dev/merkle v0.0.2 h1:Q9nBoQcZcgPamMkGn7ghV8XiTZ/kRxn1yCG81+twTK4= github.com/transparency-dev/merkle v0.0.2/go.mod h1:pqSy+OXefQ1EDUVmAJ8MUhHB9TXGuzVAT58PqBoHz1A= github.com/urfave/sflags v0.4.1 h1:9BKteZiMaLlgfMm8eYbFge3eRAUsrJXs4HsCemdDl+A= github.com/urfave/sflags v0.4.1/go.mod h1:NCIz2mBC+woyrkl88PeiKAuQUKJdEre2Y4at5SreAeU= github.com/vbatts/tar-split v0.12.2 h1:w/Y6tjxpeiFMR47yzZPlPj/FcPLpXbTUi/9H7d3CPa4= github.com/vbatts/tar-split v0.12.2/go.mod h1:eF6B6i6ftWQcDqEn3/iGFRFRo8cBIMSJVOpnNdfTMFA= github.com/vladimirvivien/gexe v0.5.0 h1:AWBVaYnrTsGYBktXvcO0DfWPeSiZxn6mnQ5nvL+A1/A= github.com/vladimirvivien/gexe v0.5.0/go.mod h1:3gjgTqE2c0VyHnU5UOIwk7gyNzZDGulPb/DJPgcw64E= github.com/weaveworks/eksctl v0.221.0 h1:sJEuVRU+8dia8rj/4VmB8DwKArmGhG7uwaqdUYJhqv0= github.com/weaveworks/eksctl v0.221.0/go.mod h1:fkWnFg8h/h24bl5DmyRgJIERB/7g5zqIeNgSklfeH5Q= github.com/x448/float16 v0.8.4 h1:qLwI1I70+NjRFUR3zs1JPUCgaCXSh3SW62uAKT1mSBM= github.com/x448/float16 v0.8.4/go.mod h1:14CWIYCyZA/cWjXOioeEpHeN/83MdbZDRQHoFcYsOfg= github.com/xanzy/go-gitlab v0.109.0 h1:RcRme5w8VpLXTSTTMZdVoQWY37qTJWg+gwdQl4aAttE= github.com/xanzy/go-gitlab v0.109.0/go.mod h1:wKNKh3GkYDMOsGmnfuX+ITCmDuSDWFO0G+C4AygL9RY= github.com/xanzy/ssh-agent v0.3.3 h1:+/15pJfg/RsTxqYcX6fHqOXZwwMP+2VyYWJeWM2qQFM= github.com/xanzy/ssh-agent v0.3.3/go.mod h1:6dzNDKs0J9rVPHPhaGCukekBHKqfl+L3KghI1Bc68Uw= github.com/xeipuuv/gojsonpointer v0.0.0-20190905194746-02993c407bfb h1:zGWFAtiMcyryUHoUjUJX0/lt1H2+i2Ka2n+D3DImSNo= github.com/xeipuuv/gojsonpointer v0.0.0-20190905194746-02993c407bfb/go.mod h1:N2zxlSyiKSe5eX1tZViRH5QA0qijqEDrYZiPEAiq3wU= github.com/xeipuuv/gojsonreference v0.0.0-20180127040603-bd5ef7bd5415 h1:EzJWgHovont7NscjpAxXsDA8S8BMYve8Y5+7cuRE7R0= github.com/xeipuuv/gojsonreference v0.0.0-20180127040603-bd5ef7bd5415/go.mod h1:GwrjFmJcFw6At/Gs6z4yjiIwzuJ1/+UwLxMQDVQXShQ= github.com/xlab/treeprint v1.2.0 h1:HzHnuAF1plUN2zGlAFHbSQP2qJ0ZAD3XF5XD7OesXRQ= github.com/xlab/treeprint v1.2.0/go.mod h1:gj5Gd3gPdKtR1ikdDK6fnFLdmIS0X30kTTuNd/WEJu0= github.com/xo/terminfo v0.0.0-20220910002029-abceb7e1c41e h1:JVG44RsyaB9T2KIHavMF/ppJZNG9ZpyihvCd0w101no= github.com/xo/terminfo v0.0.0-20220910002029-abceb7e1c41e/go.mod h1:RbqR21r5mrJuqunuUZ/Dhy/avygyECGrLceyNeo4LiM= github.com/yashtewari/glob-intersection v0.2.0 h1:8iuHdN88yYuCzCdjt0gDe+6bAhUwBeEWqThExu54RFg= github.com/yashtewari/glob-intersection v0.2.0/go.mod h1:LK7pIC3piUjovexikBbJ26Yml7g8xa5bsjfx2v1fwok= github.com/ysmood/fetchup v0.2.3 h1:ulX+SonA0Vma5zUFXtv52Kzip/xe7aj4vqT5AJwQ+ZQ= github.com/ysmood/fetchup v0.2.3/go.mod h1:xhibcRKziSvol0H1/pj33dnKrYyI2ebIvz5cOOkYGns= github.com/ysmood/goob v0.4.0 h1:HsxXhyLBeGzWXnqVKtmT9qM7EuVs/XOgkX7T6r1o1AQ= github.com/ysmood/goob v0.4.0/go.mod h1:u6yx7ZhS4Exf2MwciFr6nIM8knHQIE22lFpWHnfql18= github.com/ysmood/got v0.40.0 h1:ZQk1B55zIvS7zflRrkGfPDrPG3d7+JOza1ZkNxcc74Q= github.com/ysmood/got v0.40.0/go.mod h1:W7DdpuX6skL3NszLmAsC5hT7JAhuLZhByVzHTq874Qg= github.com/ysmood/gson v0.7.3 h1:QFkWbTH8MxyUTKPkVWAENJhxqdBa4lYTQWqZCiLG6kE= github.com/ysmood/gson v0.7.3/go.mod h1:3Kzs5zDl21g5F/BlLTNcuAGAYLKt2lV5G8D1zF3RNmg= github.com/ysmood/leakless v0.9.0 h1:qxCG5VirSBvmi3uynXFkcnLMzkphdh3xx5FtrORwDCU= github.com/ysmood/leakless v0.9.0/go.mod h1:R8iAXPRaG97QJwqxs74RdwzcRHT1SWCGTNqY8q0JvMQ= github.com/yuin/goldmark v1.1.27/go.mod h1:3hX8gzYuyVAZsxl0MRgGTJEmQBFcNTphYh9decYSb74= github.com/yuin/goldmark v1.1.30/go.mod h1:3hX8gzYuyVAZsxl0MRgGTJEmQBFcNTphYh9decYSb74= github.com/yuin/goldmark v1.2.1/go.mod h1:3hX8gzYuyVAZsxl0MRgGTJEmQBFcNTphYh9decYSb74= github.com/yuin/goldmark v1.4.13/go.mod h1:6yULJ656Px+3vBD8DxQVa3kxgyrAnzto9xy5taEt/CY= github.com/yusufpapurcu/wmi v1.2.4 h1:zFUKzehAFReQwLys1b/iSMl+JQGSCSjtVqQn9bBrPo0= github.com/yusufpapurcu/wmi v1.2.4/go.mod h1:SBZ9tNy3G9/m5Oi98Zks0QjeHVDvuK0qfxQmPyzfmi0= github.com/zalando/go-keyring v0.2.3 h1:v9CUu9phlABObO4LPWycf+zwMG7nlbb3t/B5wa97yms= github.com/zalando/go-keyring v0.2.3/go.mod h1:HL4k+OXQfJUWaMnqyuSOc0drfGPX2b51Du6K+MRgZMk= gitlab.alpinelinux.org/alpine/go v0.10.0 h1:/ekBiNqDSXZpK+AfZx4lrtVwKTDrWz3N3ck0S+fCxwU= gitlab.alpinelinux.org/alpine/go v0.10.0/go.mod h1:LKzOqYjGTZNLwcHl+c2I5VNioQio7agzRFvlGB9Owk4= go.mongodb.org/mongo-driver v1.17.2 h1:gvZyk8352qSfzyZ2UMWcpDpMSGEr1eqE4T793SqyhzM= go.mongodb.org/mongo-driver v1.17.2/go.mod h1:Hy04i7O2kC4RS06ZrhPRqj/u4DTYkFDAAccj+rVKqgQ= go.opentelemetry.io/auto/sdk v1.2.1 h1:jXsnJ4Lmnqd11kwkBV2LgLoFMZKizbCi5fNZ/ipaZ64= go.opentelemetry.io/auto/sdk v1.2.1/go.mod h1:KRTj+aOaElaLi+wW1kO/DZRXwkF4C5xPbEe3ZiIhN7Y= go.opentelemetry.io/contrib/detectors/gcp v1.39.0 h1:kWRNZMsfBHZ+uHjiH4y7Etn2FK26LAGkNFw7RHv1DhE= go.opentelemetry.io/contrib/detectors/gcp v1.39.0/go.mod h1:t/OGqzHBa5v6RHZwrDBJ2OirWc+4q/w2fTbLZwAKjTk= go.opentelemetry.io/contrib/instrumentation/google.golang.org/grpc/otelgrpc v0.63.0 h1:YH4g8lQroajqUwWbq/tr2QX1JFmEXaDLgG+ew9bLMWo= go.opentelemetry.io/contrib/instrumentation/google.golang.org/grpc/otelgrpc v0.63.0/go.mod h1:fvPi2qXDqFs8M4B4fmJhE92TyQs9Ydjlg3RvfUp+NbQ= go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp v0.63.0 h1:RbKq8BG0FI8OiXhBfcRtqqHcZcka+gU3cskNuf05R18= go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp v0.63.0/go.mod h1:h06DGIukJOevXaj/xrNjhi/2098RZzcLTbc0jDAUbsg= go.opentelemetry.io/otel v1.39.0 h1:8yPrr/S0ND9QEfTfdP9V+SiwT4E0G7Y5MO7p85nis48= go.opentelemetry.io/otel v1.39.0/go.mod h1:kLlFTywNWrFyEdH0oj2xK0bFYZtHRYUdv1NklR/tgc8= go.opentelemetry.io/otel/exporters/otlp/otlptrace v1.38.0 h1:GqRJVj7UmLjCVyVJ3ZFLdPRmhDUp2zFmQe3RHIOsw24= go.opentelemetry.io/otel/exporters/otlp/otlptrace v1.38.0/go.mod h1:ri3aaHSmCTVYu2AWv44YMauwAQc0aqI9gHKIcSbI1pU= go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracegrpc v1.38.0 h1:lwI4Dc5leUqENgGuQImwLo4WnuXFPetmPpkLi2IrX54= go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracegrpc v1.38.0/go.mod h1:Kz/oCE7z5wuyhPxsXDuaPteSWqjSBD5YaSdbxZYGbGk= go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracehttp v1.35.0 h1:xJ2qHD0C1BeYVTLLR9sX12+Qb95kfeD/byKj6Ky1pXg= go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracehttp v1.35.0/go.mod h1:u5BF1xyjstDowA1R5QAO9JHzqK+ublenEW/dyqTjBVk= go.opentelemetry.io/otel/exporters/stdout/stdoutmetric v1.35.0 h1:PB3Zrjs1sG1GBX51SXyTSoOTqcDglmsk7nT6tkKPb/k= go.opentelemetry.io/otel/exporters/stdout/stdoutmetric v1.35.0/go.mod h1:U2R3XyVPzn0WX7wOIypPuptulsMcPDPs/oiSVOMVnHY= go.opentelemetry.io/otel/metric v1.39.0 h1:d1UzonvEZriVfpNKEVmHXbdf909uGTOQjA0HF0Ls5Q0= go.opentelemetry.io/otel/metric v1.39.0/go.mod h1:jrZSWL33sD7bBxg1xjrqyDjnuzTUB0x1nBERXd7Ftcs= go.opentelemetry.io/otel/sdk v1.39.0 h1:nMLYcjVsvdui1B/4FRkwjzoRVsMK8uL/cj0OyhKzt18= go.opentelemetry.io/otel/sdk v1.39.0/go.mod h1:vDojkC4/jsTJsE+kh+LXYQlbL8CgrEcwmt1ENZszdJE= go.opentelemetry.io/otel/sdk/metric v1.39.0 h1:cXMVVFVgsIf2YL6QkRF4Urbr/aMInf+2WKg+sEJTtB8= go.opentelemetry.io/otel/sdk/metric v1.39.0/go.mod h1:xq9HEVH7qeX69/JnwEfp6fVq5wosJsY1mt4lLfYdVew= go.opentelemetry.io/otel/trace v1.39.0 h1:2d2vfpEDmCJ5zVYz7ijaJdOF59xLomrvj7bjt6/qCJI= go.opentelemetry.io/otel/trace v1.39.0/go.mod h1:88w4/PnZSazkGzz/w84VHpQafiU4EtqqlVdxWy+rNOA= go.opentelemetry.io/proto/otlp v1.7.1 h1:gTOMpGDb0WTBOP8JaO72iL3auEZhVmAQg4ipjOVAtj4= go.opentelemetry.io/proto/otlp v1.7.1/go.mod h1:b2rVh6rfI/s2pHWNlB7ILJcRALpcNDzKhACevjI+ZnE= go.step.sm/crypto v0.57.0 h1:YjoRQDaJYAxHLVwjst0Bl0xcnoKzVwuHCJtEo2VSHYU= go.step.sm/crypto v0.57.0/go.mod h1:+Lwp5gOVPaTa3H/Ul/TzGbxQPXZZcKIUGMS0lG6n9Go= go.uber.org/goleak v1.3.0 h1:2K3zAYmnTNqV73imy9J1T3WC+gmCePx2hEGkimedGto= go.uber.org/goleak v1.3.0/go.mod h1:CoHD4mav9JJNrW/WLlf7HGZPjdw8EucARQHekz1X6bE= go.uber.org/multierr v1.11.0 h1:blXXJkSxSSfBVBlC76pxqeO+LN3aDfLQo+309xJstO0= go.uber.org/multierr v1.11.0/go.mod h1:20+QtiLqy0Nd6FdQB9TLXag12DsQkrbs3htMFfDN80Y= go.uber.org/zap v1.27.0 h1:aJMhYGrd5QSmlpLMr2MftRKl7t8J8PTZPA732ud/XR8= go.uber.org/zap v1.27.0/go.mod h1:GB2qFLM7cTU87MWRP2mPIjqfIDnGu+VIO4V/SdhGo2E= go.yaml.in/yaml/v2 v2.4.3 h1:6gvOSjQoTB3vt1l+CU+tSyi/HOjfOjRLJ4YwYZGwRO0= go.yaml.in/yaml/v2 v2.4.3/go.mod h1:zSxWcmIDjOzPXpjlTTbAsKokqkDNAVtZO0WOMiT90s8= go.yaml.in/yaml/v3 v3.0.4 h1:tfq32ie2Jv2UxXFdLJdh3jXuOzWiL1fo0bu/FbuKpbc= go.yaml.in/yaml/v3 v3.0.4/go.mod h1:DhzuOOF2ATzADvBadXxruRBLzYTpT36CKvDb3+aBEFg= golang.org/x/crypto v0.0.0-20190308221718-c2843e01d9a2/go.mod h1:djNgcEr1/C05ACkg1iLfiJU5Ep61QUkGW8qpdssI0+w= golang.org/x/crypto v0.0.0-20191011191535-87dc89f01550/go.mod h1:yigFU9vqHzYiE8UmvKecakEJjdnWj3jj499lnFckfCI= golang.org/x/crypto v0.0.0-20191219195013-becbf705a915/go.mod h1:LzIPMQfyMNhhGPhUkYOs5KpL4U8rLKemX1yGLhDgUto= golang.org/x/crypto v0.0.0-20200510223506-06a226fb4e37/go.mod h1:LzIPMQfyMNhhGPhUkYOs5KpL4U8rLKemX1yGLhDgUto= golang.org/x/crypto v0.0.0-20200622213623-75b288015ac9/go.mod h1:LzIPMQfyMNhhGPhUkYOs5KpL4U8rLKemX1yGLhDgUto= golang.org/x/crypto v0.0.0-20201012173705-84dcc777aaee/go.mod h1:LzIPMQfyMNhhGPhUkYOs5KpL4U8rLKemX1yGLhDgUto= golang.org/x/crypto v0.0.0-20210921155107-089bfa567519/go.mod h1:GvvjBRRGRdwPK5ydBHafDWAxML/pGHZbMvKqRZ5+Abc= golang.org/x/crypto v0.0.0-20211215153901-e495a2d5b3d3/go.mod h1:IxCIyHEi3zRg3s0A5j5BB6A9Jmi73HwBIUl50j+osU4= golang.org/x/crypto v0.0.0-20220622213112-05595931fe9d/go.mod h1:IxCIyHEi3zRg3s0A5j5BB6A9Jmi73HwBIUl50j+osU4= golang.org/x/crypto v0.0.0-20220722155217-630584e8d5aa/go.mod h1:IxCIyHEi3zRg3s0A5j5BB6A9Jmi73HwBIUl50j+osU4= golang.org/x/crypto v0.6.0/go.mod h1:OFC/31mSvZgRz0V1QTNCzfAI1aIRzbiufJtkMIlEp58= golang.org/x/crypto v0.14.0/go.mod h1:MVFd36DqK4CsrnJYDkBA3VC4m2GkXAM0PvzMCn4JQf4= golang.org/x/crypto v0.18.0/go.mod h1:R0j02AL6hcrfOiy9T4ZYp/rcWeMxM3L6QYxlOuEG1mg= golang.org/x/crypto v0.19.0/go.mod h1:Iy9bg/ha4yyC70EfRS8jz+B6ybOBKMaSxLj6P6oBDfU= golang.org/x/crypto v0.46.0 h1:cKRW/pmt1pKAfetfu+RCEvjvZkA9RimPbh7bhFjGVBU= golang.org/x/crypto v0.46.0/go.mod h1:Evb/oLKmMraqjZ2iQTwDwvCtJkczlDuTmdJXoZVzqU0= golang.org/x/exp v0.0.0-20190121172915-509febef88a4/go.mod h1:CJ0aWSM057203Lf6IL+f9T1iT9GByDxfZKAQTCR3kQA= golang.org/x/exp v0.0.0-20251219203646-944ab1f22d93 h1:fQsdNF2N+/YewlRZiricy4P1iimyPKZ/xwniHj8Q2a0= golang.org/x/exp v0.0.0-20251219203646-944ab1f22d93/go.mod h1:EPRbTFwzwjXj9NpYyyrvenVh9Y+GFeEvMNh7Xuz7xgU= golang.org/x/lint v0.0.0-20181026193005-c67002cb31c3/go.mod h1:UVdnD1Gm6xHRNCYTkRU2/jEulfH38KcIWyp/GAMgvoE= golang.org/x/lint v0.0.0-20190227174305-5b3e6a55c961/go.mod h1:wehouNa3lNwaWXcvxsM5YxQ5yQlVC4a0KAMCusXpPoU= golang.org/x/lint v0.0.0-20190313153728-d0100b6bd8b3/go.mod h1:6SW0HCj/g11FgYtHlgUYUwCkIfeOF89ocIRzGO/8vkc= golang.org/x/mod v0.2.0/go.mod h1:s0Qsj1ACt9ePp/hMypM3fl4fZqREWJwdYDEqhRiZZUA= golang.org/x/mod v0.3.0/go.mod h1:s0Qsj1ACt9ePp/hMypM3fl4fZqREWJwdYDEqhRiZZUA= golang.org/x/mod v0.6.0-dev.0.20220419223038-86c51ed26bb4/go.mod h1:jJ57K6gSWd91VN4djpZkiMVwK6gcyfeH4XE8wZrZaV4= golang.org/x/mod v0.8.0/go.mod h1:iBbtSCu2XBx23ZKBPSOrRkjjQPZFPuis4dIYUhu/chs= golang.org/x/mod v0.31.0 h1:HaW9xtz0+kOcWKwli0ZXy79Ix+UW/vOfmWI5QVd2tgI= golang.org/x/mod v0.31.0/go.mod h1:43JraMp9cGx1Rx3AqioxrbrhNsLl2l/iNAvuBkrezpg= golang.org/x/net v0.0.0-20180724234803-3673e40ba225/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4= golang.org/x/net v0.0.0-20180826012351-8a410e7b638d/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4= golang.org/x/net v0.0.0-20180906233101-161cd47e91fd/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4= golang.org/x/net v0.0.0-20190213061140-3a22650c66bd/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4= golang.org/x/net v0.0.0-20190311183353-d8887717615a/go.mod h1:t9HGtf8HONx5eT2rtn7q6eTqICYqUVnKs3thJo3Qplg= golang.org/x/net v0.0.0-20190404232315-eb5bcb51f2a3/go.mod h1:t9HGtf8HONx5eT2rtn7q6eTqICYqUVnKs3thJo3Qplg= golang.org/x/net v0.0.0-20190620200207-3b0461eec859/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s= golang.org/x/net v0.0.0-20200226121028-0de0cce0169b/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s= golang.org/x/net v0.0.0-20200506145744-7e3656a0809f/go.mod h1:qpuaurCH72eLCgpAm/N6yyVIVM9cpaDIP3A8BGJEC5A= golang.org/x/net v0.0.0-20200520004742-59133d7f0dd7/go.mod h1:qpuaurCH72eLCgpAm/N6yyVIVM9cpaDIP3A8BGJEC5A= golang.org/x/net v0.0.0-20201010224723-4f7140c49acb/go.mod h1:sp8m0HH+o8qH0wwXwYZr8TS3Oi6o0r6Gce1SSxlDquU= golang.org/x/net v0.0.0-20201021035429-f5854403a974/go.mod h1:sp8m0HH+o8qH0wwXwYZr8TS3Oi6o0r6Gce1SSxlDquU= golang.org/x/net v0.0.0-20210226172049-e18ecbb05110/go.mod h1:m0MpNAwzfU5UDzcl9v0D8zg8gWTRqZa9RBIspLL5mdg= golang.org/x/net v0.0.0-20210428140749-89ef3d95e781/go.mod h1:OJAsFXCWl8Ukc7SiCT/9KSuxbyM7479/AVlXFRxuMCk= golang.org/x/net v0.0.0-20211112202133-69e39bad7dc2/go.mod h1:9nx3DQGgdP8bBQD5qxJ1jj9UTztislL4KSBs9R2vV5Y= golang.org/x/net v0.0.0-20220225172249-27dd8689420f/go.mod h1:CfG3xpIq0wQ8r1q4Su4UZFWDARRcnwPjda9FqA0JpMk= golang.org/x/net v0.0.0-20220607020251-c690dde0001d/go.mod h1:XRhObCWvk6IyKnWLug+ECip1KBveYUHfp+8e9klMJ9c= golang.org/x/net v0.0.0-20220722155237-a158d28d115b/go.mod h1:XRhObCWvk6IyKnWLug+ECip1KBveYUHfp+8e9klMJ9c= golang.org/x/net v0.6.0/go.mod h1:2Tu9+aMcznHK/AK1HMvgo6xiTLG5rD5rZLDS+rp2Bjs= golang.org/x/net v0.10.0/go.mod h1:0qNGK6F8kojg2nk9dLZ2mShWaEBan6FAoqfSigmmuDg= golang.org/x/net v0.17.0/go.mod h1:NxSsAGuq816PNPmqtQdLE42eU2Fs7NoRIZrHJAlaCOE= golang.org/x/net v0.20.0/go.mod h1:z8BVo6PvndSri0LbOE3hAn0apkU+1YvI6E70E9jsnvY= golang.org/x/net v0.48.0 h1:zyQRTTrjc33Lhh0fBgT/H3oZq9WuvRR5gPC70xpDiQU= golang.org/x/net v0.48.0/go.mod h1:+ndRgGjkh8FGtu1w1FGbEC31if4VrNVMuKTgcAAnQRY= golang.org/x/oauth2 v0.0.0-20180821212333-d2e6202438be/go.mod h1:N/0e6XlmueqKjAGxoOufVs8QHGRruUQn6yWY3a++T0U= golang.org/x/oauth2 v0.34.0 h1:hqK/t4AKgbqWkdkcAeI8XLmbK+4m4G5YeQRrmiotGlw= golang.org/x/oauth2 v0.34.0/go.mod h1:lzm5WQJQwKZ3nwavOZ3IS5Aulzxi68dUSgRHujetwEA= golang.org/x/sync v0.0.0-20180314180146-1d60e4601c6f/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= golang.org/x/sync v0.0.0-20181108010431-42b317875d0f/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= golang.org/x/sync v0.0.0-20190423024810-112230192c58/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= golang.org/x/sync v0.0.0-20190911185100-cd5d95a43a6e/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= golang.org/x/sync v0.0.0-20200317015054-43a5402ce75a/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= golang.org/x/sync v0.0.0-20201020160332-67f06af15bc9/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= golang.org/x/sync v0.0.0-20220722155255-886fb9371eb4/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= golang.org/x/sync v0.1.0/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= golang.org/x/sync v0.7.0/go.mod h1:Czt+wKu1gCyEFDUtn0jG5QVvpJ6rzVqr5aXyt9drQfk= golang.org/x/sync v0.19.0 h1:vV+1eWNmZ5geRlYjzm2adRgW2/mcpevXNg50YZtPCE4= golang.org/x/sync v0.19.0/go.mod h1:9KTHXmSnoGruLpwFjVSX0lNNA75CykiMECbovNTZqGI= golang.org/x/sys v0.0.0-20180830151530-49385e6e1522/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY= golang.org/x/sys v0.0.0-20180909124046-d0be0721c37e/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY= golang.org/x/sys v0.0.0-20190215142949-d0b11bdaac8a/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY= golang.org/x/sys v0.0.0-20190412213103-97732733099d/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= golang.org/x/sys v0.0.0-20190904154756-749cb33beabd/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= golang.org/x/sys v0.0.0-20190916202348-b4ddaad3f8a3/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= golang.org/x/sys v0.0.0-20191005200804-aed5e4c7ecf9/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= golang.org/x/sys v0.0.0-20191026070338-33540a1f6037/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= golang.org/x/sys v0.0.0-20191120155948-bd437916bb0e/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= golang.org/x/sys v0.0.0-20191204072324-ce4227a45e2e/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= golang.org/x/sys v0.0.0-20200116001909-b77594299b42/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= golang.org/x/sys v0.0.0-20200223170610-d5e6a3e2c0ae/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= golang.org/x/sys v0.0.0-20200323222414-85ca7c5b95cd/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= golang.org/x/sys v0.0.0-20200509044756-6aff5f38e54f/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= golang.org/x/sys v0.0.0-20200930185726-fdedc70b468f/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= golang.org/x/sys v0.0.0-20201119102817-f84b799fce68/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= golang.org/x/sys v0.0.0-20201204225414-ed752295db88/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= golang.org/x/sys v0.0.0-20210112080510-489259a85091/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= golang.org/x/sys v0.0.0-20210124154548-22da62e12c0c/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= golang.org/x/sys v0.0.0-20210423082822-04245dca01da/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= golang.org/x/sys v0.0.0-20210615035016-665e8c7367d1/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/sys v0.0.0-20211216021012-1d35b9e2eb4e/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/sys v0.0.0-20220412211240-33da011f77ad/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/sys v0.0.0-20220520151302-bc2c85ada10a/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/sys v0.0.0-20220715151400-c0bba94af5f8/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/sys v0.0.0-20220722155257-8c9f86f7a55f/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/sys v0.1.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/sys v0.5.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/sys v0.6.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/sys v0.8.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/sys v0.13.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/sys v0.16.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA= golang.org/x/sys v0.17.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA= golang.org/x/sys v0.39.0 h1:CvCKL8MeisomCi6qNZ+wbb0DN9E5AATixKsvNtMoMFk= golang.org/x/sys v0.39.0/go.mod h1:OgkHotnGiDImocRcuBABYBEXf8A9a87e/uXjp9XT3ks= golang.org/x/term v0.0.0-20201126162022-7de9c90e9dd1/go.mod h1:bj7SfCRtBDWHUb9snDiAeCFNEtKQo2Wmx5Cou7ajbmo= golang.org/x/term v0.0.0-20210927222741-03fcf44c2211/go.mod h1:jbD1KX2456YbFQfuXm/mYQcufACuNUgVhRMnK/tPxf8= golang.org/x/term v0.5.0/go.mod h1:jMB1sMXY+tzblOD4FWmEbocvup2/aLOaQEp7JmGp78k= golang.org/x/term v0.8.0/go.mod h1:xPskH00ivmX89bAKVGSKKtLOWNx2+17Eiy94tnKShWo= golang.org/x/term v0.13.0/go.mod h1:LTmsnFJwVN6bCy1rVCoS+qHT1HhALEFxKncY3WNNh4U= golang.org/x/term v0.16.0/go.mod h1:yn7UURbUtPyrVJPGPq404EukNFxcm/foM+bV/bfcDsY= golang.org/x/term v0.17.0/go.mod h1:lLRBjIVuehSbZlaOtGMbcMncT+aqLLLmKrsjNrUguwk= golang.org/x/term v0.38.0 h1:PQ5pkm/rLO6HnxFR7N2lJHOZX6Kez5Y1gDSJla6jo7Q= golang.org/x/term v0.38.0/go.mod h1:bSEAKrOT1W+VSu9TSCMtoGEOUcKxOKgl3LE5QEF/xVg= golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ= golang.org/x/text v0.3.2/go.mod h1:bEr9sfX3Q8Zfm5fL9x+3itogRgK3+ptLWKqgva+5dAk= golang.org/x/text v0.3.3/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ= golang.org/x/text v0.3.6/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ= golang.org/x/text v0.3.7/go.mod h1:u+2+/6zg+i71rQMx5EYifcz6MCKuco9NR6JIITiCfzQ= golang.org/x/text v0.7.0/go.mod h1:mrYo+phRRbMaCq/xk9113O4dZlRixOauAjOtrjsXDZ8= golang.org/x/text v0.9.0/go.mod h1:e1OnstbJyHTd6l/uOt8jFFHp6TRDWZR/bV3emEE/zU8= golang.org/x/text v0.13.0/go.mod h1:TvPlkZtksWOMsz7fbANvkp4WM8x/WCo/om8BMLbz+aE= golang.org/x/text v0.14.0/go.mod h1:18ZOQIKpY8NJVqYksKHtTdi31H5itFRjB5/qKTNYzSU= golang.org/x/text v0.32.0 h1:ZD01bjUt1FQ9WJ0ClOL5vxgxOI/sVCNgX1YtKwcY0mU= golang.org/x/text v0.32.0/go.mod h1:o/rUWzghvpD5TXrTIBuJU77MTaN0ljMWE47kxGJQ7jY= golang.org/x/time v0.13.0 h1:eUlYslOIt32DgYD6utsuUeHs4d7AsEYLuIAdg7FlYgI= golang.org/x/time v0.13.0/go.mod h1:eL/Oa2bBBK0TkX57Fyni+NgnyQQN4LitPmob2Hjnqw4= golang.org/x/tools v0.0.0-20180917221912-90fa682c2a6e/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ= golang.org/x/tools v0.0.0-20190114222345-bf090417da8b/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ= golang.org/x/tools v0.0.0-20190226205152-f727befe758c/go.mod h1:9Yl7xja0Znq3iFh3HoIrodX9oNMXvdceNzlUR8zjMvY= golang.org/x/tools v0.0.0-20190311212946-11955173bddd/go.mod h1:LCzVGOaR6xXOjkQ3onu1FJEFr0SW1gC7cKk1uF8kGRs= golang.org/x/tools v0.0.0-20190328211700-ab21143f2384/go.mod h1:LCzVGOaR6xXOjkQ3onu1FJEFr0SW1gC7cKk1uF8kGRs= golang.org/x/tools v0.0.0-20190524140312-2c0ae7006135/go.mod h1:RgjU9mgBXZiqYHBnxXauZ1Gv1EHHAz9KjViQ78xBX0Q= golang.org/x/tools v0.0.0-20191119224855-298f0cb1881e/go.mod h1:b+2E5dAYhXwXZwtnZ6UAqBI28+e2cm9otk0dWdXHAEo= golang.org/x/tools v0.0.0-20200509030707-2212a7e161a5/go.mod h1:EkVYQZoAsY45+roYkvgYkIh4xh/qjgUK9TdY2XT94GE= golang.org/x/tools v0.0.0-20200619180055-7c47624df98f/go.mod h1:EkVYQZoAsY45+roYkvgYkIh4xh/qjgUK9TdY2XT94GE= golang.org/x/tools v0.0.0-20201224043029-2b0845dc783e/go.mod h1:emZCQorbCU4vsT4fOWvOPXz4eW1wZW4PmDk9uLelYpA= golang.org/x/tools v0.0.0-20210106214847-113979e3529a/go.mod h1:emZCQorbCU4vsT4fOWvOPXz4eW1wZW4PmDk9uLelYpA= golang.org/x/tools v0.1.12/go.mod h1:hNGJHUnrk76NpqgfD5Aqm5Crs+Hm0VOH/i9J2+nxYbc= golang.org/x/tools v0.6.0/go.mod h1:Xwgl3UAJ/d3gWutnCtw505GrjyAbvKui8lOU390QaIU= golang.org/x/tools v0.40.0 h1:yLkxfA+Qnul4cs9QA3KnlFu0lVmd8JJfoq+E41uSutA= golang.org/x/tools v0.40.0/go.mod h1:Ik/tzLRlbscWpqqMRjyWYDisX8bG13FrdXp3o4Sr9lc= golang.org/x/tools/go/vcs v0.1.0-deprecated h1:cOIJqWBl99H1dH5LWizPa+0ImeeJq3t3cJjaeOWUAL4= golang.org/x/tools/go/vcs v0.1.0-deprecated/go.mod h1:zUrvATBAvEI9535oC0yWYsLsHIV4Z7g63sNPVMtuBy8= golang.org/x/xerrors v0.0.0-20190717185122-a985d3407aa7/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= golang.org/x/xerrors v0.0.0-20191011141410-1b5146add898/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= golang.org/x/xerrors v0.0.0-20191204190536-9bdfabe68543/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= golang.org/x/xerrors v0.0.0-20200804184101-5ec99f83aff1/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= golang.org/x/xerrors v0.0.0-20220517211312-f3a8303e98df/go.mod h1:K8+ghG5WaK9qNqU5K3HdILfMLy1f3aNYFI/wnl100a8= golang.org/x/xerrors v0.0.0-20240716161551-93cc26a95ae9 h1:LLhsEBxRTBLuKlQxFBYUOU8xyFgXv6cOTp2HASDlsDk= golang.org/x/xerrors v0.0.0-20240716161551-93cc26a95ae9/go.mod h1:NDW/Ps6MPRej6fsCIbMTohpP40sJ/P/vI1MoTEGwX90= gomodules.xyz/jsonpatch/v2 v2.4.0 h1:Ci3iUJyx9UeRx7CeFN8ARgGbkESwJK+KB9lLcWxY/Zw= gomodules.xyz/jsonpatch/v2 v2.4.0/go.mod h1:AH3dM2RI6uoBZxn3LVrfvJ3E0/9dG4cSrbuBJT4moAY= gonum.org/v1/gonum v0.16.0 h1:5+ul4Swaf3ESvrOnidPp4GZbzf0mxVQpDCYUQE7OJfk= gonum.org/v1/gonum v0.16.0/go.mod h1:fef3am4MQ93R2HHpKnLk4/Tbh/s0+wqD5nfa6Pnwy4E= google.golang.org/api v0.242.0 h1:7Lnb1nfnpvbkCiZek6IXKdJ0MFuAZNAJKQfA1ws62xg= google.golang.org/api v0.242.0/go.mod h1:cOVEm2TpdAGHL2z+UwyS+kmlGr3bVWQQ6sYEqkKje50= google.golang.org/appengine v1.1.0/go.mod h1:EbEs0AVv82hx2wNQdGPgUI5lhzA/G0D9YwlJXL52JkM= google.golang.org/appengine v1.4.0/go.mod h1:xpcJRLb0r/rnEns0DIKYYv+WjYCduHsrkT7/EB5XEv4= google.golang.org/genproto v0.0.0-20180817151627-c66870c02cf8/go.mod h1:JiN7NxoALGmiZfu7CAH4rXhgtRTLTxftemlI0sWmxmc= google.golang.org/genproto v0.0.0-20190819201941-24fa4b261c55/go.mod h1:DMBHOl98Agz4BDEuKkezgsaosCRResVns1a3J2ZsMNc= google.golang.org/genproto v0.0.0-20250603155806-513f23925822 h1:rHWScKit0gvAPuOnu87KpaYtjK5zBMLcULh7gxkCXu4= google.golang.org/genproto v0.0.0-20250603155806-513f23925822/go.mod h1:HubltRL7rMh0LfnQPkMH4NPDFEWp0jw3vixw7jEM53s= google.golang.org/genproto/googleapis/api v0.0.0-20251202230838-ff82c1b0f217 h1:fCvbg86sFXwdrl5LgVcTEvNC+2txB5mgROGmRL5mrls= google.golang.org/genproto/googleapis/api v0.0.0-20251202230838-ff82c1b0f217/go.mod h1:+rXWjjaukWZun3mLfjmVnQi18E1AsFbDN9QdJ5YXLto= google.golang.org/genproto/googleapis/rpc v0.0.0-20251202230838-ff82c1b0f217 h1:gRkg/vSppuSQoDjxyiGfN4Upv/h/DQmIR10ZU8dh4Ww= google.golang.org/genproto/googleapis/rpc v0.0.0-20251202230838-ff82c1b0f217/go.mod h1:7i2o+ce6H/6BluujYR+kqX3GKH+dChPTQU19wjRPiGk= google.golang.org/grpc v1.19.0/go.mod h1:mqu4LbDTu4XGKhr4mRzUsmM4RtVoemTSY81AxZiDr8c= google.golang.org/grpc v1.23.0/go.mod h1:Y5yQAOtifL1yxbo5wqy6BxZv8vAUGQwXBOALyacEbxg= google.golang.org/grpc v1.25.1/go.mod h1:c3i+UQWmh7LiEpx4sFZnkU36qjEYZ0imhYfXVyQciAY= google.golang.org/grpc v1.31.0/go.mod h1:N36X2cJ7JwdamYAgDz+s+rVMFjt3numwzf/HckM8pak= google.golang.org/grpc v1.79.3 h1:sybAEdRIEtvcD68Gx7dmnwjZKlyfuc61Dyo9pGXXkKE= google.golang.org/grpc v1.79.3/go.mod h1:KmT0Kjez+0dde/v2j9vzwoAScgEPx/Bw1CYChhHLrHQ= google.golang.org/protobuf v0.0.0-20200109180630-ec00e32a8dfd/go.mod h1:DFci5gLYBciE7Vtevhsrf46CRTquxDuWsQurQQe4oz8= google.golang.org/protobuf v0.0.0-20200221191635-4d8936d0db64/go.mod h1:kwYJMbMJ01Woi6D6+Kah6886xMZcty6N08ah7+eCXa0= google.golang.org/protobuf v0.0.0-20200228230310-ab0ca4ff8a60/go.mod h1:cfTl7dwQJ+fmap5saPgwCLgHXTUD7jkjRqWcaiX5VyM= google.golang.org/protobuf v1.20.1-0.20200309200217-e05f789c0967/go.mod h1:A+miEFZTKqfCUM6K7xSMQL9OKL/b6hQv+e19PK+JZNE= google.golang.org/protobuf v1.21.0/go.mod h1:47Nbq4nVaFHyn7ilMalzfO3qCViNmqZ2kzikPIcrTAo= google.golang.org/protobuf v1.23.0/go.mod h1:EGpADcykh3NcUnDUJcl1+ZksZNG86OlYog2l/sGQquU= google.golang.org/protobuf v1.26.0-rc.1/go.mod h1:jlhhOSvTdKEhbULTjvd4ARK9grFBp09yW+WbY/TyQbw= google.golang.org/protobuf v1.26.0/go.mod h1:9q0QmTI4eRPtz6boOQmLYwt+qCgq0jsYwAQnmE0givc= google.golang.org/protobuf v1.36.10 h1:AYd7cD/uASjIL6Q9LiTjz8JLcrh/88q5UObnmY3aOOE= google.golang.org/protobuf v1.36.10/go.mod h1:HTf+CrKn2C3g5S8VImy6tdcUvCska2kB7j23XfzDpco= gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0= gopkg.in/check.v1 v1.0.0-20190902080502-41f04d3bba15/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0= gopkg.in/check.v1 v1.0.0-20200227125254-8fa46927fb4f/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0= gopkg.in/check.v1 v1.0.0-20201130134442-10cb98267c6c h1:Hei/4ADfdWqJk1ZMxUNpqntNwaWcugrBjAiHlqqRiVk= gopkg.in/check.v1 v1.0.0-20201130134442-10cb98267c6c/go.mod h1:JHkPIbrfpd72SG/EVd6muEfDQjcINNoR0C8j2r3qZ4Q= gopkg.in/evanphx/json-patch.v4 v4.13.0 h1:czT3CmqEaQ1aanPc5SdlgQrrEIb8w/wwCvWWnfEbYzo= gopkg.in/evanphx/json-patch.v4 v4.13.0/go.mod h1:p8EYWUEYMpynmqDbY58zCKCFZw8pRWMG4EsWvDvM72M= gopkg.in/fsnotify.v1 v1.4.7/go.mod h1:Tz8NjZHkW78fSQdbUxIjBTcgA1z1m8ZHf0WmKUhAMys= gopkg.in/gcfg.v1 v1.2.3 h1:m8OOJ4ccYHnx2f4gQwpno8nAX5OGOh7RLaaz0pj3Ogs= gopkg.in/gcfg.v1 v1.2.3/go.mod h1:yesOnuUOFQAhST5vPY4nbZsb/huCgGGXlipJsBn0b3o= gopkg.in/inf.v0 v0.9.1 h1:73M5CoZyi3ZLMOyDlQh031Cx6N9NDJ2Vvfl76EDAgDc= gopkg.in/inf.v0 v0.9.1/go.mod h1:cWUDdTG/fYaXco+Dcufb5Vnc6Gp2YChqWtbxRZE0mXw= gopkg.in/ini.v1 v1.56.0/go.mod h1:pNLf8WUiyNEtQjuu5G5vTm06TEv9tsIgeAvK8hOrP4k= gopkg.in/ini.v1 v1.67.0 h1:Dgnx+6+nfE+IfzjUEISNeydPJh9AXNNsWbGP9KzCsOA= gopkg.in/ini.v1 v1.67.0/go.mod h1:pNLf8WUiyNEtQjuu5G5vTm06TEv9tsIgeAvK8hOrP4k= gopkg.in/tomb.v1 v1.0.0-20141024135613-dd632973f1e7 h1:uRGJdciOHaEIrze2W8Q3AKkepLTh2hOroT7a+7czfdQ= gopkg.in/tomb.v1 v1.0.0-20141024135613-dd632973f1e7/go.mod h1:dt/ZhP58zS4L8KSrWDmTeBkI65Dw0HsyUHuEVlX15mw= gopkg.in/warnings.v0 v0.1.2 h1:wFXVbFY8DY5/xOe1ECiWdKCzZlxgshcYVNkBHstARME= gopkg.in/warnings.v0 v0.1.2/go.mod h1:jksf8JmL6Qr/oQM2OXTHunEvvTAsrWBLb6OOjuVWRNI= gopkg.in/yaml.v2 v2.2.2/go.mod h1:hI93XBmqTisBFMUTm0b8Fm+jr3Dg1NNxqwp+5A1VGuI= gopkg.in/yaml.v2 v2.2.4/go.mod h1:hI93XBmqTisBFMUTm0b8Fm+jr3Dg1NNxqwp+5A1VGuI= gopkg.in/yaml.v2 v2.2.8/go.mod h1:hI93XBmqTisBFMUTm0b8Fm+jr3Dg1NNxqwp+5A1VGuI= gopkg.in/yaml.v2 v2.3.0/go.mod h1:hI93XBmqTisBFMUTm0b8Fm+jr3Dg1NNxqwp+5A1VGuI= gopkg.in/yaml.v2 v2.4.0 h1:D8xgwECY7CYvx+Y2n4sBz93Jn9JRvxdiyyo8CTfuKaY= gopkg.in/yaml.v2 v2.4.0/go.mod h1:RDklbk79AGWmwhnvt/jBztapEOGDOx6ZbXqjP6csGnQ= gopkg.in/yaml.v3 v3.0.0-20200313102051-9f266ea9e77c/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM= gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA= gopkg.in/yaml.v3 v3.0.1/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM= gotest.tools/v3 v3.5.1 h1:EENdUnS3pdur5nybKYIh2Vfgc8IUNBjxDPSjtiJcOzU= gotest.tools/v3 v3.5.1/go.mod h1:isy3WKz7GK6uNw/sbHzfKBLvlvXwUyV06n6brMxxopU= honnef.co/go/tools v0.0.0-20190102054323-c2f93a96b099/go.mod h1:rf3lG4BRIbNafJWhAfAdb/ePZxsR/4RtNHQocxwk9r4= honnef.co/go/tools v0.0.0-20190523083050-ea95bdfd59fc/go.mod h1:rf3lG4BRIbNafJWhAfAdb/ePZxsR/4RtNHQocxwk9r4= k8s.io/api v0.35.0 h1:iBAU5LTyBI9vw3L5glmat1njFK34srdLmktWwLTprlY= k8s.io/api v0.35.0/go.mod h1:AQ0SNTzm4ZAczM03QH42c7l3bih1TbAXYo0DkF8ktnA= k8s.io/apiextensions-apiserver v0.34.1 h1:NNPBva8FNAPt1iSVwIE0FsdrVriRXMsaWFMqJbII2CI= k8s.io/apiextensions-apiserver v0.34.1/go.mod h1:hP9Rld3zF5Ay2Of3BeEpLAToP+l4s5UlxiHfqRaRcMc= k8s.io/apimachinery v0.35.0 h1:Z2L3IHvPVv/MJ7xRxHEtk6GoJElaAqDCCU0S6ncYok8= k8s.io/apimachinery v0.35.0/go.mod h1:jQCgFZFR1F4Ik7hvr2g84RTJSZegBc8yHgFWKn//hns= k8s.io/cli-runtime v0.35.0 h1:PEJtYS/Zr4p20PfZSLCbY6YvaoLrfByd6THQzPworUE= k8s.io/cli-runtime v0.35.0/go.mod h1:VBRvHzosVAoVdP3XwUQn1Oqkvaa8facnokNkD7jOTMY= k8s.io/client-go v0.35.0 h1:IAW0ifFbfQQwQmga0UdoH0yvdqrbwMdq9vIFEhRpxBE= k8s.io/client-go v0.35.0/go.mod h1:q2E5AAyqcbeLGPdoRB+Nxe3KYTfPce1Dnu1myQdqz9o= k8s.io/cloud-provider v0.35.0 h1:syiBCQbKh2gho/S1BkIl006Dc44pV8eAtGZmv5NMe7M= k8s.io/cloud-provider v0.35.0/go.mod h1:7grN+/Nt5Hf7tnSGPT3aErt4K7aQpygyCrGpbrQbzNc= k8s.io/cloud-provider-aws v1.35.0 h1:jlMZmc4JjJ6lkYj41xeKqZ8nw1ais00xQi8Nnz2lqkI= k8s.io/cloud-provider-aws v1.35.0/go.mod h1:6R9TIgQ/ecysPukSmEUs4kZIwqvju80+FjMAhtJ22Q0= k8s.io/component-base v0.35.0 h1:+yBrOhzri2S1BVqyVSvcM3PtPyx5GUxCK2tinZz1G94= k8s.io/component-base v0.35.0/go.mod h1:85SCX4UCa6SCFt6p3IKAPej7jSnF3L8EbfSyMZayJR0= k8s.io/component-helpers v0.35.0 h1:wcXv7HJRksgVjM4VlXJ1CNFBpyDHruRI99RrBtrJceA= k8s.io/component-helpers v0.35.0/go.mod h1:ahX0m/LTYmu7fL3W8zYiIwnQ/5gT28Ex4o2pymF63Co= k8s.io/klog v1.0.0 h1:Pt+yjF5aB1xDSVbau4VsWe+dQNzA0qv1LlXdC2dF6Q8= k8s.io/klog v1.0.0/go.mod h1:4Bi6QPql/J/LkTDqv7R/cd3hPo4k2DG6Ptcz060Ez5I= k8s.io/klog/v2 v2.130.1 h1:n9Xl7H1Xvksem4KFG4PYbdQCQxqc/tTUyrgXaOhHSzk= k8s.io/klog/v2 v2.130.1/go.mod h1:3Jpz1GvMt720eyJH1ckRHK1EDfpxISzJ7I9OYgaDtPE= k8s.io/kops v1.33.1 h1:MFrj3r6f+F9rL2DQQdfAXEyFJDdq0GAyu96woF6TOaQ= k8s.io/kops v1.33.1/go.mod h1:epTyN30uGaeRBmN1jmT993Kc4Wd/tti9snQDd5aivXc= k8s.io/kube-openapi v0.0.0-20250910181357-589584f1c912 h1:Y3gxNAuB0OBLImH611+UDZcmKS3g6CthxToOb37KgwE= k8s.io/kube-openapi v0.0.0-20250910181357-589584f1c912/go.mod h1:kdmbQkyfwUagLfXIad1y2TdrjPFWp2Q89B3qkRwf/pQ= k8s.io/kubelet v0.35.0 h1:8cgJHCBCKLYuuQ7/Pxb/qWbJfX1LXIw7790ce9xHq7c= k8s.io/kubelet v0.35.0/go.mod h1:ciRzAXn7C4z5iB7FhG1L2CGPPXLTVCABDlbXt/Zz8YA= k8s.io/release v0.18.0 h1:xn+ZU/8bDmtAcSZMh0K2HMa2+dYrD3Qqq+yqv3Uuk9k= k8s.io/release v0.18.0/go.mod h1:PJ4HhnTcmTKSakE475b4e3xJEVw+EVB5ycZM9vWFcTU= k8s.io/utils v0.0.0-20260108192941-914a6e750570 h1:JT4W8lsdrGENg9W+YwwdLJxklIuKWdRm+BC+xt33FOY= k8s.io/utils v0.0.0-20260108192941-914a6e750570/go.mod h1:xDxuJ0whA3d0I4mf/C4ppKHxXynQ+fxnkmQH0vTHnuk= modernc.org/cc/v4 v4.19.3 h1:vE9kmJqUcyvNOf8F2Hn8od14SOMq34BiqcZ2tMzLk5c= modernc.org/cc/v4 v4.19.3/go.mod h1:HM7VJTZbUCR3rV8EYBi9wxnJ0ZBRiGE5OeGXNA0IsLQ= modernc.org/ccgo/v4 v4.11.0 h1:2uc2kRvZLC/oHylsrirRW6f1I4wljQST2BBbm+aKiXM= modernc.org/ccgo/v4 v4.11.0/go.mod h1:GwrfAtnU6PdZkCWD4XI8wB1T5Xj3fSw9lO/40H1ldys= modernc.org/fileutil v1.3.0 h1:gQ5SIzK3H9kdfai/5x41oQiKValumqNTDXMvKo62HvE= modernc.org/fileutil v1.3.0/go.mod h1:XatxS8fZi3pS8/hKG2GH/ArUogfxjpEKs3Ku3aK4JyQ= modernc.org/gc/v2 v2.4.1 h1:9cNzOqPyMJBvrUipmynX0ZohMhcxPtMccYgGOJdOiBw= modernc.org/gc/v2 v2.4.1/go.mod h1:wzN5dK1AzVGoH6XOzc3YZ+ey/jPgYHLuVckd62P0GYU= modernc.org/libc v1.45.2 h1:oRlBu8xlBen2awVAWuLOkvYNBPaIKFxFOj9wA/jaXHM= modernc.org/libc v1.45.2/go.mod h1:YkRHLoN4L70OdO1cVmM83KZhRbRvsc3XogfVzbTXBwE= modernc.org/mathutil v1.6.0 h1:fRe9+AmYlaej+64JsEEhoWuAYBkOtQiMEU7n/XgfYi4= modernc.org/mathutil v1.6.0/go.mod h1:Ui5Q9q1TR2gFm0AQRqQUaBWFLAhQpCwNcuhBOSedWPo= modernc.org/memory v1.7.2 h1:Klh90S215mmH8c9gO98QxQFsY+W451E8AnzjoE2ee1E= modernc.org/memory v1.7.2/go.mod h1:NO4NVCQy0N7ln+T9ngWqOQfi7ley4vpwvARR+Hjw95E= modernc.org/opt v0.1.3 h1:3XOZf2yznlhC+ibLltsDGzABUGVx8J6pnFMS3E4dcq4= modernc.org/opt v0.1.3/go.mod h1:WdSiB5evDcignE70guQKxYUl14mgWtbClRi5wmkkTX0= modernc.org/sortutil v1.2.0 h1:jQiD3PfS2REGJNzNCMMaLSp/wdMNieTbKX920Cqdgqc= modernc.org/sortutil v1.2.0/go.mod h1:TKU2s7kJMf1AE84OoiGppNHJwvB753OYfNl2WRb++Ss= modernc.org/sqlite v1.29.5 h1:8l/SQKAjDtZFo9lkJLdk8g9JEOeYRG4/ghStDCCTiTE= modernc.org/sqlite v1.29.5/go.mod h1:S02dvcmm7TnTRvGhv8IGYyLnIt7AS2KPaB1F/71p75U= modernc.org/strutil v1.2.0 h1:agBi9dp1I+eOnxXeiZawM8F4LawKv4NzGWSaLfyeNZA= modernc.org/strutil v1.2.0/go.mod h1:/mdcBmfOibveCTBxUl5B5l6W+TTH1FXPLHZE6bTosX0= modernc.org/token v1.1.0 h1:Xl7Ap9dKaEs5kLoOQeQmPWevfnk/DM5qcLcYlA8ys6Y= modernc.org/token v1.1.0/go.mod h1:UGzOrNV1mAFSEB63lOFHIpNRUVMvYTc6yu1SMY/XTDM= sigs.k8s.io/bom v0.6.0 h1:IPMPHx6XdmMeW2oEeF66DgNyP5d4RxfuXwiC1qn+n9o= sigs.k8s.io/bom v0.6.0/go.mod h1:MV0D3vdGlkaPgi5EwpwMBeQ8n8QS8Q2u1lJ5LyE7RLM= sigs.k8s.io/controller-runtime v0.22.4 h1:GEjV7KV3TY8e+tJ2LCTxUTanW4z/FmNB7l327UfMq9A= sigs.k8s.io/controller-runtime v0.22.4/go.mod h1:+QX1XUpTXN4mLoblf4tqr5CQcyHPAki2HLXqQMY6vh8= sigs.k8s.io/e2e-framework v0.6.1-0.20250909060333-8677714ff9a6 h1:5saOTCrwclRdFJLj5zDMJITisRmR0HuG8SU6ts9z5IY= sigs.k8s.io/e2e-framework v0.6.1-0.20250909060333-8677714ff9a6/go.mod h1:MUvWdQO9AGg4/yP9Y0kOcmX+KIOXI0UR6Xw6xz11ULw= sigs.k8s.io/json v0.0.0-20250730193827-2d320260d730 h1:IpInykpT6ceI+QxKBbEflcR5EXP7sU1kvOlxwZh5txg= sigs.k8s.io/json v0.0.0-20250730193827-2d320260d730/go.mod h1:mdzfpAEoE6DHQEN0uh9ZbOCuHbLK5wOm7dK4ctXE9Tg= sigs.k8s.io/karpenter v1.8.0 h1:AmTHUPtnuL8IX9mbcD3NOohyk62idrBCBtM+8Wn6Jvk= sigs.k8s.io/karpenter v1.8.0/go.mod h1:nDDVB5873dVVuyTam3oJrllSv0sAgp6as6/5HRTcV4o= sigs.k8s.io/kubetest2 v0.0.0-20260108084739-2f9a9397f033 h1:+HmjjgPGGqvYRBErxVSbguBnp7hILyuwHHDKUXRCDA4= sigs.k8s.io/kubetest2 v0.0.0-20260108084739-2f9a9397f033/go.mod h1:pBd0cFaT0hDqmwQg+TIhyLgPMYaH66QMLcKd09XnKTI= sigs.k8s.io/kustomize/api v0.20.1 h1:iWP1Ydh3/lmldBnH/S5RXgT98vWYMaTUL1ADcr+Sv7I= sigs.k8s.io/kustomize/api v0.20.1/go.mod h1:t6hUFxO+Ph0VxIk1sKp1WS0dOjbPCtLJ4p8aADLwqjM= sigs.k8s.io/kustomize/kyaml v0.20.1 h1:PCMnA2mrVbRP3NIB6v9kYCAc38uvFLVs8j/CD567A78= sigs.k8s.io/kustomize/kyaml v0.20.1/go.mod h1:0EmkQHRUsJxY8Ug9Niig1pUMSCGHxQ5RklbpV/Ri6po= sigs.k8s.io/promo-tools/v3 v3.6.0 h1:C2L08ezrWm1aZI8Emd3iZPZQserLPRgzuqQVxvI0PUI= sigs.k8s.io/promo-tools/v3 v3.6.0/go.mod h1:XJ3jy0hJYs+hWKt8XsLHFzGQV8PUtvllvbxjN/E5RXI= sigs.k8s.io/randfill v1.0.0 h1:JfjMILfT8A6RbawdsK2JXGBR5AQVfd+9TbzrlneTyrU= sigs.k8s.io/randfill v1.0.0/go.mod h1:XeLlZ/jmk4i1HRopwe7/aU3H5n1zNUcX6TM94b3QxOY= sigs.k8s.io/release-sdk v0.12.2 h1:ncuHwUu8VWcZVVrNkjoUR8xGo6ibHg+AM6uMMD+IwuQ= sigs.k8s.io/release-sdk v0.12.2/go.mod h1:tlJgWPJLeRbWOvcyq1XrCZmLe8Yfn3H5U/LNtmBa0Nc= sigs.k8s.io/release-utils v0.12.0 h1:+Z8cEUAaxItrMcTOJ0jtUg3Fm1uNgPNol+VIL6XtQqQ= sigs.k8s.io/release-utils v0.12.0/go.mod h1:TveYRPK4Mq6qXA0PJiUMEOlWvvIQG0Mh5APQmHD5JpA= sigs.k8s.io/structured-merge-diff/v6 v6.3.0 h1:jTijUJbW353oVOd9oTlifJqOGEkUw2jB/fXCbTiQEco= sigs.k8s.io/structured-merge-diff/v6 v6.3.0/go.mod h1:M3W8sfWvn2HhQDIbGWj3S099YozAsymCo/wrT5ohRUE= sigs.k8s.io/yaml v1.6.0 h1:G8fkbMSAFqgEFgh4b1wmtzDnioxFCUgTZhlbj5P9QYs= sigs.k8s.io/yaml v1.6.0/go.mod h1:796bPqUfzR/0jLAl6XjHl3Ck7MiyVv8dbTdyT3/pMf4= software.sslmate.com/src/go-pkcs12 v0.4.0 h1:H2g08FrTvSFKUj+D309j1DPfk5APnIdAQAB8aEykJ5k= software.sslmate.com/src/go-pkcs12 v0.4.0/go.mod h1:Qiz0EyvDRJjjxGyUQa2cCNZn/wMyzrRJ/qcDXOQazLI= ================================================ FILE: hack/download-kubernetes-binaries.sh ================================================ #!/usr/bin/env bash set -o errexit set -o nounset BUNDLES=( "kubernetes-client" "kubernetes-test" ) if [ "$#" -ne 3 ]; then echo >&2 "usage: $0 (KUBERNETES_MINOR_VERSION|latest) OS ARCH" exit 1 fi if [ "$1" = "latest" ]; then RELEASE_MARKER="latest.txt" else RELEASE_MARKER="latest-$1.txt" fi echo >&2 "Release marker: ${RELEASE_MARKER}" OS="$2" ARCH="$3" function download_binaries() { local basePath=$1 local KUBERNETES_VERSION=$(curl --silent "${basePath}/${RELEASE_MARKER}") echo "Kubernetes version: ${KUBERNETES_VERSION}" echo "${KUBERNETES_VERSION}" > kubernetes-version.txt for BUNDLE in ${BUNDLES[@]}; do echo >&2 "Downloading bundle: ${BUNDLE}" local TARBALL="${BUNDLE}.tar.gz" if ! wget --quiet --output-document=${TARBALL} $basePath/${KUBERNETES_VERSION}/${BUNDLE}-${OS}-${ARCH}.tar.gz; then return 1 fi tar xzf ${TARBALL} rm ${TARBALL} done } if ! download_binaries https://storage.googleapis.com/kubernetes-release/release; then echo >&2 "binary download failed from release bucket, falling back to ci dev release" download_binaries https://storage.googleapis.com/k8s-release-dev/ci fi ================================================ FILE: hack/free-disk-space.sh ================================================ #!/usr/bin/env bash set -o nounset set -o errexit set -o pipefail # hack to free up disk space for build # ref: https://github.com/easimon/maximize-build-space/blob/master/action.yml # storage before sudo df -h sudo rm -rf \ /usr/share/dotnet \ /usr/local/lib/android \ /opt/ghc \ /opt/hostedtoolcache/CodeQL docker image prune --all --force docker builder prune -a # storage after sudo df -h ================================================ FILE: hack/update-go-dependencies.sh ================================================ #!/usr/bin/env bash set -o nounset set -o errexit set -o pipefail echo "Updating go modules..." go get $(go list -f '{{if not (or .Main .Indirect)}}{{.Path}}{{end}}' -mod=mod -m all) && go mod tidy echo "Updating kubetest2 image go version..." MODULE_GO_VERSION=$(go list -m -f "{{if .Main}}{{.GoVersion}}{{end}}" | cut -d'.' -f1-2) find . -type f -name Dockerfile -exec sed -i "s/\(GO_MINOR_VERSION\)=.*/\1=${MODULE_GO_VERSION}/g" {} + ================================================ FILE: hack/update-image-tags.sh ================================================ #!/usr/bin/env bash set -o nounset set -o errexit set -o pipefail ECR_PUBLIC_REGISTRY="public.ecr.aws" EKS_CONTAINER_REGISTRY="602401143452.dkr.ecr.us-west-2.amazonaws.com" # get_ecr_image_tags # e.g. get_ecr_image_tags $ECR_PUBLIC_REGISTRY amazonlinux/amazonlinux get_ecr_image_tags() { set -e local REGISTRY=$1 local REPOSITORY=$2 local TOKEN # Get ECR public token if image is from a public registry, otherwise use a private token # An authorization token is required for every ECR HTTP request if [ "$REGISTRY" = "$ECR_PUBLIC_REGISTRY" ]; then TOKEN=$(aws ecr-public get-authorization-token --region us-east-1 --output=text --query 'authorizationData.authorizationToken') local AUTHORIZATION_TYPE="Bearer" else TOKEN=$(aws ecr get-authorization-token --output text --query 'authorizationData[].authorizationToken') local AUTHORIZATION_TYPE="Basic" fi curl -s -H "Authorization: ${AUTHORIZATION_TYPE} $TOKEN" "https://$REGISTRY/v2/$REPOSITORY/tags/list" | jq '.tags' } # update_image_uris REPOSITORY IMAGE_TAG update_image_uris() { local REPOSITORY=$1 local NEW_TAG=$2 PREFIX="image: ${REPOSITORY}" find ./test/manifests -type f -exec sed -i "s#$PREFIX:.*#$PREFIX:$NEW_TAG#g" {} + } # update the nvidia k8s device plugin echo "Updating Nvidia device plugin image" NVIDIA_DEVICE_PLUGIN_TAG=$(curl -s 'https://catalog.ngc.nvidia.com/api/containers/images?orgName=nvidia&name=k8s-device-plugin&isPublic=true' | jq -r '.images | sort_by(.updatedDate) | reverse | map(select(.tag | test("^v[0-9]+.[0-9]+.[0-9]+$"))) | first | .tag') update_image_uris nvcr.io/nvidia/k8s-device-plugin $NVIDIA_DEVICE_PLUGIN_TAG # below updates require authentication and should not exit early with a failure. # TODO: remove this once the aws credentials are setup and the paths are expected to succeed. set +e # update the neuron k8s device plugin echo "Updating Neuron device plugin image" NEURON_DEVICE_PLUGIN_REPOSITORY_NAME="neuron/neuron-device-plugin" NEURON_DEVICE_PLUGIN_TAGS=$(get_ecr_image_tags $ECR_PUBLIC_REGISTRY $NEURON_DEVICE_PLUGIN_REPOSITORY_NAME) if [ $? -eq 0 ]; then LATEST_NEURON_DEVICE_PLUGIN_TAG=$(echo $NEURON_DEVICE_PLUGIN_TAGS | jq -r 'max_by(split(".") | map(tonumber))') update_image_uris "${ECR_PUBLIC_REGISTRY}/${NEURON_DEVICE_PLUGIN_REPOSITORY_NAME}" $LATEST_NEURON_DEVICE_PLUGIN_TAG fi # update the efa k8s device plugin echo "Updating EFA device plugin image" EFA_DEVICE_PLUGIN_REPOSITORY_NAME="eks/aws-efa-k8s-device-plugin" EFA_DEVICE_PLUGIN_TAGS=$(get_ecr_image_tags $EKS_CONTAINER_REGISTRY $EFA_DEVICE_PLUGIN_REPOSITORY_NAME) if [ $? -eq 0 ]; then LATEST_EFA_DEVICE_PLUGIN_TAG=$(echo $EFA_DEVICE_PLUGIN_TAGS | jq -r 'map(split("-") | .[0]) | max_by(sub("^v"; "") | split(".") | map(tonumber))') update_image_uris "${EKS_CONTAINER_REGISTRY}/${EFA_DEVICE_PLUGIN_REPOSITORY_NAME}" $LATEST_EFA_DEVICE_PLUGIN_TAG fi ================================================ FILE: hack/update-neuron-dependencies.sh ================================================ #!/usr/bin/env bash set -o nounset set -o errexit set -o pipefail # pip_versionsearch takes exactly 1 argument and returns its latest available version from the neuron pip repo # usage: pip_versionsearch PACKAGE pip_versionsearch() { local PACKAGE_INDEX_NAME=$(echo $1 | tr -s '_' '-') local PACKAGE_VERSION_NAME=$(echo $PACKAGE_INDEX_NAME | tr -s '-' '_') curl -s https://pip.repos.neuron.amazonaws.com/${PACKAGE_INDEX_NAME} | grep -o -G "${PACKAGE_VERSION_NAME}-[0-9\.]*+[a-f0-9]*" | sed "s/$PACKAGE_VERSION_NAME-//" | sort -V | tail -n 1 } # versionsearch takes exactly 1 argument and returns its latest available version from the neuron amd64 apt repo # usage: versionsearch PACKAGE versionsearch() { local PACKAGE_NAME=$1 curl -s https://apt.repos.neuron.amazonaws.com/dists/focal/main/binary-amd64/Packages | grep -o "${PACKAGE_NAME}_[0-9\.]*-*[a-f0-9]*" | sed "s/${PACKAGE_NAME}_//" | sort -V | tail -n 1 } # update_arg ARG NEW_VALUE update_arg() { local ARG=$1 local NEW_VALUE=$2 echo "setting $ARG to $NEW_VALUE" find . -type f -name Dockerfile -exec sed -i "s/${ARG}=.*/${ARG}=$NEW_VALUE/g" {} + } update_arg NEURONX_RUNTIME_LIB_VERSION $(versionsearch aws-neuronx-runtime-lib) update_arg NEURONX_COLLECTIVES_LIB_VERSION $(versionsearch aws-neuronx-collectives) update_arg NEURONX_TOOLS_VERSION $(versionsearch aws-neuronx-tools) update_arg NEURONX_FRAMEWORK_VERSION $(pip_versionsearch torch-neuronx) update_arg NEURONX_CC_VERSION $(pip_versionsearch neuronx-cc) update_arg NEURONX_DISTRIBUTED_VERSION $(pip_versionsearch neuronx_distributed) ================================================ FILE: hack/update-nvidia-dependencies.sh ================================================ #!/usr/bin/env bash # following from the last updated dependency: # 1. get the latest release of aws-ofi-nccl # 2. get the supported version of libnccl # 3. get the latest correct cuda version used for libnccl set -o nounset set -o errexit set -o pipefail echo "Updating aws-ofi-nccl" AWS_OFI_NCCL_TAG=$(curl -s https://api.github.com/repos/aws/aws-ofi-nccl/releases/latest | jq -r .tag_name | sed 's/^v//') find . -type f -name Dockerfile -exec sed -i "s/AWS_OFI_NCCL_VERSION=.*/AWS_OFI_NCCL_VERSION=$AWS_OFI_NCCL_TAG/g" {} + echo "Updating nccl" LIB_NCCL_TAG=$(curl -s https://api.github.com/repos/aws/aws-ofi-nccl/releases/latest | jq -r .body | grep -oP '\[NCCL \K(\S*)(?=\])' | head -n 1 | sed 's/^v//') find . -type f -name Dockerfile -exec sed -i "s/LIBNCCL_VERSION=.*/LIBNCCL_VERSION=$LIB_NCCL_TAG/g" {} + echo "Updating nvbandwidth" NVBANDWIDTH_TAG=$(curl -s https://api.github.com/repos/NVIDIA/nvbandwidth/releases/latest | jq -r .tag_name) find . -type f -name Dockerfile -exec sed -i "s/NVBANDWIDTH_VERSION=.*/NVBANDWIDTH_VERSION=$NVBANDWIDTH_TAG/g" {} + ================================================ FILE: internal/awssdk/config.go ================================================ package awssdk import ( "context" "log/slog" "github.com/aws/aws-sdk-go-v2/aws" "github.com/aws/aws-sdk-go-v2/config" ) // NewConfig returns an AWS SDK config // It will panic if the cnfig cannot be created func NewConfig() aws.Config { c, err := config.LoadDefaultConfig(context.TODO()) if err != nil { slog.Error("failed to create AWS SDK config", "error", err) panic(err) } return c } ================================================ FILE: internal/deployers/eksapi/addons.go ================================================ package eksapi import ( "context" "fmt" "log/slog" "strings" "time" "github.com/aws/aws-sdk-go-v2/aws" "github.com/aws/aws-sdk-go-v2/service/eks" ) const ( addonCreationTimeout = 5 * time.Minute ) type AddonManager struct { clients *awsClients } func NewAddonManager(clients *awsClients) *AddonManager { return &AddonManager{ clients: clients, } } func (m *AddonManager) createAddons(infra *Infrastructure, cluster *Cluster, opts *deployerOptions) error { ctx := context.TODO() addonMap := map[string]string{} for _, addon := range opts.Addons { addonParts := strings.Split(addon, ":") if len(addonParts) != 2 { return fmt.Errorf("invalid addon format: %s", addon) } name := addonParts[0] version := addonParts[1] slog.Info("resolving addon version", "addon", name, "version", version) resolvedVersion, err := m.resolveAddonVersion(name, version, opts.KubernetesVersion) if err != nil { return err } // dedupe addons with the same name. last provided entry wins. addonMap[name] = resolvedVersion } for addonName, addonVersion := range addonMap { slog.Info("creating addon", "addon", addonName, "version", addonVersion) input := eks.CreateAddonInput{ AddonName: aws.String(addonName), AddonVersion: aws.String(addonVersion), ClusterName: aws.String(cluster.name), } _, err := m.clients.EKS().CreateAddon(ctx, &input) if err != nil { return fmt.Errorf("failed to create addon: %v", err) } slog.Info("waiting for addon to be active", "addon", addonName) err = eks.NewAddonActiveWaiter(m.clients.EKS()). Wait(ctx, &eks.DescribeAddonInput{ AddonName: aws.String(addonName), ClusterName: aws.String(cluster.name), }, addonCreationTimeout) if err != nil { return fmt.Errorf("failed to wait for addon to be active: %v", err) } } return nil } func (m *AddonManager) resolveAddonVersion(name string, versionMarker string, kubernetesVersion string) (string, error) { input := eks.DescribeAddonVersionsInput{ AddonName: aws.String(name), KubernetesVersion: aws.String(kubernetesVersion), } descOutput, err := m.clients.EKS().DescribeAddonVersions(context.TODO(), &input) if err != nil { return "", err } for _, addon := range descOutput.Addons { for _, versionInfo := range addon.AddonVersions { switch versionMarker { case "latest": return *versionInfo.AddonVersion, nil case "default": for _, compatibility := range versionInfo.Compatibilities { if compatibility.DefaultVersion { return *versionInfo.AddonVersion, nil } } default: if *versionInfo.AddonVersion == versionMarker { return *versionInfo.AddonVersion, nil } } } } return "", fmt.Errorf("failed to resolve addon version: %s=%s", name, versionMarker) } ================================================ FILE: internal/deployers/eksapi/ami_resolver.go ================================================ package eksapi import ( "context" "fmt" "log/slog" "github.com/aws/aws-sdk-go-v2/aws" "github.com/aws/aws-sdk-go-v2/service/ec2" ec2types "github.com/aws/aws-sdk-go-v2/service/ec2/types" "github.com/aws/aws-sdk-go-v2/service/ssm" ) func NewAMIResolver(awsClients *awsClients) *amiResolver { return &amiResolver{ clients: awsClients, } } type amiResolver struct { clients *awsClients } func (r *amiResolver) Resolve(ctx context.Context, opts *deployerOptions) (string, error) { switch opts.UserDataFormat { case UserDataBootstrapSh: // TODO: AL2 is not a high priority, skipping for now. return "", fmt.Errorf("%s is not handled", opts.UserDataFormat) case UserDataNodeadm: return r.ResolveAL2023(ctx, opts) case UserDataBottlerocket: return r.ResolveBottlerocket(ctx, opts) default: return "", fmt.Errorf("unhandled userdata format: %s", opts.UserDataFormat) } } func (r *amiResolver) ResolveAL2023(ctx context.Context, opts *deployerOptions) (string, error) { describeInstanceTypesResponse, err := r.clients.EC2().DescribeInstanceTypes(ctx, &ec2.DescribeInstanceTypesInput{ InstanceTypes: []ec2types.InstanceType{ec2types.InstanceType(r.getInstance(opts))}, }) if err != nil { return "", err } instanceTypeInfo := describeInstanceTypesResponse.InstanceTypes[0] arch, err := r.resolveArch(instanceTypeInfo) if err != nil { return "", err } variant := "standard" if instanceTypeInfo.NeuronInfo != nil { if len(instanceTypeInfo.NeuronInfo.NeuronDevices) > 0 { variant = "neuron" } } else if instanceTypeInfo.GpuInfo != nil { for _, gpu := range instanceTypeInfo.GpuInfo.Gpus { if aws.ToString(gpu.Manufacturer) == "NVIDIA" { variant = "nvidia" break } } } getParameterReponse, err := r.clients.SSM().GetParameter(ctx, &ssm.GetParameterInput{ Name: aws.String(fmt.Sprintf("/aws/service/eks/optimized-ami/%s/amazon-linux-2023/%s/%s/recommended/image_id", opts.KubernetesVersion, arch, variant)), }) if err != nil { return "", err } return aws.ToString(getParameterReponse.Parameter.Value), nil } func (r *amiResolver) ResolveBottlerocket(ctx context.Context, opts *deployerOptions) (string, error) { describeInstanceTypesResponse, err := r.clients.EC2().DescribeInstanceTypes(ctx, &ec2.DescribeInstanceTypesInput{ InstanceTypes: []ec2types.InstanceType{ec2types.InstanceType(r.getInstance(opts))}, }) if err != nil { return "", err } instanceTypeInfo := describeInstanceTypesResponse.InstanceTypes[0] arch, err := r.resolveArch(instanceTypeInfo) if err != nil { return "", err } // TODO: enable fips flavorSuffix := "" if instanceTypeInfo.GpuInfo != nil { for _, gpu := range instanceTypeInfo.GpuInfo.Gpus { if aws.ToString(gpu.Manufacturer) == "NVIDIA" { flavorSuffix = "-nvidia" break } } } getParameterResponse, err := r.clients.SSM().GetParameter(ctx, &ssm.GetParameterInput{ Name: aws.String(fmt.Sprintf("/aws/service/bottlerocket/aws-k8s-%s%s/%s/latest/image_id", opts.KubernetesVersion, flavorSuffix, arch)), }) if err != nil { return "", err } return aws.ToString(getParameterResponse.Parameter.Value), nil } func (r *amiResolver) getInstance(opts *deployerOptions) string { instanceType := opts.InstanceTypes[0] if len(opts.InstanceTypes) > 1 { slog.Warn("only resolving AMI based on first instance type", "instanceType", instanceType) } return instanceType } func (r *amiResolver) resolveArch(instanceTypeInfo ec2types.InstanceTypeInfo) (string, error) { // TODO: the ordering might be weird because old instances might support // both i386 and x8664. switch arch := instanceTypeInfo.ProcessorInfo.SupportedArchitectures[0]; arch { case ec2types.ArchitectureTypeArm64, ec2types.ArchitectureTypeX8664: return string(arch), nil default: return "", fmt.Errorf("unhandled arch: %s", arch) } } ================================================ FILE: internal/deployers/eksapi/ami_resolver_test.go ================================================ //go:build integration package eksapi import ( "context" "testing" "github.com/aws/aws-sdk-go-v2/config" "github.com/stretchr/testify/assert" ) func TestAMIResolver(t *testing.T) { ctx := context.Background() awsCfg, err := config.LoadDefaultConfig(ctx) assert.NoError(t, err) amiResolver := NewAMIResolver(newAWSClients(awsCfg, "")) t.Run("AL2023-nvidia", func(t *testing.T) { opts := deployerOptions{ UserDataFormat: UserDataNodeadm, KubernetesVersion: "1.33", } t.Run("nvidia", func(t *testing.T) { opts := opts opts.InstanceTypes = []string{"g5.xlarge"} ami, err := amiResolver.Resolve(ctx, &opts) assert.NoError(t, err) assert.Regexp(t, "ami-.*", ami) }) t.Run("standard", func(t *testing.T) { opts := opts opts.InstanceTypes = []string{"m5.xlarge"} ami, err := amiResolver.Resolve(ctx, &opts) assert.NoError(t, err) assert.Regexp(t, "ami-.*", ami) }) }) t.Run("Bottlerocket", func(t *testing.T) { opts := deployerOptions{ UserDataFormat: UserDataBottlerocket, KubernetesVersion: "1.33", } t.Run("nvidia", func(t *testing.T) { opts := opts opts.InstanceTypes = []string{"g5.xlarge"} ami, err := amiResolver.Resolve(ctx, &opts) assert.NoError(t, err) assert.Regexp(t, "ami-.*", ami) }) t.Run("standard", func(t *testing.T) { opts := opts opts.InstanceTypes = []string{"m5.xlarge"} ami, err := amiResolver.Resolve(ctx, &opts) assert.NoError(t, err) assert.Regexp(t, "ami-.*", ami) }) }) } ================================================ FILE: internal/deployers/eksapi/auth_map_role.go ================================================ package eksapi import ( "bytes" "github.com/aws/aws-k8s-tester/internal/deployers/eksapi/templates" ) func generateAuthMapRole(nodeNameStrategy string, rolearn string) (string, error) { template := templates.AuthMapRole buf := bytes.Buffer{} if err := template.Execute(&buf, templates.AuthMapRoleTemplateData{ NodeNameStrategy: nodeNameStrategy, Rolearn: rolearn, }); err != nil { return "", err } return buf.String(), nil } ================================================ FILE: internal/deployers/eksapi/auth_map_role_test.go ================================================ package eksapi import ( "testing" "github.com/stretchr/testify/assert" ) const rolearn = "mock-role-arn" const sessionNamedAuthMapRole = ` - username: system:node:{{SessionName}} groups: - system:bootstrappers - system:nodes rolearn: mock-role-arn` const privateDNSNamedAuthMapRole = ` - username: system:node:{{EC2PrivateDNSName}} groups: - system:bootstrappers - system:nodes rolearn: mock-role-arn` func Test_generateAuthRoleMap(t *testing.T) { cases := []struct { nodeNameStrategy string expected string }{ { nodeNameStrategy: "SessionName", expected: sessionNamedAuthMapRole, }, { nodeNameStrategy: "EC2PrivateDNSName", expected: privateDNSNamedAuthMapRole, }, } for _, c := range cases { t.Run(c.nodeNameStrategy, func(t *testing.T) { actual, err := generateAuthMapRole(c.nodeNameStrategy, rolearn) if err != nil { t.Log(err) t.Error(err) } assert.Equal(t, c.expected, actual) }) } } ================================================ FILE: internal/deployers/eksapi/aws.go ================================================ package eksapi import ( "github.com/aws/aws-sdk-go-v2/aws" "github.com/aws/aws-sdk-go-v2/service/autoscaling" "github.com/aws/aws-sdk-go-v2/service/cloudformation" "github.com/aws/aws-sdk-go-v2/service/ec2" "github.com/aws/aws-sdk-go-v2/service/eks" "github.com/aws/aws-sdk-go-v2/service/iam" "github.com/aws/aws-sdk-go-v2/service/s3" "github.com/aws/aws-sdk-go-v2/service/ssm" ) type awsClients struct { _eks *eks.Client _cfn *cloudformation.Client _ec2 *ec2.Client _asg *autoscaling.Client _ssm *ssm.Client _iam *iam.Client _s3 *s3.Client _s3Presign *s3.PresignClient } func newAWSClients(config aws.Config, eksEndpointURL string) *awsClients { clients := awsClients{ _cfn: cloudformation.NewFromConfig(config), _ec2: ec2.NewFromConfig(config), _asg: autoscaling.NewFromConfig(config), _ssm: ssm.NewFromConfig(config), _iam: iam.NewFromConfig(config), _s3: s3.NewFromConfig(config), } clients._s3Presign = s3.NewPresignClient(clients._s3) if eksEndpointURL != "" { clients._eks = eks.NewFromConfig(config, func(o *eks.Options) { o.BaseEndpoint = aws.String(eksEndpointURL) }) } else { clients._eks = eks.NewFromConfig(config) } return &clients } func (c *awsClients) EKS() *eks.Client { return c._eks } func (c *awsClients) CFN() *cloudformation.Client { return c._cfn } func (c *awsClients) EC2() *ec2.Client { return c._ec2 } func (c *awsClients) ASG() *autoscaling.Client { return c._asg } func (c *awsClients) SSM() *ssm.Client { return c._ssm } func (c *awsClients) IAM() *iam.Client { return c._iam } func (c *awsClients) S3() *s3.Client { return c._s3 } func (c *awsClients) S3Presign() *s3.PresignClient { return c._s3Presign } ================================================ FILE: internal/deployers/eksapi/cluster.go ================================================ package eksapi import ( "context" "errors" "fmt" "log/slog" "time" "github.com/aws/aws-k8s-tester/internal/util" "github.com/aws/aws-sdk-go-v2/aws" "github.com/aws/aws-sdk-go-v2/service/eks" ekstypes "github.com/aws/aws-sdk-go-v2/service/eks/types" "github.com/aws/smithy-go/ptr" ) type ClusterManager struct { clients *awsClients resourceID string } func NewClusterManager(clients *awsClients, resourceID string) *ClusterManager { return &ClusterManager{ clients: clients, resourceID: resourceID, } } type Cluster struct { endpoint string certificateAuthorityData string securityGroupId string arn string name string cidr string } func (m *ClusterManager) getOrCreateCluster(infra *Infrastructure, opts *deployerOptions) (*Cluster, error) { targetClusterName := opts.StaticClusterName if targetClusterName == "" { slog.Info("creating cluster...") input := eks.CreateClusterInput{ Name: aws.String(m.resourceID), ResourcesVpcConfig: &ekstypes.VpcConfigRequest{ EndpointPrivateAccess: aws.Bool(true), EndpointPublicAccess: aws.Bool(true), SubnetIds: infra.subnets(), }, RoleArn: aws.String(infra.clusterRoleARN), KubernetesNetworkConfig: &ekstypes.KubernetesNetworkConfigRequest{ IpFamily: ekstypes.IpFamily(opts.IPFamily), }, Version: aws.String(opts.KubernetesVersion), } if opts.AutoMode { input.ComputeConfig = &ekstypes.ComputeConfigRequest{ // we don't enable any of the default node pools, we'll create our own Enabled: aws.Bool(true), NodeRoleArn: aws.String(infra.nodeRoleARN), // TODO: we can't currently enable managed compute without a default NodePool // the system NodePool is tainted for critical addons only, so will be ignored for our test workloads NodePools: []string{"system"}, } input.StorageConfig = &ekstypes.StorageConfigRequest{ BlockStorage: &ekstypes.BlockStorage{ Enabled: aws.Bool(true), }, } input.KubernetesNetworkConfig.ElasticLoadBalancing = &ekstypes.ElasticLoadBalancing{ Enabled: aws.Bool(true), } input.AccessConfig = &ekstypes.CreateAccessConfigRequest{ AuthenticationMode: ekstypes.AuthenticationModeApi, } input.BootstrapSelfManagedAddons = aws.Bool(false) } if opts.EnableClusterLogging { input.Logging = &ekstypes.Logging{ ClusterLogging: []ekstypes.LogSetup{ { Enabled: ptr.Bool(true), Types: ekstypes.LogTypeApi.Values(), }, }, } } apiOpts, err := util.NewHTTPHeaderAPIOptions(opts.UpClusterHeaders) if err != nil { return nil, fmt.Errorf("failed to create API options: %v", err) } createOutput, err := m.clients.EKS().CreateCluster(context.TODO(), &input, func(o *eks.Options) { o.APIOptions = apiOpts }) if err != nil { return nil, fmt.Errorf("failed to create cluster: %v", err) } targetClusterName = aws.ToString(createOutput.Cluster.Name) } else { slog.Info("reusing existing static cluster", "clusterName", opts.StaticClusterName) } cluster, waitErr := m.waitForClusterActive(targetClusterName, opts.ClusterCreationTimeout) if waitErr != nil { return nil, fmt.Errorf("failed to wait for cluster to become active: %v", waitErr) } return cluster, nil } func (m *ClusterManager) waitForClusterActive(clusterName string, timeout time.Duration) (*Cluster, error) { slog.Info("waiting for cluster to be active", "clusterName", clusterName) out, err := eks.NewClusterActiveWaiter(m.clients.EKS()).WaitForOutput(context.TODO(), &eks.DescribeClusterInput{ Name: aws.String(clusterName), }, timeout) // log when possible, whether there was an error or not if out != nil { slog.Info("cluster details", "cluster", out.Cluster) } if err != nil { return nil, fmt.Errorf("failed waiting for cluster be active: %v", err) } slog.Info("cluster is active", "arn", *out.Cluster.Arn) var cidr string switch out.Cluster.KubernetesNetworkConfig.IpFamily { case ekstypes.IpFamilyIpv4: cidr = *out.Cluster.KubernetesNetworkConfig.ServiceIpv4Cidr case ekstypes.IpFamilyIpv6: cidr = *out.Cluster.KubernetesNetworkConfig.ServiceIpv6Cidr default: return nil, fmt.Errorf("unknown cluster IP family: '%v'", out.Cluster.KubernetesNetworkConfig.IpFamily) } return &Cluster{ arn: *out.Cluster.Arn, certificateAuthorityData: *out.Cluster.CertificateAuthority.Data, cidr: cidr, endpoint: *out.Cluster.Endpoint, name: *out.Cluster.Name, securityGroupId: *out.Cluster.ResourcesVpcConfig.ClusterSecurityGroupId, }, nil } func (m *ClusterManager) isClusterActive() (bool, error) { result, err := m.clients.EKS().DescribeCluster(context.TODO(), &eks.DescribeClusterInput{ Name: aws.String(m.resourceID), }) if err != nil { return false, err } switch result.Cluster.Status { case ekstypes.ClusterStatusActive: return true, nil case ekstypes.ClusterStatusCreating: return false, nil default: return false, fmt.Errorf("cluster status is: %v", result.Cluster.Status) } } func (m *ClusterManager) deleteCluster() error { const ( retryInterval = 2 * time.Minute maxAttempts = 5 ) for attempt := 1; attempt <= maxAttempts; attempt++ { input := eks.DeleteClusterInput{ Name: aws.String(m.resourceID), } slog.Info("deleting cluster...", "attempt", attempt) out, err := m.clients.EKS().DeleteCluster(context.TODO(), &input) if err != nil { var notFound *ekstypes.ResourceNotFoundException if errors.As(err, ¬Found) { slog.Info("cluster does not exist", "resourceID", m.resourceID) return nil } if attempt == maxAttempts { return fmt.Errorf("failed to delete cluster after %d attempts: %v", maxAttempts, err) } slog.Info("deletion failed, retrying...", "error", err, "retryInterval", retryInterval) time.Sleep(retryInterval) continue } slog.Info("waiting for cluster to be deleted", "arn", *out.Cluster.Arn) err = eks.NewClusterDeletedWaiter(m.clients.EKS()). Wait(context.TODO(), &eks.DescribeClusterInput{ Name: aws.String(m.resourceID), }, time.Minute*15) if err != nil { return fmt.Errorf("failed to wait for cluster to be deleted: %v", err) } return nil } return fmt.Errorf("failed to delete cluster after %d attempts", maxAttempts) } ================================================ FILE: internal/deployers/eksapi/common.go ================================================ package eksapi import ( "os" "slices" "strings" ) const AvailabilityZonePriorityEnv = "EKSAPI_AZ_PRIORITY" func availabilityZoneHintedOrder(availabilityZones []string) []string { var priorityAZs []string if priorityAZsString, ok := os.LookupEnv(AvailabilityZonePriorityEnv); ok { priorityAZs = strings.Split(priorityAZsString, ",") } if len(priorityAZs) == 0 { return availabilityZones } return slices.SortedStableFunc(slices.Values(availabilityZones), func(az1, az2 string) int { if slices.Contains(priorityAZs, az1) { if slices.Contains(priorityAZs, az2) { return 0 } return -1 } return 0 }) } ================================================ FILE: internal/deployers/eksapi/common_test.go ================================================ package eksapi import ( "testing" "github.com/stretchr/testify/assert" ) func Test_AZ_PRIORITY(t *testing.T) { t.Setenv(AvailabilityZonePriorityEnv, "us-west-2d") assert.Equal(t, []string{"us-west-2d", "us-west-2b", "us-west-2c"}, availabilityZoneHintedOrder([]string{"us-west-2b", "us-west-2c", "us-west-2d"}), ) } ================================================ FILE: internal/deployers/eksapi/deployer.go ================================================ package eksapi import ( "context" "flag" "fmt" "log/slog" "os" "path/filepath" "strings" "time" "github.com/aws/aws-k8s-tester/internal" "github.com/aws/aws-k8s-tester/internal/awssdk" "github.com/aws/aws-k8s-tester/internal/deployers/eksapi/templates" fwext "github.com/aws/aws-k8s-tester/internal/e2e" "github.com/aws/aws-k8s-tester/internal/metrics" "github.com/aws/aws-k8s-tester/internal/util" "github.com/aws/aws-sdk-go-v2/service/cloudwatch" ekstypes "github.com/aws/aws-sdk-go-v2/service/eks/types" "github.com/spf13/pflag" "github.com/urfave/sflags/gen/gpflag" "golang.org/x/exp/slices" "sigs.k8s.io/kubetest2/pkg/types" ) // DeployerName is the name of the deployer const DeployerName = "eksapi" const ResourcePrefix = "kubetest2-" + DeployerName var SupportedNodeNameStrategy = []string{"SessionName", "EC2PrivateDNSName"} // assert that deployer implements optional interfaces var _ types.DeployerWithKubeconfig = &deployer{} var _ types.DeployerWithInit = &deployer{} var _ types.DeployerWithFinish = &deployer{} type deployer struct { commonOptions types.Options deployerOptions metrics metrics.MetricRegistry infraManager *InfrastructureManager clusterManager *ClusterManager addonManager *AddonManager nodeManager *nodeManager logManager *logManager staticClusterManager *StaticClusterManager awsClients *awsClients infra *Infrastructure cluster *Cluster k8sClient *k8sClient initTime time.Time } type deployerOptions struct { Addons []string `flag:"addons" desc:"Managed addons (name:version pairs) to create in the cluster. Use 'latest' for the most recent version, or 'default' for the default version."` AMI string `flag:"ami" desc:"AMI for unmanaged nodes"` AMIType string `flag:"ami-type" desc:"AMI type for managed nodes"` AutoMode bool `flag:"auto-mode" desc:"Enable EKS Auto Mode"` CapacityReservation bool `flag:"capacity-reservation" desc:"Use capacity reservation for the unmanaged nodegroup"` TargetCapacityReservationId string `flag:"target-capacity-reservation-id" desc:"CapacityReservation ID to use for targeted launches. Implies --capacity-reservation."` ClusterCreationTimeout time.Duration `flag:"cluster-creation-timeout" desc:"Time to wait for cluster to be created and become active."` ClusterRoleServicePrincipal string `flag:"cluster-role-service-principal" desc:"Additional service principal that can assume the cluster role"` DeployCloudwatchInfra bool `flag:"deploy-cloudwatch-infra" desc:"Deploy required infrastructure for emitting metrics to CloudWatch"` EFA bool `flag:"efa" desc:"Create EFA interfaces on the node of an unmanaged nodegroup. One instance type must be passed if set. Requires --unmanaged-nodes and --instance-types."` EKSEndpointURL string `flag:"endpoint-url" desc:"Endpoint URL for the EKS API"` EmitMetrics bool `flag:"emit-metrics" desc:"Record and emit metrics to CloudWatch"` EnableClusterLogging bool `flag:"enable-cluster-logging" desc:"Enable sending EKS control plane logs to an /aws/eks/ 0 && len(d.InstanceTypeArchs) > 0 { return fmt.Errorf("--instance-types and --instance-type-archs are mutually exclusive") } if d.TargetCapacityReservationId != "" { d.CapacityReservation = true } if d.UnmanagedNodes { if d.AMIType != "" { return fmt.Errorf("--ami-type should not be provided with --unmanaged-nodes") } if d.NodeNameStrategy == "" { d.NodeNameStrategy = "EC2PrivateDNSName" slog.Info("using default node name strategy", "strategy", "EC2PrivateDNSName") } else { if !slices.Contains(SupportedNodeNameStrategy, d.NodeNameStrategy) { return fmt.Errorf("--node-name-strategy must be one of the following values: ['SessionName', 'EC2PrivateDNSName']") } } if d.UserDataFormat == "" { d.UserDataFormat = UserDataBootstrapSh slog.Info("using default user data format", "format", d.UserDataFormat) } // AMI ID check must come after user-data format resolution because we // can try to infer the AMI type for unmanaged nodes. if d.AMI == "" { ami, err := NewAMIResolver(d.awsClients).Resolve(context.TODO(), &d.deployerOptions) if err != nil { return fmt.Errorf("failed to automatically resolve ami for unmanaged nodegroup (provide --ami to short circuit this): %w", err) } d.AMI = ami } if d.EFA && len(d.InstanceTypes) != 1 { return fmt.Errorf("--efa requires a single instance type") } } else { if d.AMI != "" { return fmt.Errorf("--ami should not be provided without --unmanaged-nodes") } if d.AMIType == "" { d.AMIType = "AL2023_x86_64_STANDARD" slog.Info("using default AMI type", "amiType", d.AMIType) } } if d.EKSEndpointURL != "" && d.ClusterRoleServicePrincipal == "" { spType := "beta" if strings.Contains(d.EKSEndpointURL, "gamma") { spType = "gamma" } d.ClusterRoleServicePrincipal = fmt.Sprintf("eks-%s.aws.internal", spType) } if d.DeployCloudwatchInfra { slog.Info("prepending pod identity agent to addons for cloudwatch infrastructure") // this must be prepended to the list in order to respect user overrides. d.deployerOptions.Addons = slices.Insert(d.deployerOptions.Addons, 0, "eks-pod-identity-agent:default") } return nil } func detectKubernetesVersion() (string, error) { detectedVersion, err := util.DetectKubernetesVersion() if err != nil { return "", err } minorVersion, err := util.ParseMinorVersion(detectedVersion) if err != nil { return "", err } return minorVersion, nil } func (d *deployer) IsUp() (up bool, err error) { return d.clusterManager.isClusterActive() } func (d *deployer) Down() error { if err := d.logManager.gatherLogsFromNodes(d.k8sClient, &d.deployerOptions, deployerPhaseDown); err != nil { slog.Warn("failed to gather logs from nodes", "error", err) // don't return err, this isn't critical } if d.deployerOptions.StaticClusterName != "" { return d.staticClusterManager.TearDownNodeForStaticCluster() } return deleteResources(d.infraManager, d.clusterManager, d.nodeManager, d.k8sClient, &d.deployerOptions) } func deleteResources(im *InfrastructureManager, cm *ClusterManager, nm *nodeManager, k8sClient *k8sClient /* nillable */, opts *deployerOptions /* nillable */) error { if err := im.deleteCloudWatchInfrastructureStack(); err != nil { return err } if err := nm.deleteNodes(k8sClient, opts); err != nil { return err } // the EKS-managed cluster security group may be associated with a leaked ENI // so we need to make sure we've deleted leaked ENIs before we delete the cluster // otherwise, the cluster security group will be left behind and will block deletion of our VPC if err := im.deleteLeakedENIs(); err != nil { return err } if err := cm.deleteCluster(); err != nil { return err } return im.deleteInfrastructureStack() } ================================================ FILE: internal/deployers/eksapi/infra.go ================================================ package eksapi import ( "context" _ "embed" "errors" "fmt" "log/slog" "path" "slices" "sort" "strings" "time" "github.com/aws/aws-sdk-go-v2/aws" "github.com/aws/aws-sdk-go-v2/service/cloudformation" cloudformationtypes "github.com/aws/aws-sdk-go-v2/service/cloudformation/types" cloudwatchtypes "github.com/aws/aws-sdk-go-v2/service/cloudwatch/types" "github.com/aws/aws-sdk-go-v2/service/ec2" ec2types "github.com/aws/aws-sdk-go-v2/service/ec2/types" "github.com/aws/aws-sdk-go-v2/service/eks" "github.com/aws/aws-sdk-go-v2/service/iam" iamtypes "github.com/aws/aws-sdk-go-v2/service/iam/types" "github.com/aws/aws-sdk-go/aws/arn" "github.com/aws/aws-k8s-tester/internal/deployers/eksapi/templates" "github.com/aws/aws-k8s-tester/internal/metrics" "github.com/aws/aws-k8s-tester/internal/util" ) const ( infraStackCreationTimeout = time.Minute * 15 infraStackDeletionTimeout = time.Minute * 30 networkInterfaceDetachmentTimeout = time.Minute * 10 ) const ( // the VPC CNI will always add this tag to ENI's that it creates vpcCNIENITagKey = "node.k8s.amazonaws.com/createdAt" // the IPAM controller will add this tag to the ENI's that it creates ipamControllerENITagKey = "eks:kubernetes-cni-node-name" ) // eksEndpointURLTag is the key for an optional tag on the infrastructure CloudFormation stack, // which indicates which EKS environment is associated with the stack's resources. // The tag is only added when --endpoint-url is passed to the deployer. const eksEndpointURLTag = "eks-endpoint-url" var ( infraMetricNamespace = path.Join(DeployerMetricNamespace, "infrastructure") infraStackDeletionFailed = &metrics.MetricSpec{ Namespace: infraMetricNamespace, Metric: "StackDeletionFailed", Unit: cloudwatchtypes.StandardUnitCount, } infraLeakedENIs = &metrics.MetricSpec{ Namespace: infraMetricNamespace, Metric: "LeakedENIs", Unit: cloudwatchtypes.StandardUnitCount, } ) type InfrastructureManager struct { clients *awsClients resourceID string metrics metrics.MetricRegistry } func NewInfrastructureManager(clients *awsClients, resourceID string, metrics metrics.MetricRegistry) *InfrastructureManager { return &InfrastructureManager{ clients: clients, resourceID: resourceID, metrics: metrics, } } type Infrastructure struct { availabilityZones []string vpc string subnetsPublic []string subnetsPrivate []string clusterRoleARN string nodeRoleARN string nodeRoleName string cloudwatchRoleArn string } func (i *Infrastructure) subnets() []string { return append(i.subnetsPublic, i.subnetsPrivate...) } func (m *InfrastructureManager) createInfrastructureStack(opts *deployerOptions) (*Infrastructure, error) { var subnetAzs []string if opts.CapacityReservation { azs, err := m.getAZsWithCapacity(opts) if err != nil { return nil, err } subnetAzs = azs } else if len(opts.InstanceTypes) > 0 { azs, err := m.getRankedAZsForInstanceTypes(opts) if err != nil { return nil, err } if len(azs) == 0 { return nil, fmt.Errorf("no AZs support any of the provided instance types (%v)", opts.InstanceTypes) } subnetAzs = azs } // this value is not currently configurable, the infra stack is hardcoded to create 2 // TODO: create a subnet in every AZ. today we need exactly 2 AZs for the subnets. const numInfraAZs = 2 subnetAzs, err := m.normalizeAZs(opts, subnetAzs, numInfraAZs) if err != nil { return nil, err } slog.Info("creating infrastructure stack", "availabilityZones", subnetAzs) input := cloudformation.CreateStackInput{ StackName: aws.String(m.resourceID), TemplateBody: aws.String(templates.Infrastructure), Capabilities: []cloudformationtypes.Capability{cloudformationtypes.CapabilityCapabilityIam}, Parameters: []cloudformationtypes.Parameter{ { ParameterKey: aws.String("ResourceId"), ParameterValue: aws.String(m.resourceID), }, { ParameterKey: aws.String("Subnet01AZ"), ParameterValue: aws.String(subnetAzs[0]), }, { ParameterKey: aws.String("Subnet02AZ"), ParameterValue: aws.String(subnetAzs[1]), }, { ParameterKey: aws.String("AutoMode"), ParameterValue: aws.String(fmt.Sprintf("%t", opts.AutoMode)), }, }, } if opts.ClusterRoleServicePrincipal != "" { input.Parameters = append(input.Parameters, cloudformationtypes.Parameter{ ParameterKey: aws.String("AdditionalClusterRoleServicePrincipal"), ParameterValue: aws.String(opts.ClusterRoleServicePrincipal), }) } if opts.EKSEndpointURL != "" { input.Tags = []cloudformationtypes.Tag{ { Key: aws.String(eksEndpointURLTag), Value: aws.String(opts.EKSEndpointURL), }, } } slog.Info("creating infrastructure stack...") out, err := m.clients.CFN().CreateStack(context.TODO(), &input) if err != nil { return nil, err } slog.Info("waiting for infrastructure stack to be created", "stackId", *out.StackId) err = cloudformation.NewStackCreateCompleteWaiter(m.clients.CFN()). Wait(context.TODO(), &cloudformation.DescribeStacksInput{ StackName: out.StackId, }, infraStackCreationTimeout) if err != nil { return nil, fmt.Errorf("failed to wait for infrastructure stack creation: %w", err) } slog.Info("getting infrastructure stack resources", "stackId", *out.StackId) infra, err := m.getInfrastructureStackResources() infra.availabilityZones = subnetAzs if err != nil { return nil, fmt.Errorf("failed to get infrastructure stack resources: %w", err) } slog.Info("created infrastructure", "infra", infra) return infra, nil } func (m *InfrastructureManager) getInfrastructureStackResources() (*Infrastructure, error) { stack, err := m.clients.CFN().DescribeStacks(context.TODO(), &cloudformation.DescribeStacksInput{ StackName: aws.String(m.resourceID), }) if err != nil { return nil, err } infra := Infrastructure{} for _, output := range stack.Stacks[0].Outputs { value := *output.OutputValue switch *output.OutputKey { case "VPC": infra.vpc = value case "SubnetsPublic": infra.subnetsPublic = strings.Split(value, ",") case "SubnetsPrivate": infra.subnetsPrivate = strings.Split(value, ",") case "ClusterRole": arn, err := arn.Parse(value) if err != nil { return nil, fmt.Errorf("infrastructure stack ClusterRole output is not a valid ARN: '%s': %v", value, err) } infra.clusterRoleARN = arn.String() case "NodeRole": arn, err := arn.Parse(value) if err != nil { return nil, fmt.Errorf("infrastructure stack NodeRole output is not a valid ARN: '%s': %v", value, err) } infra.nodeRoleARN = arn.String() // Resource looks like 'role:/MyRole' resourceParts := strings.Split(arn.Resource, "/") infra.nodeRoleName = resourceParts[len(resourceParts)-1] } } return &infra, nil } func (m *InfrastructureManager) deleteInfrastructureStack() error { infra, err := m.getInfrastructureStackResources() if err != nil { var notFound *cloudformationtypes.StackNotFoundException if errors.As(err, ¬Found) { slog.Info("infrastructure stack does not exist", "resourceID", m.resourceID) return nil } return err } if err := m.deleteLeakedInstanceProfiles(infra); err != nil { return err } input := cloudformation.DeleteStackInput{ StackName: aws.String(m.resourceID), } slog.Info("deleting infrastructure stack", "resourceID", m.resourceID) _, err = m.clients.CFN().DeleteStack(context.TODO(), &input) if err != nil { var notFound *cloudformationtypes.StackNotFoundException if errors.As(err, ¬Found) { slog.Info("infrastructure stack does not exist", "resourceID", m.resourceID) return nil } return fmt.Errorf("failed to delete infrastructure stack: %w", err) } slog.Info("waiting for infrastructure stack to be deleted", "resourceID", m.resourceID) err = cloudformation.NewStackDeleteCompleteWaiter(m.clients.CFN()). Wait(context.TODO(), &cloudformation.DescribeStacksInput{ StackName: aws.String(m.resourceID), }, infraStackDeletionTimeout) if err != nil { // don't fail the overall test, the janitor can clean this up slog.Warn("failed to wait for infrastructure stack deletion", "error", err) m.metrics.Record(infraStackDeletionFailed, 1, nil) return nil } slog.Info("deleted infrastructure stack", "resourceID", m.resourceID) return nil } // deleteLeakedInstanceProfiles deletes any instance profiles to which the node role is attached, // because this will block node role deletion (and deletion of the infrastructure stack). // For example, when --auto-mode is used, an instance profile will be created for us and won't be deleted automatically with the cluster. func (m *InfrastructureManager) deleteLeakedInstanceProfiles(infra *Infrastructure) error { if infra.nodeRoleName == "" { // if the infra stack failed to create, it could end up in a weird state with no node role // we know there aren't any instance profiles in that case, so all good return nil } out, err := m.clients.IAM().ListInstanceProfilesForRole(context.TODO(), &iam.ListInstanceProfilesForRoleInput{ RoleName: aws.String(infra.nodeRoleName), }) if err != nil { var notFound *iamtypes.NoSuchEntityException if errors.As(err, ¬Found) { return nil } return fmt.Errorf("failed to list instance profiles for role name: '%s': %v", infra.nodeRoleName, err) } else if len(out.InstanceProfiles) > 0 { var deletedInstanceProfiles []string for _, instanceProfile := range out.InstanceProfiles { _, err := m.clients.IAM().RemoveRoleFromInstanceProfile(context.TODO(), &iam.RemoveRoleFromInstanceProfileInput{ RoleName: aws.String(infra.nodeRoleName), InstanceProfileName: instanceProfile.InstanceProfileName, }) if err != nil { var notFound *iamtypes.NoSuchEntityException if errors.As(err, ¬Found) { slog.Info("instance profile does not exist", "name", aws.ToString(instanceProfile.InstanceProfileName)) continue } return fmt.Errorf("failed to remove node role %s from instance profile: %s: %v", infra.nodeRoleName, aws.ToString(instanceProfile.InstanceProfileName), err) } _, err = m.clients.IAM().DeleteInstanceProfile(context.TODO(), &iam.DeleteInstanceProfileInput{ InstanceProfileName: instanceProfile.InstanceProfileName, }) if err != nil { var notFound *iamtypes.NoSuchEntityException if errors.As(err, ¬Found) { slog.Info("instance profile does not exist", "name", aws.ToString(instanceProfile.InstanceProfileName)) continue } return fmt.Errorf("failed to delete instance profile: %s: %v", aws.ToString(instanceProfile.InstanceProfileName), err) } deletedInstanceProfiles = append(deletedInstanceProfiles, aws.ToString(instanceProfile.InstanceProfileName)) } slog.Info("deleted leaked instance profiles", "count", len(deletedInstanceProfiles), "profiles", deletedInstanceProfiles) } return nil } // deleteLeakedENIs deletes Elastic Network Interfaces that may have been allocated (and left behind) by the VPC CNI. // These leaked ENIs will prevent deletion of their associated subnets and security groups. func (m *InfrastructureManager) deleteLeakedENIs() error { infra, err := m.getInfrastructureStackResources() if err != nil { var notFound *cloudformationtypes.StackNotFoundException if errors.As(err, ¬Found) { return nil } return fmt.Errorf("failed to get infrastructure stack resources: %w", err) } enis, err := m.getVPCCNINetworkInterfaceIds(infra.vpc) if err != nil { return err } if len(enis) == 0 { return nil } slog.Info("waiting for leaked ENIs to become available", "count", len(enis), "enis", enis) if err := ec2.NewNetworkInterfaceAvailableWaiter(m.clients.EC2()).Wait(context.TODO(), &ec2.DescribeNetworkInterfacesInput{ NetworkInterfaceIds: enis, }, networkInterfaceDetachmentTimeout); err != nil { refreshedENIs, err2 := m.getVPCCNINetworkInterfaceIds(infra.vpc) if err2 != nil { return fmt.Errorf("waiter failed, and re-checking ENIs also failed: %w", err2) } if len(refreshedENIs) == 0 { slog.Info("ENIs were deleted during waiter timeout, skipping delete") return nil } return fmt.Errorf("failed to wait for ENI(s) to become available: %v", err) } for _, eni := range enis { slog.Info("deleting leaked ENI", "eni", eni) _, err := m.clients.EC2().DeleteNetworkInterface(context.TODO(), &ec2.DeleteNetworkInterfaceInput{ NetworkInterfaceId: aws.String(eni), }) if err != nil { return fmt.Errorf("failed to delete leaked ENI: %w", err) } } slog.Info("deleted leaked ENIs", "count", len(enis)) m.metrics.Record(infraLeakedENIs, float64(len(enis)), nil) return nil } // getVPCCNINetworkInterfaceIds returns the IDs of ENIs in the specified VPC that were created by the VPC CNI func (m *InfrastructureManager) getVPCCNINetworkInterfaceIds(vpcId string) ([]string, error) { paginator := ec2.NewDescribeNetworkInterfacesPaginator(m.clients.EC2(), &ec2.DescribeNetworkInterfacesInput{ Filters: []ec2types.Filter{ { Name: aws.String("vpc-id"), Values: []string{vpcId}, }, { Name: aws.String("interface-type"), Values: []string{"interface"}, }, { Name: aws.String("tag-key"), Values: []string{vpcCNIENITagKey, ipamControllerENITagKey}, }, }, }) var enis []string for paginator.HasMorePages() { page, err := paginator.NextPage(context.TODO()) if err != nil { return nil, fmt.Errorf("failed to describe ENIs: %w", err) } for _, eni := range page.NetworkInterfaces { enis = append(enis, *eni.NetworkInterfaceId) } } return enis, nil } // normalizeAZs removes availability zones that don't meet launch requirements // for instances and ensures that the resulting list containers enough AZs to // satisfy the deployment. func (m *InfrastructureManager) normalizeAZs(opts *deployerOptions, subnetAZs []string, expectedCount int) ([]string, error) { azs, err := m.clients.EC2().DescribeAvailabilityZones(context.TODO(), &ec2.DescribeAvailabilityZonesInput{ Filters: []ec2types.Filter{ { Name: aws.String("zone-type"), Values: []string{opts.ZoneType}, }, }, }) if err != nil { return nil, err } var supporttedAZs []string for _, az := range azs.AvailabilityZones { supporttedAZs = append(supporttedAZs, aws.ToString(az.ZoneName)) } var filteredAZs []string for _, az := range subnetAZs { if slices.Contains(supporttedAZs, az) { filteredAZs = append(filteredAZs, az) } } // enforce users' preferred ordering over AZs filteredAZs = availabilityZoneHintedOrder(filteredAZs) // truncate the list if we went over the max filteredAZs = filteredAZs[:min(len(filteredAZs), expectedCount)] // pad the availability zones with supported entries if we end up not having // enough after filtering. if len(filteredAZs) < expectedCount { for _, az := range supporttedAZs { if len(filteredAZs) == expectedCount { break } if !slices.Contains(filteredAZs, az) { slog.Info("padding infra stack with AZ", "az", az) filteredAZs = append(filteredAZs, az) } } } if len(filteredAZs) != expectedCount { return nil, fmt.Errorf("failed to provide AZs with expected count %d: %v", expectedCount, filteredAZs) } return filteredAZs, nil } // getAZsWithInstanceTypes returns the availability zones ordered decreasingly by the number of // requested instance types they support func (m *InfrastructureManager) getRankedAZsForInstanceTypes(opts *deployerOptions) ([]string, error) { offerings, err := m.clients.EC2().DescribeInstanceTypeOfferings(context.TODO(), &ec2.DescribeInstanceTypeOfferingsInput{ LocationType: ec2types.LocationTypeAvailabilityZone, Filters: []ec2types.Filter{ { Name: aws.String("instance-type"), Values: opts.InstanceTypes, }, }, }) if err != nil { return nil, fmt.Errorf("failed to describe instance type offerings: %v", err) } counts := make(map[string]int) for _, offering := range offerings.InstanceTypeOfferings { counts[aws.ToString(offering.Location)]++ } var azs []string for az := range counts { azs = append(azs, az) } sort.Slice(azs, func(i, j int) bool { return counts[azs[i]] > counts[azs[j]] }) return azs, nil } func (m *InfrastructureManager) getAZsWithCapacity(opts *deployerOptions) ([]string, error) { // TODO: consolidate this with the CapacityReservation logic in node.go var subnetAzs []string describeReservationsInput := ec2.DescribeCapacityReservationsInput{ Filters: []ec2types.Filter{ { Name: aws.String("instance-type"), Values: opts.InstanceTypes, }, { Name: aws.String("state"), Values: []string{"active"}, }, }, } if opts.TargetCapacityReservationId != "" { describeReservationsInput.CapacityReservationIds = []string{opts.TargetCapacityReservationId} } capacityReservations, err := m.clients.EC2().DescribeCapacityReservations(context.TODO(), &describeReservationsInput) if err != nil { return nil, err } for _, cr := range capacityReservations.CapacityReservations { if *cr.AvailableInstanceCount >= int32(opts.Nodes) { subnetAzs = append(subnetAzs, *cr.AvailabilityZone) break } } return subnetAzs, nil } func getCloudWatchStackName(resourceID string) (string, string) { clusterUUID := strings.TrimPrefix(resourceID, ResourcePrefix+"-") return fmt.Sprintf("%s-cw", resourceID), clusterUUID } func (m *InfrastructureManager) createCloudWatchInfrastructureStack(clusterName string) (string, error) { stackName, clusterUUID := getCloudWatchStackName(clusterName) slog.Info("creating CloudWatch infrastructure stack", "stackName", stackName) out, err := m.clients.CFN().CreateStack(context.TODO(), &cloudformation.CreateStackInput{ StackName: aws.String(stackName), TemplateBody: aws.String(templates.CloudWatchInfra), Capabilities: []cloudformationtypes.Capability{cloudformationtypes.CapabilityCapabilityNamedIam}, Parameters: []cloudformationtypes.Parameter{ { ParameterKey: aws.String("ClusterUUID"), ParameterValue: aws.String(clusterUUID), }, }, }) if err != nil { return "", fmt.Errorf("failed to create CloudWatch infrastructure stack: %w", err) } slog.Info("waiting for CloudWatch infrastructure stack to be created", "stackId", *out.StackId) if err := cloudformation.NewStackCreateCompleteWaiter(m.clients.CFN()). Wait(context.TODO(), &cloudformation.DescribeStacksInput{ StackName: out.StackId, }, infraStackCreationTimeout); err != nil { return "", util.WrapCFNStackFailure(context.TODO(), m.clients.CFN(), fmt.Errorf("failed to wait for CloudWatch infrastructure stack creation: %w", err), stackName) } stack, err := m.clients.CFN().DescribeStacks(context.TODO(), &cloudformation.DescribeStacksInput{ StackName: out.StackId, }) if err != nil { return "", fmt.Errorf("failed to describe CloudWatch infrastructure stack: %w", err) } // Get the CloudWatch role ARN from stack outputs var roleArn string for _, output := range stack.Stacks[0].Outputs { if aws.ToString(output.OutputKey) == "CloudWatchRoleArn" { roleArn = aws.ToString(output.OutputValue) break } } if roleArn == "" { return "", fmt.Errorf("CloudWatch role ARN not found in stack outputs") } slog.Info("CloudWatch infrastructure stack created successfully", "roleArn", roleArn) return roleArn, nil } // createCloudWatchPodIdentityAssociation creates a PodIdentityAssociation // via the EKS API directly, rather than through CloudFormation, to ensure // the correct EKS endpoint is used when a custom endpoint URL is configured. // The association is automatically cleaned up when the cluster is deleted. func (m *InfrastructureManager) createCloudWatchPodIdentityAssociation(clusterName string, roleArn string) error { slog.Info("creating PodIdentityAssociation", "clusterName", clusterName) _, err := m.clients.EKS().CreatePodIdentityAssociation(context.TODO(), &eks.CreatePodIdentityAssociationInput{ ClusterName: aws.String(clusterName), Namespace: aws.String("amazon-cloudwatch"), ServiceAccount: aws.String("cwagent"), RoleArn: aws.String(roleArn), }) if err != nil { return fmt.Errorf("failed to create PodIdentityAssociation: %w", err) } slog.Info("PodIdentityAssociation created successfully", "clusterName", clusterName) return nil } func (m *InfrastructureManager) deleteCloudWatchInfrastructureStack() error { stackName, _ := getCloudWatchStackName(m.resourceID) // The PodIdentityAssociation created via the EKS API is automatically // cleaned up when the cluster is deleted, so no explicit deletion is needed. slog.Info("deleting CloudWatch infrastructure stack", "stackName", stackName) if _, err := m.clients.CFN().DeleteStack(context.TODO(), &cloudformation.DeleteStackInput{ StackName: aws.String(stackName), }); err != nil { var notFound *cloudformationtypes.StackNotFoundException if errors.As(err, ¬Found) { slog.Info("CloudWatch infrastructure stack does not exist", "stackName", stackName) return nil } return fmt.Errorf("failed to delete CloudWatch infrastructure stack: %w", err) } slog.Info("initiated deletion of CloudWatch infrastructure stack", "stackName", stackName) return nil } ================================================ FILE: internal/deployers/eksapi/janitor.go ================================================ package eksapi import ( "context" "errors" "fmt" "log/slog" "strings" "sync" "time" "github.com/aws/aws-k8s-tester/internal/awssdk" "github.com/aws/aws-k8s-tester/internal/metrics" "github.com/aws/aws-sdk-go-v2/aws" "github.com/aws/aws-sdk-go-v2/service/cloudformation" cloudformationtypes "github.com/aws/aws-sdk-go-v2/service/cloudformation/types" "github.com/aws/aws-sdk-go-v2/service/cloudwatch" ) func NewJanitor(maxResourceAge time.Duration, emitMetrics bool, workers int, stackStatus string) *janitor { awsConfig := awssdk.NewConfig() var metricRegistry metrics.MetricRegistry if emitMetrics { metricRegistry = metrics.NewCloudWatchRegistry(cloudwatch.NewFromConfig(awsConfig)) } else { metricRegistry = metrics.NewNoopMetricRegistry() } if workers <= 0 { workers = 1 } return &janitor{ maxResourceAge: maxResourceAge, workers: workers, stackStatus: stackStatus, awsConfig: awsConfig, cfnClient: cloudformation.NewFromConfig(awsConfig), metrics: metricRegistry, } } type janitor struct { maxResourceAge time.Duration workers int stackStatus string awsConfig aws.Config cfnClient *cloudformation.Client metrics metrics.MetricRegistry } func (j *janitor) Sweep(ctx context.Context) error { awsConfig := awssdk.NewConfig() cfnClient := cloudformation.NewFromConfig(awsConfig) stacks, err := j.getStacks(ctx, cfnClient) if err != nil { return fmt.Errorf("failed to get stacks: %v", err) } var wg sync.WaitGroup stackQueue := make(chan cloudformationtypes.Stack, len(stacks)) errChan := make(chan error, len(stacks)) for i := 1; i <= j.workers; i++ { wg.Add(1) go j.sweepWorker(&wg, stackQueue, errChan) } for _, stack := range stacks { stackQueue <- stack } close(stackQueue) wg.Wait() close(errChan) var errs []error for err := range errChan { errs = append(errs, err) } return errors.Join(errs...) } func (j *janitor) getStacks(ctx context.Context, cfnClient *cloudformation.Client) ([]cloudformationtypes.Stack, error) { var stacks []cloudformationtypes.Stack stackPaginator := cloudformation.NewDescribeStacksPaginator(cfnClient, &cloudformation.DescribeStacksInput{}) for stackPaginator.HasMorePages() { page, err := stackPaginator.NextPage(ctx) if err != nil { return nil, err } stacks = append(stacks, page.Stacks...) } return stacks, nil } func (j *janitor) sweepWorker(wg *sync.WaitGroup, stackQueue <-chan cloudformationtypes.Stack, errChan chan<- error) { defer wg.Done() for stack := range stackQueue { resourceID := *stack.StackName if !strings.HasPrefix(resourceID, ResourcePrefix) { continue } if stack.StackStatus == "DELETE_COMPLETE" { continue } if j.stackStatus != "" && j.stackStatus != string(stack.StackStatus) { slog.Info("skipping resources", "status", stack.StackStatus, "resourceID", resourceID) continue } resourceAge := time.Since(*stack.CreationTime) if resourceAge < j.maxResourceAge { slog.Info("skipping resources", "age", resourceAge, "resourceID", resourceID) continue } clients := j.awsClientsForStack(stack) infraManager := NewInfrastructureManager(clients, resourceID, j.metrics) clusterManager := NewClusterManager(clients, resourceID) nodeManager := NewNodeManager(clients, resourceID) slog.Info("deleting resources", "age", resourceAge, "resourceID", resourceID) if err := deleteResources(infraManager, clusterManager, nodeManager, nil /* k8sClient */, nil /* deployerOptions */); err != nil { errChan <- fmt.Errorf("failed to delete resources: %s: %v", resourceID, err) } } } func (j *janitor) awsClientsForStack(stack cloudformationtypes.Stack) *awsClients { var eksEndpointURL string for _, tag := range stack.Tags { if *tag.Key == eksEndpointURLTag { eksEndpointURL = *tag.Value } } return newAWSClients(j.awsConfig, eksEndpointURL) } ================================================ FILE: internal/deployers/eksapi/k8s.go ================================================ package eksapi import ( "context" "errors" "fmt" "log/slog" "net" "net/url" "strings" "time" "github.com/aws/aws-k8s-tester/internal/metrics" "github.com/aws/aws-k8s-tester/internal/util" "github.com/aws/aws-sdk-go-v2/service/ec2" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" v1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apimachinery/pkg/util/sets" "k8s.io/apimachinery/pkg/util/wait" "k8s.io/apimachinery/pkg/watch" "k8s.io/client-go/dynamic" "k8s.io/client-go/kubernetes" "k8s.io/client-go/rest" "k8s.io/client-go/tools/clientcmd" "sigs.k8s.io/controller-runtime/pkg/client" crlog "sigs.k8s.io/controller-runtime/pkg/log" "sigs.k8s.io/controller-runtime/pkg/log/zap" corev1 "k8s.io/api/core/v1" ) func init() { // controller-runtime will complain loudly if this isn't set, even though we don't use this logger crlog.SetLogger(zap.New()) } const ( requestRetryInterval = 10 * time.Second requestRetryTimeout = 10 * time.Minute ) type k8sClient struct { config *rest.Config clientset kubernetes.Interface client client.Client dclient *dynamic.DynamicClient } func newK8sClient(kubeconfigPath string) (*k8sClient, error) { config, err := clientcmd.BuildConfigFromFlags("", kubeconfigPath) if err != nil { return nil, err } return &k8sClient{ config: config, clientset: kubernetes.NewForConfigOrDie(config), client: util.Must(client.New(config, client.Options{})), dclient: util.Must(dynamic.NewForConfig(config)), }, nil } func (k *k8sClient) waitForReadyNodes(nodeCount int, timeout time.Duration) error { slog.Info("waiting for nodes to be ready", "nodeCount", nodeCount, "timeout", timeout) readyNodes := sets.NewString() watcher, err := k.clientset.CoreV1().Nodes().Watch(context.TODO(), metav1.ListOptions{}) if err != nil { return fmt.Errorf("failed to create node watcher: %v", err) } defer watcher.Stop() initialReadyNodes, err := k.getReadyNodes() if err != nil { return fmt.Errorf("failed to get ready nodes: %v", err) } counter := len(initialReadyNodes) ctx, cancel := context.WithTimeout(context.Background(), timeout) defer cancel() for { select { case event, ok := <-watcher.ResultChan(): if !ok { return fmt.Errorf("the watcher channel for the nodes was closed by Kubernetes due to an unknown error") } if event.Type == watch.Error { msg := "unexpected error event type from node watcher" if statusErr, ok := event.Object.(*metav1.Status); ok { return fmt.Errorf("%s: %s", msg, statusErr.String()) } return fmt.Errorf("%s: %+v", msg, event.Object) } if event.Object != nil && event.Type != watch.Deleted { if node, ok := event.Object.(*corev1.Node); ok { if isNodeReady(node) { readyNodes.Insert(node.Name) counter = readyNodes.Len() } } } case <-ctx.Done(): return fmt.Errorf("timed out waiting for %d nodes to be ready: %w", nodeCount, ctx.Err()) } if counter >= nodeCount { break } } slog.Info("nodes are ready", "count", readyNodes.Len(), "nodes", readyNodes) return nil } func (k *k8sClient) waitForNodeDeletion(timeout time.Duration) error { slog.Info("waiting for nodes to be deleted", "timeout", timeout) nodes := sets.NewString() watcher, err := k.clientset.CoreV1().Nodes().Watch(context.TODO(), metav1.ListOptions{}) if err != nil { return fmt.Errorf("failed to create node watcher: %v", err) } defer watcher.Stop() initialNodes, err := k.clientset.CoreV1().Nodes().List(context.TODO(), v1.ListOptions{}) if err != nil { return fmt.Errorf("failed to list nodes: %v", err) } for _, node := range initialNodes.Items { nodes.Insert(node.Name) } ctx, cancelFunc := context.WithTimeout(context.Background(), timeout) defer cancelFunc() for { select { case event, ok := <-watcher.ResultChan(): if !ok { return fmt.Errorf("the watcher channel for the nodes was closed by Kubernetes due to an unknown error") } if event.Type == watch.Error { msg := "unexpected error event type from node watcher" if statusErr, ok := event.Object.(*metav1.Status); ok { return fmt.Errorf("%s: %s", msg, statusErr.String()) } return fmt.Errorf("%s: %+v", msg, event.Object) } if event.Object != nil { if node, ok := event.Object.(*corev1.Node); !ok { return fmt.Errorf("node watcher received an object that isn't a Node: %+v", event.Object) } else { switch event.Type { case watch.Added: nodes.Insert(node.Name) case watch.Deleted: nodes.Delete(node.Name) } } } case <-ctx.Done(): return fmt.Errorf("timed out waiting for nodes to be deleted: %w", ctx.Err()) } if len(nodes) == 0 { break } } slog.Info("all nodes deleted!") return nil } func (k *k8sClient) getReadyNodes() ([]corev1.Node, error) { nodes, err := k.clientset.CoreV1().Nodes().List(context.TODO(), v1.ListOptions{}) if err != nil { return nil, err } var readyNodes []corev1.Node for _, node := range nodes.Items { if isNodeReady(&node) { readyNodes = append(readyNodes, node) } } return readyNodes, nil } func isNodeReady(node *corev1.Node) bool { c := getNodeReadyCondition(node) if c == nil { return false } return c.Status == corev1.ConditionTrue } func getNodeReadyCondition(node *corev1.Node) *corev1.NodeCondition { for _, c := range node.Status.Conditions { if c.Type == corev1.NodeReady { return &c } } return nil } func (k *k8sClient) createAWSAuthConfigMap(nodeNameStrategy string, nodeRoleARN string) error { mapRoles, err := generateAuthMapRole(nodeNameStrategy, nodeRoleARN) if err != nil { return err } slog.Info("generated AuthMapRole", "mapRoles", mapRoles) err = wait.PollUntilContextTimeout(context.TODO(), requestRetryInterval, requestRetryTimeout, true, func(ctx context.Context) (bool, error) { _, err := k.clientset.CoreV1().ConfigMaps("kube-system").Create(ctx, &corev1.ConfigMap{ ObjectMeta: metav1.ObjectMeta{ Name: "aws-auth", Namespace: "kube-system", }, Data: map[string]string{ "mapRoles": mapRoles, }, }, metav1.CreateOptions{}) if err != nil { var dnsErr *net.DNSError if errors.As(err, &dnsErr) { slog.Warn("failed to create aws-auth configmap due to DNS error, retrying", "error", err) return false, nil } return false, err } return true, nil }) if err != nil { return fmt.Errorf("retry loop failed: %w", err) } return nil } func getNodeInstanceIDs(nodes []corev1.Node) ([]string, error) { var instanceIds []string var errs []error for _, node := range nodes { providerId, err := parseKubernetesProviderID(node.Spec.ProviderID) if err != nil { errs = append(errs, err) continue } instanceIds = append(instanceIds, providerId.InstanceID) } if len(errs) > 0 { return nil, errors.Join(errs...) } return instanceIds, nil } func (k *k8sClient) emitNodeMetrics(metricRegistry metrics.MetricRegistry, ec2Client *ec2.Client) error { nodes, err := k.getReadyNodes() if err != nil { return err } var errs []error for _, node := range nodes { providerId, err := parseKubernetesProviderID(node.Spec.ProviderID) if err != nil { errs = append(errs, err) continue } instanceInfo, err := ec2Client.DescribeInstances(context.TODO(), &ec2.DescribeInstancesInput{ InstanceIds: []string{providerId.InstanceID}, }) if err != nil { errs = append(errs, err) continue } instance := instanceInfo.Reservations[0].Instances[0] launchTime := *instance.LaunchTime timeToRegistration := node.ObjectMeta.CreationTimestamp.Time.Sub(launchTime) timeToReady := getNodeReadyCondition(&node).LastTransitionTime.Time.Sub(launchTime) nodeDimensions := map[string]string{ "instanceType": string(instance.InstanceType), "os": node.Status.NodeInfo.OperatingSystem, "osImage": node.Status.NodeInfo.OSImage, "arch": node.Status.NodeInfo.Architecture, } // we'll emit the metrics with different subset(s) of dimensions, to make aggregation simpler var nodeDimensionSets []map[string]string nodeDimensionSets = append(nodeDimensionSets, nodeDimensions) var osDistro string if strings.HasPrefix(node.Status.NodeInfo.OSImage, "Amazon Linux") { // on al2: "Amazon Linux 2" // on al2023: "Amazon Linux 2023.6.20241010" parts := strings.Split(node.Status.NodeInfo.OSImage, ".") amazonLinuxMajorVersion := parts[0] osDistro = amazonLinuxMajorVersion } if osDistro != "" { nodeDimensions["osDistro"] = osDistro // if we have an osDistro, add a pared-down dimension set that includes it nodeDimensionSets = append(nodeDimensionSets, map[string]string{ "osDistro": nodeDimensions["osDistro"], "instanceType": nodeDimensions["instanceType"], "arch": nodeDimensions["arch"], }) } for _, nodeDimensionSet := range nodeDimensionSets { metricRegistry.Record(nodeTimeToRegistrationSeconds, timeToRegistration.Seconds(), nodeDimensionSet) metricRegistry.Record(nodeTimeToReadySeconds, timeToReady.Seconds(), nodeDimensionSet) } } return errors.Join(errs...) } type KubernetesProviderID struct { AvailabilityZone string InstanceID string } func parseKubernetesProviderID(rawProviderId string) (*KubernetesProviderID, error) { url, err := url.Parse(rawProviderId) if err != nil { return nil, fmt.Errorf("malformed provider ID: %s", rawProviderId) } if url.Scheme != "aws" { return nil, fmt.Errorf("usupported provider ID scheme: %s", url.Scheme) } if url.Path == "" { return nil, fmt.Errorf("provider ID path is empty: %s", rawProviderId) } // example: /us-west-2a/i-12345abcdefg parts := strings.Split(url.Path, "/") if len(parts) != 3 { return nil, fmt.Errorf("provider ID path does not have 3 parts: %s", url.Path) } return &KubernetesProviderID{ AvailabilityZone: parts[1], InstanceID: parts[2], }, nil } ================================================ FILE: internal/deployers/eksapi/kubeconfig.go ================================================ package eksapi import ( "bytes" "fmt" "log/slog" "os" "text/template" ) const kubeconfigPerm = 0666 var kubeconfigTemplate = `--- apiVersion: v1 kind: Config clusters: - cluster: certificate-authority-data: {{ .ClusterCertificateAuthority }} server: {{ .ClusterEndpoint }} name: {{ .ClusterARN }} contexts: - context: cluster: {{ .ClusterARN }} user: {{ .ClusterARN }} name: {{ .ClusterARN }} current-context: {{ .ClusterARN }} preferences: {} users: - name: {{ .ClusterARN }} user: exec: apiVersion: client.authentication.k8s.io/v1beta1 command: aws args: - eks - get-token - --cluster-name - {{ .ClusterName }} ` type kubeconfigTemplateParameters struct { ClusterCertificateAuthority string ClusterARN string ClusterEndpoint string ClusterName string } func writeKubeconfig(cluster *Cluster, kubeconfigPath string) error { if cluster == nil { return fmt.Errorf("Cluster is nil, you might need set --static-cluster-name or set --up to initial cluster resrouces") } slog.Info("writing kubeconfig", "path", kubeconfigPath, "clusterArn", cluster.arn) templateParams := kubeconfigTemplateParameters{ ClusterCertificateAuthority: cluster.certificateAuthorityData, ClusterARN: cluster.arn, ClusterEndpoint: cluster.endpoint, ClusterName: cluster.name, } kubeconfig := bytes.Buffer{} t, err := template.New("kubeconfig").Parse(kubeconfigTemplate) if err != nil { return err } err = t.Execute(&kubeconfig, templateParams) if err != nil { return err } err = os.WriteFile(kubeconfigPath, kubeconfig.Bytes(), kubeconfigPerm) if err != nil { return err } slog.Info("wrote kubeconfig", "path", kubeconfigPath, "content", kubeconfig.String()) return nil } ================================================ FILE: internal/deployers/eksapi/logs.go ================================================ package eksapi import ( "context" _ "embed" "errors" "fmt" "log/slog" "slices" "time" "github.com/aws/aws-sdk-go-v2/aws" "github.com/aws/aws-sdk-go-v2/service/autoscaling" "github.com/aws/aws-sdk-go-v2/service/s3" "github.com/aws/aws-sdk-go-v2/service/ssm" ssmtypes "github.com/aws/aws-sdk-go-v2/service/ssm/types" v1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apimachinery/pkg/apis/meta/v1/unstructured" "k8s.io/apimachinery/pkg/util/wait" "sigs.k8s.io/controller-runtime/pkg/client" ) type logManager struct { clients *awsClients resourceID string } type deployerPhase string const ( deployerPhaseUp = "up" deployerPhaseDown = "down" ) func NewLogManager(clients *awsClients, resourceID string) *logManager { return &logManager{ clients: clients, resourceID: resourceID, } } func (m *logManager) gatherLogsFromNodes(k8sClient *k8sClient, opts *deployerOptions, phase deployerPhase) error { if opts.LogBucket == "" { slog.Info("--log-bucket is empty, no logs will be gathered!") return nil } if k8sClient == nil { slog.Info("no k8s client available, no logs will be gathered!") return nil } if opts.AutoMode { return m.gatherLogsUsingNodeDiagnostic(k8sClient, opts, phase) } switch opts.UserDataFormat { case "bootstrap.sh", "nodeadm", "": // if no --user-data-format was passed, we must be using managed nodes, which default to AL-based AMIs return m.gatherLogsUsingScript(k8sClient, opts, phase) default: slog.Warn("unable to gather logs for userDataFormat", "format", opts.UserDataFormat) return nil } } //go:embed logs_ssm_doc.json var logCollectorScriptSsmDocumentContent string const logCollectorSsmDocumentTimeout = 5 * time.Minute func (m *logManager) gatherLogsUsingScript(k8sClient *k8sClient, opts *deployerOptions, phase deployerPhase) error { slog.Info("gathering logs using script...") nodes, err := k8sClient.clientset.CoreV1().Nodes().List(context.TODO(), v1.ListOptions{}) if err != nil { return err } var instanceIds []string if len(nodes.Items) > 0 { instanceIds, err = getNodeInstanceIDs(nodes.Items) if err != nil { return err } } else { slog.Warn("no nodes found in cluster!") // if we're using unmanaged nodes, we can track down the instances in the ASG even if they didn't join the cluster if opts.UnmanagedNodes { slog.Info("fetching instances from unmanaged nodegroup...") out, err := m.clients.ASG().DescribeAutoScalingGroups(context.TODO(), &autoscaling.DescribeAutoScalingGroupsInput{ AutoScalingGroupNames: []string{m.resourceID}, }) if err != nil { slog.Warn("failed to describe unmanaged nodegroup ASG", "error", err) return nil } if len(out.AutoScalingGroups) != 1 { slog.Warn("autoscaling group not found", "resourceID", m.resourceID) } else { for _, asg := range out.AutoScalingGroups { for _, instance := range asg.Instances { instanceIds = append(instanceIds, aws.ToString(instance.InstanceId)) } } } } } if len(instanceIds) == 0 { slog.Warn("no nodes to gather logs from!") return nil } doc, err := m.clients.SSM().CreateDocument(context.TODO(), &ssm.CreateDocumentInput{ Content: aws.String(logCollectorScriptSsmDocumentContent), Name: aws.String(fmt.Sprintf("%s-log-collector", m.resourceID)), DocumentType: ssmtypes.DocumentTypeCommand, DocumentFormat: ssmtypes.DocumentFormatJson, }) if err != nil { return err } defer func() { m.clients.SSM().DeleteDocument(context.TODO(), &ssm.DeleteDocumentInput{ Name: doc.DocumentDescription.Name, }) }() command, err := m.clients.SSM().SendCommand(context.TODO(), &ssm.SendCommandInput{ DocumentName: doc.DocumentDescription.Name, InstanceIds: instanceIds, Parameters: map[string][]string{ "s3Destination": {fmt.Sprintf("s3://%s/node-logs/%s/%s/", opts.LogBucket, m.resourceID, phase)}, }, }) if err != nil { return err } var errs []error for _, instanceId := range instanceIds { out, err := ssm.NewCommandExecutedWaiter(m.clients.SSM()).WaitForOutput(context.TODO(), &ssm.GetCommandInvocationInput{ CommandId: command.Command.CommandId, InstanceId: aws.String(instanceId), }, logCollectorSsmDocumentTimeout) if err != nil { errs = append(errs, err) } else { slog.Info("log collection command completed", "instanceId", instanceId, "status", out.Status) } } if len(errs) > 0 { return errors.Join(errs...) } slog.Info("gathered logs from nodes", "instanceIds", instanceIds) return nil } const logCollectorNodeDiagnosticTimeout = 5 * time.Minute func (m *logManager) gatherLogsUsingNodeDiagnostic(k8sClient *k8sClient, opts *deployerOptions, phase deployerPhase) error { slog.Info("gathering logs using NodeDiagnostic...") nodes, err := k8sClient.clientset.CoreV1().Nodes().List(context.TODO(), v1.ListOptions{}) if err != nil { return err } if len(nodes.Items) == 0 { slog.Warn("no nodes to gather logs from!") return nil } instanceIds, err := getNodeInstanceIDs(nodes.Items) if err != nil { return err } var errs []error var nodeDiagnostics []unstructured.Unstructured for _, instanceId := range instanceIds { presignedPut, err := m.clients.S3Presign().PresignPutObject(context.TODO(), &s3.PutObjectInput{ Bucket: aws.String(opts.LogBucket), Key: aws.String(fmt.Sprintf("node-logs/%s/%s/%s.tar.gz", m.resourceID, phase, instanceId)), }) if err != nil { errs = append(errs, fmt.Errorf("failed to create presigned PUT for %s: %v", instanceId, err)) continue } nodeDiagnostic := unstructured.Unstructured{ Object: map[string]interface{}{ "apiVersion": "eks.amazonaws.com/v1alpha1", "kind": "NodeDiagnostic", "metadata": v1.ObjectMeta{ Name: instanceId, }, "spec": map[string]interface{}{ "logCapture": map[string]interface{}{ "destination": presignedPut.URL, }, }, }, } if err := k8sClient.client.Create(context.TODO(), &nodeDiagnostic); err != nil { errs = append(errs, err) } else { nodeDiagnostics = append(nodeDiagnostics, nodeDiagnostic) } } outcomes, err := m.waitForNodeDiagnostics(k8sClient, nodeDiagnostics) if err != nil { errs = append(errs, fmt.Errorf("failed to wait for node diagnostics: %v", err)) } for instanceId, reasons := range outcomes { for _, reason := range reasons { // consider SuccessWithErrors a success, this isn't high stakes if !slices.Contains([]string{"Success", "SuccessWithErrors"}, reason) { errs = append(errs, fmt.Errorf("node diagnostic outcome reason for %s: %s", instanceId, reason)) } } } for _, nodeDiagnostic := range nodeDiagnostics { if err := k8sClient.client.Delete(context.TODO(), &nodeDiagnostic); err != nil { errs = append(errs, err) } } if len(errs) > 0 { return errors.Join(errs...) } slog.Info("gathered logs from nodes", "instanceIds", instanceIds) return nil } // waitForNodeDiagnostics polls each node diagnostic until it reaches a terminal state, or the timeout is reached // a map of node diagnostic names to their outcome reason(s) is returned if no error occurred func (m *logManager) waitForNodeDiagnostics(k8sClient *k8sClient, nodeDiagnostics []unstructured.Unstructured) (map[string][]string, error) { outcomes := make(map[string][]string) err := wait.PollUntilContextTimeout(context.Background(), 5*time.Second, logCollectorNodeDiagnosticTimeout, false, func(ctx context.Context) (done bool, err error) { for _, nodeDiagnostic := range nodeDiagnostics { objectKey := client.ObjectKeyFromObject(&nodeDiagnostic) if _, ok := outcomes[objectKey.Name]; ok { // we already have an outcome for this node diagnostic continue } if err := k8sClient.client.Get(ctx, objectKey, &nodeDiagnostic); err != nil { return false, fmt.Errorf("failed to get node diagnostic: %+v: %v", objectKey, err) } complete, reasons := m.isNodeDiagnosticComplete(&nodeDiagnostic) if !complete { continue } outcomes[objectKey.Name] = reasons } if len(outcomes) == len(nodeDiagnostics) { // we're done! return true, nil } return false, nil }) if err != nil { return nil, err } return outcomes, nil } func (m *logManager) isNodeDiagnosticComplete(nodeDiagnostic *unstructured.Unstructured) (bool, []string) { captureStatuses, found, err := unstructured.NestedSlice(nodeDiagnostic.Object, "status", "captureStatuses") if err != nil { slog.Error("NodeDiagnostic captureStatuses does not match expected type", "nodeDiagnostic", nodeDiagnostic) return false, nil } if !found { return false, nil } var reasons []string for _, captureStatus := range captureStatuses { captureStatusMap, ok := captureStatus.(map[string]interface{}) if !ok { slog.Error("NodeDiagnostic captureStatus does not match expected type", "nodeDiagnostic", nodeDiagnostic) return false, nil } reason, found, err := unstructured.NestedString(captureStatusMap, "state", "completed", "reason") if err != nil { slog.Error("NodeDiagnostic captureStatus.reason does not match expected type", "nodeDiagnostic", nodeDiagnostic) return false, nil } if !found { return false, nil } reasons = append(reasons, reason) } return true, reasons } ================================================ FILE: internal/deployers/eksapi/logs_ssm_doc.json ================================================ { "schemaVersion": "2.2", "description": "Collect logs from an Amazon Linux EKS node", "parameters": { "s3Destination": { "type": "String" } }, "mainSteps": [ { "action": "aws:runShellScript", "name": "collectAndUploadLogs", "precondition": { "StringEquals": [ "platformType", "Linux" ] }, "inputs": { "runCommand": [ "bash /etc/eks/log-collector-script/eks-log-collector.sh >/dev/null 2>&1", "aws s3 cp /var/log/eks_i* {{s3Destination}}" ] } } ] } ================================================ FILE: internal/deployers/eksapi/metrics.go ================================================ package eksapi import ( "path" "github.com/aws/aws-k8s-tester/internal/metrics" cloudwatchtypes "github.com/aws/aws-sdk-go-v2/service/cloudwatch/types" ) var DeployerMetricNamespace = path.Join("kubetest2", DeployerName) var ( totalRuntimeSeconds = &metrics.MetricSpec{ Namespace: DeployerMetricNamespace, Metric: "TotalRuntimeSeconds", Unit: cloudwatchtypes.StandardUnitSeconds, } nodeTimeToRegistrationSeconds = &metrics.MetricSpec{ Namespace: DeployerMetricNamespace, Metric: "NodeTimeToRegistrationSeconds", Unit: cloudwatchtypes.StandardUnitSeconds, } nodeTimeToReadySeconds = &metrics.MetricSpec{ Namespace: DeployerMetricNamespace, Metric: "NodeTimeToReadySeconds", Unit: cloudwatchtypes.StandardUnitSeconds, } ) ================================================ FILE: internal/deployers/eksapi/node.go ================================================ package eksapi import ( "bytes" "context" _ "embed" "errors" "fmt" "strconv" "strings" "time" "github.com/aws/aws-sdk-go-v2/aws" "github.com/aws/aws-sdk-go-v2/service/autoscaling" "github.com/aws/aws-sdk-go-v2/service/cloudformation" cloudformationtypes "github.com/aws/aws-sdk-go-v2/service/cloudformation/types" "github.com/aws/aws-sdk-go-v2/service/ec2" ec2types "github.com/aws/aws-sdk-go-v2/service/ec2/types" "github.com/aws/aws-sdk-go-v2/service/eks" ekstypes "github.com/aws/aws-sdk-go-v2/service/eks/types" "github.com/aws/smithy-go" appsv1 "k8s.io/api/apps/v1" corev1 "k8s.io/api/core/v1" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apimachinery/pkg/runtime/schema" "log/slog" "k8s.io/utils/pointer" karpv1 "sigs.k8s.io/karpenter/pkg/apis/v1" "github.com/aws/aws-k8s-tester/internal/deployers/eksapi/templates" "github.com/aws/aws-k8s-tester/internal/util" apierrors "k8s.io/apimachinery/pkg/api/errors" ) const ( nodeDeletionTimeout = time.Minute * 20 ) var ( defaultInstanceTypes_x86_64 = []string{ "m6i.xlarge", "m6i.large", "m5.large", "t3.large", } defaultInstanceTypes_arm64 = []string{ "m7g.xlarge", "m7g.large", "m6g.xlarge", "m6g.large", "t4g.xlarge", "t4g.large", } defaultInstanceTypesByEC2ArchitectureValues = map[ec2types.ArchitectureValues][]string{ ec2types.ArchitectureValuesX8664: defaultInstanceTypes_x86_64, ec2types.ArchitectureValuesArm64: defaultInstanceTypes_arm64, } defaultInstanceTypesByEKSAMITypes = map[ekstypes.AMITypes][]string{ ekstypes.AMITypesAl2X8664: defaultInstanceTypes_x86_64, ekstypes.AMITypesAl2Arm64: defaultInstanceTypes_arm64, ekstypes.AMITypesAl2023X8664Standard: defaultInstanceTypes_x86_64, ekstypes.AMITypesAl2023Arm64Standard: defaultInstanceTypes_arm64, } nodeClassResource = schema.GroupVersionResource{Group: "eks.amazonaws.com", Version: "v1", Resource: "nodeclasses"} ) type nodeManager struct { clients *awsClients resourceID string } func NewNodeManager(clients *awsClients, resourceID string) *nodeManager { return &nodeManager{ clients: clients, resourceID: resourceID, } } func (m *nodeManager) createNodes(infra *Infrastructure, cluster *Cluster, opts *deployerOptions, k8sClient *k8sClient) error { if err := m.resolveInstanceTypes(opts); err != nil { return fmt.Errorf("failed to resolve instance types: %v", err) } if opts.AutoMode { if err := m.createNodeClass(opts, k8sClient); err != nil { return err } if err := m.createNodePool(opts, k8sClient); err != nil { return err } _, err := m.createPlaceholderDeployment(opts, k8sClient) return err } else if opts.UnmanagedNodes { return m.createUnmanagedNodegroup(infra, cluster, opts) } else { return m.createManagedNodegroup(infra, cluster, opts) } } func (m *nodeManager) resolveInstanceTypes(opts *deployerOptions) (err error) { instanceTypes := opts.InstanceTypes if len(instanceTypes) == 0 { if len(opts.InstanceTypeArchs) > 0 { slog.Info("choosing instance types based on architectures", "archs", opts.InstanceTypeArchs) for _, arch := range opts.InstanceTypeArchs { var ec2Arch ec2types.ArchitectureValues switch arch { case "x86_64", "amd64": ec2Arch = ec2types.ArchitectureValuesX8664 case "aarch64", "arm64": ec2Arch = ec2types.ArchitectureValuesArm64 default: return fmt.Errorf("unknown architecture: '%s'", arch) } instanceTypesForArch, ok := defaultInstanceTypesByEC2ArchitectureValues[ec2Arch] if !ok { return fmt.Errorf("no default instance types known for architecture: '%s'", arch) } instanceTypes = append(instanceTypes, instanceTypesForArch...) } } else if opts.UnmanagedNodes { slog.Info("choosing instance types based on AMI architecture...") if out, err := m.clients.EC2().DescribeImages(context.TODO(), &ec2.DescribeImagesInput{ ImageIds: []string{opts.AMI}, }); err != nil { return fmt.Errorf("failed to describe AMI: %s: %v", opts.AMI, err) } else { amiArch := out.Images[0].Architecture instanceTypesForAMIArchitecture, ok := defaultInstanceTypesByEC2ArchitectureValues[amiArch] if !ok { return fmt.Errorf("no default instance types known for AMI architecture: %v", amiArch) } instanceTypes = instanceTypesForAMIArchitecture } } else { // we don't rely on the service's default instance types, because they're a bit too small for the k8s e2e suite slog.Info("choosing instance types based on managed nodegroup's AMI type...") instanceTypesForAMIType, ok := defaultInstanceTypesByEKSAMITypes[ekstypes.AMITypes(opts.AMIType)] if !ok { return fmt.Errorf("no default instance types known for AMI type: %v", opts.AMIType) } instanceTypes = instanceTypesForAMIType } } validInstanceTypes, err := m.getValidInstanceTypes(instanceTypes) if err != nil { return err } if len(validInstanceTypes) == 0 { return fmt.Errorf("none of the instance types %v were valid", instanceTypes) } opts.InstanceTypes = validInstanceTypes slog.Info("using instance types", "instanceTypes", opts.InstanceTypes) return nil } func (m *nodeManager) createNodeClass(opts *deployerOptions, k8sClient *k8sClient) error { nodeclass, err := k8sClient.dclient.Resource(nodeClassResource).Get(context.Background(), "default", metav1.GetOptions{}) if err != nil { return fmt.Errorf("getting default nodeclass, %w", err) } slog.Info("got existing default nodeclass for template..") // clear out the metadata and set the name only nodeclass.Object["metadata"] = map[string]interface{}{} nodeclass.SetName(m.resourceID) // clear out the status delete(nodeclass.Object, "status") // update the ephemeral storage spec to be 500Gi if spec, ok := nodeclass.Object["spec"].(map[string]interface{}); ok { if ephemeralStorage, ok := spec["ephemeralStorage"].(map[string]interface{}); ok { ephemeralStorage["size"] = "500Gi" } // configure capacity reservation selector terms if capacity reservation is enabled if opts.CapacityReservation { capacityReservation, err := m.getCapacityReservation(opts) if err != nil { return fmt.Errorf("failed to get capacity reservation: %w", err) } spec["capacityReservationSelectorTerms"] = []map[string]interface{}{ { "id": aws.ToString(capacityReservation.CapacityReservationId), }, } } } slog.Info("creating new node class...") _, err = k8sClient.dclient.Resource(nodeClassResource).Create(context.Background(), nodeclass, metav1.CreateOptions{}) if err != nil { return fmt.Errorf("creating new nodeclass, %w", err) } slog.Info("node class created!") return nil } func (m *nodeManager) createNodePool(opts *deployerOptions, k8sClient *k8sClient) error { nodePool := karpv1.NodePool{ ObjectMeta: metav1.ObjectMeta{ Name: m.resourceID, }, Spec: karpv1.NodePoolSpec{ Weight: pointer.Int32(100), // max Disruption: karpv1.Disruption{ Budgets: []karpv1.Budget{ { Nodes: "10%", }, }, ConsolidateAfter: karpv1.MustParseNillableDuration("Never"), }, Template: karpv1.NodeClaimTemplate{ Spec: karpv1.NodeClaimTemplateSpec{ ExpireAfter: karpv1.MustParseNillableDuration("24h"), NodeClassRef: &karpv1.NodeClassReference{ Group: "eks.amazonaws.com", Kind: "NodeClass", Name: m.resourceID, }, Requirements: []karpv1.NodeSelectorRequirementWithMinValues{ { NodeSelectorRequirement: corev1.NodeSelectorRequirement{ Key: "kubernetes.io/os", Operator: corev1.NodeSelectorOpIn, Values: []string{"linux"}, }, }, { NodeSelectorRequirement: corev1.NodeSelectorRequirement{ Key: "karpenter.sh/capacity-type", Operator: corev1.NodeSelectorOpIn, Values: []string{"reserved", "on-demand"}, }, }, { NodeSelectorRequirement: corev1.NodeSelectorRequirement{ Key: "node.kubernetes.io/instance-type", Operator: corev1.NodeSelectorOpIn, Values: opts.InstanceTypes, }, }, }, }, }, }, } slog.Info("creating node pool...") if err := k8sClient.client.Create(context.TODO(), &nodePool); err != nil { return fmt.Errorf("failed to create node pool: %v", err) } slog.Info("created node pool", "nodePool", nodePool) return nil } func (m *nodeManager) deleteNodeClass(k8sClient *k8sClient) error { slog.Info("deleting node class...") if err := k8sClient.dclient.Resource(nodeClassResource).Delete(context.Background(), m.resourceID, metav1.DeleteOptions{}); err != nil { if apierrors.IsNotFound(err) { slog.Info("node class does not exist", "resourceID", m.resourceID) return nil } return fmt.Errorf("failed to delete node class, %w", err) } slog.Info("deleted node class!") return nil } func (m *nodeManager) deleteNodePool(k8sClient *k8sClient) error { nodePool := karpv1.NodePool{ ObjectMeta: metav1.ObjectMeta{ Name: m.resourceID, }, } slog.Info("deleting node pool...") if err := k8sClient.client.Delete(context.TODO(), &nodePool); err != nil { if apierrors.IsNotFound(err) { slog.Info("node pool does not exist", "resourceID", m.resourceID) return nil } return fmt.Errorf("failed to delete node pool: %w", err) } slog.Info("deleted node pool!") return nil } // createPlaceholderDeployment creates a Deployment with the specified number of replicas that requires // each replica to be scheduled on different nodes. // This ensures that (at least) the specified number of nodes exist in an EKS Auto cluster func (m *nodeManager) createPlaceholderDeployment(opts *deployerOptions, k8sClient *k8sClient) (*appsv1.Deployment, error) { if opts.Nodes == 0 { slog.Info("not creating placeholder deployment!") return nil, nil } labels := map[string]string{ "app": m.resourceID, } d := &appsv1.Deployment{ ObjectMeta: metav1.ObjectMeta{Name: m.resourceID, Namespace: "default"}, Spec: appsv1.DeploymentSpec{ Replicas: pointer.Int32(int32(opts.Nodes)), Selector: &metav1.LabelSelector{ MatchLabels: labels, }, Template: corev1.PodTemplateSpec{ ObjectMeta: metav1.ObjectMeta{ Labels: labels, }, Spec: corev1.PodSpec{ Affinity: &corev1.Affinity{ PodAntiAffinity: &corev1.PodAntiAffinity{ RequiredDuringSchedulingIgnoredDuringExecution: []corev1.PodAffinityTerm{ { LabelSelector: &metav1.LabelSelector{ MatchLabels: labels, }, TopologyKey: "kubernetes.io/hostname", }, }, }, }, Containers: []corev1.Container{ { Name: "main", Image: "public.ecr.aws/amazonlinux/amazonlinux:2023", Command: []string{"sleep", "infinity"}, }, }, }, }, }, } slog.Info("creating placeholder deployment...") d, err := k8sClient.clientset.AppsV1().Deployments("default").Create(context.TODO(), d, metav1.CreateOptions{}) if err != nil { return nil, fmt.Errorf("failed to create placeholder deployment: %v", err) } slog.Info("created placeholder deployment", "deployment", d) return d, nil } func (m *nodeManager) deletePlaceholderDeployment(k8sClient *k8sClient) error { slog.Info("deleting placeholder deployment...") if err := k8sClient.clientset.AppsV1().Deployments("default").Delete(context.TODO(), m.resourceID, *metav1.NewDeleteOptions( /* no grace period */ 0)); err != nil { if apierrors.IsNotFound(err) { slog.Info("placeholder deployment does not exist", "resourceID", m.resourceID) return nil } return fmt.Errorf("failed to delete placeholder deployment: %v", err) } slog.Info("deleted placeholder deployment!") return nil } func (m *nodeManager) createManagedNodegroup(infra *Infrastructure, cluster *Cluster, opts *deployerOptions) error { slog.Info("creating nodegroup...") input := eks.CreateNodegroupInput{ ClusterName: aws.String(m.resourceID), NodegroupName: aws.String(m.resourceID), NodeRole: aws.String(infra.nodeRoleARN), Subnets: infra.subnets(), DiskSize: aws.Int32(100), CapacityType: ekstypes.CapacityTypesOnDemand, ScalingConfig: &ekstypes.NodegroupScalingConfig{ MinSize: aws.Int32(int32(opts.Nodes)), MaxSize: aws.Int32(int32(opts.Nodes)), DesiredSize: aws.Int32(int32(opts.Nodes)), }, AmiType: ekstypes.AMITypes(opts.AMIType), InstanceTypes: opts.InstanceTypes, } out, err := m.clients.EKS().CreateNodegroup(context.TODO(), &input) if err != nil { return err } slog.Info("waiting for nodegroup to be active", "arn", *out.Nodegroup.NodegroupArn) err = eks.NewNodegroupActiveWaiter(m.clients.EKS()). Wait(context.TODO(), &eks.DescribeNodegroupInput{ ClusterName: input.ClusterName, NodegroupName: input.NodegroupName, }, opts.NodeCreationTimeout) if err != nil { return err } slog.Info("nodegroup is active", "arn", *out.Nodegroup.NodegroupArn) if opts.ExpectedAMI != "" { out, err := m.clients.EKS().DescribeNodegroup(context.TODO(), &eks.DescribeNodegroupInput{ ClusterName: input.ClusterName, NodegroupName: input.NodegroupName, }) if err != nil { return err } asgName := out.Nodegroup.Resources.AutoScalingGroups[0].Name if ok, err := m.verifyASGAMI(*asgName, opts.ExpectedAMI); err != nil { return err } else if !ok { return fmt.Errorf("ASG %s is not using expected AMI: %s", *asgName, opts.ExpectedAMI) } } return nil } func (m *nodeManager) createUnmanagedNodegroup(infra *Infrastructure, cluster *Cluster, opts *deployerOptions) error { var availabilityZoneFilter []string var capacityReservationId string stackName := m.getUnmanagedNodegroupStackName() slog.Info("creating unmanaged nodegroup stack", "stackName", stackName) userData, userDataIsMimePart, err := generateUserData(cluster, opts) if err != nil { return err } if opts.CapacityReservation { capacityReservation, err := m.getCapacityReservation(opts) if err != nil { return err } capacityReservationId = aws.ToString(capacityReservation.CapacityReservationId) availabilityZoneFilter = []string{aws.ToString(capacityReservation.AvailabilityZone)} } else { availabilityZoneFilter, err = m.getValidAvailabilityZonesFilter(opts, infra) if err != nil { return err } } targetSubnets, err := m.getValidSubnets(opts, infra, availabilityZoneFilter) if err != nil { return err } networkInterfaces, err := m.getNetworkInterfaces(opts, []string{cluster.securityGroupId}, targetSubnets) if err != nil { return err } volumeMountPath := "/dev/xvda" if opts.UserDataFormat == "bottlerocket" { volumeMountPath = "/dev/xvdb" } templateBuf := bytes.Buffer{} err = templates.UnmanagedNodegroup.Execute(&templateBuf, struct { NetworkInterfaces []templates.NetworkInterface InstanceTypes []string }{ NetworkInterfaces: networkInterfaces, InstanceTypes: opts.InstanceTypes, }) if err != nil { return err } input := cloudformation.CreateStackInput{ StackName: aws.String(stackName), TemplateBody: aws.String(templateBuf.String()), Capabilities: []cloudformationtypes.Capability{cloudformationtypes.CapabilityCapabilityIam}, Parameters: []cloudformationtypes.Parameter{ { ParameterKey: aws.String("ResourceId"), ParameterValue: aws.String(m.resourceID), }, { ParameterKey: aws.String("VpcId"), ParameterValue: aws.String(infra.vpc), }, { ParameterKey: aws.String("SubnetIds"), ParameterValue: aws.String(strings.Join(targetSubnets, ",")), }, { ParameterKey: aws.String("UserData"), ParameterValue: aws.String(userData), }, { ParameterKey: aws.String("UserDataIsMIMEPart"), ParameterValue: aws.String(strconv.FormatBool(userDataIsMimePart)), }, { ParameterKey: aws.String("VolumeMountPath"), ParameterValue: aws.String(volumeMountPath), }, { ParameterKey: aws.String("ClusterName"), ParameterValue: aws.String(cluster.name), }, { ParameterKey: aws.String("NodeRoleName"), ParameterValue: aws.String(infra.nodeRoleName), }, { ParameterKey: aws.String("NodeCount"), ParameterValue: aws.String(strconv.Itoa(opts.Nodes)), }, { ParameterKey: aws.String("SecurityGroup"), ParameterValue: aws.String(cluster.securityGroupId), }, { ParameterKey: aws.String("AMIId"), ParameterValue: aws.String(opts.AMI), }, { ParameterKey: aws.String("CapacityReservationId"), ParameterValue: aws.String(capacityReservationId), }, }, } out, err := m.clients.CFN().CreateStack(context.TODO(), &input) if err != nil { return err } slog.Info("waiting for unmanaged nodegroup stack to be created", "stackId", aws.ToString(out.StackId)) err = cloudformation.NewStackCreateCompleteWaiter(m.clients.CFN()). Wait(context.TODO(), &cloudformation.DescribeStacksInput{ StackName: out.StackId, }, opts.NodeCreationTimeout) if err != nil { return util.WrapCFNStackFailure(context.TODO(), m.clients.CFN(), fmt.Errorf("failed to wait for unmanaged nodegroup stack creation: %w", err), stackName) } slog.Info("created unmanaged nodegroup stack", "stackId", *out.StackId) if opts.ExpectedAMI != "" { if ok, err := m.verifyASGAMI(m.resourceID, opts.ExpectedAMI); err != nil { return err } else if !ok { return fmt.Errorf("ASG %s is not using expected AMI: %s", m.resourceID, opts.ExpectedAMI) } } return nil } // deleteNodes cleans up any nodes in the cluster // it will be called outside the context of a deployer run (by the janitor, for example) // so will try to delete nodes of any type func (m *nodeManager) deleteNodes(k8sClient *k8sClient, opts *deployerOptions) error { if err := m.deleteUnmanagedNodegroup(); err != nil { return err } if err := m.deleteManagedNodegroup(); err != nil { return err } // we only have a k8sClient when this is called by the deployer, not by the janitor // TODO implement cleanup of Auto nodes in the janitor if k8sClient != nil && opts != nil && opts.AutoMode { if err := m.deletePlaceholderDeployment(k8sClient); err != nil { return err } if err := m.deleteNodeClass(k8sClient); err != nil { return err } if err := m.deleteNodePool(k8sClient); err != nil { return err } if err := k8sClient.waitForNodeDeletion(nodeDeletionTimeout); err != nil { return err } } return nil } func (m *nodeManager) deleteManagedNodegroup() error { input := eks.DeleteNodegroupInput{ ClusterName: aws.String(m.resourceID), NodegroupName: aws.String(m.resourceID), } slog.Info("deleting nodegroup...") out, err := m.clients.EKS().DeleteNodegroup(context.TODO(), &input) if err != nil { var notFound *ekstypes.ResourceNotFoundException if errors.As(err, ¬Found) { slog.Info("nodegroup does not exist", "resourceID", m.resourceID) return nil } return fmt.Errorf("failed to delete nodegroup: %v", err) } slog.Info("waiting for nodegroup deletion", "arn", *out.Nodegroup.NodegroupArn) err = eks.NewNodegroupDeletedWaiter(m.clients.EKS()). Wait(context.TODO(), &eks.DescribeNodegroupInput{ ClusterName: input.ClusterName, NodegroupName: input.NodegroupName, }, nodeDeletionTimeout) if err != nil { return fmt.Errorf("failed to wait for nodegroup deletion: %v", err) } slog.Info("nodegroup deleted", "arn", *out.Nodegroup.NodegroupArn) return nil } func (m *nodeManager) deleteUnmanagedNodegroup() error { stackName := m.getUnmanagedNodegroupStackName() input := cloudformation.DeleteStackInput{ StackName: aws.String(stackName), } slog.Info("deleting unmanaged nodegroup stack", "stackName", stackName) _, err := m.clients.CFN().DeleteStack(context.TODO(), &input) if err != nil { var notFound *cloudformationtypes.StackNotFoundException if errors.As(err, ¬Found) { slog.Info("unmanaged nodegroup stack does not exist", "stackName", stackName) return nil } return fmt.Errorf("failed to delete unmanaged nodegroup stack: %w", err) } slog.Info("waiting for unmanaged nodegroup stack to be deleted", "stackName", stackName) err = cloudformation.NewStackDeleteCompleteWaiter(m.clients.CFN()). Wait(context.TODO(), &cloudformation.DescribeStacksInput{ StackName: aws.String(stackName), }, infraStackDeletionTimeout) if err != nil { return fmt.Errorf("failed to wait for unmanaged nodegroup stack deletion: %w", err) } slog.Info("deleted unmanaged nodegroup stack", "stackName", stackName) return nil } func (m *nodeManager) getUnmanagedNodegroupStackName() string { return fmt.Sprintf("%s-unmanaged-nodegroup", m.resourceID) } func (m *nodeManager) verifyASGAMI(asgName string, amiId string) (bool, error) { slog.Info("verifying AMI for ASG", "amiId", amiId, "asgName", asgName) asgOut, err := m.clients.ASG().DescribeAutoScalingGroups(context.TODO(), &autoscaling.DescribeAutoScalingGroupsInput{ AutoScalingGroupNames: []string{asgName}, }) if err != nil { return false, nil } if len(asgOut.AutoScalingGroups) != 1 { return false, fmt.Errorf("autoscaling group not found: %s", asgName) } var instanceIds []string for _, instance := range asgOut.AutoScalingGroups[0].Instances { instanceIds = append(instanceIds, *instance.InstanceId) } slog.Info("verifying AMI for instances", "instanceIds", instanceIds) ec2Out, err := m.clients.EC2().DescribeInstances(context.TODO(), &ec2.DescribeInstancesInput{ InstanceIds: instanceIds, }) if err != nil { return false, err } var errs []error for _, reservation := range ec2Out.Reservations { for _, instance := range reservation.Instances { if *instance.ImageId != amiId { errs = append(errs, fmt.Errorf("instance %s using wrong AMI: %s", *instance.InstanceId, *instance.ImageId)) } } } if len(errs) > 0 { return false, errors.Join(errs...) } slog.Info("ASG instances are using expected AMI", "amiId", amiId) return true, nil } func (m *nodeManager) getCapacityReservation(opts *deployerOptions) (*ec2types.CapacityReservation, error) { describeReservationsInput := ec2.DescribeCapacityReservationsInput{ Filters: []ec2types.Filter{ { Name: aws.String("instance-type"), Values: opts.InstanceTypes, }, { Name: aws.String("state"), Values: []string{"active"}, }, }, } if opts.TargetCapacityReservationId != "" { describeReservationsInput.CapacityReservationIds = []string{opts.TargetCapacityReservationId} } capacityReservations, err := m.clients.EC2().DescribeCapacityReservations(context.TODO(), &describeReservationsInput) if err != nil { return nil, fmt.Errorf("failed to describe capacity reservation: %v", err) } var capacityReservation *ec2types.CapacityReservation for _, cr := range capacityReservations.CapacityReservations { if aws.ToInt32(cr.AvailableInstanceCount) >= int32(opts.Nodes) { capacityReservation = &cr break } } if capacityReservation == nil { return nil, fmt.Errorf("no capacity reservation found for instance type %s with %d nodes count", opts.InstanceTypes[0], opts.Nodes) } slog.Info("using capacity reservation", "id", aws.ToString(capacityReservation.CapacityReservationId)) return capacityReservation, nil } func (m *nodeManager) getValidAvailabilityZonesFilter(opts *deployerOptions, infra *Infrastructure) ([]string, error) { if !opts.EFA { // no filter needed, leaves scheduling to EC2 provisioner return []string{}, nil } describeFilters := []ec2types.Filter{ { Name: aws.String("instance-type"), Values: opts.InstanceTypes, }, { Name: aws.String("location"), Values: infra.availabilityZones, }, } describeResponse, err := m.clients.EC2().DescribeInstanceTypeOfferings(context.TODO(), &ec2.DescribeInstanceTypeOfferingsInput{ Filters: describeFilters, LocationType: ec2types.LocationTypeAvailabilityZone, }) if err != nil { return nil, fmt.Errorf("failed to describe instance type offerings: %v", err) } if describeResponse == nil || len(describeResponse.InstanceTypeOfferings) == 0 { return nil, fmt.Errorf("no instance type offerings in current region with filters %v", describeFilters) } var candidateAZs []string for _, offering := range describeResponse.InstanceTypeOfferings { candidateAZs = append(candidateAZs, aws.ToString(offering.Location)) } // EFA traffic cannot cross an AZ https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/efa.html#efa-limits targetAZ := availabilityZoneHintedOrder(candidateAZs)[0] slog.Info("found availability zone with offering", "az", targetAZ, "instanceTypes", opts.InstanceTypes) return []string{targetAZ}, nil } func formatFilters(filters []ec2types.Filter) string { var parts []string for _, f := range filters { parts = append(parts, fmt.Sprintf("{Name:%s Values:%v}", aws.ToString(f.Name), f.Values)) } return "[" + strings.Join(parts, ",") + "]" } func (m *nodeManager) getValidSubnets(opts *deployerOptions, infra *Infrastructure, availabilityZoneFilter []string) ([]string, error) { var describeFilters []ec2types.Filter var targetSubnets []string if opts.EFA { // EFA requires private subnets targetSubnets = infra.subnetsPrivate } else { targetSubnets = infra.subnets() } if len(availabilityZoneFilter) > 0 { describeFilters = append(describeFilters, ec2types.Filter{ Name: aws.String("availability-zone"), Values: availabilityZoneFilter, }) } describeResponse, err := m.clients.EC2().DescribeSubnets(context.TODO(), &ec2.DescribeSubnetsInput{ Filters: describeFilters, SubnetIds: targetSubnets, }) if err != nil { return nil, fmt.Errorf("failed to describe subnets %v: %v", targetSubnets, err) } if describeResponse == nil || len(describeResponse.Subnets) == 0 { return nil, fmt.Errorf("no subnet in %v satisfied filters: %s", targetSubnets, formatFilters(describeFilters)) } var subnetIds []string for _, subnet := range describeResponse.Subnets { subnetIds = append(subnetIds, *subnet.SubnetId) } slog.Info("using subnets", "subnetIds", subnetIds) return subnetIds, nil } func (m *nodeManager) getValidInstanceTypes(desiredInstanceTypes []string) ([]string, error) { var validInstanceTypes []string for _, instanceType := range desiredInstanceTypes { ec2InstanceType := ec2types.InstanceType(instanceType) _, err := m.clients.EC2().DescribeInstanceTypes(context.TODO(), &ec2.DescribeInstanceTypesInput{ InstanceTypes: []ec2types.InstanceType{ec2InstanceType}, }) if err != nil { var apierr smithy.APIError if errors.As(err, &apierr) && apierr.ErrorCode() == "InvalidInstanceType" { slog.Info("eliminating instance type as an option", "instanceType", instanceType) } else { return nil, fmt.Errorf("failed to describe instance type: %s: %v", instanceType, err) } } else { validInstanceTypes = append(validInstanceTypes, instanceType) } } return validInstanceTypes, nil } func (m *nodeManager) getNetworkInterfaces(opts *deployerOptions, securityGroups []string, subnetIDs []string) ([]templates.NetworkInterface, error) { if !opts.EFA { // create only the default primary network interface if not using EFA netiface, err := getNetworkInterface(opts, 0, subnetIDs, securityGroups) if err != nil { return nil, err } return []templates.NetworkInterface{netiface}, nil } // EFA option assumes a single instance type instanceType := opts.InstanceTypes[0] ec2InstanceType := ec2types.InstanceType(instanceType) describeInstanceTypeOutput, err := m.clients.EC2().DescribeInstanceTypes(context.TODO(), &ec2.DescribeInstanceTypesInput{ InstanceTypes: []ec2types.InstanceType{ec2InstanceType}, }) if err != nil { return nil, fmt.Errorf("failed to describe instance type %s to get network interface support: %v", instanceType, err) } networkInfo := describeInstanceTypeOutput.InstanceTypes[0].NetworkInfo if !aws.ToBool(networkInfo.EfaSupported) { // fail early for better transparency return nil, fmt.Errorf("cannot generate efa interfaces for instance type %s because it does not support efa", instanceType) } // 1 EFA interface is supported per network card // https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/efa.html#efa-limits numEfaInterfaces := int(aws.ToInt32(networkInfo.MaximumNetworkCards)) var networkInterfaces []templates.NetworkInterface for cardIndex := range numEfaInterfaces { efaInterface, err := getNetworkInterface(opts, cardIndex, subnetIDs, securityGroups) if err != nil { return nil, err } networkInterfaces = append(networkInterfaces, efaInterface) } return networkInterfaces, nil } func getNetworkInterface(opts *deployerOptions, networkCardIndex int, subnetIds []string, securityGroups []string) (templates.NetworkInterface, error) { // simplification that works with currently supported network interfaces based on // https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/using-eni.html#network-cards // and // https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/create-efa.html#efa-launch deviceIndex := 0 if networkCardIndex > 0 { deviceIndex = 1 } var description, interfaceType, subnetID *string if opts.EFA { if len(subnetIds) == 0 { return templates.NetworkInterface{}, fmt.Errorf("EFA interfaces require a subnet but none were provided") } subnetID = &subnetIds[0] interfaceType = aws.String("efa") description = aws.String("EFA-enabled network interface") } else { // no need to assign a subnet here, more restrictive than it is helpful interfaceType = aws.String("interface") description = aws.String("Standard network interface") } return templates.NetworkInterface{ Description: description, DeviceIndex: &deviceIndex, NetworkCardIndex: &networkCardIndex, InterfaceType: interfaceType, SubnetId: subnetID, Groups: securityGroups, DeleteOnTermination: aws.Bool(true), }, nil } ================================================ FILE: internal/deployers/eksapi/static_cluster.go ================================================ package eksapi import ( "bytes" "context" "fmt" "log/slog" "strings" "time" "github.com/aws/aws-k8s-tester/internal/deployers/eksapi/templates" v1 "k8s.io/api/apps/v1" corev1 "k8s.io/api/core/v1" "k8s.io/apimachinery/pkg/api/errors" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apimachinery/pkg/util/wait" "k8s.io/client-go/kubernetes" "k8s.io/client-go/tools/clientcmd" "sigs.k8s.io/controller-runtime/pkg/client" karpv1 "sigs.k8s.io/karpenter/pkg/apis/v1" "sigs.k8s.io/yaml" ) type StaticClusterManager struct { k8sClient *kubernetes.Clientset karpenterClient client.Client options *deployerOptions } type NodeCondition func(nodes []corev1.Node) bool func NewStaticClusterManager(options *deployerOptions) *StaticClusterManager { return &StaticClusterManager{ options: options, } } func (s *StaticClusterManager) SetK8sClient(kubeconfig string) { cfg, err := clientcmd.BuildConfigFromFlags("", kubeconfig) if err != nil { slog.Error("failed to build kubeconfig", "error", err) panic(err) } s.k8sClient, err = kubernetes.NewForConfig(cfg) if err != nil { slog.Error("failed to create Kubernetes client", "error", err) panic(err) } s.karpenterClient, err = client.New(cfg, client.Options{}) if err != nil { slog.Error("failed to create Karpenter client", "error", err) panic(err) } } func (s *StaticClusterManager) EnsureNodeForStaticCluster() error { if err := s.CreateNodePool(); err != nil { return err } return s.DeployBusyboxAndWaitForNodes() } func (s *StaticClusterManager) TearDownNodeForStaticCluster() error { if err := s.TearDownBusyboxAndNodes(); err != nil { return err } return s.TearDownNodePool() } func (s *StaticClusterManager) CreateNodePool() error { if !strings.Contains(strings.ToLower(s.options.StaticClusterName), "nvidia") { slog.Info("NVIDIA not in cluster name, skipping node pool creation") return nil } var arch string if strings.Contains(s.options.StaticClusterName, "x86_64") { arch = "amd64" } else if strings.Contains(s.options.StaticClusterName, "aarch64") { arch = "arm64" } else { return fmt.Errorf("unable to determine architecture from cluster name") } t := templates.NvidiaStaticClusterNodepool var buf bytes.Buffer if err := t.Execute(&buf, templates.NvidiaStaticClusterNodepoolTemplateData{ Arch: arch, InstanceTypes: s.options.InstanceTypes, }); err != nil { return err } nodePool := &karpv1.NodePool{} if err := yaml.Unmarshal(buf.Bytes(), nodePool); err != nil { return fmt.Errorf("failed to unmarshal nodepool YAML: %v", err) } ctx := context.TODO() existing := &karpv1.NodePool{} err := s.karpenterClient.Get(ctx, client.ObjectKey{Name: nodePool.Name}, existing) if client.IgnoreNotFound(err) != nil { return err } if errors.IsNotFound(err) { return s.karpenterClient.Create(ctx, nodePool) } return nil } func (s *StaticClusterManager) TearDownNodePool() error { if !strings.Contains(strings.ToLower(s.options.StaticClusterName), "nvidia") { slog.Info("NVIDIA not in cluster name, skipping node pool deletion") return nil } nodePool := &karpv1.NodePool{ ObjectMeta: metav1.ObjectMeta{ Name: "nvidia", }, } if err := s.karpenterClient.Delete(context.TODO(), nodePool); err != nil { if errors.IsNotFound(err) { slog.Info("NodePool 'nvidia' not found, skipping deletion") return nil } return fmt.Errorf("failed to delete nodepool: %v", err) } slog.Info("NodePool deleted successfully") return nil } func (s *StaticClusterManager) DeployBusyboxAndWaitForNodes() error { slog.Info("deploying busybox pods") t := templates.BusyboxDeployment var buf bytes.Buffer if err := t.Execute(&buf, templates.BusyboxDeploymentTemplateData{ Nodes: s.options.Nodes, }); err != nil { return err } deployment := &v1.Deployment{} err := yaml.Unmarshal(buf.Bytes(), deployment) if err != nil { return fmt.Errorf("failed to unmarshal deployment: %v", err) } result, err := s.k8sClient.AppsV1().Deployments("default").Create(context.TODO(), deployment, metav1.CreateOptions{}) if err != nil { return err } slog.Info("created deployment", "name", result.GetObjectMeta().GetName()) return waitForNodeCondition(s.k8sClient, func(nodes []corev1.Node) bool { readyNodes := 0 for _, node := range nodes { if isNodeReady(&node) { readyNodes++ } } slog.Info("waiting for nodes", "readyNodes", readyNodes, "expectedNodes", s.options.Nodes) return readyNodes >= s.options.Nodes }, 15*time.Minute, "Waiting for nodes to be ready") } func (s *StaticClusterManager) TearDownBusyboxAndNodes() error { slog.Info("cleaning up busybox pods") err := s.k8sClient.AppsV1().Deployments("default").Delete(context.TODO(), "busybox-deployment", metav1.DeleteOptions{}) if err != nil { return fmt.Errorf("failed to delete deployment: %v", err) } slog.Info("busybox deployment deleted successfully") return waitForNodeCondition(s.k8sClient, func(nodes []corev1.Node) bool { return len(nodes) == 0 }, 30*time.Minute, "Waiting for nodes to be removed") } func waitForNodeCondition(clientset *kubernetes.Clientset, condition NodeCondition, timeout time.Duration, description string) error { ctx, cancel := context.WithTimeout(context.Background(), timeout) defer cancel() return wait.PollUntilContextTimeout(ctx, 15*time.Second, timeout, true, func(ctx context.Context) (bool, error) { nodes, err := clientset.CoreV1().Nodes().List(ctx, metav1.ListOptions{}) if err != nil { return false, err } conditionMet := condition(nodes.Items) slog.Info(description, "nodeCount", len(nodes.Items)) return conditionMet, nil }) } ================================================ FILE: internal/deployers/eksapi/templates/auth_map_role.yaml.template ================================================ - username: system:node:{{"{{"}}{{.NodeNameStrategy}}{{"}}"}} groups: - system:bootstrappers - system:nodes rolearn: {{.Rolearn}} ================================================ FILE: internal/deployers/eksapi/templates/busybox_deployment.yaml.template ================================================ apiVersion: apps/v1 kind: Deployment metadata: name: busybox-deployment spec: replicas: {{.Nodes}} selector: matchLabels: app: busybox template: metadata: labels: app: busybox spec: affinity: podAntiAffinity: requiredDuringSchedulingIgnoredDuringExecution: - labelSelector: matchExpressions: - key: app operator: In values: - busybox topologyKey: "kubernetes.io/hostname" containers: - name: busybox image: busybox command: ["sleep", "infinity"] ================================================ FILE: internal/deployers/eksapi/templates/cloudwatch-infra.yaml.template ================================================ AWSTemplateFormatVersion: '2010-09-09' Description: kubetest2-eksapi CloudWatch using Pod Identity Parameters: ClusterUUID: Description: UUID portion of the cluster name Type: String Resources: CloudWatchRole: Type: AWS::IAM::Role Properties: RoleName: !Sub "cloudwatch-role-${ClusterUUID}" AssumeRolePolicyDocument: Version: '2012-10-17' Statement: - Sid: AllowEksAuthToAssumeRoleForPodIdentity Effect: Allow Principal: Service: - pods.eks.amazonaws.com - beta.pods.eks.aws.internal Action: - sts:AssumeRole - sts:TagSession ManagedPolicyArns: - arn:aws:iam::aws:policy/CloudWatchAgentServerPolicy Description: Role for CloudWatch Agent in EKS cluster Outputs: CloudWatchRoleArn: Description: ARN of the CloudWatch IAM role Value: !GetAtt CloudWatchRole.Arn Export: Name: !Sub "${AWS::StackName}::CloudWatchRoleArn" ================================================ FILE: internal/deployers/eksapi/templates/cloudwatch_agent_infra.yaml ================================================ apiVersion: v1 kind: Namespace metadata: name: amazon-cloudwatch labels: name: amazon-cloudwatch --- apiVersion: v1 kind: ServiceAccount metadata: name: cwagent namespace: amazon-cloudwatch --- # ClusterRole for cwagent apiVersion: rbac.authorization.k8s.io/v1 kind: ClusterRole metadata: name: cwagent-role rules: - apiGroups: [""] resources: - nodes - nodes/proxy - services - endpoints - pods verbs: ["get", "list", "watch"] - apiGroups: ["extensions"] resources: - ingresses verbs: ["get", "list", "watch"] - nonResourceURLs: ["/metrics"] verbs: ["get"] --- # ClusterRoleBinding apiVersion: rbac.authorization.k8s.io/v1 kind: ClusterRoleBinding metadata: name: cwagent-role-binding subjects: - kind: ServiceAccount name: cwagent namespace: amazon-cloudwatch roleRef: kind: ClusterRole name: cwagent-role apiGroup: rbac.authorization.k8s.io ================================================ FILE: internal/deployers/eksapi/templates/infra.yaml ================================================ --- AWSTemplateFormatVersion: "2010-09-09" Description: "kubetest2-eksapi infrastructure" Parameters: VpcBlock: Type: String Default: 192.168.0.0/16 Description: The CIDR range for the VPC. This should be a valid private (RFC 1918) CIDR range. PublicSubnet01Block: Type: String Default: 192.168.0.0/18 Description: CidrBlock for public subnet 01 within the VPC PublicSubnet02Block: Type: String Default: 192.168.64.0/18 Description: CidrBlock for public subnet 02 within the VPC PrivateSubnet01Block: Type: String Default: 192.168.128.0/18 Description: CidrBlock for private subnet 01 within the VPC PrivateSubnet02Block: Type: String Default: 192.168.192.0/18 Description: CidrBlock for private subnet 02 within the VPC AdditionalClusterRoleServicePrincipal: Type: String Default: "" Description: Additional service principal with sts:AssumeRole permissions on the ClusterRole ResourceId: Type: String Subnet01AZ: Type: String Subnet02AZ: Type: String AutoMode: Type: String AllowedValues: - "true" - "false" Default: "false" Metadata: AWS::CloudFormation::Interface: ParameterGroups: - Label: default: "Worker Network Configuration" Parameters: - VpcBlock - PublicSubnet01Block - PublicSubnet02Block - PrivateSubnet01Block - PrivateSubnet02Block Conditions: HasAdditionalClusterRoleServicePrincipal: Fn::Not: - Fn::Equals: - "" - !Ref AdditionalClusterRoleServicePrincipal IsAutoMode: !Equals [!Ref AutoMode, "true"] Resources: # # Public VPC # VPC: Type: AWS::EC2::VPC Properties: CidrBlock: !Ref VpcBlock EnableDnsHostnames: true EnableDnsSupport: true Tags: - Key: Name Value: Fn::Sub: "${AWS::StackName}/VPC" IPv6CidrBlock: Type: AWS::EC2::VPCCidrBlock Properties: AmazonProvidedIpv6CidrBlock: true VpcId: Ref: VPC # # Internet gateways (ipv4, and egress for ipv6) # InternetGateway: Type: AWS::EC2::InternetGateway Properties: Tags: - Key: Name Value: Fn::Sub: "${AWS::StackName}/InternetGateway" VPCGatewayAttachment: Type: AWS::EC2::VPCGatewayAttachment Properties: InternetGatewayId: Ref: InternetGateway VpcId: Ref: VPC EgressOnlyInternetGateway: Type: AWS::EC2::EgressOnlyInternetGateway Properties: VpcId: Ref: VPC # # Nat gateways # NATGateway01: Type: AWS::EC2::NatGateway DependsOn: - NatGatewayEIP1 - SubnetPublic01 - VPCGatewayAttachment Properties: AllocationId: Fn::GetAtt: - NatGatewayEIP1 - AllocationId SubnetId: Ref: SubnetPublic01 Tags: - Key: Name Value: Fn::Sub: "${AWS::StackName}/NATGateway01" NATGateway02: Type: AWS::EC2::NatGateway DependsOn: - NatGatewayEIP2 - SubnetPublic02 - VPCGatewayAttachment Properties: AllocationId: Fn::GetAtt: - NatGatewayEIP2 - AllocationId SubnetId: Ref: SubnetPublic02 Tags: - Key: Name Value: Fn::Sub: "${AWS::StackName}/NATGateway02" # # Nat Gateway IPs # NatGatewayEIP1: Type: AWS::EC2::EIP DependsOn: - VPCGatewayAttachment Properties: Domain: vpc Tags: - Key: Name Value: Fn::Sub: "${AWS::StackName}/NatGatewayEIP1" NatGatewayEIP2: Type: AWS::EC2::EIP DependsOn: - VPCGatewayAttachment Properties: Domain: vpc Tags: - Key: Name Value: Fn::Sub: "${AWS::StackName}/NatGatewayEIP2" # # Routing - public subnets # PublicRouteTable: Type: AWS::EC2::RouteTable Properties: VpcId: Ref: VPC Tags: - Key: Name Value: Fn::Sub: "${AWS::StackName}/PublicRouteTable" PublicSubnetDefaultRoute: Type: AWS::EC2::Route DependsOn: - InternetGateway - VPCGatewayAttachment Properties: DestinationCidrBlock: 0.0.0.0/0 GatewayId: Ref: InternetGateway RouteTableId: Ref: PublicRouteTable PublicSubnetDefaultIpv6Route: Type: AWS::EC2::Route DependsOn: - InternetGateway - VPCGatewayAttachment Properties: DestinationIpv6CidrBlock: ::/0 GatewayId: Ref: InternetGateway RouteTableId: Ref: PublicRouteTable # # Routing - private subnets # Route tables # PrivateRouteTable01: Type: AWS::EC2::RouteTable Properties: VpcId: Ref: VPC Tags: - Key: Name Value: Fn::Sub: "${AWS::StackName}/PrivateRouteTable01" PrivateRouteTable02: Type: AWS::EC2::RouteTable Properties: VpcId: Ref: VPC Tags: - Key: Name Value: Fn::Sub: "${AWS::StackName}/PrivateRouteTable02" # # Nat IPv4 Private Routes # PrivateSubnetDefaultRoute01: Type: AWS::EC2::Route DependsOn: - VPCGatewayAttachment - NATGateway01 Properties: DestinationCidrBlock: 0.0.0.0/0 NatGatewayId: Ref: NATGateway01 RouteTableId: Ref: PrivateRouteTable01 PrivateSubnetDefaultRoute02: Type: AWS::EC2::Route DependsOn: - VPCGatewayAttachment - NATGateway02 Properties: DestinationCidrBlock: 0.0.0.0/0 NatGatewayId: Ref: NATGateway02 RouteTableId: Ref: PrivateRouteTable02 # # EOIG IPv6 Private Routes # PrivateSubnetDefaultIpv6Route01: Type: AWS::EC2::Route Properties: DestinationIpv6CidrBlock: ::/0 EgressOnlyInternetGatewayId: Ref: EgressOnlyInternetGateway RouteTableId: Ref: PrivateRouteTable01 PrivateSubnetDefaultIpv6Route02: Type: AWS::EC2::Route Properties: DestinationIpv6CidrBlock: ::/0 EgressOnlyInternetGatewayId: Ref: EgressOnlyInternetGateway RouteTableId: Ref: PrivateRouteTable02 # # Public subnets SubnetPublic01: Type: AWS::EC2::Subnet Metadata: Comment: Subnet 01 DependsOn: IPv6CidrBlock Properties: AvailabilityZone: Ref: Subnet01AZ CidrBlock: Ref: PublicSubnet01Block Ipv6CidrBlock: !Select [0, !Cidr [!Select [0, !GetAtt VPC.Ipv6CidrBlocks], 8, 64]] AssignIpv6AddressOnCreation: true MapPublicIpOnLaunch: true Tags: - Key: kubernetes.io/role/elb Value: "1" - Key: Name Value: Fn::Sub: "${AWS::StackName}/SubnetPublic01" VpcId: Ref: VPC SubnetPublic02: Type: AWS::EC2::Subnet DependsOn: IPv6CidrBlock Properties: AvailabilityZone: Ref: Subnet02AZ CidrBlock: Ref: PublicSubnet02Block Ipv6CidrBlock: !Select [1, !Cidr [!Select [0, !GetAtt VPC.Ipv6CidrBlocks], 8, 64]] AssignIpv6AddressOnCreation: true MapPublicIpOnLaunch: true Tags: - Key: kubernetes.io/role/elb Value: "1" - Key: Name Value: Fn::Sub: "${AWS::StackName}/SubnetPublic02" VpcId: Ref: VPC # # Public route table associations # RouteTableAssociationPublic01: Type: AWS::EC2::SubnetRouteTableAssociation Properties: RouteTableId: Ref: PublicRouteTable SubnetId: Ref: SubnetPublic01 RouteTableAssociationPublic02: Type: AWS::EC2::SubnetRouteTableAssociation Properties: RouteTableId: Ref: PublicRouteTable SubnetId: Ref: SubnetPublic02 # # Private subnets # SubnetPrivate01: Type: AWS::EC2::Subnet DependsOn: IPv6CidrBlock Properties: AvailabilityZone: Ref: Subnet01AZ CidrBlock: Ref: PrivateSubnet01Block Ipv6CidrBlock: !Select [2, !Cidr [!Select [0, !GetAtt VPC.Ipv6CidrBlocks], 8, 64]] AssignIpv6AddressOnCreation: true Tags: - Key: kubernetes.io/role/internal-elb Value: "1" - Key: Name Value: Fn::Sub: "${AWS::StackName}/SubnetPrivate01" VpcId: Ref: VPC SubnetPrivate02: Type: AWS::EC2::Subnet DependsOn: IPv6CidrBlock Properties: AvailabilityZone: Ref: Subnet02AZ CidrBlock: Ref: PrivateSubnet02Block Ipv6CidrBlock: !Select [3, !Cidr [!Select [0, !GetAtt VPC.Ipv6CidrBlocks], 8, 64]] AssignIpv6AddressOnCreation: true Tags: - Key: kubernetes.io/role/internal-elb Value: "1" - Key: Name Value: Fn::Sub: "${AWS::StackName}/SubnetPrivate02" VpcId: Ref: VPC # # Private route table associations # RouteTableAssociationPrivate01: Type: AWS::EC2::SubnetRouteTableAssociation Properties: RouteTableId: Ref: PrivateRouteTable01 SubnetId: Ref: SubnetPrivate01 RouteTableAssociationPrivate02: Type: AWS::EC2::SubnetRouteTableAssociation Properties: RouteTableId: Ref: PrivateRouteTable02 SubnetId: Ref: SubnetPrivate02 ClusterRole: Type: AWS::IAM::Role Properties: AssumeRolePolicyDocument: Version: 2012-10-17 Statement: - Action: - "sts:AssumeRole" - "sts:TagSession" Effect: Allow Principal: Service: Fn::If: - HasAdditionalClusterRoleServicePrincipal - - "eks.amazonaws.com" - !Ref AdditionalClusterRoleServicePrincipal - - "eks.amazonaws.com" ManagedPolicyArns: - !Join - "" - - "arn:" - !Ref "AWS::Partition" - ":iam::aws:policy/AmazonEKSClusterPolicy" - !If - IsAutoMode - !Join - "" - - "arn:" - !Ref "AWS::Partition" - ":iam::aws:policy/AmazonEKSBlockStoragePolicy" - !Ref "AWS::NoValue" - !If - IsAutoMode - !Join - "" - - "arn:" - !Ref "AWS::Partition" - ":iam::aws:policy/AmazonEKSComputePolicy" - !Ref "AWS::NoValue" - !If - IsAutoMode - !Join - "" - - "arn:" - !Ref "AWS::Partition" - ":iam::aws:policy/AmazonEKSLoadBalancingPolicy" - !Ref "AWS::NoValue" - !If - IsAutoMode - !Join - "" - - "arn:" - !Ref "AWS::Partition" - ":iam::aws:policy/AmazonEKSNetworkingPolicy" - !Ref "AWS::NoValue" NodeRole: Type: AWS::IAM::Role Properties: AssumeRolePolicyDocument: Version: 2012-10-17 Statement: - Action: "sts:AssumeRole" Effect: Allow Principal: Service: ec2.amazonaws.com ManagedPolicyArns: - !Join - "" - - "arn:" - !Ref "AWS::Partition" - ":iam::aws:policy/AmazonEKSWorkerNodePolicy" - !Join - "" - - "arn:" - !Ref "AWS::Partition" - ":iam::aws:policy/AmazonEC2ContainerRegistryReadOnly" - !Join - "" - - "arn:" - !Ref "AWS::Partition" - ":iam::aws:policy/AmazonEKS_CNI_Policy" - !Join - "" - - "arn:" - !Ref "AWS::Partition" - ":iam::aws:policy/AmazonSSMManagedInstanceCore" - !Join - "" - - "arn:" - !Ref "AWS::Partition" - ":iam::aws:policy/AmazonS3FullAccess" VPCCNIIPv6Policy: Type: AWS::IAM::Policy Properties: PolicyDocument: | { "Version": "2012-10-17", "Statement": [ { "Effect": "Allow", "Action": [ "ec2:AssignIpv6Addresses", "ec2:DescribeInstances", "ec2:DescribeTags", "ec2:DescribeNetworkInterfaces", "ec2:DescribeInstanceTypes" ], "Resource": "*" }, { "Effect": "Allow", "Action": [ "ec2:CreateTags" ], "Resource": [ "arn:*:ec2:*:*:network-interface/*" ] } ] } PolicyName: AmazonEKS_CNI_IPv6_Policy Roles: - !Ref NodeRole Outputs: SubnetsPrivate: Value: Fn::Join: - "," - - Ref: SubnetPrivate01 - Ref: SubnetPrivate02 Export: Name: Fn::Sub: "${AWS::StackName}::SubnetsPrivate" SubnetsPublic: Value: Fn::Join: - "," - - Ref: SubnetPublic01 - Ref: SubnetPublic02 Export: Name: Fn::Sub: "${AWS::StackName}::SubnetsPublic" VPC: Value: Ref: VPC Export: Name: Fn::Sub: "${AWS::StackName}::VPC" ClusterRole: Value: Fn::Join: - "" - - "arn:" - !Ref "AWS::Partition" - ":iam::" - !Ref "AWS::AccountId" - ":role/" - !Ref ClusterRole Export: Name: Fn::Sub: "${AWS::StackName}::ClusterRole" NodeRole: Value: Fn::Join: - "" - - "arn:" - !Ref "AWS::Partition" - ":iam::" - !Ref "AWS::AccountId" - ":role/" - !Ref NodeRole Export: Name: Fn::Sub: "${AWS::StackName}::NodeRole" ================================================ FILE: internal/deployers/eksapi/templates/nvidia_static_cluster_nodepool.yaml.template ================================================ apiVersion: karpenter.sh/v1 kind: NodePool metadata: labels: app.kubernetes.io/managed-by: eks name: nvidia spec: weight: 50 template: spec: requirements: - key: kubernetes.io/arch operator: In values: [{{.Arch}}] - key: kubernetes.io/os operator: In values: ["linux"] - key: karpenter.sh/capacity-type operator: In values: ["on-demand"] - key: node.kubernetes.io/instance-type operator: In values: {{- range .InstanceTypes}} - "{{.}}" {{- end}} - key: eks.amazonaws.com/instance-gpu-count operator: Exists nodeClassRef: group: eks.amazonaws.com kind: NodeClass name: default expireAfter: 336h disruption: budgets: - nodes: 10% consolidationPolicy: WhenEmpty consolidateAfter: 600s ================================================ FILE: internal/deployers/eksapi/templates/templates.go ================================================ package templates import ( _ "embed" "text/template" ) //go:embed infra.yaml var Infrastructure string //go:embed cloudwatch_agent_infra.yaml var CloudWatchAgentRbac []byte var ( //go:embed unmanaged-nodegroup.yaml.template unmanagedNodegroupTemplate string UnmanagedNodegroup = template.Must(template.New("unmanagedNodegroup").Parse(unmanagedNodegroupTemplate)) ) //go:embed cloudwatch-infra.yaml.template var CloudWatchInfra string type NetworkInterface struct { Description *string NetworkCardIndex *int DeviceIndex *int InterfaceType *string Groups []string SubnetId *string DeleteOnTermination *bool } type UnmanagedNodegroupTemplateData struct { NetworkInterfaces []NetworkInterface KubernetesVersion string InstanceTypes []string } type BusyboxDeploymentTemplateData struct { Nodes int } type NvidiaStaticClusterNodepoolTemplateData struct { Arch string InstanceTypes []string } var ( //go:embed userdata_bootstrap.sh.mimepart.template userDataBootstrapShTemplate string UserDataBootstrapSh = template.Must(template.New("userDataBootstrapSh").Parse(userDataBootstrapShTemplate)) //go:embed userdata_nodeadm.yaml.mimepart.template userDataNodeadmTemplate string UserDataNodeadm = template.Must(template.New("userDataNodeadm").Parse(userDataNodeadmTemplate)) //go:embed userdata_bottlerocket.toml.template userDataBottlerocketTemplate string UserDataBottlerocket = template.Must(template.New("userDataBottlerocket").Parse(userDataBottlerocketTemplate)) //go:embed busybox_deployment.yaml.template busyboxDeploymentTemplate string BusyboxDeployment = template.Must(template.New("busyboxDeployment").Parse(busyboxDeploymentTemplate)) //go:embed nvidia_static_cluster_nodepool.yaml.template nvidiaStaticClusterNodepoolTemplate string NvidiaStaticClusterNodepool = template.Must(template.New("nvidiaStaticClusterNodepool").Parse(nvidiaStaticClusterNodepoolTemplate)) ) type UserDataTemplateData struct { Name string CertificateAuthority string CIDR string ClusterDNSIP string APIServerEndpoint string KubeletFeatureGates map[string]bool NodeadmFeatureGates map[string]bool } var ( //go:embed auth_map_role.yaml.template authMapRoleTemplate string AuthMapRole = template.Must(template.New("authMapRole").Parse(authMapRoleTemplate)) ) type AuthMapRoleTemplateData struct { NodeNameStrategy string Rolearn string } ================================================ FILE: internal/deployers/eksapi/templates/templates_test.go ================================================ package templates import ( "bytes" "testing" ) func Test_UnmanagedNodegroup(t *testing.T) { buf := bytes.Buffer{} err := UnmanagedNodegroup.Execute(&buf, UnmanagedNodegroupTemplateData{ KubernetesVersion: "1.28", InstanceTypes: []string{ "t2.medium", "t2.large", "t2.xlarge", }, }) if err != nil { t.Error(err) } } ================================================ FILE: internal/deployers/eksapi/templates/unmanaged-nodegroup.yaml.template ================================================ --- AWSTemplateFormatVersion: '2010-09-09' Description: 'kubetest2-eksapi unmanaged nodegroup' Parameters: ResourceId: Description: Unique identifier for this kubetest2-eksapi execution. Type: String VpcId: Type: AWS::EC2::VPC::Id SubnetIds: Type: List SecurityGroup: Type: AWS::EC2::SecurityGroup::Id AMIId: Type: String Description: Specify AMI id for the node instances. NodeDiskSize: Type: Number Description: Node disk size in gigabytes. Default: 100 NodeCount: Type: Number ClusterName: Type: String NodeRoleName: Description: The IAM role name of worker nodes. Type: String UserData: Type: String VolumeMountPath: Type: String CapacityReservationId: Type: String Description: Capacity reservation id for the unmanaged nodegroup UserDataIsMIMEPart: Description: "User data should be embedded as a part of a multi-part MIME document" Default: true Type: String AllowedValues: [true, false] Conditions: IsCapacityReservationIdSet: !Not [!Equals [!Ref CapacityReservationId, ""]] IsUserDataMIMEPart: !Equals [true, !Ref UserDataIsMIMEPart] Resources: EFASecurityGroupIngress: Type: "AWS::EC2::SecurityGroupIngress" Properties: Description: Allow node to communicate with each other FromPort: 0 ToPort: 65535 GroupId: !Ref SecurityGroup IpProtocol: "-1" SourceSecurityGroupId: !Ref SecurityGroup EFASecurityGroupEgress: Type: "AWS::EC2::SecurityGroupEgress" Properties: Description: Allow the efa worker nodes outbound communication DestinationSecurityGroupId: !Ref SecurityGroup FromPort: 0 ToPort: 65536 GroupId: !Ref SecurityGroup IpProtocol: "-1" EFASecurityGroupEgressAllIpv4: Type: "AWS::EC2::SecurityGroupEgress" Properties: Description: Allow the efa worker nodes outbound communication FromPort: 0 ToPort: 65536 CidrIp: "0.0.0.0/0" GroupId: !Ref SecurityGroup IpProtocol: "-1" EFASecurityGroupEgressAllIpv6: Type: "AWS::EC2::SecurityGroupEgress" Properties: Description: Allow the efa worker nodes outbound communication FromPort: 0 ToPort: 65536 CidrIpv6: "::/0" GroupId: !Ref SecurityGroup IpProtocol: "-1" NodeInstanceProfile: Type: AWS::IAM::InstanceProfile Properties: Path: "/" Roles: - !Ref NodeRoleName NodeLaunchTemplate: Type: AWS::EC2::LaunchTemplate Properties: LaunchTemplateName: !Ref ResourceId LaunchTemplateData: BlockDeviceMappings: - DeviceName: !Ref VolumeMountPath Ebs: DeleteOnTermination: true VolumeSize: !Ref NodeDiskSize VolumeType: gp2 CapacityReservationSpecification: Fn::If: - IsCapacityReservationIdSet - CapacityReservationTarget: CapacityReservationId: !Ref CapacityReservationId - !Ref AWS::NoValue IamInstanceProfile: Arn: !GetAtt NodeInstanceProfile.Arn ImageId: !Ref AMIId InstanceType: "{{index .InstanceTypes 0}}" MetadataOptions: HttpTokens: required {{ if .NetworkInterfaces -}} NetworkInterfaces: {{- range .NetworkInterfaces}} - NetworkCardIndex: {{ .NetworkCardIndex }}{{ if .DeviceIndex }} {{/* network card index cannot be empty */}} DeviceIndex: {{ .DeviceIndex }}{{ end }}{{ if .InterfaceType }} InterfaceType: {{ .InterfaceType }}{{ end }}{{ if .Groups }} Groups: {{ .Groups }}{{ end }}{{ if .SubnetId }} SubnetId: {{ .SubnetId }}{{ end }}{{ if .DeleteOnTermination }} DeleteOnTermination: {{ .DeleteOnTermination }}{{ end }}{{ if .Description }} Description: {{ .Description}}{{ end -}} {{- end}} {{ end -}} UserData: Fn::Base64: Fn::If: - IsUserDataMIMEPart - Fn::Sub: | Content-Type: multipart/mixed; boundary="BOUNDARY" MIME-Version: 1.0 --BOUNDARY ${UserData} --BOUNDARY Content-Type: text/x-shellscript; charset="us-ascii" MIME-Version: 1.0 #!/usr/bin/env bash /opt/aws/bin/cfn-signal \ --stack ${AWS::StackName} \ --resource NodeGroup \ --region ${AWS::Region} --BOUNDARY-- - Fn::Sub: | ${UserData} NodeGroup: Type: AWS::AutoScaling::AutoScalingGroup UpdatePolicy: AutoScalingRollingUpdate: WaitOnResourceSignals: true PauseTime: PT15M Properties: AutoScalingGroupName: !Ref ResourceId MixedInstancesPolicy: LaunchTemplate: LaunchTemplateSpecification: LaunchTemplateId: !Ref NodeLaunchTemplate Version: !GetAtt NodeLaunchTemplate.LatestVersionNumber Overrides: {{- range .InstanceTypes}} - InstanceType: "{{.}}" {{- end}} DesiredCapacity: !Ref NodeCount MinSize: !Ref NodeCount MaxSize: !Ref NodeCount VPCZoneIdentifier: !Ref SubnetIds Tags: - Key: Name Value: !Sub "${ClusterName}-Node" PropagateAtLaunch: true # necessary for kubelet's legacy, in-tree cloud provider - Key: !Sub "kubernetes.io/cluster/${ClusterName}" Value: owned PropagateAtLaunch: true ================================================ FILE: internal/deployers/eksapi/templates/userdata_bootstrap.sh.mimepart.template ================================================ Content-Type: text/x-shellscript; charset="us-ascii" MIME-Version: 1.0 #!/usr/bin/env bash /etc/eks/bootstrap.sh {{.Name}} \ --b64-cluster-ca {{.CertificateAuthority}} \ --apiserver-endpoint {{.APIServerEndpoint}} ================================================ FILE: internal/deployers/eksapi/templates/userdata_bottlerocket.toml.template ================================================ [settings.kubernetes] "cluster-name" = "{{.Name}}" "api-server" = "{{.APIServerEndpoint}}" "cluster-certificate" = "{{.CertificateAuthority}}" {{- if .ClusterDNSIP}} "cluster-dns-ip" = "{{.ClusterDNSIP}}" {{- end}} device-ownership-from-security-context = true [settings.host-containers.admin] "enabled" = true ================================================ FILE: internal/deployers/eksapi/templates/userdata_nodeadm.yaml.mimepart.template ================================================ Content-Type: application/node.eks.aws MIME-Version: 1.0 --- apiVersion: node.eks.aws/v1alpha1 kind: NodeConfig spec: {{- if .NodeadmFeatureGates}} featureGates: {{- range $gate, $value := .NodeadmFeatureGates }} {{$gate}}: {{$value}} {{- end }} {{- end }} cluster: name: {{.Name}} apiServerEndpoint: {{.APIServerEndpoint}} certificateAuthority: {{.CertificateAuthority}} cidr: {{.CIDR}} {{- if .KubeletFeatureGates}} kubelet: config: featureGates: {{- range $gate, $value := .KubeletFeatureGates }} {{$gate}}: {{$value}} {{- end }} {{- end }} ================================================ FILE: internal/deployers/eksapi/userdata.go ================================================ package eksapi import ( "bytes" "fmt" "net" "strconv" "strings" "text/template" "github.com/aws/aws-k8s-tester/internal/deployers/eksapi/templates" ) const ( UserDataBootstrapSh = "bootstrap.sh" UserDataNodeadm = "nodeadm" UserDataBottlerocket = "bottlerocket" ) func generateUserData(cluster *Cluster, opts *deployerOptions) (string, bool, error) { userDataIsMimePart := true var t *template.Template switch opts.UserDataFormat { case UserDataBootstrapSh: t = templates.UserDataBootstrapSh case UserDataNodeadm: // TODO: replace the YAML template with proper usage of the nodeadm API go types t = templates.UserDataNodeadm case UserDataBottlerocket: t = templates.UserDataBottlerocket userDataIsMimePart = false default: return "", false, fmt.Errorf("unknown user data format: '%s'", opts.UserDataFormat) } kubeletFeatureGates := map[string]bool{} // DRA is in beta for 1.33, and so needs to be explicitly enabled. if opts.KubernetesVersion == "1.33" { kubeletFeatureGates["DynamicResourceAllocation"] = true } nodeadmFeatureGates, err := extractFeatureGates(opts.NodeadmFeatureGates) if err != nil { return "", false, err } var dnsIP string if opts.SetClusterDNSIP { dnsIP, err = deriveClusterDNSIP(cluster.cidr) if err != nil { return "", false, err } } var buf bytes.Buffer if err := t.Execute(&buf, templates.UserDataTemplateData{ APIServerEndpoint: cluster.endpoint, CertificateAuthority: cluster.certificateAuthorityData, CIDR: cluster.cidr, ClusterDNSIP: dnsIP, Name: cluster.name, KubeletFeatureGates: kubeletFeatureGates, NodeadmFeatureGates: nodeadmFeatureGates, }); err != nil { return "", false, err } return buf.String(), userDataIsMimePart, nil } func deriveClusterDNSIP(cidr string) (string, error) { _, ipNet, err := net.ParseCIDR(cidr) if err != nil { return "", fmt.Errorf("invalid CIDR: %v", err) } ip := ipNet.IP ip[len(ip)-1] += 10 return ip.String(), nil } func extractFeatureGates(featureGatePairs []string) (map[string]bool, error) { featureGateMap := make(map[string]bool) for _, keyValuePair := range featureGatePairs { components := strings.Split(keyValuePair, "=") if len(components) != 2 { return featureGateMap, fmt.Errorf("expected key=value pairs but %s has %d components", keyValuePair, len(components)) } boolValue, err := strconv.ParseBool(components[1]) if err != nil { return featureGateMap, fmt.Errorf("expected bool value in %s: %v", keyValuePair, err) } featureGateMap[components[0]] = boolValue } return featureGateMap, nil } ================================================ FILE: internal/deployers/eksapi/userdata_test.go ================================================ package eksapi import ( "testing" "github.com/stretchr/testify/assert" ) var cluster = Cluster{ name: "cluster", endpoint: "https://example.com", certificateAuthorityData: "certificateAuthority", cidr: "10.100.0.0/16", } const bootstrapShUserData = `Content-Type: text/x-shellscript; charset="us-ascii" MIME-Version: 1.0 #!/usr/bin/env bash /etc/eks/bootstrap.sh cluster \ --b64-cluster-ca certificateAuthority \ --apiserver-endpoint https://example.com ` const nodeadmUserData = `Content-Type: application/node.eks.aws MIME-Version: 1.0 --- apiVersion: node.eks.aws/v1alpha1 kind: NodeConfig spec: cluster: name: cluster apiServerEndpoint: https://example.com certificateAuthority: certificateAuthority cidr: 10.100.0.0/16 ` const nodeadmUserDataKubeletDRA = `Content-Type: application/node.eks.aws MIME-Version: 1.0 --- apiVersion: node.eks.aws/v1alpha1 kind: NodeConfig spec: cluster: name: cluster apiServerEndpoint: https://example.com certificateAuthority: certificateAuthority cidr: 10.100.0.0/16 kubelet: config: featureGates: DynamicResourceAllocation: true ` const nodeadmUserDataFeatureGate = `Content-Type: application/node.eks.aws MIME-Version: 1.0 --- apiVersion: node.eks.aws/v1alpha1 kind: NodeConfig spec: featureGates: foo: true cluster: name: cluster apiServerEndpoint: https://example.com certificateAuthority: certificateAuthority cidr: 10.100.0.0/16 ` const bottlerocketUserData = `[settings.kubernetes] "cluster-name" = "cluster" "api-server" = "https://example.com" "cluster-certificate" = "certificateAuthority" device-ownership-from-security-context = true [settings.host-containers.admin] "enabled" = true ` const bottlerocketUserDataWithDNS = `[settings.kubernetes] "cluster-name" = "cluster" "api-server" = "https://example.com" "cluster-certificate" = "certificateAuthority" "cluster-dns-ip" = "10.100.0.10" device-ownership-from-security-context = true [settings.host-containers.admin] "enabled" = true ` func Test_generateUserData(t *testing.T) { cases := []struct { format string expected string expectedIsMimePart bool kubernetesVersion string NodeadmFeatureGates []string setClusterDNSIP bool wantErr bool }{ { format: "bootstrap.sh", expected: bootstrapShUserData, expectedIsMimePart: true, }, { format: "nodeadm", expected: nodeadmUserData, expectedIsMimePart: true, }, { format: "bottlerocket", expected: bottlerocketUserData, expectedIsMimePart: false, }, { format: "bottlerocket", expected: bottlerocketUserDataWithDNS, expectedIsMimePart: false, setClusterDNSIP: true, }, { format: "nodeadm", expected: nodeadmUserDataKubeletDRA, kubernetesVersion: "1.33", expectedIsMimePart: true, }, { format: "nodeadm", expected: nodeadmUserDataFeatureGate, kubernetesVersion: "1.30", NodeadmFeatureGates: []string{"foo=true"}, expectedIsMimePart: true, }, } for _, c := range cases { t.Run(c.format, func(t *testing.T) { deployerOpts := &deployerOptions{ KubernetesVersion: c.kubernetesVersion, NodeadmFeatureGates: c.NodeadmFeatureGates, SetClusterDNSIP: c.setClusterDNSIP, UserDataFormat: c.format, } actual, isMimePart, err := generateUserData(&cluster, deployerOpts) if err != nil { t.Log(err) t.Error(err) } assert.Equal(t, c.expected, actual) assert.Equal(t, c.expectedIsMimePart, isMimePart) }) } } func Test_extractFeatureGates(t *testing.T) { testCases := []struct { input []string expected map[string]bool expectErr bool }{ { input: []string{"foo=true", "bar=false"}, expected: map[string]bool{ "foo": true, "bar": false, }, }, { input: []string{"foo:true"}, expectErr: true, }, { input: []string{"foo=bar"}, expectErr: true, }, } for _, testCase := range testCases { output, err := extractFeatureGates(testCase.input) if testCase.expectErr { assert.Error(t, err) } else { assert.NoError(t, err) assert.Equal(t, testCase.expected, output) } } } func Test_deriveClusterDNSIP(t *testing.T) { testCases := []struct { cidr string expected string expectErr bool }{ {cidr: "192.0.2.0/24", expected: "192.0.2.10"}, {cidr: "198.51.100.0/24", expected: "198.51.100.10"}, {cidr: "2001:db8:1234::/108", expected: "2001:db8:1234::a"}, {cidr: "invalid", expectErr: true}, } for _, tc := range testCases { result, err := deriveClusterDNSIP(tc.cidr) if tc.expectErr { assert.Error(t, err) } else { assert.NoError(t, err) assert.Equal(t, tc.expected, result) } } } ================================================ FILE: internal/deployers/eksapi/vpccni.go ================================================ package eksapi import ( "bytes" "context" "encoding/json" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apimachinery/pkg/types" ) const vpcCNIDaemonSetPatch = `{ "spec": { "template": { "spec": { "containers": [ { "name": "aws-node", "env": [ { "name": "ENABLE_PREFIX_DELEGATION", "value": "true" }, { "name": "MINIMUM_IP_TARGET", "value": "80" }, { "name": "WARM_IP_TARGET", "value": "10" } ] } ] } } } }` // tuneVPCCNI applies configuration to the VPC CNI DaemonSet that helps prevent test flakiness func (k *k8sClient) tuneVPCCNI() error { var patch bytes.Buffer if err := json.Compact(&patch, []byte(vpcCNIDaemonSetPatch)); err != nil { return err } _, err := k.clientset.AppsV1().DaemonSets("kube-system").Patch(context.TODO(), "aws-node", types.StrategicMergePatchType, patch.Bytes(), metav1.PatchOptions{}) return err } ================================================ FILE: internal/deployers/eksapi/vpccni_test.go ================================================ package eksapi import ( "encoding/json" "testing" ) func Test_validVPCCNIDaemonSetPatch(t *testing.T) { var j json.RawMessage if err := json.Unmarshal([]byte(vpcCNIDaemonSetPatch), &j); err != nil { t.Error(err) } } ================================================ FILE: internal/deployers/eksctl/build.go ================================================ package eksctl // Build is a no-op func (d *deployer) Build() error { return nil } ================================================ FILE: internal/deployers/eksctl/cluster_config.go ================================================ package eksctl import ( "fmt" "log/slog" eksctl_api "github.com/weaveworks/eksctl/pkg/apis/eksctl.io/v1alpha5" "sigs.k8s.io/yaml" ) // CreateClusterConfig constructs an eksctl_api.ClusterConfig object based on UpOptions. // This function replaces the string-based template rendering. func (d *deployer) CreateClusterConfig() (*eksctl_api.ClusterConfig, error) { d.initClusterName() cfg := eksctl_api.NewClusterConfig() // Metadata cfg.Metadata.Name = d.clusterName cfg.Metadata.Region = d.Region cfg.Metadata.Version = d.KubernetesVersion // IAM cfg.IAM.WithOIDC = &d.WithOIDC amiFamily := d.AMIFamily if amiFamily == "" { amiFamily = eksctl_api.NodeImageFamilyAmazonLinux2 } nodeGroupName := d.NodegroupName if nodeGroupName == "" { nodeGroupName = "ng-1" } // Create node group or managed node group (MNG) if d.UseUnmanagedNodegroup { ng := cfg.NewNodeGroup() // TODO: update this when we add support for SSH. ng.SSH = nil ng.AMIFamily = amiFamily ng.Name = nodeGroupName if len(d.InstanceTypes) > 0 { ng.InstanceType = d.InstanceTypes[0] } if d.Nodes >= 0 { ng.MinSize = &d.Nodes ng.MaxSize = &d.Nodes ng.DesiredCapacity = &d.Nodes } if d.VolumeSize >= 0 { ng.VolumeSize = &d.VolumeSize } ng.PrivateNetworking = d.PrivateNetworking ng.EFAEnabled = &d.EFAEnabled if len(d.AvailabilityZones) > 0 { ng.AvailabilityZones = d.AvailabilityZones } if d.AMI != "" && amiFamily == eksctl_api.NodeImageFamilyAmazonLinux2 { bootstrapCommand := fmt.Sprintf(`#!/bin/bash source /var/lib/cloud/scripts/eksctl/bootstrap.helper.sh /etc/eks/bootstrap.sh %s --kubelet-extra-args "--node-labels=${NODE_LABELS}"`, d.clusterName) ng.OverrideBootstrapCommand = &bootstrapCommand } } else { // Create managed node group mng := eksctl_api.NewManagedNodeGroup() cfg.ManagedNodeGroups = append(cfg.ManagedNodeGroups, mng) // TODO: update this when we add support for SSH. mng.SSH = nil mng.AMIFamily = amiFamily mng.Name = nodeGroupName mng.InstanceTypes = d.InstanceTypes if d.Nodes >= 0 { mng.MinSize = &d.Nodes mng.MaxSize = &d.Nodes mng.DesiredCapacity = &d.Nodes } if d.VolumeSize >= 0 { mng.VolumeSize = &d.VolumeSize } mng.PrivateNetworking = d.PrivateNetworking mng.EFAEnabled = &d.EFAEnabled if len(d.AvailabilityZones) > 0 { mng.AvailabilityZones = d.AvailabilityZones } if d.AMI != "" && amiFamily == eksctl_api.NodeImageFamilyAmazonLinux2 { bootstrapCommand := fmt.Sprintf(`#!/bin/bash source /var/lib/cloud/scripts/eksctl/bootstrap.helper.sh /etc/eks/bootstrap.sh %s --kubelet-extra-args "--node-labels=${NODE_LABELS}"`, d.clusterName) mng.OverrideBootstrapCommand = &bootstrapCommand } else if d.AMI != "" && amiFamily == eksctl_api.NodeImageFamilyBottlerocket { mng.AMI = d.AMI } } return cfg, nil } type clusterConfigTemplateParams struct { UpOptions ClusterName string Region string } func (d *deployer) RenderClusterConfig() ([]byte, error) { cfg, err := d.CreateClusterConfig() if err != nil { slog.Error("failed to create ClusterConfig", "error", err) } slog.Info("rendering cluster config yaml", "config", cfg) return yaml.Marshal(cfg) } ================================================ FILE: internal/deployers/eksctl/deployer.go ================================================ package eksctl import ( "flag" "fmt" "log/slog" "os" "path/filepath" "github.com/aws/aws-k8s-tester/internal" "github.com/aws/aws-k8s-tester/internal/awssdk" "github.com/aws/aws-sdk-go-v2/aws" "github.com/aws/aws-sdk-go-v2/service/eks" "github.com/spf13/pflag" "github.com/urfave/sflags/gen/gpflag" "sigs.k8s.io/kubetest2/pkg/types" "sigs.k8s.io/yaml" ) // DeployerName is the name of the deployer const DeployerName = "eksctl" type deployer struct { // generic parts commonOptions types.Options *UpOptions awsConfig aws.Config eksClient *eks.Client KubeconfigPath string `flag:"kubeconfig" desc:"Path to kubeconfig"` // ClusterName is the effective cluster name (from flag or RunID) clusterName string } // NewDeployer implements deployer.New for EKS using eksctl func NewDeployer(opts types.Options) (types.Deployer, *pflag.FlagSet) { // create a deployer object and set fields that are not flag controlled awsConfig := awssdk.NewConfig() d := &deployer{ commonOptions: opts, awsConfig: awsConfig, eksClient: eks.NewFromConfig(awsConfig), } // register flags and return return d, bindFlags(d) } func (d *deployer) DumpClusterLogs() error { return nil } func (d *deployer) Kubeconfig() (string, error) { if d.KubeconfigPath != "" { return d.KubeconfigPath, nil } return filepath.Join(d.commonOptions.RunDir(), "kubeconfig"), nil } func (d *deployer) Version() string { return internal.Version } // bindFlags is a helper used to create & bind a flagset to the deployer func bindFlags(d *deployer) *pflag.FlagSet { flags, err := gpflag.Parse(d) if err != nil { slog.Error("unable to bind flags for deployer") os.Exit(1) } flags.AddGoFlagSet(flag.CommandLine) return flags } // initClusterName sets the effective cluster name with this precedence: // 1. config file // 2. --cluster-name flag // 3. RunID of the kubetest func (d *deployer) initClusterName() { // First priority: config file if provided if d.UpOptions.ConfigFile != "" { clusterName, err := d.parseClusterNameFromConfig(d.UpOptions.ConfigFile) if err == nil { d.clusterName = clusterName slog.Debug("using cluster name from config file", "clusterName", d.clusterName) return } slog.Warn("failed to extract cluster name from config file", "error", err) // Continue with other methods if parsing fails } if d.UpOptions.ClusterName != "" { d.clusterName = d.UpOptions.ClusterName slog.Debug("using cluster name from flag", "clusterName", d.clusterName) } else { d.clusterName = d.commonOptions.RunID() slog.Debug("using RunID for cluster name", "clusterName", d.clusterName) } } // parseClusterNameFromConfig extracts the cluster name from an eksctl config file func (d *deployer) parseClusterNameFromConfig(configFilePath string) (string, error) { configData, err := os.ReadFile(configFilePath) if err != nil { return "", fmt.Errorf("failed to read config file: %v", err) } // Simple YAML parsing to extract the cluster name var configMap map[string]interface{} if err := yaml.Unmarshal(configData, &configMap); err != nil { return "", fmt.Errorf("failed to parse config file YAML: %v", err) } // Extract metadata section metadata, ok := configMap["metadata"].(map[string]interface{}) if !ok { return "", fmt.Errorf("metadata section missing in config file") } // Extract name field name, ok := metadata["name"].(string) if !ok || name == "" { return "", fmt.Errorf("cluster name not found in config file metadata") } return name, nil } // assert that deployer implements types.DeployerWithKubeconfig var _ types.DeployerWithKubeconfig = &deployer{} ================================================ FILE: internal/deployers/eksctl/down.go ================================================ package eksctl import ( "fmt" "log/slog" "github.com/aws/aws-k8s-tester/internal/util" ) func (d *deployer) Down() error { d.initClusterName() var err error if d.DeployTarget == "nodegroup" { slog.Info("deleting nodegroup", "nodegroupName", d.NodegroupName, "clusterName", d.clusterName) err = util.ExecuteCommand("eksctl", "delete", "nodegroup", "--cluster", d.clusterName, "--name", d.NodegroupName, "--drain=false", "--wait") if err != nil { return fmt.Errorf("failed to delete nodegroup: %v", err) } slog.Info("successfully deleted nodegroup", "nodegroupName", d.NodegroupName, "clusterName", d.clusterName) } else if d.DeployTarget == "cluster" { slog.Info("deleting cluster", "clusterName", d.clusterName) err = util.ExecuteCommand("eksctl", "delete", "cluster", "--name", d.clusterName, "--wait", "--disable-nodegroup-eviction") if err != nil { return fmt.Errorf("failed to delete cluster: %v", err) } slog.Info("successfully deleted cluster", "clusterName", d.clusterName) } else { return fmt.Errorf("Unsupported deploy target: %s, supported options: `cluster`, `nodegroup`.", d.DeployTarget) } return nil } ================================================ FILE: internal/deployers/eksctl/up.go ================================================ package eksctl import ( "context" "fmt" "log/slog" "os" "path/filepath" "slices" "github.com/aws/aws-k8s-tester/internal/util" "github.com/aws/aws-sdk-go-v2/aws" "github.com/aws/aws-sdk-go-v2/service/eks" ekstypes "github.com/aws/aws-sdk-go-v2/service/eks/types" ) type UpOptions struct { Region string `flag:"region" desc:"AWS region for EKS cluster"` KubernetesVersion string `flag:"kubernetes-version" desc:"cluster Kubernetes version"` Nodes int `flag:"nodes" desc:"number of nodes to launch in cluster"` AMI string `flag:"ami" desc:"Node AMI"` InstanceTypes []string `flag:"instance-types" desc:"Node instance types"` ConfigFile string `flag:"config-file" desc:"Path to eksctl config file (if provided, other flags are ignored)"` AvailabilityZones []string `flag:"availability-zones" desc:"Node availability zones"` AMIFamily string `flag:"ami-family" desc:"AMI family to use (AmazonLinux2023, Bottlerocket)"` EFAEnabled bool `flag:"efa-enabled" desc:"Enable Elastic Fabric Adapter for the nodegroup"` VolumeSize int `flag:"volume-size" desc:"Size of the node root volume in GB"` PrivateNetworking bool `flag:"private-networking" desc:"Use private networking for nodes"` WithOIDC bool `flag:"with-oidc" desc:"Enable OIDC provider for IAM roles for service accounts"` DeployTarget string `flag:"deploy-target" desc:"The target to deploy, supported values: cluster | nodegroup (defaults to 'cluster'). It is a thin wrapper to eksctl create subcommand with limited supported values."` ClusterName string `flag:"cluster-name" desc:"Name of the EKS cluster (defaults to RunID if not specified)"` UseUnmanagedNodegroup bool `flag:"unmanaged-nodegroup" desc:"Use unmanaged nodegroup instead of managed nodegroup"` NodegroupName string `flag:"nodegroup-name" desc:"Name of the nodegroup (defaults to 'ng-1')"` } func (d *deployer) verifyUpFlags() error { supportedDeployTargets := []string{"cluster", "nodegroup"} // Skip validation if using a config file if d.ConfigFile != "" { slog.Info("using config file, skipping command-line flag validation", "configFile", d.ConfigFile) return nil } if d.KubernetesVersion == "" { slog.Info("--kubernetes-version is empty, attempting to detect it...") detectedVersion, err := detectKubernetesVersion() if err != nil { return fmt.Errorf("unable to detect --kubernetes-version, flag cannot be empty") } slog.Info("detected kubernetes version", "version", detectedVersion) d.KubernetesVersion = detectedVersion } if d.Nodes < 0 { return fmt.Errorf("number of nodes must be greater than zero") } if d.Nodes == 0 { d.Nodes = 4 slog.Debug("using default number of nodes", "nodes", d.Nodes) } // Validate instance types for unmanaged nodegroups if d.UseUnmanagedNodegroup { if len(d.InstanceTypes) > 1 { return fmt.Errorf("Unmanaged nodegroups only support a single instance type. Using the first one: %s", d.InstanceTypes[0]) } else if len(d.InstanceTypes) == 0 { // If no instance type specified, use a default d.InstanceTypes = []string{"m5.xlarge"} slog.Info("no instance type specified for unmanaged nodegroup, using default", "instanceType", d.InstanceTypes[0]) } } if d.DeployTarget != "" && !slices.Contains(supportedDeployTargets, d.DeployTarget) { return fmt.Errorf("Unsupported deploy target: %s, supported options: `cluster`, `nodegroup`.", d.DeployTarget) } else if d.DeployTarget == "" { // If no deploy target specified, use "cluster" as default d.DeployTarget = "cluster" slog.Info("no deploy target specified, using default", "deployTarget", d.DeployTarget) } return nil } func (d *deployer) Up() error { d.initClusterName() if err := d.verifyUpFlags(); err != nil { return fmt.Errorf("up flags are invalid: %v", err) } if d.UseUnmanagedNodegroup { slog.Info("using unmanaged nodegroup", "clusterName", d.clusterName) } else { slog.Info("using managed nodegroup", "clusterName", d.clusterName) } var args []string if d.ConfigFile != "" { // If config file is provided, use it args = d.renderEksctlArgs(d.ConfigFile) } else { // Use rendered cluster config clusterConfig, err := d.RenderClusterConfig() if err != nil { return err } slog.Info("rendered cluster config", "config", string(clusterConfig)) clusterConfigFile, err := os.CreateTemp("", "kubetest2-eksctl-cluster-config") if err != nil { return err } defer clusterConfigFile.Close() _, err = clusterConfigFile.Write(clusterConfig) if err != nil { return err } args = d.renderEksctlArgs(clusterConfigFile.Name()) } err := util.ExecuteCommand("eksctl", args...) if err != nil { return fmt.Errorf("failed to create cluster: %v", err) } // Write kubeconfig to the rundir kubeConfigPath, err := d.Kubeconfig() if err != nil { return fmt.Errorf("error determining kubeconfig path: %v", err) } // Create directory if it doesn't exist err = os.MkdirAll(filepath.Dir(kubeConfigPath), 0755) if err != nil { return fmt.Errorf("error creating directory for kubeconfig: %v", err) } slog.Info("writing kubeconfig", "path", kubeConfigPath) writeKubeconfigArgs := []string{ "utils", "write-kubeconfig", "--cluster", d.clusterName, "--region", d.Region, "--kubeconfig", kubeConfigPath, } err = util.ExecuteCommand("eksctl", writeKubeconfigArgs...) if err != nil { return fmt.Errorf("failed to write kubeconfig: %v", err) } slog.Info("successfully wrote kubeconfig", "path", kubeConfigPath) d.KubeconfigPath = kubeConfigPath return nil } func (d *deployer) renderEksctlArgs(configFilePath string) []string { return []string{ "create", d.DeployTarget, "--config-file", configFilePath, } } func (d *deployer) IsUp() (up bool, err error) { d.initClusterName() result, err := d.eksClient.DescribeCluster(context.TODO(), &eks.DescribeClusterInput{ Name: aws.String(d.clusterName), }) if err != nil { return false, err } switch result.Cluster.Status { case ekstypes.ClusterStatusActive: return true, nil case ekstypes.ClusterStatusCreating: return false, nil default: return false, fmt.Errorf("cluster status is: %v", result.Cluster.Status) } } func detectKubernetesVersion() (string, error) { detectedVersion, err := util.DetectKubernetesVersion() if err != nil { return "", err } minorVersion, err := util.ParseMinorVersion(detectedVersion) if err != nil { return "", err } return minorVersion, nil } ================================================ FILE: internal/e2e/client.go ================================================ package e2e import ( "bytes" "context" "fmt" "html/template" "io" "os" batchv1 "k8s.io/api/batch/v1" corev1 "k8s.io/api/core/v1" "k8s.io/apimachinery/pkg/api/meta" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apimachinery/pkg/apis/meta/v1/unstructured" "k8s.io/apimachinery/pkg/runtime" "k8s.io/apimachinery/pkg/runtime/schema" "k8s.io/cli-runtime/pkg/resource" "k8s.io/client-go/kubernetes" "k8s.io/client-go/rest" "k8s.io/client-go/restmapper" "sigs.k8s.io/e2e-framework/klient/decoder" "sigs.k8s.io/e2e-framework/klient/k8s" ) // ApplyFiles creates Kubernetes objects contained in manifest file(s), in a manner similar to `kubectl apply -f` // Multiple objects may be in each manifest file. // The manifest files are processed in order. func ApplyFiles(restConfig *rest.Config, manifestFiles ...string) error { for _, manifestFile := range manifestFiles { if f, err := os.Open(manifestFile); err != nil { return err } else if err := applyManifests(restConfig, f); err != nil { return err } } return nil } // ApplyManifests creates Kubernetes objects contained in manifests, in a manner similar to `kubectl apply -f` // Multiple objects may be in the manifest data. func ApplyManifests(restConfig *rest.Config, manifests ...[]byte) error { return applyManifests(restConfig, bytesSlicesToReaders(manifests...)...) } func applyManifests(restConfig *rest.Config, manifests ...io.Reader) error { for _, manifest := range manifests { if objs, err := decoder.DecodeAll(context.TODO(), manifest); err != nil { return err } else if err := processObjects(restConfig, objs, func(client *resource.Helper, obj k8s.Object) error { namespace, err := meta.NewAccessor().Namespace(obj) if err != nil { return err } if namespace == "" { namespace = "default" } _, err = client.Create(namespace, false, obj) return err }); err != nil { return err } } return nil } // DeleteFiles deletes Kubernetes objects contained in manifest file(s), in a manner similar to `kubectl delete -f` // Multiple objects may be in each manifest file. func DeleteFiles(restConfig *rest.Config, manifestFiles ...string) error { for _, manifestFile := range manifestFiles { if f, err := os.Open(manifestFile); err != nil { return err } else if err := deleteManifests(restConfig, f); err != nil { return err } } return nil } // DeleteManifests deletes Kubernetes objects contained in manifest(s), in a manner similar to `kubectl delete -f` // Multiple objects may be in each manifest. func DeleteManifests(restConfig *rest.Config, manifests ...[]byte) error { return deleteManifests(restConfig, bytesSlicesToReaders(manifests...)...) } func deleteManifests(restConfig *rest.Config, manifests ...io.Reader) error { for _, manifest := range manifests { if objs, err := decoder.DecodeAll(context.TODO(), manifest); err != nil { return err } else if err := processObjects(restConfig, objs, func(client *resource.Helper, obj k8s.Object) error { name, err := meta.NewAccessor().Name(obj) if err != nil { return err } namespace, err := meta.NewAccessor().Namespace(obj) if err != nil { return err } if namespace == "" { namespace = "default" } deletePolicy := metav1.DeletePropagationBackground _, err = client.DeleteWithOptions(namespace, name, &metav1.DeleteOptions{ PropagationPolicy: &deletePolicy, }) return err }); err != nil { return err } } return nil } // RenderManifests renders manifests with the supplied data func RenderManifests(file []byte, templateData interface{}) ([]byte, error) { tpl, err := template.New("Manifest").Parse(string(file)) if err != nil { return nil, err } buf := bytes.Buffer{} err = tpl.Execute(&buf, templateData) return buf.Bytes(), err } // GetJobLogs get logs from MPIJob func GetJobLogs(restConfig *rest.Config, job k8s.Object) (string, error) { ctx := context.Background() clientset, err := kubernetes.NewForConfig(restConfig) if err != nil { return "", err } var jobLabel string switch job.(type) { case *unstructured.Unstructured: // assume this is an MPIJob jobLabel = fmt.Sprintf("job-name=%s-launcher", job.GetName()) case *batchv1.Job: jobLabel = fmt.Sprintf("job-name=%s", job.GetName()) default: return "", fmt.Errorf("unsupported job type %T", job) } pods, err := clientset.CoreV1().Pods(job.GetNamespace()).List(ctx, metav1.ListOptions{LabelSelector: jobLabel}) if err != nil { return "", err } if len(pods.Items) == 0 { return "", fmt.Errorf("no pods found for job %s", job.GetName()) } log := clientset.CoreV1().Pods(job.GetNamespace()).GetLogs(pods.Items[0].Name, &corev1.PodLogOptions{}) podLogs, err := log.Stream(ctx) if err != nil { return "", err } defer podLogs.Close() buf := new(bytes.Buffer) _, err = io.Copy(buf, podLogs) if err != nil { return "", err } str := buf.String() return str, nil } func bytesSlicesToReaders(byteSlices ...[]byte) []io.Reader { var readers []io.Reader for _, b := range byteSlices { readers = append(readers, bytes.NewReader(b)) } return readers } // processObjects applies a processFunc to each object, supplying it a dynamically-typed client appropriate for the object func processObjects(restConfig *rest.Config, objs []k8s.Object, processFunc func(client *resource.Helper, obj k8s.Object) error) error { clientset, err := kubernetes.NewForConfig(restConfig) if err != nil { return err } groupResources, err := restmapper.GetAPIGroupResources(clientset.Discovery()) if err != nil { return err } rm := restmapper.NewDiscoveryRESTMapper(groupResources) for _, obj := range objs { client, err := newResourceHelper(restConfig, rm, obj) if err != nil { return err } processFunc(client, obj) } return nil } func newResourceHelper(restConfig *rest.Config, rm meta.RESTMapper, obj runtime.Object) (*resource.Helper, error) { gvk := obj.GetObjectKind().GroupVersionKind() gk := schema.GroupKind{Group: gvk.Group, Kind: gvk.Kind} mapping, err := rm.RESTMapping(gk, gvk.Version) if err != nil { return nil, err } gv := mapping.GroupVersionKind.GroupVersion() restConfig.ContentConfig = resource.UnstructuredPlusDefaultContentConfig() restConfig.GroupVersion = &gv if len(gv.Group) == 0 { restConfig.APIPath = "/api" } else { restConfig.APIPath = "/apis" } restClient, err := rest.RESTClientFor(restConfig) if err != nil { return nil, err } return resource.NewHelper(restClient, mapping), nil } ================================================ FILE: internal/e2e/conditions.go ================================================ package e2e import ( "context" "fmt" appsv1 "k8s.io/api/apps/v1" batchv1 "k8s.io/api/batch/v1" v1 "k8s.io/api/core/v1" apimachinerywait "k8s.io/apimachinery/pkg/util/wait" "sigs.k8s.io/e2e-framework/klient/k8s" "sigs.k8s.io/e2e-framework/klient/k8s/resources" ) type ConditionExtension struct { resources *resources.Resources } func NewConditionExtension(r *resources.Resources) *ConditionExtension { return &ConditionExtension{resources: r} } // ResourceMatch is a helper function used to check if the resource under question has met a pre-defined state. This can // be leveraged for checking fields on a resource that may not be immediately present upon creation. func (c *ConditionExtension) ResourceMatch(obj k8s.Object, matchFetcher func(object k8s.Object) bool) apimachinerywait.ConditionWithContextFunc { return func(ctx context.Context) (done bool, err error) { if err := c.resources.Get(ctx, obj.GetName(), obj.GetNamespace(), obj); err != nil { return false, err } return matchFetcher(obj), nil } } func (c *ConditionExtension) PodRunning(pod k8s.Object) apimachinerywait.ConditionWithContextFunc { return func(ctx context.Context) (done bool, err error) { if err := c.resources.Get(ctx, pod.GetName(), pod.GetNamespace(), pod); err != nil { return false, err } status := pod.(*v1.Pod).Status switch status.Phase { case v1.PodRunning: return true, nil case v1.PodPending: return false, nil default: return false, fmt.Errorf("pod cannot transition to running from current status: %s", status.Phase) } } } func (c *ConditionExtension) PodSucceeded(pod k8s.Object) apimachinerywait.ConditionWithContextFunc { return func(ctx context.Context) (done bool, err error) { if err := c.resources.Get(ctx, pod.GetName(), pod.GetNamespace(), pod); err != nil { return false, err } status := pod.(*v1.Pod).Status if status.Phase == v1.PodSucceeded { return true, nil } else if status.Phase == v1.PodFailed { return false, fmt.Errorf("Pod in Failed status") } return false, nil } } func (c *ConditionExtension) DaemonSetReady(daemonset k8s.Object) apimachinerywait.ConditionWithContextFunc { return func(ctx context.Context) (done bool, err error) { if err := c.resources.Get(ctx, daemonset.GetName(), daemonset.GetNamespace(), daemonset); err != nil { return false, err } status := daemonset.(*appsv1.DaemonSet).Status if status.NumberReady == status.DesiredNumberScheduled && status.NumberUnavailable == 0 { done = true } return } } func (c *ConditionExtension) JobSucceeded(job k8s.Object) apimachinerywait.ConditionWithContextFunc { return func(ctx context.Context) (done bool, err error) { if err := c.resources.Get(ctx, job.GetName(), job.GetNamespace(), job); err != nil { return false, err } batchJob := job.(*batchv1.Job) status := batchJob.Status spec := batchJob.Spec for _, condition := range status.Conditions { if condition.Type == batchv1.JobFailed && condition.Status == v1.ConditionTrue { return false, fmt.Errorf("job failed") } } if status.Succeeded != *spec.Completions { return false, nil } return true, nil } } func (c *ConditionExtension) AllNodesHaveNonZeroResourceCapacity(resourceLabel string) apimachinerywait.ConditionWithContextFunc { return func(ctx context.Context) (done bool, err error) { nodeList := &v1.NodeList{} if err := c.resources.List(ctx, nodeList); err != nil { return false, fmt.Errorf("failed to list nodes: %w", err) } if len(nodeList.Items) == 0 { return false, fmt.Errorf("no nodes found in the cluster") } for _, node := range nodeList.Items { resource, ok := node.Status.Capacity[v1.ResourceName(resourceLabel)] if !ok { return false, nil } if resource.Value() <= 0 { return false, nil } } return true, nil } } ================================================ FILE: internal/e2e/doc.go ================================================ // Package frameworkext contains extensions to sigs.k8s.io/e2e-framework package e2e ================================================ FILE: internal/e2e/ec2.go ================================================ package e2e import ( "context" "fmt" "github.com/aws/aws-k8s-tester/internal/awssdk" "github.com/aws/aws-sdk-go-v2/service/ec2" ec2types "github.com/aws/aws-sdk-go-v2/service/ec2/types" ) type EC2Client interface { DescribeInstanceType(instanceType string) (ec2types.InstanceTypeInfo, error) } type ec2Client struct { client *ec2.Client } func NewEC2Client() *ec2Client { return &ec2Client{ client: ec2.NewFromConfig(awssdk.NewConfig()), } } func (c *ec2Client) DescribeInstanceTopology(instanceIDs []string) ([]ec2types.InstanceTopology, error) { var instanceTopologies []ec2types.InstanceTopology paginator := ec2.NewDescribeInstanceTopologyPaginator(c.client, &ec2.DescribeInstanceTopologyInput{ InstanceIds: instanceIDs, }) for paginator.HasMorePages() { instanceTopologyOuput, err := paginator.NextPage(context.TODO()) if err != nil { return []ec2types.InstanceTopology{}, err } instanceTopologies = append(instanceTopologies, instanceTopologyOuput.Instances...) } return instanceTopologies, nil } func (c *ec2Client) DescribeInstanceType(instanceType string) (ec2types.InstanceTypeInfo, error) { describeResponse, err := c.client.DescribeInstanceTypes(context.TODO(), &ec2.DescribeInstanceTypesInput{ InstanceTypes: []ec2types.InstanceType{ec2types.InstanceType(instanceType)}, }) if err != nil { return ec2types.InstanceTypeInfo{}, fmt.Errorf("failed to describe instance type: %s: %v", instanceType, err) } else { return describeResponse.InstanceTypes[0], nil } } ================================================ FILE: internal/e2e/health.go ================================================ package e2e import ( "context" "fmt" "strings" "k8s.io/client-go/kubernetes" "k8s.io/client-go/rest" ) // KubeletIsResponsive returns true if the kubelet /healthz endpoint responds with a 200 status code, and propagates // any non-connection specific errors func KubeletIsResponsive(ctx context.Context, cfg *rest.Config, nodeName string) (bool, error) { client, err := kubernetes.NewForConfig(cfg) if err != nil { return false, fmt.Errorf("failed to initialize client set: %v", err) } nodeHealthResponse := client.CoreV1().RESTClient().Get().Resource("nodes"). Name(nodeName).SubResource("proxy").Suffix("/healthz"). Do(ctx) if nodeHealthResponse.Error() != nil { errMsg := nodeHealthResponse.Error().Error() // TODO: match errors against types, e.g. syscall.ECONNREFUSED instead, the k8s client doesn't // currently properly wrap the underlying error to allow this though if strings.Contains(errMsg, "connection refused") || strings.Contains(errMsg, "connection reset by peer") || strings.Contains(errMsg, "http2: client connection lost") { // these errors indicate reachability to the node in general but an unstable connection to kubelet return false, nil } // propagate other errors, e.g. i/o timeout, that may result from things unrelated to kubelet health, // e.g. security group rules on the instance restricting traffic from the CP return false, fmt.Errorf("could not reach /healthz endpoint for node %s: %w", nodeName, nodeHealthResponse.Error()) } var statusCode int nodeHealthResponse.StatusCode(&statusCode) return statusCode == 200, nil } ================================================ FILE: internal/e2e/logs.go ================================================ package e2e import ( "context" "fmt" "io" "testing" corev1 "k8s.io/api/core/v1" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/client-go/kubernetes" "k8s.io/client-go/rest" ) // PrintDaemonSetPodLogs retrieves logs from each container in each pod of a DaemonSet. // namespace & labelSelector identify the DaemonSet's pods (e.g. "default", "app=containerd-check"). func PrintDaemonSetPodLogs( t *testing.T, ctx context.Context, restConfig *rest.Config, namespace string, labelSelector string, ) { clientset, err := kubernetes.NewForConfig(restConfig) if err != nil { t.Logf("failed to create typed clientset: %v", err) return } pods, err := clientset.CoreV1().Pods(namespace).List(ctx, metav1.ListOptions{ LabelSelector: labelSelector, }) if err != nil { t.Logf("failed to list pods: %v", err) return } if len(pods.Items) == 0 { t.Logf("No pods found for DaemonSet with label %q in namespace %q.", labelSelector, namespace) return } for _, pod := range pods.Items { t.Logf("Pod %s status: %s", pod.Name, pod.Status.Phase) for _, container := range pod.Spec.Containers { logs, logErr := ReadPodLogs(ctx, restConfig, pod.Namespace, pod.Name, container.Name) if logErr != nil { t.Logf("Failed reading logs from %s/%s: %v", pod.Name, container.Name, logErr) } else { t.Logf("=== Logs from %s/%s ===\n%s", pod.Name, container.Name, logs) } } } } // ReadPodLogs streams logs for a specific container in a pod. func ReadPodLogs( ctx context.Context, restConfig *rest.Config, namespace, podName, containerName string, ) (string, error) { clientset, err := kubernetes.NewForConfig(restConfig) if err != nil { return "", fmt.Errorf("failed to create typed clientset: %w", err) } req := clientset.CoreV1().Pods(namespace).GetLogs(podName, &corev1.PodLogOptions{ Container: containerName, }) stream, err := req.Stream(ctx) if err != nil { return "", fmt.Errorf("failed to open log stream for %s/%s: %w", podName, containerName, err) } defer stream.Close() data, err := io.ReadAll(stream) if err != nil { return "", fmt.Errorf("error reading logs: %w", err) } return string(data), nil } ================================================ FILE: internal/e2e/mpijobs/conditions.go ================================================ package mpijobs import ( "fmt" "k8s.io/apimachinery/pkg/apis/meta/v1/unstructured" "sigs.k8s.io/e2e-framework/klient/k8s" ) // MPIJobSucceeded returns true if the specified k8s.Object is an unstructured.Unstructured // with .status.conditions["Succeeded"] = "True" func MPIJobSucceeded(obj k8s.Object) bool { u := obj.(*unstructured.Unstructured) conditions, found, err := unstructured.NestedSlice(u.Object, "status", "conditions") if err != nil { panic(fmt.Errorf("MPIJob does not match expected schema: %v", err)) } if !found { return false } for _, condition := range conditions { c := condition.(map[string]interface{}) cType, found, err := unstructured.NestedString(c, "type") if err != nil { panic(fmt.Errorf("MPIJob does not match expected schema: %v", err)) } if !found { continue } if cType == "Succeeded" { cStatus, found, err := unstructured.NestedString(c, "status") if err != nil { panic(fmt.Errorf("MPIJob does not match expected schema: %v", err)) } if !found { continue } return cStatus == "True" } } return false } ================================================ FILE: internal/e2e/mpijobs/conditions_test.go ================================================ package mpijobs import ( "testing" "github.com/stretchr/testify/assert" "k8s.io/apimachinery/pkg/apis/meta/v1/unstructured" ) func Test_MPIJobSucceeded(t *testing.T) { u := unstructured.Unstructured{ Object: map[string]interface{}{ "status": map[string]interface{}{ "conditions": []interface{}{ map[string]interface{}{ "type": "Succeeded", "status": "True", }, }, }, }, } assert.True(t, MPIJobSucceeded(&u)) u = unstructured.Unstructured{ Object: map[string]interface{}{ "status": map[string]interface{}{ "conditions": []interface{}{ map[string]interface{}{ "type": "Succeeded", "status": "False", }, }, }, }, } assert.False(t, MPIJobSucceeded(&u)) } ================================================ FILE: internal/e2e/mpijobs/types.go ================================================ package mpijobs import ( "k8s.io/apimachinery/pkg/apis/meta/v1/unstructured" "k8s.io/apimachinery/pkg/runtime/schema" ) var MPIJobGVK = schema.GroupVersionKind{ Group: "kubeflow.org", Version: "v2beta1", Kind: "MPIJob", } func NewUnstructured(name, namespace string) *unstructured.Unstructured { u := unstructured.Unstructured{} u.SetGroupVersionKind(MPIJobGVK) u.SetName(name) u.SetNamespace(namespace) return &u } ================================================ FILE: internal/e2e/resources.go ================================================ package e2e import ( "fmt" v1 "k8s.io/api/core/v1" ) func GetNonZeroResourceCapacity(node *v1.Node, resourceName string) (int, error) { capacity, ok := node.Status.Capacity[v1.ResourceName(resourceName)] if !ok { return 0, fmt.Errorf("node %q has no resource %q", node.Name, resourceName) } if capacity.Value() == 0 { return 0, fmt.Errorf("node %q has zero capacity for resource %q", node.Name, resourceName) } return int(capacity.Value()), nil } ================================================ FILE: internal/metrics/cloudwatch.go ================================================ package metrics import ( "context" "log/slog" "sync" "time" "github.com/aws/aws-sdk-go-v2/service/cloudwatch" "github.com/aws/aws-sdk-go-v2/service/cloudwatch/types" "github.com/aws/aws-sdk-go/aws" ) // NewCloudWatchRegistry creates a new metric registry that will emit values using the specified cloudwatch client func NewCloudWatchRegistry(cw *cloudwatch.Client) MetricRegistry { return &cloudwatchRegistry{ cw: cw, lock: &sync.Mutex{}, dataByNamespace: make(map[string][]*cloudwatchMetricDatum), } } type cloudwatchRegistry struct { cw *cloudwatch.Client lock *sync.Mutex dataByNamespace map[string][]*cloudwatchMetricDatum } type cloudwatchMetricDatum struct { spec *MetricSpec value float64 dimensions map[string]string timestamp time.Time } func (r *cloudwatchRegistry) Record(spec *MetricSpec, value float64, dimensions map[string]string) { r.lock.Lock() defer r.lock.Unlock() r.dataByNamespace[spec.Namespace] = append(r.dataByNamespace[spec.Namespace], &cloudwatchMetricDatum{ spec: spec, value: value, dimensions: dimensions, timestamp: time.Now(), }) } func (r *cloudwatchRegistry) Emit() error { r.lock.Lock() defer r.lock.Unlock() for namespace, data := range r.dataByNamespace { for i := 0; i < len(data); { var metricData []types.MetricDatum // we can emit up to 1000 values per PutMetricData for j := 0; j < len(data) && j < 1000; j++ { datum := data[i] var dimensions []types.Dimension for key, val := range datum.dimensions { dimensions = append(dimensions, types.Dimension{ Name: aws.String(key), Value: aws.String(val), }) } metricData = append(metricData, types.MetricDatum{ MetricName: aws.String(datum.spec.Metric), Value: aws.Float64(datum.value), Dimensions: dimensions, Timestamp: &datum.timestamp, }) i++ } _, err := r.cw.PutMetricData(context.TODO(), &cloudwatch.PutMetricDataInput{ Namespace: aws.String(namespace), MetricData: metricData, }) if err != nil { return err } } slog.Info("emitted metrics", "count", len(data), "namespace", namespace) } r.dataByNamespace = make(map[string][]*cloudwatchMetricDatum) return nil } func (r *cloudwatchRegistry) GetRegistered() int { r.lock.Lock() defer r.lock.Unlock() registered := 0 for _, data := range r.dataByNamespace { registered += len(data) } return registered } ================================================ FILE: internal/metrics/noop.go ================================================ package metrics func NewNoopMetricRegistry() MetricRegistry { return &noopRegistry{} } type noopRegistry struct{} func (r *noopRegistry) Record(spec *MetricSpec, value float64, dimensions map[string]string) {} func (r *noopRegistry) Emit() error { return nil } ================================================ FILE: internal/metrics/registry.go ================================================ package metrics import ( "github.com/aws/aws-sdk-go-v2/service/cloudwatch/types" ) type MetricRegistry interface { // Record adds a new metric value to the registry Record(spec *MetricSpec, value float64, dimensions map[string]string) // Emit sends all registered metric values to cloudwatch, emptying the registry Emit() error } type MetricSpec struct { Namespace string Metric string Unit types.StandardUnit } ================================================ FILE: internal/testers/ginkgov1/LICENSE.original ================================================ THIS IS A COPY OF THE ORIGINAL LICENSE FOR `kubetest2` AT COMMIT `d7fcb799ce84ceda66c8b9b1ec8eefcbe226f293`. Apache License Version 2.0, January 2004 http://www.apache.org/licenses/ TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 1. Definitions. "License" shall mean the terms and conditions for use, reproduction, and distribution as defined by Sections 1 through 9 of this document. "Licensor" shall mean the copyright owner or entity authorized by the copyright owner that is granting the License. "Legal Entity" shall mean the union of the acting entity and all other entities that control, are controlled by, or are under common control with that entity. For the purposes of this definition, "control" means (i) the power, direct or indirect, to cause the direction or management of such entity, whether by contract or otherwise, or (ii) ownership of fifty percent (50%) or more of the outstanding shares, or (iii) beneficial ownership of such entity. "You" (or "Your") shall mean an individual or Legal Entity exercising permissions granted by this License. "Source" form shall mean the preferred form for making modifications, including but not limited to software source code, documentation source, and configuration files. "Object" form shall mean any form resulting from mechanical transformation or translation of a Source form, including but not limited to compiled object code, generated documentation, and conversions to other media types. "Work" shall mean the work of authorship, whether in Source or Object form, made available under the License, as indicated by a copyright notice that is included in or attached to the work (an example is provided in the Appendix below). "Derivative Works" shall mean any work, whether in Source or Object form, that is based on (or derived from) the Work and for which the editorial revisions, annotations, elaborations, or other modifications represent, as a whole, an original work of authorship. For the purposes of this License, Derivative Works shall not include works that remain separable from, or merely link (or bind by name) to the interfaces of, the Work and Derivative Works thereof. "Contribution" shall mean any work of authorship, including the original version of the Work and any modifications or additions to that Work or Derivative Works thereof, that is intentionally submitted to Licensor for inclusion in the Work by the copyright owner or by an individual or Legal Entity authorized to submit on behalf of the copyright owner. For the purposes of this definition, "submitted" means any form of electronic, verbal, or written communication sent to the Licensor or its representatives, including but not limited to communication on electronic mailing lists, source code control systems, and issue tracking systems that are managed by, or on behalf of, the Licensor for the purpose of discussing and improving the Work, but excluding communication that is conspicuously marked or otherwise designated in writing by the copyright owner as "Not a Contribution." "Contributor" shall mean Licensor and any individual or Legal Entity on behalf of whom a Contribution has been received by Licensor and subsequently incorporated within the Work. 2. Grant of Copyright License. Subject to the terms and conditions of this License, each Contributor hereby grants to You a perpetual, worldwide, non-exclusive, no-charge, royalty-free, irrevocable copyright license to reproduce, prepare Derivative Works of, publicly display, publicly perform, sublicense, and distribute the Work and such Derivative Works in Source or Object form. 3. Grant of Patent License. Subject to the terms and conditions of this License, each Contributor hereby grants to You a perpetual, worldwide, non-exclusive, no-charge, royalty-free, irrevocable (except as stated in this section) patent license to make, have made, use, offer to sell, sell, import, and otherwise transfer the Work, where such license applies only to those patent claims licensable by such Contributor that are necessarily infringed by their Contribution(s) alone or by combination of their Contribution(s) with the Work to which such Contribution(s) was submitted. If You institute patent litigation against any entity (including a cross-claim or counterclaim in a lawsuit) alleging that the Work or a Contribution incorporated within the Work constitutes direct or contributory patent infringement, then any patent licenses granted to You under this License for that Work shall terminate as of the date such litigation is filed. 4. Redistribution. You may reproduce and distribute copies of the Work or Derivative Works thereof in any medium, with or without modifications, and in Source or Object form, provided that You meet the following conditions: (a) You must give any other recipients of the Work or Derivative Works a copy of this License; and (b) You must cause any modified files to carry prominent notices stating that You changed the files; and (c) You must retain, in the Source form of any Derivative Works that You distribute, all copyright, patent, trademark, and attribution notices from the Source form of the Work, excluding those notices that do not pertain to any part of the Derivative Works; and (d) If the Work includes a "NOTICE" text file as part of its distribution, then any Derivative Works that You distribute must include a readable copy of the attribution notices contained within such NOTICE file, excluding those notices that do not pertain to any part of the Derivative Works, in at least one of the following places: within a NOTICE text file distributed as part of the Derivative Works; within the Source form or documentation, if provided along with the Derivative Works; or, within a display generated by the Derivative Works, if and wherever such third-party notices normally appear. The contents of the NOTICE file are for informational purposes only and do not modify the License. You may add Your own attribution notices within Derivative Works that You distribute, alongside or as an addendum to the NOTICE text from the Work, provided that such additional attribution notices cannot be construed as modifying the License. You may add Your own copyright statement to Your modifications and may provide additional or different license terms and conditions for use, reproduction, or distribution of Your modifications, or for any such Derivative Works as a whole, provided Your use, reproduction, and distribution of the Work otherwise complies with the conditions stated in this License. 5. Submission of Contributions. Unless You explicitly state otherwise, any Contribution intentionally submitted for inclusion in the Work by You to the Licensor shall be under the terms and conditions of this License, without any additional terms or conditions. Notwithstanding the above, nothing herein shall supersede or modify the terms of any separate license agreement you may have executed with Licensor regarding such Contributions. 6. Trademarks. This License does not grant permission to use the trade names, trademarks, service marks, or product names of the Licensor, except as required for reasonable and customary use in describing the origin of the Work and reproducing the content of the NOTICE file. 7. Disclaimer of Warranty. Unless required by applicable law or agreed to in writing, Licensor provides the Work (and each Contributor provides its Contributions) on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied, including, without limitation, any warranties or conditions of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A PARTICULAR PURPOSE. You are solely responsible for determining the appropriateness of using or redistributing the Work and assume any risks associated with Your exercise of permissions under this License. 8. Limitation of Liability. In no event and under no legal theory, whether in tort (including negligence), contract, or otherwise, unless required by applicable law (such as deliberate and grossly negligent acts) or agreed to in writing, shall any Contributor be liable to You for damages, including any direct, indirect, special, incidental, or consequential damages of any character arising as a result of this License or out of the use or inability to use the Work (including but not limited to damages for loss of goodwill, work stoppage, computer failure or malfunction, or any and all other commercial damages or losses), even if such Contributor has been advised of the possibility of such damages. 9. Accepting Warranty or Additional Liability. While redistributing the Work or Derivative Works thereof, You may choose to offer, and charge a fee for, acceptance of support, warranty, indemnity, or other liability obligations and/or rights consistent with this License. However, in accepting such obligations, You may act only on Your own behalf and on Your sole responsibility, not on behalf of any other Contributor, and only if You agree to indemnify, defend, and hold each Contributor harmless for any liability incurred by, or claims asserted against, such Contributor by reason of your accepting any such warranty or additional liability. END OF TERMS AND CONDITIONS APPENDIX: How to apply the Apache License to your work. To apply the Apache License to your work, attach the following boilerplate notice, with the fields enclosed by brackets "{}" replaced with your own identifying information. (Don't include the brackets!) The text should be enclosed in the appropriate comment syntax for the file format. We also recommend that a file or class name and description of purpose be included on the same "printed page" as the copyright notice for easier identification within third-party archives. Copyright {yyyy} {name of copyright owner} Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. ================================================ FILE: internal/testers/ginkgov1/README.md ================================================ This tester supports ginkgo 1.x versions, which were used for Kubernetes versions prior to 1.25. --- This is a fork of the `ginkgo` tester: https://github.com/kubernetes-sigs/kubetest2/tree/master/pkg/testers/ginkgo The fork originated at commit `d7fcb799ce84ceda66c8b9b1ec8eefcbe226f293`. A copy of the original license is provided in the file named `LICENSE.original`. ================================================ FILE: internal/testers/ginkgov1/ginkgo.go ================================================ // This file has been modified in the following ways: // 1. The `ginkgo` package has been renamed to `ginkgov1`. // 2. The `--timeout` flag has been removed. // 3. The `--flake-attempts` flag has been implemented for ginkgo 1.x versions. /* Copyright 2019 The Kubernetes Authors. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ package ginkgov1 import ( "flag" "fmt" "os" stdexec "os/exec" "path/filepath" "strconv" "strings" "github.com/kballard/go-shellquote" "github.com/urfave/sflags/gen/gpflag" "log/slog" "sigs.k8s.io/kubetest2/pkg/artifacts" "sigs.k8s.io/kubetest2/pkg/build" "sigs.k8s.io/kubetest2/pkg/exec" "sigs.k8s.io/kubetest2/pkg/testers" ) var GitTag string type Tester struct { FlakeAttempts int `desc:"Make up to this many attempts to run each spec."` GinkgoArgs string `desc:"Additional arguments supported by the ginkgo binary."` Parallel int `desc:"Run this many tests in parallel at once."` SkipRegex string `desc:"Regular expression of jobs to skip."` FocusRegex string `desc:"Regular expression of jobs to focus on."` TestPackageVersion string `desc:"The ginkgo tester uses a test package made during the kubernetes build. The tester downloads this test package from one of the release tars published to the Release bucket. Defaults to latest. visit https://kubernetes.io/releases/ to find release names. Example: v1.20.0-alpha.0"` TestPackageBucket string `desc:"The bucket which release tars will be downloaded from to acquire the test package. Defaults to the main kubernetes project bucket."` TestPackageDir string `desc:"The directory in the bucket which represents the type of release. Default to the release directory."` TestPackageMarker string `desc:"The version marker in the directory containing the package version to download when unspecified. Defaults to latest.txt."` TestArgs string `desc:"Additional arguments supported by the e2e test framework (https://godoc.org/k8s.io/kubernetes/test/e2e/framework#TestContextType)."` UseBuiltBinaries bool `desc:"Look for binaries in _rundir/$KUBETEST2_RUN_DIR instead of extracting from tars downloaded from GCS."` UseBinariesFromPath bool `desc:"Look for binaries in the $PATH instead of extracting from tars downloaded from GCS."` Env []string `desc:"List of env variables to pass to ginkgo libraries"` kubeconfigPath string runDir string // These paths are set up by AcquireTestPackage() e2eTestPath string ginkgoPath string kubectlPath string } // Test runs the test func (t *Tester) Test() error { if err := testers.WriteVersionToMetadata(GitTag, ""); err != nil { return err } if err := t.pretestSetup(); err != nil { return err } e2eTestArgs := []string{ "--kubeconfig=" + t.kubeconfigPath, "--kubectl-path=" + t.kubectlPath, "--ginkgo.skip=" + t.SkipRegex, "--ginkgo.focus=" + t.FocusRegex, "--report-dir=" + artifacts.BaseDir(), } // some ginkgo flags and behaviors are not backwards compatible switch v := t.ginkgoMajorVersion(); v { case "1": e2eTestArgs = append(e2eTestArgs, "--ginkgo.flakeAttempts="+strconv.Itoa(t.FlakeAttempts), ) case "2": e2eTestArgs = append(e2eTestArgs, "--ginkgo.flake-attempts="+strconv.Itoa(t.FlakeAttempts), ) default: return fmt.Errorf("unsupported ginkgo version: %s", v) } extraE2EArgs, err := shellquote.Split(t.TestArgs) if err != nil { return fmt.Errorf("error parsing --test-args: %v", err) } e2eTestArgs = append(e2eTestArgs, extraE2EArgs...) extraGingkoArgs, err := shellquote.Split(t.GinkgoArgs) if err != nil { return fmt.Errorf("error parsing --gingko-args: %v", err) } ginkgoArgs := append(extraGingkoArgs, "--nodes="+strconv.Itoa(t.Parallel), t.e2eTestPath, "--") ginkgoArgs = append(ginkgoArgs, e2eTestArgs...) slog.Info("running ginkgo test", "path", t.ginkgoPath, "args", ginkgoArgs) cmd := exec.Command(t.ginkgoPath, ginkgoArgs...) cmd.SetEnv(t.Env...) exec.InheritOutput(cmd) return cmd.Run() } func (t *Tester) pretestSetup() error { if config := os.Getenv("KUBECONFIG"); config != "" { // The ginkgo tester errors out if the kubeconfig provided // is not an absolute path, likely because ginkgo changes its // working directory while executing. To get around this problem // we can manually edit the provided KUBECONFIG to ensure a // successful run. if !filepath.IsAbs(config) { newKubeconfig, err := filepath.Abs(config) if err != nil { return fmt.Errorf("failed to convert kubeconfig to absolute path: %s", err) } slog.Info("ginkgo tester received non-absolute KUBECONFIG path, updating", "path", newKubeconfig) config = newKubeconfig } t.kubeconfigPath = config } else { home, err := os.UserHomeDir() if err != nil { return fmt.Errorf("failed to find home directory: %v", err) } t.kubeconfigPath = filepath.Join(home, ".kube", "config") } slog.Info("using kubeconfig", "path", t.kubeconfigPath) if t.UseBuiltBinaries { return t.validateLocalBinaries() } if t.UseBinariesFromPath { return t.validateBinariesFromPath() } if err := t.AcquireTestPackage(); err != nil { return fmt.Errorf("failed to get ginkgo test package from published releases: %s", err) } return nil } func (t *Tester) validateLocalBinaries() error { slog.Debug("checking existing test binaries...") for _, binary := range build.CommonTestBinaries { path := filepath.Join(t.runDir, binary) if _, err := os.Stat(path); err != nil { logPath := path if abspath, err := filepath.Abs(path); err != nil { slog.Warn("failed to convert path to absolute path", "path", path, "error", err) } else { logPath = abspath } return fmt.Errorf("failed to validate pre-built binary %s (checked at %q): %w", binary, logPath, err) } slog.Debug("found existing binary", "binary", binary, "path", path) } t.e2eTestPath = filepath.Join(t.runDir, "e2e.test") t.ginkgoPath = filepath.Join(t.runDir, "ginkgo") t.kubectlPath = filepath.Join(t.runDir, "kubectl") return nil } func (t *Tester) validateBinariesFromPath() error { slog.Debug("checking for test binaries on PATH...") for _, binary := range build.CommonTestBinaries { path, err := stdexec.LookPath(binary) if err != nil { return fmt.Errorf("failed to validate binary %s from PATH: %w", binary, err) } slog.Debug("found existing binary", "binary", binary, "path", path) switch binary { case "e2e.test": t.e2eTestPath = path case "ginkgo": t.ginkgoPath = path case "kubectl": t.kubectlPath = path } } return nil } // ginkgoMajorVersion returns the ginkgo major version // empty if not found func (t *Tester) ginkgoMajorVersion() string { slog.Debug("checking ginkgo version...") cmd := exec.Command(t.ginkgoPath, "version") lines, err := exec.OutputLines(cmd) if err != nil || len(lines) != 1 { return "" } // the output is in the format // Ginkgo Version 1.14.0 // Ginkgo Version 2.1.4 parts := strings.Split(lines[0], " ") if len(parts) != 3 { return "" } vers := strings.Split(parts[2], ".") if len(vers) != 3 { return "" } return vers[0] } func (t *Tester) Execute() error { fs, err := gpflag.Parse(t) if err != nil { return fmt.Errorf("failed to initialize tester: %v", err) } fs.AddGoFlagSet(flag.CommandLine) help := fs.BoolP("help", "h", false, "") if err := fs.Parse(os.Args); err != nil { return fmt.Errorf("failed to parse flags: %v", err) } if *help { fs.SetOutput(os.Stdout) fs.PrintDefaults() return nil } if err := t.initKubetest2Info(); err != nil { return err } return t.Test() } // initializes relevant information from the well defined kubetest2 environment variables. func (t *Tester) initKubetest2Info() error { if t.UseBuiltBinaries && t.UseBinariesFromPath { return fmt.Errorf("--use-built-binaries and --use-binaries-from-path are mutually exclusive") } if dir, ok := os.LookupEnv("KUBETEST2_RUN_DIR"); ok { t.runDir = dir return nil } // ginkgo/e2e.test/kubectl can be found in rundir when they are built if t.UseBuiltBinaries { t.runDir = artifacts.RunDir() return nil } // default to current working directory if for some reason the env is not set dir, err := os.Getwd() if err != nil { return fmt.Errorf("failed to set run dir: %v", err) } t.runDir = dir return nil } func (t *Tester) SetRunDir(dir string) { t.runDir = dir } func NewDefaultTester() *Tester { return &Tester{ FlakeAttempts: 1, Parallel: 1, TestPackageBucket: "kubernetes-release", TestPackageDir: "release", TestPackageMarker: "latest.txt", Env: nil, } } func Main() { t := NewDefaultTester() if err := t.Execute(); err != nil { slog.Error("failed to run ginkgo tester", "error", err) os.Exit(1) } } ================================================ FILE: internal/testers/ginkgov1/kubectl/kubectl.go ================================================ /* Copyright 2019 The Kubernetes Authors. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ package kubectl import ( "fmt" "os" "sigs.k8s.io/kubetest2/pkg/exec" ) const ( kubectl = "kubectl" ) // APIServerURL obtains the URL of the k8s master from kubectl func APIServerURL() (string, error) { kubecontext, err := execAndResult(kubectl, "config", "view", "-o", "jsonpath=\"{.current-context}\"") if err != nil { return "", fmt.Errorf("Could not get kube context: %v", err) } clustername, err := execAndResult(kubectl, "config", "view", "-o", fmt.Sprintf("jsonpath=\"{.contexts[?(@.name == %s)].context.cluster}\"", kubecontext)) if err != nil { return "", fmt.Errorf("Could not get cluster name: %v", err) } apiServerURL, err := execAndResult(kubectl, "config", "view", "-o", fmt.Sprintf("jsonpath={.clusters[?(@.name == %s)].cluster.server}", clustername)) if err != nil { return "", err } return apiServerURL, nil } // execAndResult runs command with args and returns the entire output (or error) func execAndResult(command string, args ...string) (string, error) { cmd := exec.Command(command, args...) cmd.SetStderr(os.Stderr) bytes, err := exec.Output(cmd) return string(bytes), err } ================================================ FILE: internal/testers/ginkgov1/package.go ================================================ // This file has been modified in the following ways: // 1. The `ginkgo` package has been renamed to `ginkgov1`. /* Copyright 2019 The Kubernetes Authors. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ package ginkgov1 import ( "archive/tar" "compress/gzip" "crypto/sha256" "encoding/hex" "fmt" "io" "os" "path/filepath" "runtime" "strings" "log/slog" "sigs.k8s.io/kubetest2/pkg/artifacts" "sigs.k8s.io/kubetest2/pkg/exec" ) // AcquireTestPackage obtains three test binaries and places them in $KUBETEST2_RUN_DIR. // The first is "ginkgo", the actual ginkgo executable. // The second is "e2e.test", which contains kubernetes e2e test cases. // The third is "kubectl". func (t *Tester) AcquireTestPackage() error { // first, get the name of the latest release (e.g. v1.20.0-alpha.0) if t.TestPackageVersion == "" { cmd := exec.Command( "gsutil", "cat", fmt.Sprintf("gs://%s/%s/%s", t.TestPackageBucket, t.TestPackageDir, t.TestPackageMarker), ) lines, err := exec.OutputLines(cmd) if err != nil { return fmt.Errorf("failed to get latest release name: %s", err) } if len(lines) == 0 { return fmt.Errorf("getting latest release name had no output") } t.TestPackageVersion = lines[0] slog.Info("test package version not specified, using default", "marker", t.TestPackageMarker, "version", t.TestPackageVersion) } releaseTar := fmt.Sprintf("kubernetes-test-%s-%s.tar.gz", runtime.GOOS, runtime.GOARCH) downloadDir, err := os.UserCacheDir() if err != nil { return fmt.Errorf("failed to get user cache directory: %v", err) } downloadPath := filepath.Join(downloadDir, releaseTar) if err := t.ensureReleaseTar(downloadPath, releaseTar); err != nil { return err } if err := t.extractBinaries(downloadPath); err != nil { return err } t.kubectlPath = filepath.Join(artifacts.RunDir(), "kubectl") return t.ensureKubectl(t.kubectlPath) } func (t *Tester) extractBinaries(downloadPath string) error { // ensure the artifacts dir if err := os.MkdirAll(artifacts.BaseDir(), os.ModePerm); err != nil { return err } // ensure the rundir if err := os.MkdirAll(artifacts.RunDir(), os.ModePerm); err != nil { return err } // Extract files from the test package f, err := os.Open(downloadPath) if err != nil { return fmt.Errorf("failed to open downloaded tar at %s: %s", downloadPath, err) } defer f.Close() gzf, err := gzip.NewReader(f) if err != nil { return fmt.Errorf("failed to create gzip reader: %s", err) } defer gzf.Close() tarReader := tar.NewReader(gzf) // Map of paths in archive to destination paths t.e2eTestPath = filepath.Join(artifacts.RunDir(), "e2e.test") t.ginkgoPath = filepath.Join(artifacts.RunDir(), "ginkgo") extract := map[string]string{ "kubernetes/test/bin/e2e.test": t.e2eTestPath, "kubernetes/test/bin/ginkgo": t.ginkgoPath, } extracted := map[string]bool{} for { if len(extracted) == len(extract) { break } header, err := tarReader.Next() if err == io.EOF { break } if err != nil { return fmt.Errorf("error during tar read: %s", err) } if dest := extract[header.Name]; dest != "" { outFile, err := os.Create(dest) if err != nil { return fmt.Errorf("error creating file at %s: %s", dest, err) } defer outFile.Close() if err := outFile.Chmod(0700); err != nil { return fmt.Errorf("failed to make %s executable: %s", dest, err) } if _, err := io.Copy(outFile, tarReader); err != nil { return fmt.Errorf("error reading data from tar with header name %s: %s", header.Name, err) } extracted[header.Name] = true } } for k := range extract { if !extracted[k] { return fmt.Errorf("failed to find %s in %s", k, downloadPath) } } return nil } // ensureKubectl checks if the kubectl exists and verifies the hashes // else downloads it from GCS func (t *Tester) ensureKubectl(downloadPath string) error { kubectlPathInGCS := fmt.Sprintf( "gs://%s/%s/%s/bin/%s/%s/kubectl", t.TestPackageBucket, t.TestPackageDir, t.TestPackageVersion, runtime.GOOS, runtime.GOARCH, ) if _, err := os.Stat(downloadPath); err == nil { slog.Info("found existing kubectl", "path", downloadPath) err := t.compareSHA(downloadPath, kubectlPathInGCS) if err == nil { slog.Info("validated hash for existing kubectl", "path", downloadPath) return nil } slog.Warn("hash validation failed", "error", err) } cmd := exec.Command("gsutil", "cp", kubectlPathInGCS, downloadPath) exec.InheritOutput(cmd) if err := cmd.Run(); err != nil { return fmt.Errorf("failed to download kubectl for release %s: %s", t.TestPackageVersion, err) } if err := os.Chmod(downloadPath, 0700); err != nil { return fmt.Errorf("failed to make %s executable: %s", downloadPath, err) } return nil } // ensureReleaseTar checks if the kubernetes test tarball already exists // and verifies the hashes // else downloads it from GCS func (t *Tester) ensureReleaseTar(downloadPath, releaseTar string) error { releaseTarPathInGCS := fmt.Sprintf( "gs://%s/%s/%s/%s", t.TestPackageBucket, t.TestPackageDir, t.TestPackageVersion, releaseTar, ) if _, err := os.Stat(downloadPath); err == nil { slog.Info("found existing tar", "path", downloadPath) err := t.compareSHA(downloadPath, releaseTarPathInGCS) if err == nil { slog.Info("validated hash for existing tar", "path", downloadPath) return nil } slog.Warn("hash validation failed", "error", err) } cmd := exec.Command("gsutil", "cp", releaseTarPathInGCS, downloadPath, ) exec.InheritOutput(cmd) if err := cmd.Run(); err != nil { return fmt.Errorf("failed to download release tar %s for release %s: %s", releaseTar, t.TestPackageVersion, err) } return nil } func (t *Tester) compareSHA(downloadPath string, gcsFilePath string) error { cmd := exec.Command("gsutil", "cat", fmt.Sprintf("%s.sha256", gcsFilePath), ) expectedSHABytes, err := exec.Output(cmd) if err != nil { return fmt.Errorf("failed to get sha256 for file %s for release %s: %s", gcsFilePath, t.TestPackageVersion, err) } expectedSHA := strings.TrimSuffix(string(expectedSHABytes), "\n") actualSHA, err := sha256sum(downloadPath) if err != nil { return fmt.Errorf("failed to compute sha256 for %q: %v", downloadPath, err) } if actualSHA != expectedSHA { return fmt.Errorf("sha256 does not match") } return nil } func sha256sum(path string) (string, error) { f, err := os.Open(path) if err != nil { return "", err } defer f.Close() h := sha256.New() if _, err := io.Copy(h, f); err != nil { return "", err } return hex.EncodeToString(h.Sum(nil)), nil } ================================================ FILE: internal/testers/multi/cmd.go ================================================ package multi import ( "errors" "fmt" "log/slog" "os" "path/filepath" "strings" "github.com/aws/aws-k8s-tester/internal" "github.com/urfave/sflags/gen/gpflag" "sigs.k8s.io/kubetest2/pkg/app/shim" "sigs.k8s.io/kubetest2/pkg/artifacts" "sigs.k8s.io/kubetest2/pkg/process" "sigs.k8s.io/kubetest2/pkg/testers" ) const TesterName = "multi" const usage = `kubetest2 --test=multi -- [MultiTesterDriverArgs] -- [TesterName] [TesterArgs] -- ... MultiTesterDriverArgs: arguments passed to the multi-tester driver TesterName: the name of the tester to run TesterArgs: arguments passed to tester Each tester clause is separated by "--". ` func Main() { if err := execute(); err != nil { slog.Error("failed to run multi tester", "error", err) os.Exit(1) } } type multiTesterDriver struct { argv []string } type tester struct { name string path string args []string } func execute() error { driverArgs, testerClauses := splitArguments(os.Args) driver := multiTesterDriver{ argv: driverArgs, } fs, err := gpflag.Parse(&driver) if err != nil { return fmt.Errorf("failed to initialize tester: %v", err) } fs.Usage = func() { fmt.Print(usage) } if len(testerClauses) == 0 { fs.Usage() return nil } // gracefully handle -h or --help if it is the only argument help := fs.BoolP("help", "h", false, "") failFast := fs.Bool("fail-fast", false, "Exit immediately if any tester fails") // we don't care about errors, only if -h / --help was set err = fs.Parse(driver.argv) if err != nil { fs.Usage() return err } if *help { fs.Usage() return nil } if err := testers.WriteVersionToMetadata(internal.Version, ""); err != nil { return err } if testers, err := prepareTesters(testerClauses); err != nil { return err } else { return test(testers, *failFast) } } func test(testers []tester, failFast bool) error { metadataPath := filepath.Join(artifacts.BaseDir(), "metadata.json") backupMetdataPath := metadataPath + ".bak" if err := os.Rename(metadataPath, backupMetdataPath); err != nil { slog.Error("failed to backup driver metadata", "error", err) } var testerErrs []error for _, tester := range testers { if err := tester.run(); err != nil { slog.Error("tester failed", "tester", tester, "error", err) testerErrs = append(testerErrs, fmt.Errorf("%+v: %v", tester, err)) if failFast { break } } // reset the metadata.json file // testers will try to set the tester-version key and cause conflicts if err := os.Remove(metadataPath); err != nil { return fmt.Errorf("failed to delete tester metadata: %v", err) } } if err := os.Rename(backupMetdataPath, metadataPath); err != nil { return fmt.Errorf("failed to restore driver metadata: %v", err) } if len(testerErrs) > 0 { return errors.Join(testerErrs...) } return nil } // splitArguments splits arguments into driver arguments and tester clauses, separated by "--". func splitArguments(argv []string) ([]string, [][]string) { var clauses [][]string var last int for i, arg := range argv { if arg == "--" { clauses = append(clauses, argv[last:i]) last = i + 1 } } clauses = append(clauses, argv[last:]) return clauses[0], clauses[1:] } func prepareTesters(testerClauses [][]string) ([]tester, error) { var testers []tester for _, clause := range testerClauses { testerName := clause[0] if testerName == TesterName { return nil, fmt.Errorf("nesting isn't possible with the %s tester", TesterName) } path, err := shim.FindTester(testerName) if err != nil { return nil, err } tester := tester{ name: testerName, path: path, args: expandEnv(clause[1:]), } testers = append(testers, tester) } return testers, nil } func expandEnv(args []string) []string { expandedArgs := make([]string, len(args)) for i, arg := range args { // best effort handle literal dollar for backward compatibility // this is not an all-purpose shell special character handler if strings.Contains(arg, `\$`) { expandedArgs[i] = strings.ReplaceAll(arg, `\$`, `$`) } else { expandedArgs[i] = os.ExpandEnv(arg) } } return expandedArgs } func (t *tester) run() error { slog.Info("running tester", "tester", t) return process.ExecJUnit(t.path, t.args, os.Environ()) } ================================================ FILE: internal/util/cloudformation.go ================================================ package util import ( "context" "fmt" "strings" "github.com/aws/aws-sdk-go-v2/aws" "github.com/aws/aws-sdk-go-v2/service/cloudformation" types "github.com/aws/aws-sdk-go-v2/service/cloudformation/types" ) // TODO: implement AWS client wrappers, and incorporate this into the cfn:CreateStack call func WrapCFNStackFailure(ctx context.Context, cfnClient *cloudformation.Client, createStackErr error, stackName string) error { if createStackErr == nil { return nil } resourceByFailureMode := make(map[string][]string) eventsPaginator := cloudformation.NewDescribeStackEventsPaginator(cfnClient, &cloudformation.DescribeStackEventsInput{ StackName: &stackName, }) for eventsPaginator.HasMorePages() { page, err := eventsPaginator.NextPage(ctx) if err != nil { return createStackErr } for _, event := range page.StackEvents { if event.ResourceStatus == types.ResourceStatusCreateFailed { if _, ok := resourceByFailureMode[aws.ToString(event.ResourceStatusReason)]; !ok { resourceByFailureMode[aws.ToString(event.ResourceStatusReason)] = []string{} } resourceByFailureMode[aws.ToString(event.ResourceStatusReason)] = append(resourceByFailureMode[aws.ToString(event.ResourceStatusReason)], aws.ToString(event.LogicalResourceId)) } } } nonCancellationFailure := len(resourceByFailureMode) > 1 var enhancedDetails []string for reason, resources := range resourceByFailureMode { if nonCancellationFailure && reason == "Resource creation cancelled" { // Ignore resource cancellation errors if there's another failure reported, those failures // would just be a consequence of that failure. If all the failures are resource cancellation, // then there was likely a user initiated delete of the whole stack based on a timeout // waiting for one of the resources to create continue } enhancedDetails = append(enhancedDetails, fmt.Sprintf("%s: %s", strings.Join(resources, ","), reason)) } return fmt.Errorf("%w: %s", createStackErr, strings.Join(enhancedDetails, "--")) } ================================================ FILE: internal/util/exec.go ================================================ package util import ( "os" "os/exec" ) func ExecuteCommand(name string, args ...string) error { command := exec.Command(name, args...) command.Stdout = os.Stdout command.Stderr = os.Stderr return command.Run() } ================================================ FILE: internal/util/http.go ================================================ package util import ( "fmt" "strings" "github.com/aws/smithy-go/middleware" smithyhttp "github.com/aws/smithy-go/transport/http" ) const httpHeaderBoundary = ": " // NewHTTPHeaderAPIOptions returns a slice of middleware options that adds the // specified HTTP headers to an API request. // Each header should be of the format `Header-Key: Header-Value`, in the same manner // as headers are passed with `curl`-s `-H` flag. func NewHTTPHeaderAPIOptions(headers []string) ([]func(*middleware.Stack) error, error) { var opts []func(*middleware.Stack) error for _, header := range headers { boundary := strings.Index(header, httpHeaderBoundary) if boundary == -1 { return nil, fmt.Errorf("malformed HTTP header: '%s'", header) } key := header[:boundary] val := header[boundary+len(httpHeaderBoundary):] opts = append(opts, smithyhttp.AddHeaderValue(key, val)) } return opts, nil } ================================================ FILE: internal/util/http_test.go ================================================ package util import ( "testing" ) func Test_NewHTTPHeaderAPIOptions(t *testing.T) { testCases := []struct { name string headers []string expectError bool }{ { name: "empty", headers: []string{}, }, { name: "single valid header", headers: []string{"Content-Type: application/json"}, }, { name: "multiple valid headers", headers: []string{"Content-Type: application/json", "Accept: application/json"}, }, { name: "invalid header", headers: []string{"Invalid header"}, expectError: true, }, } for _, tc := range testCases { t.Run(tc.name, func(t *testing.T) { _, err := NewHTTPHeaderAPIOptions(tc.headers) if err != nil && !tc.expectError { t.Errorf("unexpected error: %v", err) } if err == nil && tc.expectError { t.Error("expected error but got none") } }) } } ================================================ FILE: internal/util/lang.go ================================================ package util func Must[T any](t T, err error) T { if err != nil { panic(err) } return t } ================================================ FILE: internal/util/path.go ================================================ package util import ( "errors" "os" "path/filepath" "syscall" ) var ErrFileNotFoundInPath = errors.New("file not found in $PATH") // LookPath finds a file on the PATH. // It uses a similar process to exec.LookPath, but can find regular files. func LookPath(file string) (string, error) { path := os.Getenv("PATH") for _, dir := range filepath.SplitList(path) { if dir == "" { // Unix shell semantics: path element "" means "." dir = "." } path := filepath.Join(dir, file) if err := checkFile(path); err == nil { return path, nil } } return "", ErrFileNotFoundInPath } func checkFile(file string) error { d, err := os.Stat(file) if err != nil { return err } m := d.Mode() if m.IsDir() { return syscall.EISDIR } return nil } ================================================ FILE: internal/util/version.go ================================================ package util import ( "fmt" "os" "strings" ) const KubernetesVersionFile = "kubernetes-version.txt" func DetectKubernetesVersion() (string, error) { versionFile, err := LookPath(KubernetesVersionFile) if err != nil { return "", err } bytes, err := os.ReadFile(versionFile) if err != nil { return "", err } // "v1.2.3" versionTag := string(bytes) return strings.ReplaceAll(versionTag, "v", ""), nil } func ParseMinorVersion(semanticVersion string) (string, error) { parts := strings.Split(semanticVersion, ".") if len(parts) < 2 { return "", fmt.Errorf("malformed semantic version: '%s'", semanticVersion) } return strings.Join(parts[:2], "."), nil } ================================================ FILE: internal/version.go ================================================ package internal var Version string ================================================ FILE: test/cases/disruptive/graceful_reboot_test.go ================================================ //go:build e2e package disruptive import ( "context" "fmt" "strings" "testing" "time" "github.com/aws/aws-k8s-tester/internal/awssdk" fwext "github.com/aws/aws-k8s-tester/internal/e2e" "github.com/aws/aws-sdk-go-v2/service/ec2" corev1 "k8s.io/api/core/v1" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "sigs.k8s.io/e2e-framework/klient/wait" "sigs.k8s.io/e2e-framework/pkg/envconf" "sigs.k8s.io/e2e-framework/pkg/features" ) func getSleepPodTemplate(name string) corev1.Pod { return corev1.Pod{ ObjectMeta: metav1.ObjectMeta{ Name: name, Namespace: "default", }, Spec: corev1.PodSpec{ Containers: []corev1.Container{ { Name: name, Image: "public.ecr.aws/amazonlinux/amazonlinux:2023", Command: []string{"sleep", "infinity"}, }, }, RestartPolicy: corev1.RestartPolicyNever, }, } } func TestGracefulReboot(t *testing.T) { terminationCanaryPodName := fmt.Sprintf("termination-canary-%d", time.Now().Unix()) canaryPod := getSleepPodTemplate(terminationCanaryPodName) bootIndicatorPodName := fmt.Sprintf("boot-detection-%d", time.Now().Unix()) bootIndicatorPod := getSleepPodTemplate(bootIndicatorPodName) feat := features.New("graceful-reboot"). WithLabel("suite", "disruptive"). Assess("Node gracefully reboots", func(ctx context.Context, t *testing.T, cfg *envconf.Config) context.Context { // Create an initial pod to allow the default scheduler to do the work of identifying a healthy node. // Starting with a healthy node is essential to the test, as the only expectation is for the node to // return to its same initial state after the reboot. if err := cfg.Client().Resources().Create(ctx, &canaryPod); err != nil { t.Fatalf("Failed to create heartbeat pod: %v", err) } if err := wait.For(fwext.NewConditionExtension(cfg.Client().Resources()).PodRunning(&canaryPod), wait.WithContext(ctx), wait.WithTimeout(5*time.Minute), ); err != nil { t.Fatalf("Failed to wait for pod %s to go into running status: %v", terminationCanaryPodName, err) } var targetNode corev1.Node if err := cfg.Client().Resources().Get(ctx, canaryPod.Spec.NodeName, "", &targetNode); err != nil { t.Fatalf("Failed to get node %s: %v", canaryPod.Spec.NodeName, err) } t.Logf("Pod %s is running on node %s", terminationCanaryPodName, targetNode.Name) // Do an initial check of the /healthz endpoint reachability to ensure we can rely on it later. // This might fail even if the node is healthy if, for example, the node's security group rules // do not allow ingress traffic from the control plane. // Retry for up to 1 minute to handle transient TLS errors during cert rotation. var kubeletResponsive bool var err error healthCheckCtx, healthCheckCancel := context.WithTimeout(ctx, 5*time.Minute) defer healthCheckCancel() for { kubeletResponsive, err = fwext.KubeletIsResponsive(healthCheckCtx, cfg.Client().RESTConfig(), targetNode.Name) if err == nil && kubeletResponsive { break } select { case <-healthCheckCtx.Done(): t.Fatalf("Node %s is not responding to initial /healthz checks: %v", targetNode.Name, err) case <-time.After(5 * time.Second): t.Logf("Retrying /healthz check for node %s (last error: %v, responsive: %v)", targetNode.Name, err, kubeletResponsive) } } providerIDParts := strings.Split(targetNode.Spec.ProviderID, "/") instanceID := providerIDParts[len(providerIDParts)-1] t.Logf("Rebooting underlying instance %s for node %s...", instanceID, targetNode.Name) ec2Client := ec2.NewFromConfig(awssdk.NewConfig()) if _, err := ec2Client.RebootInstances(ctx, &ec2.RebootInstancesInput{ InstanceIds: []string{instanceID}, }); err != nil { t.Fatalf("Failed to reboot instance %s: %v", instanceID, err) } t.Logf("Successfully triggered reboot of instance %s, waiting for kubelet to become unresponsive...", instanceID) kubeletShutdownCtx, cancel := context.WithTimeout(ctx, 5*time.Minute) defer cancel() // Use kubelet health probes as the signal for instance shutdown. Since the health endpoint // could previously be reached, a refused connection implies kubelet was killed. for kubeletResponsive { select { case <-kubeletShutdownCtx.Done(): t.Fatalf("Failed to wait for kubelet to become unresponsive: %v", ctx.Err()) case <-time.Tick(1 * time.Second): if kubeletResponsive, err = fwext.KubeletIsResponsive(ctx, cfg.Client().RESTConfig(), targetNode.Name); err != nil { t.Fatalf("Unpexected error while monitoring kubelet on node %s: %v", targetNode.Name, err) } } } t.Logf("Node %s has become unresponsive, waiting for the node to become schedulable again...", targetNode.Name) // Create a second pod, we will rely on this pod starting to run as an indication of a healthy state. // Since kubelet was killed at this point, we know the reboot must complete and kubelet must start // again for this pod to start running. bootIndicatorPod.Spec.NodeSelector = map[string]string{ "kubernetes.io/hostname": targetNode.Name, } if err := cfg.Client().Resources().Create(ctx, &bootIndicatorPod); err != nil { t.Fatalf("Failed to create boot indicator pod: %v", err) } if err := wait.For(fwext.NewConditionExtension(cfg.Client().Resources()).PodRunning(&bootIndicatorPod), wait.WithContext(ctx), wait.WithTimeout(10*time.Minute), // TODO: bring down this value after collecting some more data ); err != nil { t.Fatalf("Failed to wait for pod to go into running status %s: %v", bootIndicatorPodName, err) } t.Logf("Node %s became ready and schedulable within %v!", targetNode.Name, time.Since(bootIndicatorPod.CreationTimestamp.Time)) return ctx }). Teardown(func(ctx context.Context, t *testing.T, cfg *envconf.Config) context.Context { if err := cfg.Client().Resources().Delete(ctx, &canaryPod); err != nil { t.Logf("Failed to delete pod %s: %v", terminationCanaryPodName, err) } if err := cfg.Client().Resources().Delete(ctx, &bootIndicatorPod); err != nil { t.Logf("Failed to delete pod %s: %v", bootIndicatorPodName, err) } return ctx }). Feature() testenv.Test(t, feat) } ================================================ FILE: test/cases/disruptive/graceful_shutdown_test.go ================================================ //go:build e2e package disruptive import ( "context" "fmt" "io" "log" "regexp" "strings" "testing" "time" "github.com/aws/aws-k8s-tester/internal/awssdk" "github.com/aws/aws-k8s-tester/internal/e2e" "github.com/aws/aws-sdk-go-v2/service/ec2" corev1 "k8s.io/api/core/v1" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/client-go/kubernetes" "k8s.io/utils/pointer" "sigs.k8s.io/e2e-framework/klient/k8s" "sigs.k8s.io/e2e-framework/klient/wait" "sigs.k8s.io/e2e-framework/pkg/envconf" "sigs.k8s.io/e2e-framework/pkg/features" ) // getPodLogs retrieves logs from a pod using kubernetes clientset func getPodLogs(ctx context.Context, cfg *envconf.Config, podName, namespace string) (string, error) { client, err := kubernetes.NewForConfig(cfg.Client().RESTConfig()) if err != nil { return "", err } req := client.CoreV1().Pods(namespace).GetLogs(podName, &corev1.PodLogOptions{}) logs, err := req.Stream(ctx) if err != nil { return "", err } defer logs.Close() var result strings.Builder _, err = io.Copy(&result, logs) if err != nil { return "", err } return result.String(), nil } // checkLogPattern checks if a log pattern exists in the pod logs func checkLogPattern(ctx context.Context, cfg *envconf.Config, podName, namespace, pattern string) (bool, error) { logs, err := getPodLogs(ctx, cfg, podName, namespace) if err != nil { return false, err } matched, err := regexp.MatchString(pattern, logs) if err != nil { return false, err } return matched, nil } // countLogMatches counts how many times a pattern appears in the logs func countLogMatches(ctx context.Context, cfg *envconf.Config, podName, namespace, pattern string) (int, error) { logs, err := getPodLogs(ctx, cfg, podName, namespace) if err != nil { return 0, err } re, err := regexp.Compile(pattern) if err != nil { return 0, err } matches := re.FindAllString(logs, -1) return len(matches), nil } func TestKubeletGracefulShutdown(t *testing.T) { feat := features.New("kubelet-graceful-shutdown"). WithLabel("suite", "disruptive"). Setup(func(ctx context.Context, t *testing.T, cfg *envconf.Config) context.Context { log.Println("[Setup] Setting up Kubelet Graceful Shutdown test...") return ctx }). Assess("Kubelet gracefully shuts down pods during node termination", func(ctx context.Context, t *testing.T, cfg *envconf.Config) context.Context { // Create heartbeat pod that will log its status pod := &corev1.Pod{ ObjectMeta: metav1.ObjectMeta{ Name: fmt.Sprintf("graceful-shutdown-test-%d", time.Now().Unix()), Namespace: "default", Labels: map[string]string{ "app": "graceful-shutdown-test", }, }, Spec: corev1.PodSpec{ Containers: []corev1.Container{ { Name: "heartbeat-container", Image: "public.ecr.aws/amazonlinux/amazonlinux:2023", Command: []string{"/usr/bin/bash", "-c"}, Args: []string{` set -x echo "[GRACEFUL-TEST] Starting graceful shutdown test pod..." function handle_sigterm() { echo "[GRACEFUL-TEST] $(date): SIGTERM-RECEIVED - starting graceful shutdown period" # Continue heartbeating until we are SIGKILL-d start_time=$(date +%s) while true; do current_time=$(date +%s) elapsed=$((current_time - start_time)) echo "[GRACEFUL-TEST] $(date): HEARTBEAT-AFTER-SIGTERM elapsed=${elapsed}s" sleep 1 done } trap handle_sigterm TERM # Initial heartbeat to show pod is running echo "[GRACEFUL-TEST] $(date): POD-STARTED - pod started successfully" # Keep running and heartbeating until terminated counter=0 while true; do echo "[GRACEFUL-TEST] $(date): NORMAL-HEARTBEAT counter=$counter" counter=$((counter + 1)) sleep 10 done `}, }, }, RestartPolicy: corev1.RestartPolicyNever, TerminationGracePeriodSeconds: pointer.Int64(150), // 2.5 minutes to allow for graceful shutdown testing }, } if err := cfg.Client().Resources().Create(ctx, pod); err != nil { t.Fatalf("[Assess] Failed to create heartbeat pod: %v", err) } log.Printf("[Assess] Created heartbeat pod: %s", pod.Name) // Store pod name in context for cleanup ctx = context.WithValue(ctx, "podName", pod.Name) log.Printf("[Assess] Waiting for pod %s to start running...", pod.Name) err := wait.For( e2e.NewConditionExtension(cfg.Client().Resources()).ResourceMatch(pod, func(object k8s.Object) bool { pod := object.(*corev1.Pod) return pod.Status.Phase == corev1.PodRunning }), wait.WithTimeout(2*time.Minute), ) if err != nil { t.Fatalf("[Assess] Pod did not start running: %v", err) } // Wait a bit for initial heartbeats log.Printf("[Assess] Waiting for initial heartbeats...") time.Sleep(30 * time.Second) // Verify pod started successfully by checking logs podStarted, err := checkLogPattern(ctx, cfg, pod.Name, pod.Namespace, `POD-STARTED`) if err != nil { t.Fatalf("[Assess] Failed to check pod logs: %v", err) } if !podStarted { t.Fatalf("[Assess] Pod did not log successful startup") } log.Printf("[Assess] ✓ Pod startup confirmed via logs") // Get the node the pod is running on if err := cfg.Client().Resources().Get(ctx, pod.Name, pod.Namespace, pod); err != nil { t.Fatalf("[Assess] Failed to get pod details: %v", err) } nodeName := pod.Spec.NodeName if nodeName == "" { t.Fatalf("[Assess] Pod is not scheduled to any node") } log.Printf("[Assess] Pod is running on node: %s", nodeName) // Get the EC2 instance ID for this node var node corev1.Node if err := cfg.Client().Resources().Get(ctx, nodeName, "", &node); err != nil { t.Fatalf("[Assess] Failed to get node %s: %v", nodeName, err) } providerID := node.Spec.ProviderID if providerID == "" { t.Fatalf("[Assess] Node %s has no providerID", nodeName) } parts := strings.Split(providerID, "/") if len(parts) < 2 { t.Fatalf("[Assess] Invalid providerID format: %s", providerID) } instanceID := parts[len(parts)-1] log.Printf("[Assess] Node %s corresponds to EC2 instance: %s", nodeName, instanceID) // Terminate the EC2 instance log.Printf("[Assess] Terminating EC2 instance %s to test graceful shutdown...", instanceID) ec2Client := ec2.NewFromConfig(awssdk.NewConfig()) _, err = ec2Client.TerminateInstances(ctx, &ec2.TerminateInstancesInput{ InstanceIds: []string{instanceID}, }) if err != nil { t.Fatalf("[Assess] Failed to terminate EC2 instance %s: %v", instanceID, err) } log.Printf("[Assess] Successfully initiated termination of instance %s", instanceID) // Wait and monitor the graceful shutdown process via logs log.Printf("[Assess] Monitoring graceful shutdown process for 3 minutes...") // Wait for SIGTERM to be received (should happen within 60 seconds) sigtermReceived := false for i := 0; i < 30; i++ { received, err := checkLogPattern(ctx, cfg, pod.Name, pod.Namespace, `SIGTERM-RECEIVED`) if err != nil { log.Printf("[Assess] Warning: Failed to check logs: %v", err) } else if received { sigtermReceived = true log.Printf("[Assess] ✓ SIGTERM received by pod (detected after %d seconds)", i*2) break } time.Sleep(2 * time.Second) } if !sigtermReceived { t.Fatalf("[Assess] Pod did not receive SIGTERM within 60 seconds of instance termination") } // Monitor heartbeats for the next 2+ minutes to verify graceful shutdown behavior log.Printf("[Assess] Verifying pod continues running during graceful shutdown period...") gracefulShutdownStart := time.Now() var heartbeatsAfterSigterm int for time.Since(gracefulShutdownStart) < 2*time.Minute { // Monitor for 2 minutes // Count heartbeats after SIGTERM matches, err := countLogMatches(ctx, cfg, pod.Name, pod.Namespace, `HEARTBEAT-AFTER-SIGTERM`) if err != nil { log.Printf("[Assess] Warning: Failed to count heartbeats: %v", err) } else if matches > 0 { log.Printf("[Assess] ✓ Pod still running after SIGTERM (%d heartbeats logged)", matches) heartbeatsAfterSigterm = matches } time.Sleep(1 * time.Second) } // Verify we got heartbeats during the graceful shutdown period // These happen once a second, so we should observe at least 110 of them for a 2 minute grace period if heartbeatsAfterSigterm < 110 { t.Fatalf("[Assess] Expected at least 110 heartbeats during graceful shutdown, got %d", heartbeatsAfterSigterm) } log.Printf("[Assess] ✓ Pod continued running and heartbeating for graceful shutdown period") log.Printf("[Assess] ✓ Total heartbeats after SIGTERM: %d", heartbeatsAfterSigterm) // Check for graceful exit gracefulExit, err := checkLogPattern(ctx, cfg, pod.Name, pod.Namespace, `GRACEFUL-EXIT`) if err != nil { log.Printf("[Assess] Warning: Failed to check for graceful exit: %v", err) } else if gracefulExit { log.Printf("[Assess] ✓ Pod logged graceful exit") } return ctx }). Teardown(func(ctx context.Context, t *testing.T, cfg *envconf.Config) context.Context { podName, ok := ctx.Value("podName").(string) if !ok { log.Printf("[Teardown] No pod name in context, nothing to clean up") return ctx } log.Printf("[Teardown] Cleaning up test pod %s...", podName) // Get final logs for debugging if needed logs, err := getPodLogs(ctx, cfg, podName, "default") if err != nil { log.Printf("[Teardown] Warning: Failed to get final logs: %v", err) } else { log.Printf("[Teardown] Final pod logs:\n%s", logs) } // Delete the pod (it may already be terminated) pod := &corev1.Pod{ ObjectMeta: metav1.ObjectMeta{ Name: podName, Namespace: "default", }, } if err := cfg.Client().Resources().Delete(ctx, pod); err != nil { log.Printf("[Teardown] Warning: Failed to delete pod %s: %v", podName, err) } else { log.Printf("[Teardown] Successfully cleaned up pod %s", podName) } return ctx }). Feature() testenv.Test(t, feat) } ================================================ FILE: test/cases/disruptive/main_test.go ================================================ //go:build e2e package disruptive import ( "context" _ "embed" "log" "os" "os/signal" "testing" "sigs.k8s.io/e2e-framework/pkg/env" "sigs.k8s.io/e2e-framework/pkg/envconf" ) var ( testenv env.Environment ) func TestMain(m *testing.M) { cfg, err := envconf.NewFromFlags() if err != nil { log.Fatalf("failed to initialize test environment: %v", err) } testenv = env.NewWithConfig(cfg) ctx, cancel := signal.NotifyContext(context.Background(), os.Interrupt) defer cancel() testenv = testenv.WithContext(ctx) testenv.Setup(func(ctx context.Context, config *envconf.Config) (context.Context, error) { log.Println("Starting quick test suite...") return ctx, nil }) os.Exit(testenv.Run(m)) } ================================================ FILE: test/cases/dra/dra_example_driver_test.go ================================================ //go:build e2e package dra import ( "context" _ "embed" "fmt" "testing" "time" "github.com/aws/aws-k8s-tester/internal/e2e" "github.com/stretchr/testify/assert" appsv1 "k8s.io/api/apps/v1" corev1 "k8s.io/api/core/v1" rbacv1 "k8s.io/api/rbac/v1" "k8s.io/api/resource/v1beta1" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "sigs.k8s.io/e2e-framework/klient/k8s" "sigs.k8s.io/e2e-framework/klient/wait" "sigs.k8s.io/e2e-framework/klient/wait/conditions" "sigs.k8s.io/e2e-framework/pkg/envconf" "sigs.k8s.io/e2e-framework/pkg/features" ) // see: https://github.com/kubernetes-sigs/dra-example-driver func TestDraExampleDriver(t *testing.T) { draDriverResources := draDriverResources() deviceClass, resourceClaimTemplate, pod := testResources() exampleDraDriver := features.New("dra-example-driver"). WithLabel("feature", "dra"). Setup(func(ctx context.Context, t *testing.T, cfg *envconf.Config) context.Context { for _, obj := range draDriverResources { assert.NoError(t, cfg.Client().Resources().Create(ctx, obj)) } assert.NoError(t, cfg.Client().Resources().Create(ctx, &deviceClass)) assert.NoError(t, cfg.Client().Resources().Create(ctx, &resourceClaimTemplate)) return ctx }). Assess("device driver present", func(ctx context.Context, t *testing.T, cfg *envconf.Config) context.Context { assert.NoError(t, cfg.Client().Resources().Create(ctx, &pod)) defer func() { assert.NoError(t, cfg.Client().Resources().Delete(ctx, &pod)) assert.NoError(t, wait.For(conditions.New(cfg.Client().Resources()).ResourceDeleted(&pod), wait.WithTimeout(time.Minute), wait.WithContext(ctx), )) }() assert.NoError(t, wait.For(conditions.New(cfg.Client().Resources()).PodRunning(&pod), wait.WithTimeout(time.Minute), wait.WithContext(ctx), )) podLogs, err := e2e.ReadPodLogs(ctx, cfg.Client().RESTConfig(), pod.Namespace, pod.Name, pod.Spec.Containers[0].Name) if assert.NoErrorf(t, err, "skipping error getting pod logs %q: %v", pod.Name, err) { t.Logf("Logs for %q\n%s", pod.Name, podLogs) assert.Contains(t, podLogs, fmt.Sprintf(`DRA_RESOURCE_DRIVER_NAME="%s"`, deviceClass.Name)) } return ctx }). Teardown(func(ctx context.Context, t *testing.T, cfg *envconf.Config) context.Context { assert.NoError(t, cfg.Client().Resources().Delete(ctx, &deviceClass)) assert.NoError(t, cfg.Client().Resources().Delete(ctx, &resourceClaimTemplate)) for _, obj := range draDriverResources { assert.NoError(t, cfg.Client().Resources().Delete(ctx, obj)) } return ctx }). Feature() testenv.Test(t, exampleDraDriver) } func testResources() (v1beta1.DeviceClass, v1beta1.ResourceClaimTemplate, corev1.Pod) { deviceClass := v1beta1.DeviceClass{ TypeMeta: metav1.TypeMeta{ APIVersion: "resource.k8s.io/v1beta1", Kind: "DeviceClass", }, ObjectMeta: metav1.ObjectMeta{ Name: "gpu.example.com", }, Spec: v1beta1.DeviceClassSpec{ Selectors: []v1beta1.DeviceSelector{ { CEL: &v1beta1.CELDeviceSelector{ Expression: "device.driver == 'gpu.example.com'", }, }, }, }, } deviceRequest := v1beta1.DeviceRequest{ Name: "gpu", DeviceClassName: deviceClass.Name, } resourceClaimTemplate := v1beta1.ResourceClaimTemplate{ TypeMeta: metav1.TypeMeta{ APIVersion: "resource.k8s.io/v1beta1", Kind: "ResourceClaimTemplate", }, ObjectMeta: metav1.ObjectMeta{ Name: "single-gpu", Namespace: corev1.NamespaceDefault, }, Spec: v1beta1.ResourceClaimTemplateSpec{ Spec: v1beta1.ResourceClaimSpec{ Devices: v1beta1.DeviceClaim{ Requests: []v1beta1.DeviceRequest{deviceRequest}, }, }, }, } pod := corev1.Pod{ ObjectMeta: metav1.ObjectMeta{ Name: "pod0", Namespace: corev1.NamespaceDefault, }, Spec: corev1.PodSpec{ Containers: []corev1.Container{ { Name: "ctr0", Image: "public.ecr.aws/amazonlinux/amazonlinux:latest", Command: []string{"bash", "-c"}, Args: []string{"export; trap 'exit 0' TERM; sleep infinity & wait"}, Resources: corev1.ResourceRequirements{ Claims: []corev1.ResourceClaim{ { Name: deviceRequest.Name, }, }, }, }, }, ResourceClaims: []corev1.PodResourceClaim{ { Name: deviceRequest.Name, ResourceClaimTemplateName: &resourceClaimTemplate.Name, }, }, }, } return deviceClass, resourceClaimTemplate, pod } func draDriverResources() []k8s.Object { serviceAccount := corev1.ServiceAccount{ TypeMeta: metav1.TypeMeta{ APIVersion: "rbac.authorization.k8s.io/v1", Kind: "ServiceAccount", }, ObjectMeta: metav1.ObjectMeta{ Name: "dra-service-account", Namespace: corev1.NamespaceDefault, }, } clusterRole := rbacv1.ClusterRole{ TypeMeta: metav1.TypeMeta{ APIVersion: "rbac.authorization.k8s.io/v1", Kind: "ClusterRole", }, ObjectMeta: metav1.ObjectMeta{ Name: "dra-example-driver-role", Namespace: corev1.NamespaceDefault, }, Rules: []rbacv1.PolicyRule{ { APIGroups: []string{"resource.k8s.io"}, Resources: []string{"resourceclaims"}, Verbs: []string{"get"}, }, { APIGroups: []string{""}, Resources: []string{"nodes"}, Verbs: []string{"get"}, }, { APIGroups: []string{"resource.k8s.io"}, Resources: []string{"resourceslices"}, Verbs: []string{"get", "list", "watch", "create", "update", "patch", "delete"}, }, }, } clusterRoleBinding := rbacv1.ClusterRoleBinding{ TypeMeta: metav1.TypeMeta{ APIVersion: "rbac.authorization.k8s.io/v1", Kind: "ClusterRoleBinding", }, ObjectMeta: metav1.ObjectMeta{ Name: "dra-example-driver-role-binding", Namespace: corev1.NamespaceDefault, }, Subjects: []rbacv1.Subject{ { Kind: serviceAccount.Kind, Name: serviceAccount.Name, Namespace: serviceAccount.Namespace, }, }, RoleRef: rbacv1.RoleRef{ Name: clusterRole.Name, Kind: clusterRole.Kind, APIGroup: "rbac.authorization.k8s.io", }, } driverDaemonset := appsv1.DaemonSet{ ObjectMeta: metav1.ObjectMeta{ Name: "dra-example-driver-kubeletplugin", Namespace: corev1.NamespaceDefault, Labels: map[string]string{ "app.kubernetes.io/name": "dra-example-driver", "app.kubernetes.io/instance": "dra-example-driver", "app.kubernetes.io/component": "kubeletplugin", }, }, Spec: appsv1.DaemonSetSpec{ Selector: &metav1.LabelSelector{ MatchLabels: map[string]string{ "app.kubernetes.io/name": "dra-example-driver", "app.kubernetes.io/instance": "dra-example-driver", "app.kubernetes.io/component": "kubeletplugin", }, }, UpdateStrategy: appsv1.DaemonSetUpdateStrategy{ Type: appsv1.RollingUpdateDaemonSetStrategyType, }, Template: corev1.PodTemplateSpec{ ObjectMeta: metav1.ObjectMeta{ Labels: map[string]string{ "app.kubernetes.io/name": "dra-example-driver", "app.kubernetes.io/instance": "dra-example-driver", "app.kubernetes.io/component": "kubeletplugin", }, }, Spec: corev1.PodSpec{ ServiceAccountName: serviceAccount.Name, PriorityClassName: "system-node-critical", Containers: []corev1.Container{ { Name: "plugin", SecurityContext: &corev1.SecurityContext{Privileged: &[]bool{true}[0]}, Image: "registry.k8s.io/dra-example-driver/dra-example-driver:v0.1.0", ImagePullPolicy: corev1.PullIfNotPresent, Command: []string{"dra-example-kubeletplugin"}, Env: []corev1.EnvVar{ {Name: "CDI_ROOT", Value: "/var/run/cdi"}, {Name: "NODE_NAME", ValueFrom: &corev1.EnvVarSource{FieldRef: &corev1.ObjectFieldSelector{FieldPath: "spec.nodeName"}}}, {Name: "NAMESPACE", ValueFrom: &corev1.EnvVarSource{FieldRef: &corev1.ObjectFieldSelector{FieldPath: "metadata.namespace"}}}, // NOTE: this is what arbitrarily decides the // number of GPUs being mocked on the node. {Name: "NUM_DEVICES", Value: "8"}, }, VolumeMounts: []corev1.VolumeMount{ {Name: "plugins-registry", MountPath: "/var/lib/kubelet/plugins_registry"}, {Name: "plugins", MountPath: "/var/lib/kubelet/plugins"}, {Name: "cdi", MountPath: "/var/run/cdi"}, }, }, }, Volumes: []corev1.Volume{ {Name: "plugins-registry", VolumeSource: corev1.VolumeSource{HostPath: &corev1.HostPathVolumeSource{Path: "/var/lib/kubelet/plugins_registry"}}}, {Name: "plugins", VolumeSource: corev1.VolumeSource{HostPath: &corev1.HostPathVolumeSource{Path: "/var/lib/kubelet/plugins"}}}, {Name: "cdi", VolumeSource: corev1.VolumeSource{HostPath: &corev1.HostPathVolumeSource{Path: "/var/run/cdi"}}}, }, }, }, }, } return []k8s.Object{ &clusterRoleBinding, &clusterRole, &serviceAccount, &driverDaemonset, } } ================================================ FILE: test/cases/dra/main_test.go ================================================ //go:build e2e package dra import ( "context" _ "embed" "log" "os" "os/signal" "testing" "sigs.k8s.io/e2e-framework/pkg/env" "sigs.k8s.io/e2e-framework/pkg/envconf" ) var ( testenv env.Environment ) func TestMain(m *testing.M) { ctx, cancel := signal.NotifyContext(context.Background(), os.Interrupt) defer cancel() cfg, err := envconf.NewFromFlags() if err != nil { log.Fatalf("failed to initialize test environment: %v", err) } testenv = env.NewWithConfig(cfg).WithContext(ctx) os.Exit(testenv.Run(m)) } ================================================ FILE: test/cases/efa/commons.go ================================================ //go:build e2e package efa import ( "context" _ "embed" "fmt" "log" "github.com/aws/aws-k8s-tester/internal/e2e" "github.com/aws/aws-sdk-go-v2/aws" corev1 "k8s.io/api/core/v1" v1 "k8s.io/api/core/v1" "k8s.io/client-go/kubernetes" "sigs.k8s.io/e2e-framework/pkg/env" "sigs.k8s.io/e2e-framework/pkg/envconf" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" ) var ( testenv env.Environment ec2Client e2e.EC2Client testImage *string pingPongSize *string pingPongIters *int pingPongDeadlineSeconds *int nodeType *string expectedEFADeviceCount *int verbose *bool ) const ( EFA_RESOURCE_NAME = "vpc.amazonaws.com/efa" TEST_NAMESPACE_NAME = "efa-tests" ) func getEfaCapacity(node corev1.Node) int { capacity, ok := node.Status.Capacity[v1.ResourceName(EFA_RESOURCE_NAME)] if !ok { return 0 } return int(capacity.Value()) } func getEfaNodes(ctx context.Context, config *envconf.Config) ([]corev1.Node, error) { var efaNodes []corev1.Node clientset, err := kubernetes.NewForConfig(config.Client().RESTConfig()) if err != nil { return []corev1.Node{}, fmt.Errorf("failed to create Kubernetes client: %w", err) } nodes, err := clientset.CoreV1().Nodes().List(ctx, metav1.ListOptions{}) if err != nil { return []corev1.Node{}, fmt.Errorf("failed to list nodes: %w", err) } if len(nodes.Items) == 0 { return []corev1.Node{}, fmt.Errorf("no nodes found in the cluster") } for _, node := range nodes.Items { instanceType := node.Labels["node.kubernetes.io/instance-type"] if aws.ToString(nodeType) != "" && instanceType != aws.ToString(nodeType) { log.Printf("[INFO] Skipping node %s (type: %s), node is not of target type %s", node.Name, instanceType, aws.ToString(nodeType)) continue } numEfaDevices, err := e2e.GetNonZeroResourceCapacity(&node, EFA_RESOURCE_NAME) if err != nil { log.Printf("[INFO] Skipping node %s (type: %s): %v", node.Name, instanceType, err) continue } expectedDeviceCount := aws.ToInt(expectedEFADeviceCount) if expectedDeviceCount < 0 { instanceInfo, err := ec2Client.DescribeInstanceType(instanceType) if err != nil { return []corev1.Node{}, err } expectedDeviceCount = int(aws.ToInt32(instanceInfo.NetworkInfo.EfaInfo.MaximumEfaInterfaces)) } if expectedDeviceCount != numEfaDevices { return []corev1.Node{}, fmt.Errorf("unexpected EFA device capacity on node %s: expected %d, got %d", node.Name, expectedDeviceCount, numEfaDevices) } efaNodes = append(efaNodes, node) } if len(efaNodes) == 0 { return []corev1.Node{}, fmt.Errorf("no nodes with EFA capacity found in the cluster") } return efaNodes, nil } ================================================ FILE: test/cases/efa/main_test.go ================================================ //go:build e2e package efa import ( "context" _ "embed" "flag" "log" "os" "os/signal" "testing" "time" "github.com/aws/aws-k8s-tester/internal/e2e" "github.com/aws/aws-k8s-tester/test/manifests" appsv1 "k8s.io/api/apps/v1" corev1 "k8s.io/api/core/v1" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "sigs.k8s.io/e2e-framework/klient/wait" "sigs.k8s.io/e2e-framework/pkg/env" "sigs.k8s.io/e2e-framework/pkg/envconf" ) func getTestNamespace() *corev1.Namespace { return &corev1.Namespace{ ObjectMeta: metav1.ObjectMeta{ Name: TEST_NAMESPACE_NAME, }, } } func deployEFAPlugin(ctx context.Context, config *envconf.Config) (context.Context, error) { err := e2e.ApplyManifests(config.Client().RESTConfig(), manifests.EfaDevicePluginManifest) if err != nil { return ctx, err } efaDS := appsv1.DaemonSet{ ObjectMeta: metav1.ObjectMeta{Name: "aws-efa-k8s-device-plugin-daemonset", Namespace: "kube-system"}, } err = wait.For(e2e.NewConditionExtension(config.Client().Resources()).DaemonSetReady(&efaDS), wait.WithContext(ctx), wait.WithTimeout(5*time.Minute), ) if err != nil { return ctx, err } return ctx, nil } func TestMain(m *testing.M) { testImage = flag.String("testImage", "", "container image to use for tests") pingPongSize = flag.String("pingPongSize", "all", "sizes to use for ping pong") pingPongIters = flag.Int("pingPongIters", 10000, "number of iterations to use for ping pong") pingPongDeadlineSeconds = flag.Int("pingPongDeadlineSeconds", 120, "maximum run time for a ping pong attempt") nodeType = flag.String("nodeType", "", "instance type to target for tests") expectedEFADeviceCount = flag.Int("expectedEFADeviceCount", -1, "expected number of efa devices for the target nodes") verbose = flag.Bool("verbose", true, "use verbose mode for tests") cfg, err := envconf.NewFromFlags() if err != nil { log.Fatalf("failed to initialize test environment: %v", err) } if *testImage == "" { log.Fatal("--testImage must be set, use https://github.com/aws/aws-k8s-tester/blob/main/test/efa/Dockerfile to build the image") } ctx, cancel := signal.NotifyContext(context.Background(), os.Interrupt) defer cancel() timedCtx, cancel := context.WithTimeout(ctx, 55*time.Minute) defer cancel() testenv = env.NewWithConfig(cfg) testenv = testenv.WithContext(timedCtx) ec2Client = e2e.NewEC2Client() testenv.Setup( deployEFAPlugin, func(ctx context.Context, config *envconf.Config) (context.Context, error) { select { case <-ctx.Done(): // Cooldown to let device plugin update node object with resources case <-time.After(15 * time.Second): } return ctx, cfg.Client().Resources().Create(ctx, getTestNamespace()) }, ) testenv.Finish( func(ctx context.Context, config *envconf.Config) (context.Context, error) { cfg.Client().Resources().Delete(context.TODO(), getTestNamespace()) err := e2e.DeleteManifests(cfg.Client().RESTConfig(), manifests.EfaDevicePluginManifest) if err != nil { return ctx, err } return ctx, nil }, ) os.Exit(testenv.Run(m)) } ================================================ FILE: test/cases/efa/pingpong_test.go ================================================ //go:build e2e package efa import ( "context" _ "embed" "fmt" "log" "testing" "time" "k8s.io/apimachinery/pkg/api/resource" "github.com/aws/aws-k8s-tester/internal/e2e" "github.com/aws/aws-sdk-go-v2/aws" "github.com/stretchr/testify/assert" corev1 "k8s.io/api/core/v1" v1 "k8s.io/api/core/v1" "sigs.k8s.io/e2e-framework/klient/wait" "sigs.k8s.io/e2e-framework/klient/wait/conditions" "sigs.k8s.io/e2e-framework/pkg/envconf" "sigs.k8s.io/e2e-framework/pkg/features" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" ) const ( PING_PONG_SERVICE_NAME = "pingpong-service" SERVER_POD_NAME = "pingpong-server" CLIENT_POD_NAME = "pingpong-client" PINGPONG_COMMAND = "fi_pingpong" ) func getPingPongPodName(server bool) string { if server { return SERVER_POD_NAME } else { return CLIENT_POD_NAME } } func getPingPongArgs(server bool) (args []string) { args = []string{"-S", aws.ToString(pingPongSize), "-I", fmt.Sprint(aws.ToInt(pingPongIters)), "-p", "efa"} if aws.ToBool(verbose) { args = append(args, "-v") } if !server { args = append(args, fmt.Sprintf("%s.%s", SERVER_POD_NAME, PING_PONG_SERVICE_NAME)) } return } func getPingPongResourceLabels(server bool) map[string]string { return map[string]string{ "test-suite": "pingpong", "pingpong-server": fmt.Sprint(server), } } func generatePingPongServiceManifest() corev1.Service { return corev1.Service{ ObjectMeta: metav1.ObjectMeta{ Name: PING_PONG_SERVICE_NAME, Namespace: TEST_NAMESPACE_NAME, }, Spec: v1.ServiceSpec{ Selector: getPingPongResourceLabels(true), ClusterIP: "None", }, } } func generatePingPongPodManifest(server bool, node corev1.Node) corev1.Pod { efaResourceQuantity := resource.MustParse(fmt.Sprint(getEfaCapacity(node))) return corev1.Pod{ ObjectMeta: metav1.ObjectMeta{ Name: getPingPongPodName(server), Namespace: TEST_NAMESPACE_NAME, Labels: getPingPongResourceLabels(server), }, Spec: corev1.PodSpec{ Hostname: getPingPongPodName(server), Subdomain: PING_PONG_SERVICE_NAME, RestartPolicy: v1.RestartPolicyOnFailure, // TODO: centralize re-usable logic for pod spec formatting Affinity: &corev1.Affinity{ NodeAffinity: &corev1.NodeAffinity{ RequiredDuringSchedulingIgnoredDuringExecution: &corev1.NodeSelector{ NodeSelectorTerms: []corev1.NodeSelectorTerm{ { MatchExpressions: []corev1.NodeSelectorRequirement{ { Key: "kubernetes.io/hostname", Operator: "In", Values: []string{ node.Name, }, }, }, }, }, }, }, }, Containers: []corev1.Container{ { Name: "pingpong", Image: aws.ToString(testImage), Command: []string{"timeout", fmt.Sprintf("%ds", aws.ToInt(pingPongDeadlineSeconds)), PINGPONG_COMMAND}, Args: getPingPongArgs(server), Resources: corev1.ResourceRequirements{ Requests: corev1.ResourceList{ EFA_RESOURCE_NAME: efaResourceQuantity, }, Limits: corev1.ResourceList{ EFA_RESOURCE_NAME: efaResourceQuantity, }, }, }, }, }, } } func getPingPongPods(ctx context.Context, config *envconf.Config) (corev1.Pod, corev1.Pod, error) { efaNodes, err := getEfaNodes(ctx, config) if err != nil { return corev1.Pod{}, corev1.Pod{}, err } if len(efaNodes) < 2 { return corev1.Pod{}, corev1.Pod{}, fmt.Errorf("need at least 2 nodes with EFA capacity, got %d", len(efaNodes)) } serverNode := efaNodes[0] log.Printf("[INFO] Using node %s (type: %s), as server", serverNode.Name, serverNode.Labels["node.kubernetes.io/instance-type"]) clientNode := efaNodes[1] log.Printf("[INFO] Using node %s (type: %s), as client", clientNode.Name, clientNode.Labels["node.kubernetes.io/instance-type"]) return generatePingPongPodManifest(true, serverNode), generatePingPongPodManifest(false, clientNode), nil } func TestPingPong(t *testing.T) { var err error var pingPongService corev1.Service var client, server corev1.Pod pingpong := features.New("pingpong"). WithLabel("suite", "efa"). Setup(func(ctx context.Context, t *testing.T, cfg *envconf.Config) context.Context { pingPongService = generatePingPongServiceManifest() client, server, err = getPingPongPods(ctx, cfg) if err != nil { t.Fatal(err) } assert.NoError(t, cfg.Client().Resources().Create(ctx, &pingPongService)) assert.NoError(t, cfg.Client().Resources().Create(ctx, &server)) assert.NoError(t, cfg.Client().Resources().Create(ctx, &client)) return ctx }). Assess("Pingpong between nodes succeeds", func(ctx context.Context, t *testing.T, cfg *envconf.Config) context.Context { assert.NoError(t, wait.For(conditions.New(cfg.Client().Resources()).PodPhaseMatch(&server, v1.PodSucceeded), wait.WithTimeout(15*time.Minute), wait.WithContext(ctx), )) return ctx }). Teardown(func(ctx context.Context, t *testing.T, cfg *envconf.Config) context.Context { serverPodLogs, err := e2e.ReadPodLogs(ctx, cfg.Client().RESTConfig(), server.Namespace, server.Name, server.Spec.Containers[0].Name) if err != nil { t.Logf("Could not get pods for server") } t.Logf("Logs for server\n%s", serverPodLogs) assert.NoError(t, cfg.Client().Resources().Delete(ctx, &pingPongService)) assert.NoError(t, cfg.Client().Resources().Delete(ctx, &server)) assert.NoError(t, cfg.Client().Resources().Delete(ctx, &client)) return ctx }). Feature() testenv.Test(t, pingpong) } ================================================ FILE: test/cases/efa/unit_test.go ================================================ //go:build e2e package efa import ( "context" _ "embed" "fmt" "testing" "time" "github.com/aws/aws-k8s-tester/internal/e2e" "github.com/aws/aws-sdk-go-v2/aws" "github.com/stretchr/testify/assert" corev1 "k8s.io/api/core/v1" v1 "k8s.io/api/core/v1" "k8s.io/apimachinery/pkg/api/resource" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "sigs.k8s.io/e2e-framework/klient/wait" "sigs.k8s.io/e2e-framework/klient/wait/conditions" "sigs.k8s.io/e2e-framework/pkg/envconf" "sigs.k8s.io/e2e-framework/pkg/features" ) func generateUnitTestManifest(node corev1.Node, testIndex int) corev1.Pod { efaAllocatable := fmt.Sprint(getEfaCapacity(node)) efaResourceQuantity := resource.MustParse(efaAllocatable) return corev1.Pod{ ObjectMeta: metav1.ObjectMeta{ Name: fmt.Sprintf("efa-unit-%d", testIndex), Namespace: TEST_NAMESPACE_NAME, }, Spec: corev1.PodSpec{ RestartPolicy: v1.RestartPolicyOnFailure, // TODO: centralize re-usable logic for pod spec fkormatting Affinity: &corev1.Affinity{ NodeAffinity: &corev1.NodeAffinity{ RequiredDuringSchedulingIgnoredDuringExecution: &corev1.NodeSelector{ NodeSelectorTerms: []corev1.NodeSelectorTerm{ { MatchExpressions: []corev1.NodeSelectorRequirement{ { Key: "kubernetes.io/hostname", Operator: "In", Values: []string{ node.Name, }, }, }, }, }, }, }, }, Containers: []corev1.Container{ { Name: "unit-test", Image: aws.ToString(testImage), Command: []string{"./scripts/unit-test.sh"}, Env: []v1.EnvVar{ { Name: "EXPECTED_EFA_DEVICE_COUNT", Value: efaAllocatable, }, { Name: "EC2_INSTANCE_TYPE", Value: node.Labels["node.kubernetes.io/instance-type"], }, }, Resources: corev1.ResourceRequirements{ Requests: corev1.ResourceList{ EFA_RESOURCE_NAME: efaResourceQuantity, }, Limits: corev1.ResourceList{ EFA_RESOURCE_NAME: efaResourceQuantity, }, }, }, }, }, } } func getUnitTestPodManifests(ctx context.Context, config *envconf.Config) ([]corev1.Pod, error) { var podManifests []corev1.Pod efaNodes, err := getEfaNodes(ctx, config) if err != nil { return []corev1.Pod{}, err } for nodeIndex, node := range efaNodes { podManifests = append(podManifests, generateUnitTestManifest(node, nodeIndex)) } return podManifests, err } func TestUnit(t *testing.T) { var err error var pods []corev1.Pod unit := features.New("unit"). WithLabel("suite", "efa"). Setup(func(ctx context.Context, t *testing.T, cfg *envconf.Config) context.Context { pods, err = getUnitTestPodManifests(ctx, cfg) if err != nil { t.Fatalf("Failed to generate unit test manifests: %v", err) } for _, pod := range pods { assert.NoError(t, cfg.Client().Resources().Create(ctx, &pod)) } return ctx }). Assess("Unit test succeeds", func(ctx context.Context, t *testing.T, cfg *envconf.Config) context.Context { suiteCtx, cancel := context.WithTimeout(ctx, 20*time.Minute) defer cancel() for _, pod := range pods { assert.NoError(t, wait.For(conditions.New(cfg.Client().Resources()).PodPhaseMatch(&pod, v1.PodSucceeded), wait.WithContext(suiteCtx), )) } return ctx }). Teardown(func(ctx context.Context, t *testing.T, cfg *envconf.Config) context.Context { for _, pod := range pods { podLogs, err := e2e.ReadPodLogs(ctx, cfg.Client().RESTConfig(), pod.Namespace, pod.Name, pod.Spec.Containers[0].Name) if err != nil { t.Logf("Could not get logs for pod %q", pod.Name) } else { t.Logf("Logs for pod %q\n%s", pod.Name, podLogs) } } for _, pod := range pods { assert.NoError(t, cfg.Client().Resources().Delete(ctx, &pod)) } return ctx }). Feature() testenv.Test(t, unit) } ================================================ FILE: test/cases/fips/README.md ================================================ # FIPS TLS Compliance Test This test validates that FIPS-enabled EKS nodes enforce FIPS-compliant TLS cipher suites when pulling container images. ## What It Does 1. Deploys two local container registries as DaemonSets on each node: - `registry-fips` (port 5000) — serves TLS using the node's default (FIPS-compliant) cipher suites - `registry-nonfips` (port 5001) — an nginx reverse proxy configured to only offer `ECDHE-RSA-CHACHA20-POLY1305`, a non-FIPS cipher 2. Seeds both registries with a test image via `skopeo` 3. Runs two test pods: - `test-pull-fips` — pulls from `localhost:5000` and expects success - `test-pull-nonfips` — pulls from `localhost:5001` and expects `ImagePullBackOff` (TLS handshake failure) ## Prerequisites - An EKS cluster with FIPS-enabled nodes - TLS certificates available on each node at `/mnt/server-conf/certs/`: - `server.crt` — server certificate - `server.key` — private key - `kubeconfig` configured for the target cluster - Go 1.21+ ## Host Setup ### Amazon Linux 2023 FIPS mode must be enabled at launch time via the EKS AMI. Use a FIPS-enabled AL2023 AMI when creating the nodegroup: ```bash # Create a FIPS-enabled nodegroup with eksctl kubetest2 eksctl \ --kubernetes-version=X.XX \ --ami-family=AmazonLinux2023 \ --up \ --down \ --test=exec \ -- ``` Verify FIPS is active on a node: ```bash # SSH into a node and check cat /proc/sys/crypto/fips_enabled # Expected output: 1 # Or check via sysctl sysctl crypto.fips_enabled # Expected output: crypto.fips_enabled = 1 ``` Generate the TLS certificates on each node: ```bash sudo mkdir -p /mnt/server-conf/certs sudo openssl req -x509 -nodes -days 365 -newkey rsa:2048 \ -keyout /mnt/server-conf/certs/server.key \ -out /mnt/server-conf/certs/server.crt \ -subj "/CN=localhost" \ -addext "subjectAltName=DNS:localhost,IP:127.0.0.1" ``` Add the certificate to the node's trust store so containerd trusts the local registries: ```bash sudo cp /mnt/server-conf/certs/server.crt /etc/pki/ca-trust/source/anchors/ sudo update-ca-trust sudo systemctl restart containerd ``` Without this, containerd will reject the self-signed cert and both test pods would fail with `ImagePullBackOff`. ### Bottlerocket Bottlerocket is an immutable OS — you can't SSH in and run `openssl` directly. Certs must be provisioned via a bootstrap container that runs before kubelet starts. **1. Build the bootstrap container image** The Dockerfile is minimal — it runs a user-data script at boot: ```dockerfile FROM public.ecr.aws/docker/library/alpine:latest RUN apk add --no-cache openssl curl ENTRYPOINT ["/bin/sh", "/.bottlerocket/bootstrap-containers/gen-certs/user-data"] ``` Build and push to ECR: ```bash docker build -t .dkr.ecr..amazonaws.com/cert-bootstrap:v1 . docker push .dkr.ecr..amazonaws.com/cert-bootstrap:v1 ``` **2. Prepare the cert generation script** The cert generation script generates a CA + server cert, writes them to the host at `/mnt/server-conf/certs/`, and registers the CA with Bottlerocket's trust store via `apiclient`: ```bash #!/bin/sh set -xe WORK_DIR=$(mktemp -d) CERTS_DIR=/.bottlerocket/rootfs/mnt/server-conf/certs CSR_CONF=${WORK_DIR}/csr.conf CA_CRT=${WORK_DIR}/ca.crt CA_KEY=${WORK_DIR}/ca.key mkdir -p ${CERTS_DIR} # Generate CA openssl genrsa -out ${CA_KEY} 2048 openssl req -x509 -new -nodes -key ${CA_KEY} \ -subj "/CN=Bottlerocket Test CA/C=US/ST=WASHINGTON/L=Seattle/O=Bottlerocket" \ -days 1825 -out ${CA_CRT} # Get instance metadata TOKEN=$(curl -X PUT "http://169.254.169.254/latest/api/token" -H "X-aws-ec2-metadata-token-ttl-seconds: 21600") DOMAIN=$(curl -H "X-aws-ec2-metadata-token: ${TOKEN}" http://169.254.169.254/latest/meta-data/public-hostname) IP=$(curl -H "X-aws-ec2-metadata-token: ${TOKEN}" http://169.254.169.254/latest/meta-data/public-ipv4) # Generate CSR config with real values cat > ${CSR_CONF} < ``` Verify FIPS on Bottlerocket (via the admin container): ```bash cat /proc/sys/crypto/fips_enabled # Expected output: 1 ``` ## Running the Test ```bash # Run all FIPS test cases go test -tags e2e -v ./test/cases/fips/ --kubeconfig=$HOME/.kube/config # Run a specific test case by label go test -tags e2e -v ./test/cases/fips/ --kubeconfig=$HOME/.kube/config -labels="suite=fips" ``` Or via `kubetest2`: ```bash kubetest2 eksctl \ --kubernetes-version=X.XX \ --ami-family= \ --up \ --down \ --test=exec \ -- fips.test -v ``` ## Test Cases | Test | Description | Expected Result | |------|-------------|-----------------| | `fips-tls-pull` | Pull image from FIPS-cipher registry (port 5000) | Pod succeeds | | `nonfips-tls-pull` | Pull image from non-FIPS-cipher registry (port 5001) | `ImagePullBackOff` — TLS handshake rejected | ================================================ FILE: test/cases/fips/fips_test.go ================================================ //go:build e2e package fips import ( "context" _ "embed" "io" "strings" "testing" "time" v1 "k8s.io/api/core/v1" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/client-go/kubernetes" "sigs.k8s.io/e2e-framework/klient/wait" "sigs.k8s.io/e2e-framework/klient/wait/conditions" "sigs.k8s.io/e2e-framework/pkg/envconf" "sigs.k8s.io/e2e-framework/pkg/features" ) const ( pullTimeout = 5 * time.Minute rejectTimeout = 2 * time.Minute ) var ( //go:embed manifests/registry-fips.yaml registryFIPSManifest []byte //go:embed manifests/registry-nonfips.yaml registryNonFIPSManifest []byte //go:embed manifests/test-pods.yaml testPodsManifest []byte ) func verifyNonfipsCipherRejection(ctx context.Context, t *testing.T, cfg *envconf.Config) { t.Helper() clientset, err := kubernetes.NewForConfig(cfg.Client().RESTConfig()) if err != nil { t.Fatalf("could not create clientset for log verification: %v", err) } logCtx, logCancel := context.WithTimeout(ctx, logFetchTimeout) defer logCancel() pods, err := clientset.CoreV1().Pods("default").List(logCtx, metav1.ListOptions{ LabelSelector: "name=registry-nonfips", }) if err != nil { t.Fatalf("failed to list registry-nonfips pods: %v", err) } if len(pods.Items) == 0 { t.Fatal("no registry-nonfips pods found for log verification") } for _, pod := range pods.Items { req := clientset.CoreV1().Pods("default").GetLogs(pod.Name, &v1.PodLogOptions{ Container: "nginx", TailLines: int64Ptr(50), }) stream, err := req.Stream(logCtx) if err != nil { continue } body, _ := io.ReadAll(stream) stream.Close() logs := string(body) t.Logf("registry-nonfips nginx logs:\n%s", logs) if strings.Contains(logs, "no shared cipher") { t.Log("Verified: FIPS node rejected non-FIPS cipher suite (no shared cipher)") return } } t.Fatal("Expected 'no shared cipher' in registry-nonfips nginx logs but not found") } func TestFIPSTLS(t *testing.T) { fipsPull := features.New("fips-tls-pull"). WithLabel("suite", "fips"). Assess("Pull from FIPS-cipher registry succeeds", func(ctx context.Context, t *testing.T, cfg *envconf.Config) context.Context { pod := &v1.Pod{ ObjectMeta: metav1.ObjectMeta{Name: "test-pull-fips", Namespace: "default"}, } err := wait.For( conditions.New(cfg.Client().Resources()).PodPhaseMatch(pod, v1.PodSucceeded), wait.WithContext(ctx), wait.WithTimeout(pullTimeout), ) if err != nil { t.Fatalf("test-pull-fips pod did not succeed: %v", err) } t.Log("FIPS TLS pull succeeded as expected") return ctx }). Teardown(func(ctx context.Context, t *testing.T, cfg *envconf.Config) context.Context { cfg.Client().Resources().Delete(ctx, &v1.Pod{ ObjectMeta: metav1.ObjectMeta{Name: "test-pull-fips", Namespace: "default"}, }) return ctx }). Feature() nonfipsPull := features.New("nonfips-tls-pull"). WithLabel("suite", "fips"). Assess("Pull from non-FIPS-cipher registry fails on FIPS node", func(ctx context.Context, t *testing.T, cfg *envconf.Config) context.Context { pod := &v1.Pod{ ObjectMeta: metav1.ObjectMeta{Name: "test-pull-nonfips", Namespace: "default"}, } // Poll for ImagePullBackOff/ErrImagePull — pod won't reach PodFailed phase deadline := time.Now().Add(rejectTimeout) for time.Now().Before(deadline) { select { case <-ctx.Done(): t.Fatalf("context cancelled while waiting for ImagePullBackOff: %v", ctx.Err()) default: } err := cfg.Client().Resources().Get(ctx, "test-pull-nonfips", "default", pod) if err != nil { t.Fatalf("failed to get test-pull-nonfips pod: %v", err) } // #1: Log pod status during polling t.Logf("Polling test-pull-nonfips: Phase=%s", pod.Status.Phase) for _, cs := range pod.Status.ContainerStatuses { if cs.State.Waiting != nil { t.Logf(" Container %s: Waiting (Reason=%s)", cs.Name, cs.State.Waiting.Reason) } else if cs.State.Running != nil { t.Logf(" Container %s: Running", cs.Name) } else if cs.State.Terminated != nil { t.Logf(" Container %s: Terminated (Reason=%s)", cs.Name, cs.State.Terminated.Reason) } } // #2: Detect unexpected success if pod.Status.Phase == v1.PodSucceeded { t.Fatal("test-pull-nonfips pod succeeded — expected ImagePullBackOff. Is this a FIPS node?") } for _, cs := range pod.Status.ContainerStatuses { if cs.State.Running != nil && cs.Ready { t.Fatal("test-pull-nonfips container is running — image pull succeeded. Is this a FIPS node?") } if cs.State.Waiting != nil && (cs.State.Waiting.Reason == "ImagePullBackOff" || cs.State.Waiting.Reason == "ErrImagePull") { verifyNonfipsCipherRejection(ctx, t, cfg) t.Log("Non-FIPS TLS pull correctly rejected (ImagePullBackOff)") return ctx } } time.Sleep(pollInterval) } t.Fatal("test-pull-nonfips did not reach ImagePullBackOff within timeout") return ctx }). Teardown(func(ctx context.Context, t *testing.T, cfg *envconf.Config) context.Context { cfg.Client().Resources().Delete(ctx, &v1.Pod{ ObjectMeta: metav1.ObjectMeta{Name: "test-pull-nonfips", Namespace: "default"}, }) return ctx }). Feature() testenv.Test(t, fipsPull, nonfipsPull) } ================================================ FILE: test/cases/fips/main_test.go ================================================ //go:build e2e package fips import ( "context" "fmt" "io" "log" "os" "os/signal" "strings" "testing" "time" fwext "github.com/aws/aws-k8s-tester/internal/e2e" appsv1 "k8s.io/api/apps/v1" v1 "k8s.io/api/core/v1" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/client-go/kubernetes" "sigs.k8s.io/e2e-framework/klient/wait" "sigs.k8s.io/e2e-framework/pkg/env" "sigs.k8s.io/e2e-framework/pkg/envconf" ) const ( pollInterval = 5 * time.Second // polling interval for waitForSeed and status checks seedTimeout = 5 * time.Minute // apk install + skopeo copy can be slow on first pull daemonSetTimeout = 2 * time.Minute // per DaemonSet; image pulls vary by network logFetchTimeout = 30 * time.Second // timeout for fetching pod logs // Worst-case Setup: 2x daemonSetTimeout (4m) + 2x seedTimeout (6m) = ~10m ) var testenv env.Environment func int64Ptr(i int64) *int64 { return &i } func logDaemonSetDiagnostics(ctx context.Context, clientset *kubernetes.Clientset, dsName string) { log.Printf("=== Diagnostics for DaemonSet %s ===", dsName) pods, err := clientset.CoreV1().Pods("default").List(ctx, metav1.ListOptions{ LabelSelector: "name=" + dsName, }) if err != nil { log.Printf("Failed to list pods: %v", err) return } for _, pod := range pods.Items { log.Printf("Pod %s: Phase=%s", pod.Name, pod.Status.Phase) for _, cond := range pod.Status.Conditions { log.Printf(" Condition %s: %s (Reason: %s)", cond.Type, cond.Status, cond.Reason) } for _, cs := range pod.Status.ContainerStatuses { log.Printf(" Container %s: Ready=%v, RestartCount=%d", cs.Name, cs.Ready, cs.RestartCount) if cs.State.Waiting != nil { log.Printf(" Waiting: %s - %s", cs.State.Waiting.Reason, cs.State.Waiting.Message) } if cs.State.Terminated != nil { log.Printf(" Terminated: %s - %s", cs.State.Terminated.Reason, cs.State.Terminated.Message) } if (cs.State.Waiting != nil && cs.State.Waiting.Reason == "CrashLoopBackOff") || cs.RestartCount > 0 { req := clientset.CoreV1().Pods("default").GetLogs(pod.Name, &v1.PodLogOptions{ Container: cs.Name, TailLines: int64Ptr(20), }) stream, err := req.Stream(ctx) if err == nil { body, _ := io.ReadAll(stream) stream.Close() log.Printf(" Last logs:\n%s", string(body)) } } } } } func logNodeInfo(ctx context.Context, clientset *kubernetes.Clientset) { nodes, err := clientset.CoreV1().Nodes().List(ctx, metav1.ListOptions{}) if err != nil { log.Printf("Warning: could not list nodes: %v", err) return } for _, node := range nodes.Items { osImage := node.Status.NodeInfo.OSImage isFIPS := strings.Contains(strings.ToLower(osImage), "fips") log.Printf("Node %s: OS=%s, FIPS=%v", node.Name, osImage, isFIPS) } } // normally this will only take couple seconds. func waitForSeed(ctx context.Context, clientset *kubernetes.Clientset, dsName string) error { log.Printf("Waiting for %s seed container to complete...", dsName) deadline := time.Now().Add(seedTimeout) var lastLogs string for time.Now().Before(deadline) { select { case <-ctx.Done(): return ctx.Err() default: } pods, err := clientset.CoreV1().Pods("default").List(ctx, metav1.ListOptions{ LabelSelector: "name=" + dsName, }) if err != nil { return err } if len(pods.Items) == 0 { log.Printf("%s: no pods found yet, waiting...", dsName) time.Sleep(pollInterval) continue } allSeeded := true for _, pod := range pods.Items { req := clientset.CoreV1().Pods("default").GetLogs(pod.Name, &v1.PodLogOptions{ Container: "seed-image", }) logCtx, logCancel := context.WithTimeout(ctx, logFetchTimeout) stream, err := req.Stream(logCtx) if err != nil { logCancel() log.Printf("Failed to get logs for %s/%s: %v", dsName, pod.Name, err) allSeeded = false continue } body, _ := io.ReadAll(stream) stream.Close() logCancel() logs := string(body) if strings.Contains(logs, "level=fatal") { return fmt.Errorf("%s seed failed: %s", dsName, logs) } if !strings.Contains(logs, "Image seeded successfully") { allSeeded = false lastLogs = logs } } if allSeeded { log.Printf("%s seed completed successfully on all %d pods", dsName, len(pods.Items)) return nil } log.Printf("%s seed still waiting... (got %d bytes of logs)", dsName, len(lastLogs)) time.Sleep(pollInterval) } // Dump last logs on timeout if lastLogs != "" { log.Printf("%s seed timeout - last logs:\n%s", dsName, lastLogs) } return fmt.Errorf("%s seed did not complete within timeout", dsName) } func TestMain(m *testing.M) { cfg, err := envconf.NewFromFlags() if err != nil { log.Fatalf("failed to initialize test environment: %v", err) } testenv = env.NewWithConfig(cfg) ctx, cancel := signal.NotifyContext(context.Background(), os.Interrupt) defer cancel() testenv = testenv.WithContext(ctx) testenv.Setup( func(ctx context.Context, config *envconf.Config) (context.Context, error) { clientset, err := kubernetes.NewForConfig(config.Client().RESTConfig()) if err != nil { return ctx, fmt.Errorf("failed to create Kubernetes client: %w", err) } logNodeInfo(ctx, clientset) if err := fwext.ApplyManifests(config.Client().RESTConfig(), registryFIPSManifest); err != nil { return ctx, fmt.Errorf("failed to apply registry-fips manifest: %w", err) } log.Println("registry-fips DaemonSet deployed") if err := fwext.ApplyManifests(config.Client().RESTConfig(), registryNonFIPSManifest); err != nil { return ctx, fmt.Errorf("failed to apply registry-nonfips manifest: %w", err) } log.Println("registry-nonfips DaemonSet deployed") for _, name := range []string{"registry-fips", "registry-nonfips"} { ds := appsv1.DaemonSet{ ObjectMeta: metav1.ObjectMeta{Name: name, Namespace: "default"}, } log.Printf("Waiting for %s DaemonSet to be ready...", name) err := wait.For( fwext.NewConditionExtension(config.Client().Resources()).DaemonSetReady(&ds), wait.WithContext(ctx), wait.WithTimeout(daemonSetTimeout), ) if err != nil { logDaemonSetDiagnostics(ctx, clientset, name) return ctx, fmt.Errorf("%s DaemonSet not ready: %w", name, err) } log.Printf("%s DaemonSet is ready", name) } for _, dsName := range []string{"registry-fips", "registry-nonfips"} { if err := waitForSeed(ctx, clientset, dsName); err != nil { return ctx, fmt.Errorf("seed verification failed for %s: %w", dsName, err) } } if err := fwext.ApplyManifests(config.Client().RESTConfig(), testPodsManifest); err != nil { return ctx, fmt.Errorf("failed to apply test-pods manifest: %w", err) } log.Println("test pods deployed") return ctx, nil }, ) testenv.Finish( func(ctx context.Context, config *envconf.Config) (context.Context, error) { fwext.DeleteManifests(config.Client().RESTConfig(), registryFIPSManifest) fwext.DeleteManifests(config.Client().RESTConfig(), registryNonFIPSManifest) fwext.DeleteManifests(config.Client().RESTConfig(), testPodsManifest) return ctx, nil }, ) os.Exit(testenv.Run(m)) } ================================================ FILE: test/cases/fips/manifests/registry-fips.yaml ================================================ apiVersion: apps/v1 kind: DaemonSet metadata: name: registry-fips spec: selector: matchLabels: name: registry-fips template: metadata: labels: name: registry-fips spec: hostNetwork: true containers: - name: registry image: registry:2 env: - name: REGISTRY_HTTP_ADDR value: "0.0.0.0:5000" - name: REGISTRY_HTTP_TLS_CERTIFICATE value: "/certs/server.crt" - name: REGISTRY_HTTP_TLS_KEY value: "/certs/server.key" volumeMounts: - name: certs mountPath: /certs - name: seed-image image: public.ecr.aws/docker/library/alpine:latest command: - /bin/sh - -c - | apk add --no-cache --repository=https://dl-cdn.alpinelinux.org/alpine/edge/community skopeo sleep 5 skopeo copy --dest-tls-verify=false docker://public.ecr.aws/docker/library/alpine:latest docker://127.0.0.1:5000/test:latest echo "Image seeded successfully" sleep infinity volumes: - name: certs hostPath: path: /mnt/server-conf/certs ================================================ FILE: test/cases/fips/manifests/registry-nonfips.yaml ================================================ apiVersion: v1 kind: ConfigMap metadata: name: nginx-nonfips-config data: nginx.conf: | events {} http { error_log /dev/stderr debug; server { listen 5001 ssl; ssl_certificate /certs/server.crt; ssl_certificate_key /certs/server.key; ssl_protocols TLSv1.2; ssl_ciphers ECDHE-RSA-CHACHA20-POLY1305; location / { proxy_pass http://127.0.0.1:5002; proxy_set_header Host $host; proxy_set_header X-Real-IP $remote_addr; } } } --- apiVersion: apps/v1 kind: DaemonSet metadata: name: registry-nonfips spec: selector: matchLabels: name: registry-nonfips template: metadata: labels: name: registry-nonfips spec: hostNetwork: true containers: - name: nginx image: public.ecr.aws/nginx/nginx:stable-alpine volumeMounts: - name: certs mountPath: /certs - name: nginx-config mountPath: /etc/nginx/nginx.conf subPath: nginx.conf - name: registry image: registry:2 env: - name: REGISTRY_HTTP_ADDR value: "127.0.0.1:5002" - name: seed-image image: public.ecr.aws/docker/library/alpine:latest command: - /bin/sh - -c - | apk add --no-cache --repository=https://dl-cdn.alpinelinux.org/alpine/edge/community skopeo sleep 5 skopeo copy --dest-tls-verify=false docker://public.ecr.aws/docker/library/alpine:latest docker://127.0.0.1:5002/test:latest echo "Image seeded successfully" sleep infinity volumes: - name: certs hostPath: path: /mnt/server-conf/certs - name: nginx-config configMap: name: nginx-nonfips-config ================================================ FILE: test/cases/fips/manifests/test-pods.yaml ================================================ apiVersion: v1 kind: Pod metadata: name: test-pull-fips spec: containers: - name: test image: localhost:5000/test:latest command: ["echo", "FIPS cipher works"] restartPolicy: Never --- apiVersion: v1 kind: Pod metadata: name: test-pull-nonfips spec: containers: - name: test image: localhost:5001/test:latest command: ["echo", "should not reach here"] restartPolicy: Never ================================================ FILE: test/cases/netpol/main_test.go ================================================ //go:build e2e package netpol import ( "context" "flag" "log" "os" "testing" "time" "github.com/aws/aws-sdk-go-v2/aws" "github.com/aws/aws-sdk-go-v2/config" "github.com/aws/aws-sdk-go-v2/service/eks" "github.com/aws/aws-sdk-go-v2/service/eks/types" fwext "github.com/aws/aws-k8s-tester/internal/e2e" "github.com/pkg/errors" appsv1 "k8s.io/api/apps/v1" corev1 "k8s.io/api/core/v1" v1 "k8s.io/api/core/v1" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "sigs.k8s.io/e2e-framework/klient" "sigs.k8s.io/e2e-framework/klient/wait" "sigs.k8s.io/e2e-framework/klient/wait/conditions" "sigs.k8s.io/e2e-framework/pkg/env" "sigs.k8s.io/e2e-framework/pkg/envconf" ) var ( testenv env.Environment clusterName string endPointUrl string kubernetesVersion string addonName string = "vpc-cni" ) func TestMain(m *testing.M) { cfg, err := envconf.NewFromFlags() if err != nil { log.Fatalf("failed to initialize test environment: %v", err) } config, err := config.LoadDefaultConfig(context.TODO()) eksclient := eks.NewFromConfig(config) testenv = env.NewWithConfig(cfg) flag.StringVar(&clusterName, "cluster-name", "", "Name of the cluster") flag.StringVar(&endPointUrl, "endpoint-url", "", "Endpoint url to use") flag.Parse() namespaces := []string{"a", "b", "c"} testenv.Setup( func(ctx context.Context, config *envconf.Config) (context.Context, error) { client, err := config.NewClient() if err != nil { return ctx, err } servers := map[string]string{ "a": "a-server", "b": "b-server", "c": "c-server", } // 1. Install Latest CNI version log.Print("Install the latest VPC-CNI on the cluster") kubernetesVersion, err = getClusterVersion(ctx, eksclient) if err != nil { return ctx, err } err = installLatestCNIVersion(ctx, config, eksclient) if err != nil { return ctx, err } // 2. Create three namespaces log.Print("Creating the test namespaces") for _, ns := range namespaces { err = createNamespace(ns, client, ctx) if err != nil { return ctx, errors.Wrapf(err, "Failed to create namespace %s", ns) } } // 3. Create deployment and service log.Print("Creating the test deployment and service") for ns, server := range servers { err = createServerAndService(ns, server, 1, client, ctx) if err != nil { return ctx, errors.Wrapf(err, "Failed to create deployment and service for %s", server) } } return ctx, nil }, ) testenv.Finish( func(ctx context.Context, config *envconf.Config) (context.Context, error) { client, err := config.NewClient() if err != nil { return ctx, err } log.Print("Deleting the test namespaces") for _, ns := range namespaces { client.Resources().Delete(ctx, &v1.Namespace{ObjectMeta: metav1.ObjectMeta{Name: ns, Namespace: ns}}) } log.Print("Installing the Default version of VPC-CNI on the cluster") err = installDefaultCNIVersion(ctx, config, eksclient) if err != nil { return ctx, err } return ctx, nil }, ) os.Exit(testenv.Run(m)) } func installDefaultCNIVersion(ctx context.Context, config *envconf.Config, eksclient *eks.Client) error { // Uninstall the currently install addon uninstallCNIAddon(ctx, config, eksclient) // Passing addonVersion empty installs the default version of addon err := installCNIAddon(ctx, config, eksclient, "", "") if err != nil { return errors.Wrap(err, "Could not install the default addon version") } return nil } func installLatestCNIVersion(ctx context.Context, config *envconf.Config, eksclient *eks.Client) error { version, err := getLatestCNIAddon(ctx, eksclient) if err != nil { return err } configurationValues := "{\"enableNetworkPolicy\": \"true\"}" err = installCNIAddon(ctx, config, eksclient, version, configurationValues) if err != nil { return err } return nil } func uninstallCNIAddon(ctx context.Context, config *envconf.Config, eksclient *eks.Client) error { cniDS := &appsv1.DaemonSet{ObjectMeta: metav1.ObjectMeta{Name: "aws-node", Namespace: "kube-system"}} _, err := eksclient.DeleteAddon(ctx, &eks.DeleteAddonInput{ AddonName: aws.String(addonName), ClusterName: aws.String(clusterName), }) err = wait.For(conditions.New(config.Client().Resources()).ResourceDeleted(cniDS), wait.WithTimeout(time.Minute*5)) if err != nil { return errors.Wrap(err, "Daemonset could not be deleted") } return nil } func getLatestCNIAddon(ctx context.Context, eksclient *eks.Client) (string, error) { addonVersions, err := eksclient.DescribeAddonVersions(ctx, &eks.DescribeAddonVersionsInput{ AddonName: aws.String(addonName), KubernetesVersion: aws.String(kubernetesVersion), }) if err != nil { return "", err } if len(*&addonVersions.Addons) > 0 { return *addonVersions.Addons[0].AddonVersions[0].AddonVersion, nil } else { return "", errors.Errorf("Addon versions not available") } } func installCNIAddon(ctx context.Context, config *envconf.Config, eksclient *eks.Client, addonVersion string, configurationValues string) error { // Delete old Daemonset if exists cniDS := &appsv1.DaemonSet{ObjectMeta: metav1.ObjectMeta{Name: "aws-node", Namespace: "kube-system"}} config.Client().Resources().Delete(ctx, cniDS) _, err := eksclient.CreateAddon(ctx, &eks.CreateAddonInput{ AddonName: aws.String(addonName), ClusterName: aws.String(clusterName), AddonVersion: aws.String(addonVersion), ConfigurationValues: aws.String(configurationValues), ResolveConflicts: types.ResolveConflictsOverwrite, }) if err != nil { return errors.Wrap(err, "Failed to create addon") } err = wait.For(fwext.NewConditionExtension(config.Client().Resources()).DaemonSetReady(cniDS), wait.WithTimeout(time.Minute*5)) if err != nil { return errors.Wrap(err, "Daemonset failed to reach running state") } return nil } func getClusterVersion(ctx context.Context, eksclient *eks.Client) (string, error) { cluster, err := eksclient.DescribeCluster(ctx, &eks.DescribeClusterInput{ Name: aws.String(clusterName), }) if err != nil { return "", err } return *cluster.Cluster.Version, nil } func createNamespace(name string, client klient.Client, ctx context.Context) error { ns := &v1.Namespace{ ObjectMeta: metav1.ObjectMeta{ Name: name, Namespace: name, Labels: map[string]string{"ns": name}, }, } if err := client.Resources().Create(ctx, ns); err != nil { return err } return nil } func createServerAndService(namespace string, name string, replicas int32, client klient.Client, ctx context.Context) error { labels := map[string]string{"app": name} service := &v1.Service{ ObjectMeta: metav1.ObjectMeta{Name: name, Namespace: namespace}, Spec: v1.ServiceSpec{ Ports: []v1.ServicePort{{Name: name, Protocol: "TCP", Port: 80}}, Selector: labels, }, } if err := client.Resources().Create(ctx, service); err != nil { return err } deploy := &appsv1.Deployment{ ObjectMeta: metav1.ObjectMeta{Name: name, Namespace: namespace}, Spec: appsv1.DeploymentSpec{ Replicas: &replicas, Selector: &metav1.LabelSelector{ MatchLabels: labels, }, Template: corev1.PodTemplateSpec{ ObjectMeta: metav1.ObjectMeta{Labels: labels}, Spec: corev1.PodSpec{Containers: []corev1.Container{{Name: name, Image: "nginx"}}}, }, }, } if err := client.Resources().Create(ctx, deploy); err != nil { return err } err := wait.For(conditions.New(client.Resources()).DeploymentConditionMatch(deploy, appsv1.DeploymentAvailable, v1.ConditionTrue), wait.WithTimeout(time.Minute*5)) if err != nil { return err } return nil } ================================================ FILE: test/cases/netpol/np_test.go ================================================ //go:build e2e package netpol import ( "bytes" "context" "log" "strings" "testing" "time" corev1 "k8s.io/api/core/v1" networking "k8s.io/api/networking/v1" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apimachinery/pkg/util/intstr" "sigs.k8s.io/e2e-framework/pkg/envconf" "sigs.k8s.io/e2e-framework/pkg/features" ) func TestNetworkPolicyCases(t *testing.T) { protocolTCP := corev1.ProtocolTCP protocolUDP := corev1.ProtocolUDP networkPolicy := networking.NetworkPolicy{ ObjectMeta: metav1.ObjectMeta{Name: "block-c-to-a", Namespace: "a"}, Spec: networking.NetworkPolicySpec{ PodSelector: metav1.LabelSelector{MatchLabels: map[string]string{"app": "a-server"}}, PolicyTypes: []networking.PolicyType{networking.PolicyTypeIngress, networking.PolicyTypeEgress}, Ingress: []networking.NetworkPolicyIngressRule{ { From: []networking.NetworkPolicyPeer{ { PodSelector: &metav1.LabelSelector{MatchLabels: map[string]string{"app": "b-server"}}, NamespaceSelector: &metav1.LabelSelector{MatchLabels: map[string]string{"ns": "b"}}, }, }, Ports: []networking.NetworkPolicyPort{ { Protocol: &protocolTCP, Port: &intstr.IntOrString{IntVal: 80}, }, }, }, }, Egress: []networking.NetworkPolicyEgressRule{ { To: []networking.NetworkPolicyPeer{ { PodSelector: &metav1.LabelSelector{MatchLabels: map[string]string{"app": "b-server"}}, NamespaceSelector: &metav1.LabelSelector{MatchLabels: map[string]string{"ns": "b"}}, }, }, Ports: []networking.NetworkPolicyPort{ { Protocol: &protocolTCP, Port: &intstr.IntOrString{IntVal: 80}, }, }, }, { Ports: []networking.NetworkPolicyPort{ { Protocol: &protocolUDP, Port: &intstr.IntOrString{IntVal: 53}, }, }, }, }, }, } allowAll := features.New("allowAll"). WithLabel("suite", "netpol"). WithLabel("policy", "none"). Assess("curl from A to B succeeds", func(ctx context.Context, t *testing.T, cfg *envconf.Config) context.Context { client, err := cfg.NewClient() if err != nil { return ctx } pods := &corev1.PodList{} namespace := "a" containerName := "a-server" err = client.Resources("a").List(context.TODO(), pods) if err != nil || pods.Items == nil { t.Error("error while getting pods", err) } podName := pods.Items[0].Name var stdout, stderr bytes.Buffer command := []string{"curl", "-m", "2", "-I", "http://b-server.b:80"} client.Resources().ExecInPod(context.TODO(), namespace, podName, containerName, command, &stdout, &stderr) httpStatus := strings.Split(stdout.String(), "\n")[0] if !strings.Contains(httpStatus, "200") { t.Fatal("Couldn't connect to server B") } return ctx }). Assess("curl from C to A succeeds", func(ctx context.Context, t *testing.T, cfg *envconf.Config) context.Context { client, err := cfg.NewClient() if err != nil { return ctx } namespace := "c" containerName := "c-server" pods := &corev1.PodList{} err = client.Resources("c").List(context.TODO(), pods) if err != nil || pods.Items == nil { t.Error("error while getting pods", err) } podName := pods.Items[0].Name var stdout, stderr bytes.Buffer command := []string{"curl", "-m", "2", "-I", "http://a-server.a:80"} client.Resources().ExecInPod(context.TODO(), namespace, podName, containerName, command, &stdout, &stderr) httpStatus := strings.Split(stdout.String(), "\n")[0] if !strings.Contains(httpStatus, "200") { t.Fatal("Couldn't connect to server A") } return ctx }). Feature() blockCToA := features.New("blockCToA"). WithLabel("suite", "netpol"). Setup(func(ctx context.Context, t *testing.T, cfg *envconf.Config) context.Context { client, err := cfg.NewClient() if err != nil { return ctx } log.Print("Applying Network Policy") if err := client.Resources().Create(ctx, &networkPolicy); err != nil { t.Error("error while applying Network Policy", err) return ctx } // This time-wait is to account for Network Policy Controller to start up, run leader election in the control plane // and to apply the network policy time.Sleep(1 * time.Minute) return ctx }). Assess("curl from A to B succeeds", func(ctx context.Context, t *testing.T, cfg *envconf.Config) context.Context { client, err := cfg.NewClient() if err != nil { return ctx } pods := &corev1.PodList{} namespace := "a" containerName := "a-server" err = client.Resources("a").List(context.TODO(), pods) if err != nil || pods.Items == nil { t.Error("error while getting pods", err) } podName := pods.Items[0].Name var stdout, stderr bytes.Buffer command := []string{"curl", "-m", "2", "-I", "http://b-server.b:80"} client.Resources().ExecInPod(context.TODO(), namespace, podName, containerName, command, &stdout, &stderr) httpStatus := strings.Split(stdout.String(), "\n")[0] if !strings.Contains(httpStatus, "200") { t.Fatal("Couldn't connect to server B") } return ctx }). Assess("curl from C to A fails", func(ctx context.Context, t *testing.T, cfg *envconf.Config) context.Context { client, err := cfg.NewClient() if err != nil { return ctx } namespace := "c" containerName := "c-server" pods := &corev1.PodList{} err = client.Resources("c").List(context.TODO(), pods) if err != nil || pods.Items == nil { t.Error("error while getting pods", err) } podName := pods.Items[0].Name var stdout, stderr bytes.Buffer command := []string{"curl", "-m", "2", "-I", "http://a-server.a:80"} client.Resources().ExecInPod(context.TODO(), namespace, podName, containerName, command, &stdout, &stderr) httpStatus := strings.Split(stdout.String(), "\n")[0] if strings.Contains(httpStatus, "200") { t.Fatal("Network Policy didn't block connection to server A") } return ctx }). Teardown(func(ctx context.Context, t *testing.T, cfg *envconf.Config) context.Context { client, err := cfg.NewClient() if err != nil { return ctx } if err := client.Resources().Delete(ctx, &networkPolicy); err != nil { t.Error("error while deleting Network Policy", err) return ctx } return ctx }). Feature() testenv.Test(t, allowAll, blockCToA) } ================================================ FILE: test/cases/neuron/main_test.go ================================================ //go:build e2e package neuron import ( "context" _ "embed" "flag" "fmt" "log" "os" "os/signal" "slices" "testing" "time" "github.com/aws/aws-k8s-tester/internal/e2e" fwext "github.com/aws/aws-k8s-tester/internal/e2e" "github.com/aws/aws-k8s-tester/test/manifests" "github.com/aws/aws-sdk-go-v2/aws" appsv1 "k8s.io/api/apps/v1" v1 "k8s.io/api/core/v1" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/client-go/kubernetes" "sigs.k8s.io/e2e-framework/klient/wait" "sigs.k8s.io/e2e-framework/klient/wait/conditions" "sigs.k8s.io/e2e-framework/pkg/env" "sigs.k8s.io/e2e-framework/pkg/envconf" ) var ( testenv env.Environment nodeType *string efaEnabled *bool nodeCount int neuronPerNode int neuronCorePerNode int efaPerNode int neuronTestImage *string installDevicePlugin *bool ) func deployNeuronDevicePlugin(ctx context.Context, config *envconf.Config) (context.Context, error) { ds := appsv1.DaemonSet{ ObjectMeta: metav1.ObjectMeta{Name: "neuron-device-plugin-daemonset", Namespace: "kube-system"}, } err := wait.For(fwext.NewConditionExtension(config.Client().Resources()).DaemonSetReady(&ds), wait.WithContext(ctx)) if err != nil { return ctx, err } return ctx, nil } func deployMPIOperator(ctx context.Context, config *envconf.Config) (context.Context, error) { dep := appsv1.Deployment{ ObjectMeta: metav1.ObjectMeta{Name: "mpi-operator", Namespace: "mpi-operator"}, } err := wait.For(conditions.New(config.Client().Resources()).DeploymentConditionMatch(&dep, appsv1.DeploymentAvailable, v1.ConditionTrue), wait.WithContext(ctx)) if err != nil { return ctx, fmt.Errorf("failed to deploy mpi-operator: %v", err) } return ctx, nil } func deployEFAPlugin(ctx context.Context, config *envconf.Config) (context.Context, error) { err := fwext.ApplyManifests(config.Client().RESTConfig(), manifests.EfaDevicePluginManifest) if err != nil { return ctx, err } ds := appsv1.DaemonSet{ ObjectMeta: metav1.ObjectMeta{Name: "aws-efa-k8s-device-plugin-daemonset", Namespace: "kube-system"}, } err = wait.For(fwext.NewConditionExtension(config.Client().Resources()).DaemonSetReady(&ds), wait.WithContext(ctx)) if err != nil { return ctx, fmt.Errorf("failed to deploy efa-device-plugin: %v", err) } return ctx, nil } func checkNodeTypes(ctx context.Context, config *envconf.Config) (context.Context, error) { time.Sleep(time.Minute) // give node info time to populate clientset, err := kubernetes.NewForConfig(config.Client().RESTConfig()) if err != nil { return ctx, fmt.Errorf("failed to create Kubernetes client: %w", err) } nodes, err := clientset.CoreV1().Nodes().List(ctx, metav1.ListOptions{}) if err != nil { return ctx, fmt.Errorf("failed to list nodes: %w", err) } if len(nodes.Items) == 0 { return ctx, fmt.Errorf("no nodes found in the cluster") } var totalEfaCount, totalNeuronCoreCount, totalNeuronCount int if *nodeType == "" { nodeType = aws.String(nodes.Items[0].Labels["node.kubernetes.io/instance-type"]) log.Printf("No node type specified. Using the node type %s in the node groups.", *nodeType) } for _, node := range nodes.Items { if node.Labels["node.kubernetes.io/instance-type"] != *nodeType { continue } neuron, err := e2e.GetNonZeroResourceCapacity(&node, "aws.amazon.com/neuron") if err != nil { return nil, err } totalNeuronCount += neuron // Check for NeuronCore capacity neuronCore, err := e2e.GetNonZeroResourceCapacity(&node, "aws.amazon.com/neuroncore") if err != nil { return nil, err } totalNeuronCoreCount += neuronCore // Check for EFA capacity if *efaEnabled { efa, err := e2e.GetNonZeroResourceCapacity(&node, "vpc.amazonaws.com/efa") if err != nil { return nil, err } totalEfaCount += efa } nodeCount += 1 } // Update global capacities if nodeCount > 0 { neuronPerNode = totalNeuronCount / nodeCount neuronCorePerNode = totalNeuronCoreCount / nodeCount efaPerNode = totalEfaCount / nodeCount } else { return nil, fmt.Errorf("no nodes of type %q found", *nodeType) } log.Printf("[INFO] Total Nodes: %d", nodeCount) log.Printf("[INFO] Total Neuron Count: %d, Neuron Per Node: %d", totalNeuronCount, neuronPerNode) log.Printf("[INFO] Total Neuron Core Count: %d, Neuron Core Per Node: %d", totalNeuronCoreCount, neuronCorePerNode) log.Printf("[INFO] Total EFA Count: %d, EFA Per Node: %d", totalEfaCount, efaPerNode) return ctx, nil } func TestMain(m *testing.M) { nodeType = flag.String("nodeType", "", "node type for the tests") efaEnabled = flag.Bool("efaEnabled", false, "enable efa tests") neuronTestImage = flag.String("neuronTestImage", "", "image for neuron single node test") installDevicePlugin = flag.Bool("installDevicePlugin", true, "install neuron device plugin") cfg, err := envconf.NewFromFlags() if err != nil { log.Fatalf("failed to initialize test environment: %v", err) } testenv = env.NewWithConfig(cfg) ctx, cancel := signal.NotifyContext(context.Background(), os.Interrupt) defer cancel() testenv = testenv.WithContext(ctx) deploymentManifests := [][]byte{ manifests.MpiOperatorManifest, } setUpFunctions := []env.Func{ func(ctx context.Context, config *envconf.Config) (context.Context, error) { err := fwext.ApplyManifests(config.Client().RESTConfig(), deploymentManifests...) if err != nil { return ctx, err } return ctx, nil }, deployMPIOperator, } if *installDevicePlugin { deploymentManifests = append(deploymentManifests, manifests.NeuronDevicePluginManifest, manifests.NeuronDevicePluginRbacManifest) setUpFunctions = append(setUpFunctions, deployNeuronDevicePlugin) } if *efaEnabled { setUpFunctions = append(setUpFunctions, deployEFAPlugin) } setUpFunctions = append(setUpFunctions, checkNodeTypes) testenv.Setup(setUpFunctions...) testenv.Finish( func(ctx context.Context, config *envconf.Config) (context.Context, error) { err := fwext.DeleteManifests(cfg.Client().RESTConfig(), manifests.EfaDevicePluginManifest) if err != nil { return ctx, err } slices.Reverse(deploymentManifests) err = fwext.DeleteManifests(config.Client().RESTConfig(), deploymentManifests...) if err != nil { return ctx, err } return ctx, nil }, ) os.Exit(testenv.Run(m)) } ================================================ FILE: test/cases/neuron/manifests/multi-node-test-neuron.yaml ================================================ apiVersion: kubeflow.org/v2beta1 kind: MPIJob metadata: name: multi-node-nccom-test spec: slotsPerWorker: {{.NeuronPerNode}} runPolicy: backoffLimit: 20 cleanPodPolicy: Running mpiReplicaSpecs: Launcher: replicas: 1 template: spec: restartPolicy: OnFailure containers: - image: {{.NeuronTestImage}} imagePullPolicy: Always name: nccom-test-launcher env: - name: POD_IP valueFrom: fieldRef: fieldPath: status.podIP command: - /bin/bash args: - -c - | WORKER_IPS=() for i in $(seq 0 $(({{.WorkerNodeCount}} - 1))); do WORKER_IP=$(getent hosts multi-node-nccom-test-worker-$i.multi-node-nccom-test | awk '{print $1}') WORKER_IPS+=("$WORKER_IP") done export CCOM_SOCKET_IFNAME=eth0 export NEURON_RT_ROOT_COMM_ID=${WORKER_IPS[0]}:63182 nccom-test -r $(({{.NeuronCorePerNode}}*{{.WorkerNodeCount}})) -N {{.WorkerNodeCount}} -b "8" -e "2G" -f "2" -n "5" -w "5" -d "fp32" allr --hosts ${WORKER_IPS[*]} --data-collector-host $POD_IP --data-collector-port 60006 --debug Worker: replicas: {{.WorkerNodeCount}} template: spec: securityContext: runAsUser: 1000 runAsGroup: 2000 fsGroup: 3000 containers: - image: {{.NeuronTestImage}} name: nccom-test-worker command: ["/bin/bash"] args: ["-c", "echo password | sudo -S /usr/sbin/sshd -D"] imagePullPolicy: Always resources: limits: aws.amazon.com/neuron: {{.NeuronPerNode}} aws.amazon.com/neuroncore: {{.NeuronCorePerNode}} vpc.amazonaws.com/efa: {{.EfaInterfacePerNode}} requests: aws.amazon.com/neuron: {{.NeuronPerNode}} aws.amazon.com/neuroncore: {{.NeuronCorePerNode}} vpc.amazonaws.com/efa: {{.EfaInterfacePerNode}} ================================================ FILE: test/cases/neuron/manifests/single-node-test-neuronx.yaml ================================================ kind: Job apiVersion: batch/v1 metadata: name: neuronx-single-node labels: app: neuronx-single-node spec: template: metadata: labels: app: neuronx-single-node spec: containers: - name: neuronx-single-node-test image: {{.NeuronTestImage}} command: - /bin/bash - ./tests/singleNodeTest.sh imagePullPolicy: Always resources: limits: cpu: "4" memory: 4Gi aws.amazon.com/neuron: "1" requests: cpu: "1" memory: 1Gi aws.amazon.com/neuron: "1" restartPolicy: Never securityContext: runAsUser: 1000 runAsGroup: 2000 fsGroup: 3000 backoffLimit: 4 ================================================ FILE: test/cases/neuron/neuron_test.go ================================================ //go:build e2e package neuron import ( "context" _ "embed" "fmt" "testing" "time" fwext "github.com/aws/aws-k8s-tester/internal/e2e" "github.com/aws/aws-k8s-tester/internal/e2e/mpijobs" "sigs.k8s.io/e2e-framework/klient/wait" "sigs.k8s.io/e2e-framework/klient/wait/conditions" "sigs.k8s.io/e2e-framework/pkg/envconf" "sigs.k8s.io/e2e-framework/pkg/features" batchv1 "k8s.io/api/batch/v1" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" ) var ( //go:embed manifests/single-node-test-neuronx.yaml neuronSingleNodeManifest []byte //go:embed manifests/multi-node-test-neuron.yaml neuronMultiNodeManifest []byte renderedNeuronSingleNodeManifest []byte renderedNeuronMultiNodeManifest []byte ) type neuronSingleNodeManifestTplVars struct { NeuronTestImage string } type neuronMultiNodeTestManifestTplVars struct { WorkerNodeCount int WorkerNodeNeuronCount int NeuronPerNode int NeuronCorePerNode int NeuronTestImage string EfaInterfacePerNode int } func TestNeuronNodes(t *testing.T) { singleNode := features.New("single-node"). WithLabel("suite", "neuron"). WithLabel("hardware", "neuron"). Setup(func(ctx context.Context, t *testing.T, cfg *envconf.Config) context.Context { if *neuronTestImage == "" { t.Fatal(fmt.Errorf("neuronTestImage must be set to run neuron single node test, use https://github.com/aws/aws-k8s-tester/blob/main/test/images/neuron/Dockerfile to build the image and -neuronTestImage to set the image url")) } var err error renderedNeuronSingleNodeManifest, err = fwext.RenderManifests(neuronSingleNodeManifest, neuronSingleNodeManifestTplVars{ NeuronTestImage: *neuronTestImage, }) if err != nil { t.Fatal(err) } t.Log("Applying single node manifest") err = fwext.ApplyManifests(cfg.Client().RESTConfig(), renderedNeuronSingleNodeManifest) if err != nil { t.Fatal(err) } t.Log("Manifest applied successfully") return ctx }). Assess("Single node test Job succeeds", func(ctx context.Context, t *testing.T, cfg *envconf.Config) context.Context { job := &batchv1.Job{ ObjectMeta: metav1.ObjectMeta{Name: "neuronx-single-node", Namespace: "default"}, } t.Log("Waiting for single node job to complete") err := wait.For(fwext.NewConditionExtension(cfg.Client().Resources()).JobSucceeded(job), wait.WithContext(ctx), wait.WithTimeout(time.Minute*20), ) if err != nil { t.Fatal(err) } return ctx }). Teardown(func(ctx context.Context, t *testing.T, cfg *envconf.Config) context.Context { log, err := fwext.GetJobLogs(cfg.Client().RESTConfig(), &batchv1.Job{ ObjectMeta: metav1.ObjectMeta{Name: "neuronx-single-node", Namespace: "default"}, }) if err != nil { t.Error(err) } else { t.Log("Test log for neuronx-single-node:") t.Log(log) } err = fwext.DeleteManifests(cfg.Client().RESTConfig(), renderedNeuronSingleNodeManifest) if err != nil { t.Error(err) } return ctx }). Feature() multiNode := features.New("multi-node"). WithLabel("suite", "neuron"). WithLabel("hardware", "neuron"). WithLabel("hardware", "efa"). Setup(func(ctx context.Context, t *testing.T, cfg *envconf.Config) context.Context { if *neuronTestImage == "" { t.Fatal(fmt.Errorf("neuronTestImage must be set to run unit test, use https://github.com/aws/aws-k8s-tester/blob/main/test/images/neuron/Dockerfile to build the image and -neuronTestImage to set the image url")) } renderedNeuronMultiNodeManifest, err := fwext.RenderManifests(neuronMultiNodeManifest, neuronMultiNodeTestManifestTplVars{ // one of the nodes will be used for the master pod WorkerNodeCount: nodeCount, WorkerNodeNeuronCount: nodeCount * neuronPerNode, NeuronPerNode: neuronPerNode, NeuronCorePerNode: neuronCorePerNode, NeuronTestImage: *neuronTestImage, EfaInterfacePerNode: efaPerNode, }) if err != nil { t.Fatal(err) } t.Log("Applying multi node manifest") err = fwext.ApplyManifests(cfg.Client().RESTConfig(), renderedNeuronMultiNodeManifest) if err != nil { t.Fatal(err) } t.Log("Applied manifest successfully") return ctx }). Assess("NCCOM test succeeds", func(ctx context.Context, t *testing.T, cfg *envconf.Config) context.Context { mpiJob := mpijobs.NewUnstructured("multi-node-nccom-test", "default") ctx = context.WithValue(ctx, "mpiJob", mpiJob) t.Log("Waiting for MPIJob to complete") err := wait.For(conditions.New(cfg.Client().Resources()).ResourceMatch(mpiJob, mpijobs.MPIJobSucceeded), wait.WithContext(ctx), wait.WithTimeout(time.Minute*30), ) if err != nil { t.Fatal(err) } log, err := fwext.GetJobLogs(cfg.Client().RESTConfig(), mpiJob) if err != nil { t.Fatal(err) } t.Log("Test log for multi-node-nccom-test:") t.Log(log) return ctx }). Teardown(func(ctx context.Context, t *testing.T, cfg *envconf.Config) context.Context { err := fwext.DeleteManifests(cfg.Client().RESTConfig(), renderedNeuronMultiNodeManifest) if err != nil { t.Fatal(err) } return ctx }). Feature() testenv.Test(t, singleNode, multiNode) } ================================================ FILE: test/cases/neuron-dra/main_test.go ================================================ //go:build e2e package neuron_dra import ( "context" "embed" "flag" "fmt" "log" "os" "os/exec" "os/signal" "path/filepath" "slices" "strings" "sync" "testing" "time" fwext "github.com/aws/aws-k8s-tester/internal/e2e" "github.com/aws/aws-k8s-tester/test/common" "github.com/aws/aws-k8s-tester/test/manifests" "golang.org/x/sync/errgroup" appsv1 "k8s.io/api/apps/v1" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/client-go/kubernetes" "sigs.k8s.io/e2e-framework/klient/wait" "sigs.k8s.io/e2e-framework/pkg/env" "sigs.k8s.io/e2e-framework/pkg/envconf" ) //go:embed rcts var rctsFS embed.FS var ( testenv env.Environment clientset kubernetes.Interface nodeType *string rdmaDeviceDraDriverImage *string acceleratorDraDriverImage *string containerTestImage *string nodeCount int ) // supportedRdmaTypes lists the recognized RDMA device types. var supportedRdmaTypes = []string{"efa"} func validateConfig() error { if err := common.ValidateRequiredFlags(map[string]string{ "rdmaDeviceDraDriverImage": *rdmaDeviceDraDriverImage, "containerTestImage": *containerTestImage, "nodeType": *nodeType, }); err != nil { return err } // Validate that nodeType maps to a known topology (and thus a known RDMA type) topo, err := GetTopologyForNodeType(*nodeType) if err != nil { return fmt.Errorf("invalid -nodeType: %w", err) } if !slices.Contains(supportedRdmaTypes, topo.RdmaType) { return fmt.Errorf("instance family %q has unsupported RDMA type %q; supported: %v", topo.Family, topo.RdmaType, supportedRdmaTypes) } // Verify helm is available on the PATH. if _, err := exec.LookPath("helm"); err != nil { return fmt.Errorf("helm is required but not found on PATH: %w", err) } return nil } const ( neuronHelmReleaseName = "neuron-helm-chart" neuronHelmChartOCI = "oci://public.ecr.aws/neuron/neuron-helm-chart" neuronDRANamespace = "neuron-dra-driver" ) // installNeuronDRADriverHelm installs the Neuron DRA driver via the public Helm chart. // If acceleratorDraDriverImage is non-empty, it splits on the last ":" to extract // repository and tag and passes them as --set overrides. func installNeuronDRADriverHelm(ctx context.Context, config *envconf.Config) (context.Context, error) { args := []string{ "upgrade", "--install", neuronHelmReleaseName, neuronHelmChartOCI, "--namespace", neuronDRANamespace, "--create-namespace", "--set", "devicePlugin.enabled=false", "--set", "npd.enabled=false", "--set", "draDriver.enabled=true", "--wait", "--timeout", "5m", } if *acceleratorDraDriverImage != "" { repo, tag := common.SplitImageRepoTag(*acceleratorDraDriverImage) args = append(args, "--set", fmt.Sprintf("draDriver.image.repository=%s", repo), "--set", fmt.Sprintf("draDriver.image.tag=%s", tag), ) } log.Printf("[INFO] Installing Neuron DRA driver via Helm: helm %s", strings.Join(args, " ")) cmd := exec.CommandContext(ctx, "helm", args...) cmd.Stdout = os.Stdout cmd.Stderr = os.Stderr if err := cmd.Run(); err != nil { return ctx, fmt.Errorf("helm install neuron-dra-driver failed: %w", err) } log.Println("Neuron DRA driver Helm release installed successfully.") return ctx, nil } // uninstallNeuronDRADriverHelm uninstalls the Neuron DRA driver Helm release. func uninstallNeuronDRADriverHelm(ctx context.Context, config *envconf.Config) (context.Context, error) { args := []string{ "uninstall", neuronHelmReleaseName, "--namespace", neuronDRANamespace, } log.Printf("[INFO] Uninstalling Neuron DRA driver Helm release: helm %s", strings.Join(args, " ")) cmd := exec.CommandContext(ctx, "helm", args...) cmd.Stdout = os.Stdout cmd.Stderr = os.Stderr if err := cmd.Run(); err != nil { log.Printf("[WARN] helm uninstall neuron-dra-driver failed (may already be removed): %v", err) } return ctx, nil } func deployNeuronDRADriver(ctx context.Context, config *envconf.Config) (context.Context, error) { ds := appsv1.DaemonSet{ ObjectMeta: metav1.ObjectMeta{Name: "neuron-dra-driver-kubelet-plugin", Namespace: neuronDRANamespace}, } err := wait.For( fwext.NewConditionExtension(config.Client().Resources()).DaemonSetReady(&ds), wait.WithTimeout(5*time.Minute), wait.WithContext(ctx), ) if err != nil { return ctx, fmt.Errorf("neuron-dra-driver daemonset is not ready: %w", err) } log.Println("neuron-dra-driver daemonset is ready.") return ctx, nil } func TestMain(m *testing.M) { nodeType = flag.String("nodeType", "", "instance type for the cluster (e.g. trn1.32xlarge)") rdmaDeviceDraDriverImage = flag.String("rdmaDeviceDraDriverImage", "", "container image for the dranet DRA driver") acceleratorDraDriverImage = flag.String("acceleratorDraDriverImage", "", "container image for the Neuron DRA driver") containerTestImage = flag.String("containerTestImage", "", "container image for the nccom test workload") cfg, err := envconf.NewFromFlags() if err != nil { log.Fatalf("failed to initialize test environment: %v", err) } if err := validateConfig(); err != nil { log.Fatalf("invalid configuration: %v", err) } ctx, cancel := signal.NotifyContext(context.Background(), os.Interrupt) defer cancel() testenv = env.NewWithConfig(cfg).WithContext(ctx) // Build the manifest list and setup functions dynamically. // Resolve topology to determine RDMA type from nodeType. topo, err := GetTopologyForNodeType(*nodeType) if err != nil { log.Fatalf("failed to resolve topology: %v", err) } manifestsList := [][]byte{ manifests.MpiOperatorManifest, } setUpFunctions := []env.Func{ // Run independent setup steps concurrently. func(ctx context.Context, config *envconf.Config) (context.Context, error) { var mu sync.Mutex g, gctx := errgroup.WithContext(ctx) // Deploy MPI operator. g.Go(func() error { return common.DeployMPIOperator(gctx, config) }) // Deploy dranet and RCTs based on topology's RDMA type. if topo.RdmaType == "efa" { rctManifests, err := common.LoadRCTManifests(rctsFS, filepath.Join("rcts", topo.RCTSubDir)) if err != nil { return ctx, fmt.Errorf("failed to load RCT manifests: %w", err) } mu.Lock() manifestsList = append(manifestsList, rctManifests...) mu.Unlock() g.Go(func() error { renderedDranet, err := common.DeployDranet(gctx, config, *rdmaDeviceDraDriverImage) if err != nil { return err } mu.Lock() manifestsList = append(manifestsList, renderedDranet) mu.Unlock() return nil }) g.Go(func() error { return fwext.ApplyManifests(config.Client().RESTConfig(), rctManifests...) }) } // Install Neuron DRA driver via Helm chart. g.Go(func() error { _, err := installNeuronDRADriverHelm(gctx, config) return err }) if err := g.Wait(); err != nil { return ctx, err } return ctx, nil }, deployNeuronDRADriver, func(ctx context.Context, config *envconf.Config) (context.Context, error) { var err error clientset, err = kubernetes.NewForConfig(config.Client().RESTConfig()) if err != nil { return ctx, err } nodeCount, err = common.CountNodesByType(ctx, clientset, *nodeType) return ctx, err }, } testenv.Setup(setUpFunctions...) testenv.Finish( func(ctx context.Context, config *envconf.Config) (context.Context, error) { // Uninstall Neuron DRA driver Helm release first. ctx, _ = uninstallNeuronDRADriverHelm(ctx, config) // Delete remaining manifests in reverse order. slices.Reverse(manifestsList) if err := fwext.DeleteManifests(config.Client().RESTConfig(), manifestsList...); err != nil { return ctx, fmt.Errorf("failed to delete manifests: %w", err) } return ctx, nil }, ) os.Exit(testenv.Run(m)) } ================================================ FILE: test/cases/neuron-dra/neuron_dra_test.go ================================================ //go:build e2e package neuron_dra import ( "embed" "path/filepath" "testing" "github.com/aws/aws-k8s-tester/test/common" ) //go:embed testcases var embeddedTestCases embed.FS func TestNeuronDRAMultiNode(t *testing.T) { topo, err := GetTopologyForNodeType(*nodeType) if err != nil { t.Fatalf("resolving topology for %s: %v", *nodeType, err) } rctDir := filepath.Join("rcts", topo.RCTSubDir) rctIndex, err := common.LoadRCTIndex(rctsFS, rctDir) if err != nil { t.Fatalf("loading RCT index from %s: %v", rctDir, err) } tcDir := filepath.Join("testcases", topo.TestCaseSubDir) featureList, err := common.DiscoverAndBuildFeatures( embeddedTestCases, tcDir, rctIndex, "neuron-dra", "multi-node-nccom-test", nodeCount, func(tc *common.TestCaseSpec, rctIndex map[string]*common.ResourceClaimTemplateSpec) ([]byte, error) { params, err := ComputeMPIJobParamsFromTestCase(tc, rctIndex, topo, nodeCount, *containerTestImage) if err != nil { return nil, err } return RenderMPIJobYAML(*params) }, clientset, ) if err != nil { t.Fatalf("discovering and building features: %v", err) } if len(featureList) == 0 { t.Logf("no test cases found under %s, skipping", tcDir) return } testenv.Test(t, featureList...) } ================================================ FILE: test/cases/neuron-dra/rcts/trn1/rct-2-efas-4-neurons-wrong-match.yaml ================================================ apiVersion: resource.k8s.io/v1beta1 kind: ResourceClaimTemplate metadata: namespace: default name: rct-2-efas-4-neurons-wrong-match spec: spec: devices: requests: - name: 4-neurons deviceClassName: neuron.aws.com allocationMode: ExactCount count: 4 - name: 2-efas deviceClassName: efa.networking.k8s.aws allocationMode: ExactCount count: 2 constraints: - requests: ["4-neurons", "2-efas"] matchAttribute: "resource.aws.com/devicegroup1_id" ================================================ FILE: test/cases/neuron-dra/rcts/trn1/rct-all-efas-all-neurons.yaml ================================================ apiVersion: resource.k8s.io/v1beta1 kind: ResourceClaimTemplate metadata: namespace: default name: rct-all-efas-all-neurons spec: spec: devices: requests: - name: all-neurons deviceClassName: neuron.aws.com allocationMode: All - name: all-efas deviceClassName: efa.networking.k8s.aws allocationMode: All ================================================ FILE: test/cases/neuron-dra/templates/nccom-test-mpijob.yaml.tmpl ================================================ apiVersion: kubeflow.org/v2beta1 kind: MPIJob metadata: name: multi-node-nccom-test spec: slotsPerWorker: {{.SlotsPerWorker}} runPolicy: backoffLimit: 20 cleanPodPolicy: Running mpiReplicaSpecs: Launcher: replicas: 1 template: spec: restartPolicy: OnFailure containers: - name: nccom-test-launcher image: {{.ContainerTestImage}} imagePullPolicy: Always env: - name: POD_IP valueFrom: fieldRef: fieldPath: status.podIP command: - /bin/bash - -lc args: - | set -euo pipefail WORKER_IPS=() for i in $(seq 0 $(({{.WorkerReplicas}} - 1))); do WORKER_IP=$(getent hosts multi-node-nccom-test-worker-$i.multi-node-nccom-test | awk '{print $1}') WORKER_IPS+=("$WORKER_IP") done export NCCOM_SOCKET_IFNAME=eth0 export NEURON_RT_ROOT_COMM_ID=${WORKER_IPS[0]}:63182 nccom-test \ -r {{.TotalRanks}} \ -N {{.WorkerReplicas}} \ -b 8 \ -e 2G \ -f 2 \ -n 5 \ -w 5 \ -d fp32 \ allr \ --hosts ${WORKER_IPS[*]} \ --data-collector-host "${POD_IP}" \ --data-collector-port 60006 \ --debug Worker: replicas: {{.WorkerReplicas}} template: spec: restartPolicy: OnFailure securityContext: runAsUser: 0 containers: - name: nccom-test-worker image: {{.ContainerTestImage}} imagePullPolicy: Always securityContext: capabilities: add: ["NET_ADMIN"] env: - name: FI_EFA_USE_DEVICE_RDMA value: "1" command: - /bin/bash - -lc args: - | set -euo pipefail MY_IP=$(hostname -i) ip addr add ${MY_IP}/16 dev eth0 label eth0:ccom ip route del 192.168.0.0/16 dev eth0 2>/dev/null || true /usr/sbin/sshd -D resources: claims: {{- range .ResourceClaims}} - name: {{.Name}} {{- end}} resourceClaims: {{- range .ResourceClaims}} - name: {{.Name}} resourceClaimTemplateName: {{.TemplateName}} {{- end}} ================================================ FILE: test/cases/neuron-dra/testcases/trn1/2-efas-4-neurons-wrong-match.yaml ================================================ expectFailure: true resourceClaims: - name: 2-efas-4-neurons-wrong-match resourceClaimTemplateName: rct-2-efas-4-neurons-wrong-match ================================================ FILE: test/cases/neuron-dra/testcases/trn1/all-efas-all-neurons.yaml ================================================ resourceClaims: - name: all-efas-all-neurons resourceClaimTemplateName: rct-all-efas-all-neurons ================================================ FILE: test/cases/neuron-dra/topology.go ================================================ package neuron_dra import ( "bytes" _ "embed" "fmt" "strings" "text/template" "github.com/aws/aws-k8s-tester/test/common" ) //go:embed templates/nccom-test-mpijob.yaml.tmpl var mpijobTemplate string // --------------------------------------------------------------------------- // Instance topology // --------------------------------------------------------------------------- // InstanceTopology describes the Neuron/EFA hardware topology for an instance family. type InstanceTopology struct { Family string NeuronCoresPerDevice int AllNeuronCount int RdmaType string // RDMA device type (e.g. "efa") RCTSubDir string // subdirectory under rcts/ TestCaseSubDir string // subdirectory under testcases/ } var instanceTopologies = map[string]InstanceTopology{ "trn1": { Family: "trn1", NeuronCoresPerDevice: 2, AllNeuronCount: 16, RdmaType: "efa", RCTSubDir: "trn1", TestCaseSubDir: "trn1", }, } // GetTopologyForNodeType returns the InstanceTopology for a given node type // (e.g. "trn1.32xlarge"). It extracts the family prefix before the first "." // and looks it up in the registry. func GetTopologyForNodeType(nodeType string) (*InstanceTopology, error) { family := common.ExtractFamily(nodeType) topo, ok := instanceTopologies[family] if !ok { return nil, fmt.Errorf("unsupported instance family %q (from %q); supported: %s", family, nodeType, supportedFamilies()) } return &topo, nil } func supportedFamilies() string { families := make([]string, 0, len(instanceTopologies)) for k := range instanceTopologies { families = append(families, k) } return strings.Join(families, ", ") } // --------------------------------------------------------------------------- // MPIJob rendering // --------------------------------------------------------------------------- // MPIJobParams holds all template parameters for rendering the MPIJob YAML. type MPIJobParams struct { SlotsPerWorker int TotalRanks int WorkerReplicas int ContainerTestImage string ResourceClaims []common.ResourceClaimRef } // RenderMPIJobYAML renders the embedded MPIJob Go template with the given params // and returns the resulting YAML bytes. func RenderMPIJobYAML(params MPIJobParams) ([]byte, error) { tmpl, err := template.New("mpijob").Parse(mpijobTemplate) if err != nil { return nil, fmt.Errorf("parsing MPIJob template: %w", err) } var buf bytes.Buffer if err := tmpl.Execute(&buf, params); err != nil { return nil, fmt.Errorf("rendering MPIJob template: %w", err) } return buf.Bytes(), nil } // --------------------------------------------------------------------------- // Neuron-specific helpers // --------------------------------------------------------------------------- // getNeuronCount returns the neuron device count from an RCT. // For AllocationMode "All" it returns the topology's AllNeuronCount; // otherwise it returns the explicit Count from the neuron request. func getNeuronCount(rct *common.ResourceClaimTemplateSpec, topo *InstanceTopology) int { for _, req := range rct.Spec.Spec.Devices.Requests { if req.DeviceClassName != "neuron.aws.com" { continue } if req.AllocationMode == "All" { return topo.AllNeuronCount } return req.Count } return 0 } // ComputeMPIJobParamsFromTestCase computes MPIJob parameters from a test case spec. // It resolves each claim's resourceClaimTemplateName against the RCT index to // get the neuron count, then calculates SlotsPerWorker and TotalRanks. func ComputeMPIJobParamsFromTestCase(tc *common.TestCaseSpec, rctIndex map[string]*common.ResourceClaimTemplateSpec, topo *InstanceTopology, workerReplicas int, containerTestImage string) (*MPIJobParams, error) { if topo == nil { return nil, fmt.Errorf("instance topology is required") } if workerReplicas <= 0 { return nil, fmt.Errorf("workerReplicas must be positive, got %d", workerReplicas) } if containerTestImage == "" { return nil, fmt.Errorf("containerTestImage is required") } totalNeurons := 0 var claims []common.ResourceClaimRef for _, tcClaim := range tc.ResourceClaims { rct, ok := rctIndex[tcClaim.ResourceClaimTemplateName] if !ok { return nil, fmt.Errorf("resource claim template %q not found in RCT index", tcClaim.ResourceClaimTemplateName) } totalNeurons += getNeuronCount(rct, topo) claims = append(claims, common.ResourceClaimRef{ Name: tcClaim.Name, TemplateName: tcClaim.ResourceClaimTemplateName, }) } slotsPerWorker := totalNeurons * topo.NeuronCoresPerDevice totalRanks := slotsPerWorker * workerReplicas return &MPIJobParams{ SlotsPerWorker: slotsPerWorker, TotalRanks: totalRanks, WorkerReplicas: workerReplicas, ContainerTestImage: containerTestImage, ResourceClaims: claims, }, nil } ================================================ FILE: test/cases/neuron-inference/bert_inference_test.go ================================================ //go:build e2e package inference import ( "context" _ "embed" "fmt" "io" "log" "testing" "time" fwext "github.com/aws/aws-k8s-tester/internal/e2e" batchv1 "k8s.io/api/batch/v1" v1 "k8s.io/api/core/v1" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/client-go/kubernetes" "k8s.io/client-go/rest" "sigs.k8s.io/e2e-framework/klient/wait" "sigs.k8s.io/e2e-framework/pkg/envconf" "sigs.k8s.io/e2e-framework/pkg/features" ) //go:embed manifests/neuron-bert-inference.yaml var neuronBertInferenceManifest []byte var renderedManifest []byte func TestNeuronInference(t *testing.T) { if *bertInferenceImage == "" { t.Fatal("bertInferenceImage must be set to run the test") } log.Printf("[INFO] Using nodeType=%s, inferenceMode=%s", *nodeType, *inferenceMode) log.Printf("[INFO] Discovered neuronPerNode=%d, neuronCorePerNode=%d", neuronPerNode, neuronCorePerNode) renderVars := map[string]string{ "BertInferenceImage": *bertInferenceImage, "NodeType": *nodeType, // e.g. "inf2.xlarge" "InferenceMode": *inferenceMode, // "throughput" or "latency" "NeuronPerNode": fmt.Sprintf("%d", neuronPerNode), "NeuronCorePerNode": fmt.Sprintf("%d", neuronCorePerNode), } // Render the manifest renderedManifest, err := fwext.RenderManifests(neuronBertInferenceManifest, renderVars) if err != nil { t.Fatalf("[ERROR] Failed to render Neuron inference manifest: %v", err) } feature := features.New("neuron-inference"). WithLabel("suite", "neuron"). WithLabel("hardware", "neuron"). Setup(func(ctx context.Context, t *testing.T, cfg *envconf.Config) context.Context { log.Println("[INFO] Applying rendered Neuron inference manifest.") err := fwext.ApplyManifests(cfg.Client().RESTConfig(), renderedManifest) if err != nil { t.Fatalf("[ERROR] Failed to apply Neuron inference manifest: %v", err) } log.Println("[INFO] Successfully applied Neuron inference manifest.") return ctx }). Assess("BERT inference Job succeeds", func(ctx context.Context, t *testing.T, cfg *envconf.Config) context.Context { log.Println("[INFO] Checking 'neuron-inference' job completion...") job := &batchv1.Job{ ObjectMeta: metav1.ObjectMeta{Name: "neuron-inference", Namespace: "default"}, } if err := wait.For( fwext.NewConditionExtension(cfg.Client().Resources()).JobSucceeded(job), wait.WithTimeout(60*time.Minute), ); err != nil { log.Println("[ERROR] Neuron inference job failed. Gathering logs...") if err := printJobLogs(ctx, cfg, "default", "neuron-inference"); err != nil { t.Logf("[WARNING] Failed to retrieve neuron-inference job logs: %v", err) } t.Fatalf("[ERROR] Neuron inference job did not succeed: %v", err) } log.Println("[INFO] Neuron inference job succeeded. Gathering logs...") applyTime := ctx.Value("applyTime") if applyTime != nil { if start, ok := applyTime.(time.Time); ok { duration := time.Since(start) log.Printf("[INFO] Neuron inference job completed in %s", duration) } } if err := printJobLogs(ctx, cfg, "default", "neuron-inference"); err != nil { t.Logf("[WARNING] Failed to retrieve neuron-inference job logs: %v", err) } return ctx }). Teardown(func(ctx context.Context, t *testing.T, cfg *envconf.Config) context.Context { log.Println("[INFO] Cleaning up neuron-inference job resources...") if err := fwext.DeleteManifests(cfg.Client().RESTConfig(), renderedManifest); err != nil { t.Fatalf("[ERROR] Failed to delete inference job resources: %v", err) } log.Println("[INFO] Inference job cleanup complete.") return ctx }). Feature() testenv.Test(t, feature) } func printJobLogs(ctx context.Context, cfg *envconf.Config, namespace, jobName string) error { cs, err := getClientset(cfg.Client().RESTConfig()) if err != nil { return fmt.Errorf("[ERROR] failed to create kubernetes client: %w", err) } pods, err := cs.CoreV1().Pods(namespace).List(ctx, metav1.ListOptions{ LabelSelector: fmt.Sprintf("job-name=%s", jobName), }) if err != nil { return fmt.Errorf("[ERROR] failed to list pods for job %s: %w", jobName, err) } if len(pods.Items) == 0 { return fmt.Errorf("[ERROR] no pods found for job %s", jobName) } for _, pod := range pods.Items { log.Printf("[INFO] Pod %s is on node %s", pod.Name, pod.Spec.NodeName) stream, err := cs.CoreV1().Pods(pod.Namespace).GetLogs(pod.Name, &v1.PodLogOptions{}).Stream(ctx) if err != nil { return fmt.Errorf("[ERROR] failed to get logs from pod %s: %w", pod.Name, err) } defer stream.Close() buf := make([]byte, 4096) for { n, readErr := stream.Read(buf) if n > 0 { log.Printf("[INFO] Logs from Pod %s:\n%s", pod.Name, string(buf[:n])) } if readErr == io.EOF { log.Printf("[INFO] Completed log stream for pod %s.", pod.Name) break } if readErr != nil { return fmt.Errorf("[ERROR] reading logs from pod %s: %w", pod.Name, readErr) } } } return nil } func getClientset(restConfig *rest.Config) (*kubernetes.Clientset, error) { cs, err := kubernetes.NewForConfig(restConfig) if err != nil { return nil, fmt.Errorf("cannot create kubernetes clientset: %w", err) } return cs, nil } ================================================ FILE: test/cases/neuron-inference/main_test.go ================================================ //go:build e2e package inference import ( "context" _ "embed" "flag" "fmt" "log" "os" "slices" "testing" "time" fwext "github.com/aws/aws-k8s-tester/internal/e2e" "github.com/aws/aws-k8s-tester/test/manifests" appsv1 "k8s.io/api/apps/v1" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/client-go/kubernetes" "sigs.k8s.io/e2e-framework/klient/wait" "sigs.k8s.io/e2e-framework/pkg/env" "sigs.k8s.io/e2e-framework/pkg/envconf" ) func TestMain(m *testing.M) { flag.Parse() cfg, err := envconf.NewFromFlags() if err != nil { log.Fatalf("[ERROR] Failed to create test environment: %v", err) } testenv = env.NewWithConfig(cfg) deploymentManifests := [][]byte{ manifests.NeuronDevicePluginRbacManifest, manifests.NeuronDevicePluginManifest, } // Setup steps: apply the device plugin, wait for DS readiness, discover capacity testenv.Setup( func(ctx context.Context, config *envconf.Config) (context.Context, error) { log.Println("Applying Neuron device plugin RBAC and Neuron device plugin manifests.") err := fwext.ApplyManifests(config.Client().RESTConfig(), deploymentManifests...) if err != nil { return ctx, fmt.Errorf("failed to apply manifests: %w", err) } log.Println("Successfully applied Neuron device plugin RBAC and Neuron device plugin manifests.") return ctx, nil }, func(ctx context.Context, config *envconf.Config) (context.Context, error) { log.Println("Waiting for Neuron Device Plugin daemonset to be ready.") daemonset := appsv1.DaemonSet{ ObjectMeta: metav1.ObjectMeta{Name: "neuron-device-plugin-daemonset", Namespace: "kube-system"}, } err := wait.For( fwext.NewConditionExtension(config.Client().Resources()).DaemonSetReady(&daemonset), wait.WithTimeout(time.Minute*5), ) if err != nil { return ctx, fmt.Errorf("Neuron Device Plugin daemonset is not ready: %w", err) } log.Println("Neuron Device Plugin daemonset is ready.") return ctx, nil }, discoverNeuronCoreCapacity, getNodeCapacity, ) // Finish steps: remove device plugin if desired testenv.Finish( func(ctx context.Context, config *envconf.Config) (context.Context, error) { log.Println("[INFO] Cleaning up Neuron device plugin.") slices.Reverse(deploymentManifests) if err := fwext.DeleteManifests(config.Client().RESTConfig(), deploymentManifests...); err != nil { return ctx, fmt.Errorf("failed to delete neuron device plugin: %w", err) } log.Println("[INFO] Neuron device plugin cleanup complete.") return ctx, nil }, ) exitCode := testenv.Run(m) log.Printf("[INFO] Test environment finished with exit code %d", exitCode) os.Exit(exitCode) } // discoverNeuronCoreCapacity sets neuronPerNode and neuronCorePerNode by scanning the cluster func discoverNeuronCoreCapacity(ctx context.Context, config *envconf.Config) (context.Context, error) { log.Println("[INFO] Discovering cluster's Neuron capacity...") // Check Neuron devices log.Println("[INFO] Checking Neuron device capacity on nodes") err := wait.For( fwext.NewConditionExtension(config.Client().Resources()).AllNodesHaveNonZeroResourceCapacity("aws.amazon.com/neuron"), wait.WithTimeout(time.Second*60), wait.WithInterval(time.Second*5), ) if err != nil { return ctx, fmt.Errorf("failed to verify Neuron device capacity on nodes: %w", err) } log.Println("[INFO] Neuron devices check passed - all nodes have non-zero capacity") // Check Neuron cores log.Println("[INFO] Checking Neuron core capacity on nodes") err = wait.For( fwext.NewConditionExtension(config.Client().Resources()).AllNodesHaveNonZeroResourceCapacity("aws.amazon.com/neuroncore"), wait.WithTimeout(time.Second*60), wait.WithInterval(time.Second*5), ) if err != nil { return ctx, fmt.Errorf("failed to verify Neuron core capacity on nodes: %w", err) } log.Println("[INFO] Neuron cores check passed - all nodes have non-zero capacity") log.Println("[INFO] Neuron capacity discovery complete.") return ctx, nil } func getNodeCapacity(ctx context.Context, config *envconf.Config) (context.Context, error) { cs, err := kubernetes.NewForConfig(config.Client().RESTConfig()) if err != nil { return ctx, fmt.Errorf("failed to create kubernetes client: %w", err) } nodes, err := cs.CoreV1().Nodes().List(ctx, metav1.ListOptions{}) if err != nil { return ctx, fmt.Errorf("failed to list nodes: %w", err) } if len(nodes.Items) == 0 { return ctx, fmt.Errorf("no nodes found in the cluster") } var totalNeuron, totalNeuronCore, nodeCount int // if nodeType not set, use the instance type discovered if *nodeType == "" { *nodeType = nodes.Items[0].Labels["node.kubernetes.io/instance-type"] } for _, node := range nodes.Items { instanceType := node.Labels["node.kubernetes.io/instance-type"] neuronCap, hasNeuron := node.Status.Capacity["aws.amazon.com/neuron"] neuronCoreCap, hasNeuronCore := node.Status.Capacity["aws.amazon.com/neuroncore"] if instanceType == *nodeType { nodeCount++ if hasNeuron { totalNeuron += int(neuronCap.Value()) } else { log.Printf("[WARN] Node %s (type=%s) lacks 'aws.amazon.com/neuron'.", node.Name, instanceType) } if hasNeuronCore { totalNeuronCore += int(neuronCoreCap.Value()) } else { log.Printf("[WARN] Node %s (type=%s) lacks 'aws.amazon.com/neuroncore'.", node.Name, instanceType) } } } if nodeCount > 0 { neuronPerNode = totalNeuron / nodeCount neuronCorePerNode = totalNeuronCore / nodeCount } else { return ctx, fmt.Errorf("no nodes with %s node type found in the cluster", *nodeType) } log.Printf("[INFO] Discovered neuronPerNode=%d, neuronCorePerNode=%d (across %d node(s))", neuronPerNode, neuronCorePerNode, nodeCount) return ctx, nil } ================================================ FILE: test/cases/neuron-inference/manifests/neuron-bert-inference.yaml ================================================ apiVersion: batch/v1 kind: Job metadata: name: neuron-inference spec: backoffLimit: 4 template: spec: restartPolicy: OnFailure volumes: - name: dshm emptyDir: medium: Memory containers: - name: neuron-inference image: {{.BertInferenceImage}} imagePullPolicy: Always command: ["python", "/app/infer.py"] env: - name: INFERENCE_MODE value: "{{.InferenceMode}}" volumeMounts: - mountPath: /dev/shm name: dshm resources: requests: aws.amazon.com/neuroncore: "{{.NeuronCorePerNode}}" limits: aws.amazon.com/neuroncore: "{{.NeuronCorePerNode}}" nodeSelector: node.kubernetes.io/instance-type: {{.NodeType}} ================================================ FILE: test/cases/neuron-inference/vars.go ================================================ //go:build e2e package inference import ( "flag" "sigs.k8s.io/e2e-framework/pkg/env" ) // Shared global variables var ( // The e2e-framework environment testenv env.Environment // Passed in as flags bertInferenceImage *string nodeType *string inferenceMode *string // Discovered in main_test.go neuronPerNode int neuronCorePerNode int ) // init() runs before TestMain and sets up the flags func init() { bertInferenceImage = flag.String("bertInferenceImage", "", "[REQUIRED] Docker image used for Neuron-based BERT inference") nodeType = flag.String("nodeType", "", "Node type label for K8s nodes, e.g., trn1.32xlarge or inf2.xlarge") inferenceMode = flag.String("inferenceMode", "throughput", "Inference mode for BERT (throughput or latency)") } ================================================ FILE: test/cases/neuron-training/bert_training_test.go ================================================ //go:build e2e package training import ( "bytes" "context" _ "embed" "fmt" "log" "regexp" "strconv" "testing" "time" "k8s.io/client-go/kubernetes" "k8s.io/client-go/rest" fwext "github.com/aws/aws-k8s-tester/internal/e2e" batchv1 "k8s.io/api/batch/v1" v1 "k8s.io/api/core/v1" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "sigs.k8s.io/e2e-framework/klient/k8s" "sigs.k8s.io/e2e-framework/klient/wait" "sigs.k8s.io/e2e-framework/klient/wait/conditions" "sigs.k8s.io/e2e-framework/pkg/envconf" "sigs.k8s.io/e2e-framework/pkg/features" ) var ( //go:embed manifests/bert-training.yaml bertTrainingJobManifest []byte //go:embed manifests/training-comm-service.yaml trainingPodCommServiceManifest []byte // Regex to match lines like: // local_throughput=5.00 samples/s rankThroughputRegex = regexp.MustCompile( `local_throughput\s*=\s*([\d\.]+)\s+samples\/s`, ) // Regex to match lines like: // local_avg_epoch_time=12.50s rankEpochTimeRegex = regexp.MustCompile( `local_avg_epoch_time=([\d\.]+)s`, ) ) // TestBertTraining runs the Neuron-based BERT training test func TestBertTraining(t *testing.T) { if *bertTrainingImage == "" { t.Fatal("bertTrainingImage must be set to run the test") } // Render the templated manifest with dynamic variables renderVars := map[string]string{ "BertTrainingImage": *bertTrainingImage, "NodeType": *nodeType, "SlotsPerWorker": fmt.Sprintf("%d", nodeCount), "NodeCount": fmt.Sprintf("%d", nodeCount), "NeuronPerNode": fmt.Sprintf("%d", neuronPerNode), "NeuronCorePerNode": fmt.Sprintf("%d", neuronCorePerNode), "EFAPerNode": fmt.Sprintf("%d", efaPerNode), } // Render the manifest renderedManifest, err := fwext.RenderManifests(bertTrainingJobManifest, renderVars) if err != nil { t.Fatalf("failed to render neuron BERT training manifest: %v", err) } renderedCommServiceManifest, err := fwext.RenderManifests(trainingPodCommServiceManifest, renderVars) if err != nil { t.Fatalf("failed to render pod communication manifest: %v", err) } // Define a feature for the Neuron BERT training neuronTraining := features.New("bert-training"). WithLabel("suite", "neuron"). WithLabel("hardware", "neuron"). Assess("Neuron training Job succeeds", func(ctx context.Context, t *testing.T, cfg *envconf.Config) context.Context { manifests := [][]byte{renderedCommServiceManifest, renderedManifest} maxAttempts := (*retries) + 1 for attempt := 0; attempt < maxAttempts; attempt++ { log.Printf("Applying manifests for BERT training test (Attempt #%d)", attempt+1) if err := applyManifests(cfg, manifests); err != nil { log.Printf("Failed to apply manifests: %v", err) cleanupManifests(cfg, manifests) continue } job, err := waitForJobCreation(cfg) if err != nil { log.Printf("Failed to detect job creation: %v", err) cleanupManifests(cfg, manifests) continue } if err := waitForJobCompletion(job, cfg); err != nil { log.Printf("Job did not complete successfully: %v", err) logsBuf, err := gatherJobLogs(ctx, cfg, "default", "bert-training") if err != nil { log.Printf("failed to get logs: %v", err) } else { log.Println(logsBuf.String()) } cleanupManifests(cfg, manifests) continue } // Job completed successfully if err := processJobLogs(ctx, cfg); err != nil { log.Printf("Failed to process job logs: %v", err) cleanupManifests(cfg, manifests) continue } // Test succeeded, clean up and return cleanupManifests(cfg, manifests) log.Printf("BERT training test succeeded on attempt #%d", attempt+1) return ctx } // If we've exhausted all attempts t.Fatalf("BERT training test did not succeed after %d attempts", maxAttempts) return ctx }). Feature() // Run the feature testenv.Test(t, neuronTraining) } // gatherJobLogs retrieves logs from all pods of the specified jobName, returning them as a buffer. func gatherJobLogs(ctx context.Context, cfg *envconf.Config, namespace, jobName string) (*bytes.Buffer, error) { clientset, err := getClientset(cfg.Client().RESTConfig()) if err != nil { return nil, fmt.Errorf("failed to create kubernetes clientset: %w", err) } podList, err := clientset.CoreV1().Pods(namespace).List(ctx, metav1.ListOptions{ LabelSelector: fmt.Sprintf("job-name=%s", jobName), }) if err != nil { return nil, fmt.Errorf("failed to list pods for job %s: %w", jobName, err) } if len(podList.Items) == 0 { return nil, fmt.Errorf("no pods found for job %s", jobName) } var out bytes.Buffer for _, pod := range podList.Items { req := clientset.CoreV1().Pods(pod.Namespace).GetLogs(pod.Name, &v1.PodLogOptions{}) logStream, err := req.Stream(ctx) if err != nil { return &out, fmt.Errorf("failed to get logs from pod %s: %w", pod.Name, err) } defer logStream.Close() // Copy logs into our buffer if _, err := out.ReadFrom(logStream); err != nil { return &out, fmt.Errorf("failed to read logs from pod %s: %w", pod.Name, err) } } return &out, nil } // aggregateMetricFromLogs scans the log output for lines based on a provided RegEx. // The RegEx is assumed to take a sufficiently unique form like = to avoid // collisions, but also to simplify parsing. // // returns the average, sum, and count for all occurrences of the metric. func aggregateMetricFromLogs(metricRegex *regexp.Regexp, logs string) (avg float64, sum float64, count int) { matches := metricRegex.FindAllStringSubmatch(logs, -1) for _, match := range matches { val, err := strconv.ParseFloat(match[1], 64) if err == nil { sum += val count++ } } if count > 0 { avg = sum / float64(count) } return avg, sum, count } func applyManifests(cfg *envconf.Config, manifests [][]byte) error { fwext.ApplyManifests(cfg.Client().RESTConfig(), manifests...) log.Println("Successfully applied test manifests.") return nil } func waitForJobCreation(cfg *envconf.Config) (*batchv1.Job, error) { job := &batchv1.Job{ ObjectMeta: metav1.ObjectMeta{ Name: "bert-training", Namespace: "default", }, } log.Println("Waiting for the 'bert-training' Job resource to be created...") return job, wait.For( conditions.New(cfg.Client().Resources()).ResourceMatch(job, func(object k8s.Object) bool { return true }), wait.WithTimeout(time.Minute*5), ) } func waitForJobCompletion(job *batchv1.Job, cfg *envconf.Config) error { log.Println("Waiting for 'bert-training' Job to succeed...") return wait.For( fwext.NewConditionExtension(cfg.Client().Resources()).JobSucceeded(job), wait.WithTimeout(30*time.Minute), ) } func processJobLogs(ctx context.Context, cfg *envconf.Config) error { logsBuf, err := gatherJobLogs(ctx, cfg, "default", "bert-training") if err != nil { return fmt.Errorf("failed to retrieve bert-training job logs: %v", err) } log.Println("== Raw Logs from the launcher pods ==") log.Println(logsBuf.String()) processMetrics(logsBuf.String()) return nil } func processMetrics(logs string) { // Process throughput avgThru, sumThru, countThru := aggregateMetricFromLogs(rankThroughputRegex, logs) if countThru == 0 { log.Printf("No throughput lines found. Possibly missing in logs.") } else { log.Printf("Parsed throughput from %d ranks. Total=%.2f samples/s, Average=%.2f samples/s", countThru, sumThru, avgThru) log.Printf("Average Throughput: %.2f samples/second", avgThru) } // Process epoch time avgEp, sumEp, countEp := aggregateMetricFromLogs(rankEpochTimeRegex, logs) if countEp == 0 { log.Printf("No epoch time lines found. Possibly missing in logs.") } else { log.Printf("Parsed average epoch time from %d ranks. Sum=%.2fs, Average=%.2fs", countEp, sumEp, avgEp) } } func cleanupManifests(cfg *envconf.Config, manifests [][]byte) { log.Println("Deleting test manifests.") if err := fwext.DeleteManifests(cfg.Client().RESTConfig(), manifests...); err != nil { log.Printf("Failed to delete manifests: %v", err) } } // getClientset creates a Kubernetes clientset from the given REST config func getClientset(restConfig *rest.Config) (*kubernetes.Clientset, error) { clientset, err := kubernetes.NewForConfig(restConfig) if err != nil { return nil, fmt.Errorf("failed to create kubernetes clientset: %w", err) } return clientset, nil } ================================================ FILE: test/cases/neuron-training/main_test.go ================================================ //go:build e2e package training import ( "context" _ "embed" "fmt" "log" "os" "slices" "testing" "time" fwext "github.com/aws/aws-k8s-tester/internal/e2e" "github.com/aws/aws-k8s-tester/test/manifests" appsv1 "k8s.io/api/apps/v1" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/client-go/kubernetes" "sigs.k8s.io/e2e-framework/klient/wait" "sigs.k8s.io/e2e-framework/pkg/env" "sigs.k8s.io/e2e-framework/pkg/envconf" ) func TestMain(m *testing.M) { cfg, err := envconf.NewFromFlags() if err != nil { log.Fatalf("failed to initialize test environment: %v", err) } testenv = env.NewWithConfig(cfg) manifests := [][]byte{ manifests.NeuronDevicePluginRbacManifest, manifests.NeuronDevicePluginManifest, manifests.EfaDevicePluginManifest, } testenv.Setup( func(ctx context.Context, config *envconf.Config) (context.Context, error) { log.Println("Applying Neuron device plugin RBAC, Neuron device plugin and EFA device plugin manifests.") err := fwext.ApplyManifests(config.Client().RESTConfig(), manifests...) if err != nil { return ctx, fmt.Errorf("failed to apply manifests: %w", err) } log.Println("Successfully applied Neuron device plugin RBAC, Neuron device plugin and EFA device plugin manifests.") return ctx, nil }, func(ctx context.Context, config *envconf.Config) (context.Context, error) { log.Println("Waiting for Neuron Device Plugin daemonset to be ready.") daemonset := appsv1.DaemonSet{ ObjectMeta: metav1.ObjectMeta{Name: "neuron-device-plugin-daemonset", Namespace: "kube-system"}, } err := wait.For( fwext.NewConditionExtension(config.Client().Resources()).DaemonSetReady(&daemonset), wait.WithTimeout(time.Minute*5), ) if err != nil { return ctx, fmt.Errorf("Neuron Device Plugin daemonset is not ready: %w", err) } log.Println("Neuron Device Plugin daemonset is ready.") return ctx, nil }, func(ctx context.Context, config *envconf.Config) (context.Context, error) { log.Println("Waiting for EFA Device Plugin daemonset to be ready.") daemonset := appsv1.DaemonSet{ ObjectMeta: metav1.ObjectMeta{Name: "aws-efa-k8s-device-plugin-daemonset", Namespace: "kube-system"}, } err := wait.For( fwext.NewConditionExtension(config.Client().Resources()).DaemonSetReady(&daemonset), wait.WithTimeout(time.Minute*5), ) if err != nil { return ctx, fmt.Errorf("EFA Device Plugin daemonset is not ready: %w", err) } log.Println("EFA Device Plugin daemonset is ready.") return ctx, nil }, checkNonZeroResourceCapacity, checkNodeTypes, ) testenv.Finish( func(ctx context.Context, config *envconf.Config) (context.Context, error) { log.Println("Deleting Neuron device plugin and EFA device plugin manifests.") slices.Reverse(manifests) err := fwext.DeleteManifests(config.Client().RESTConfig(), manifests...) if err != nil { return ctx, fmt.Errorf("failed to delete manifests: %w", err) } log.Println("Successfully deleted Neuron device plugin and EFA device plugin manifests.") return ctx, nil }, ) log.Println("Starting tests...") exitCode := testenv.Run(m) log.Printf("Tests finished with exit code %d", exitCode) os.Exit(exitCode) } func checkNodeTypes(ctx context.Context, config *envconf.Config) (context.Context, error) { clientset, err := kubernetes.NewForConfig(config.Client().RESTConfig()) if err != nil { return ctx, fmt.Errorf("failed to create Kubernetes client: %w", err) } nodes, err := clientset.CoreV1().Nodes().List(ctx, metav1.ListOptions{}) if err != nil { return ctx, fmt.Errorf("failed to list nodes: %w", err) } if len(nodes.Items) == 0 { return ctx, fmt.Errorf("no nodes found in the cluster") } // Check if all nodes have the same instance type for i := 1; i < len(nodes.Items); i++ { currentInstanceType := nodes.Items[i].Labels["node.kubernetes.io/instance-type"] if currentInstanceType != nodes.Items[i-1].Labels["node.kubernetes.io/instance-type"] { return ctx, fmt.Errorf("inconsistent node types detected, all nodes must have the same instance type") } else if *nodeType == "" { log.Printf("[INFO] nodeType was not set, discovered type %s", currentInstanceType) *nodeType = currentInstanceType } } // Calculate capacities for all nodes totalNeuronCount := 0 totalNeuronCoreCount := 0 totalEfaCount := 0 nodeCount = len(nodes.Items) // Store global node count for _, node := range nodes.Items { log.Printf("[INFO] Processing node %s", node.Name) // Check for Neuron capacity neuron, ok := node.Status.Capacity["aws.amazon.com/neuron"] if ok { totalNeuronCount += int(neuron.Value()) } else { log.Printf("[WARN] Node %s does not have 'aws.amazon.com/neuron' capacity", node.Name) } // Check for NeuronCore capacity neuronCore, ok := node.Status.Capacity["aws.amazon.com/neuroncore"] if ok { totalNeuronCoreCount += int(neuronCore.Value()) } else { log.Printf("[WARN] Node %s does not have 'aws.amazon.com/neuroncore' capacity", node.Name) } // Check for EFA capacity efa, ok := node.Status.Capacity["vpc.amazonaws.com/efa"] if ok { totalEfaCount += int(efa.Value()) } else { log.Printf("[WARN] Node %s does not have 'vpc.amazonaws.com/efa' capacity", node.Name) } } // Update global capacities if nodeCount > 0 { neuronPerNode = totalNeuronCount / nodeCount neuronCorePerNode = totalNeuronCoreCount / nodeCount efaPerNode = totalEfaCount / nodeCount } else { log.Printf("[WARN] No nodes found, setting capacities to 0") neuronPerNode = 0 neuronCorePerNode = 0 efaPerNode = 0 } log.Printf("[INFO] Total Nodes: %d", nodeCount) log.Printf("[INFO] Total Neuron Count: %d, Neuron Per Node: %d", totalNeuronCount, neuronPerNode) log.Printf("[INFO] Total Neuron Core Count: %d, Neuron Core Per Node: %d", totalNeuronCoreCount, neuronCorePerNode) log.Printf("[INFO] Total EFA Count: %d, EFA Per Node: %d", totalEfaCount, efaPerNode) return ctx, nil } func checkNonZeroResourceCapacity(ctx context.Context, config *envconf.Config) (context.Context, error) { log.Println("[INFO] Starting resource capacity checks") // Check Neuron devices log.Println("Checking Neuron device capacity on nodes") err := wait.For( fwext.NewConditionExtension(config.Client().Resources()).AllNodesHaveNonZeroResourceCapacity("aws.amazon.com/neuron"), wait.WithTimeout(time.Second*60), wait.WithInterval(time.Second*5), ) if err != nil { return ctx, fmt.Errorf("failed to verify Neuron device capacity on nodes: %w", err) } log.Println("Neuron devices check passed - all nodes have non-zero capacity") // Check Neuron cores log.Println("Checking Neuron core capacity on nodes") err = wait.For( fwext.NewConditionExtension(config.Client().Resources()).AllNodesHaveNonZeroResourceCapacity("aws.amazon.com/neuroncore"), wait.WithTimeout(time.Second*60), wait.WithInterval(time.Second*5), ) if err != nil { return ctx, fmt.Errorf("failed to verify Neuron core capacity on nodes: %w", err) } log.Println("Neuron cores check passed - all nodes have non-zero capacity") // Check EFA devices log.Println("Checking EFA device capacity on nodes") err = wait.For( fwext.NewConditionExtension(config.Client().Resources()).AllNodesHaveNonZeroResourceCapacity("vpc.amazonaws.com/efa"), wait.WithTimeout(time.Second*60), wait.WithInterval(time.Second*5), ) if err != nil { return ctx, fmt.Errorf("failed to verify EFA device capacity on nodes: %w", err) } log.Println("EFA devices check passed - all nodes have non-zero capacity") log.Println("[INFO] All resource capacity checks completed successfully") return ctx, nil } ================================================ FILE: test/cases/neuron-training/manifests/bert-training.yaml ================================================ apiVersion: batch/v1 kind: Job metadata: labels: app: bert-training name: bert-training spec: completionMode: Indexed completions: {{.NodeCount}} parallelism: {{.NodeCount}} backoffLimit: 0 template: spec: restartPolicy: Never containers: - image: {{.BertTrainingImage}} name: bert-training env: - name: MASTER_ADDR value: bert-training-0.training args: - sh - -c - | # Enable EFA https://awsdocs-neuron.readthedocs-hosted.com/en/latest/neuron-runtime/nrt-troubleshoot.html#fi-efa-fork-safe (AL2 legacy requirement) export FI_EFA_FORK_SAFE=1 export CCOM_SOCKET_IFNAME=eth0 export NCCL_DEBUG=ERROR torchrun --nproc_per_node {{.NeuronCorePerNode}} --nnodes {{.NodeCount}} --node_rank $JOB_COMPLETION_INDEX --master_addr $MASTER_ADDR train.py volumeMounts: - name: dshm mountPath: /dev/shm resources: requests: aws.amazon.com/neuron: {{.NeuronPerNode}} aws.amazon.com/neuroncore: {{.NeuronCorePerNode}} vpc.amazonaws.com/efa: {{.EFAPerNode}} limits: aws.amazon.com/neuron: {{.NeuronPerNode}} aws.amazon.com/neuroncore: {{.NeuronCorePerNode}} vpc.amazonaws.com/efa: {{.EFAPerNode}} nodeSelector: node.kubernetes.io/instance-type: {{.NodeType}} subdomain: training volumes: - name: dshm emptyDir: medium: Memory ================================================ FILE: test/cases/neuron-training/manifests/training-comm-service.yaml ================================================ apiVersion: v1 kind: Service metadata: name: training labels: app: training spec: clusterIP: None selector: job-name: bert-training ================================================ FILE: test/cases/neuron-training/vars.go ================================================ package training import ( "flag" "sigs.k8s.io/e2e-framework/pkg/env" ) // Shared global variables var ( testenv env.Environment bertTrainingImage *string efaEnabled *bool nodeType *string nodeCount int efaPerNode int neuronPerNode int neuronCorePerNode int retries *int ) func init() { // Define command-line flags bertTrainingImage = flag.String("bertTrainingImage", "", "Docker image used for BERT training workload") efaEnabled = flag.Bool("efaEnabled", false, "Enable Elastic Fabric Adapter (EFA)") nodeType = flag.String("nodeType", "", "Instance type for cluster nodes (e.g., inf1.24xlarge)") retries = flag.Int("retries", 2, "Number of retries to attempt before marking the test as failed.") } ================================================ FILE: test/cases/nvidia/capabilities_test.go ================================================ //go:build e2e package nvidia import ( "context" "testing" "time" "github.com/aws/aws-k8s-tester/internal/e2e" v1 "k8s.io/api/core/v1" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apimachinery/pkg/util/wait" e2ewait "sigs.k8s.io/e2e-framework/klient/wait" "sigs.k8s.io/e2e-framework/pkg/envconf" "sigs.k8s.io/e2e-framework/pkg/features" _ "embed" ) //go:embed manifests/nvidia-driver-capabilities-check.yaml var capabilitiesCheckPod []byte const ( PodName = "moderngl-pod" PodNamespace = "default" ) func TestNvidiaDriverCapabilities(t *testing.T) { feat := features.New("nvidia-driver-capabilities-check"). WithLabel("suite", "nvidia"). Setup(func(ctx context.Context, t *testing.T, cfg *envconf.Config) context.Context { t.Log("Applying nvidia driver capabilities check pod manifest.") // capabilitiesCheckPod only run moderngl.create_standalone_context() with NVIDIA_DRIVER_CAPABILITIES=all to load all capabilities enabled by nvidia driver. // If any lib required by any of nvidia driver capabilities is missing, it will failed with exception. if err := e2e.ApplyManifests(cfg.Client().RESTConfig(), capabilitiesCheckPod); err != nil { t.Fatalf("Failed to apply capabilities check pod manifest: %v", err) } return ctx }). Assess("Check Pod becomes ready", func(ctx context.Context, t *testing.T, cfg *envconf.Config) context.Context { t.Log("Waiting up to 5 minute for pod to complete...") pod := &v1.Pod{ ObjectMeta: metav1.ObjectMeta{ Name: PodName, Namespace: PodNamespace, }, } err := e2ewait.For( e2e.NewConditionExtension(cfg.Client().Resources()).PodSucceeded(pod), e2ewait.WithTimeout(5*time.Minute), ) if err != nil { if err == wait.ErrWaitTimeout { t.Fatalf("nvidia capabilities check pod not in compeleted phase (succeeded or failed) within 5 minute and waiter timeout: %v", err) } t.Fatalf("nvidia capabilities pod in Failed status, ModernGL check failed. Could be caused by required library missing") } t.Log("nvidia driver capabilities check succeeded.") return ctx }). Teardown(func(ctx context.Context, t *testing.T, cfg *envconf.Config) context.Context { t.Log("Removing nvidia driver capabilities check pod.") if err := e2e.DeleteManifests(cfg.Client().RESTConfig(), capabilitiesCheckPod); err != nil { t.Errorf("Failed to delete pod: %v", err) } t.Log("all test resources removed successfully.") return ctx }). Feature() testenv.Test(t, feat) } ================================================ FILE: test/cases/nvidia/containerd_test.go ================================================ //go:build e2e package nvidia import ( "context" "log" "testing" "time" "github.com/aws/aws-k8s-tester/internal/e2e" appsv1 "k8s.io/api/apps/v1" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "sigs.k8s.io/e2e-framework/klient/wait" "sigs.k8s.io/e2e-framework/pkg/envconf" "sigs.k8s.io/e2e-framework/pkg/features" _ "embed" ) //go:embed manifests/daemonset-containerd-check.yaml var containerdCheckDS []byte func TestContainerdConfig(t *testing.T) { feat := features.New("containerd-config-check"). WithLabel("suite", "nvidia"). Setup(func(ctx context.Context, t *testing.T, cfg *envconf.Config) context.Context { log.Println("[Setup] Applying containerd-check DaemonSet manifest.") if err := e2e.ApplyManifests(cfg.Client().RESTConfig(), containerdCheckDS); err != nil { t.Fatalf("Failed to apply containerd-check DS: %v", err) } return ctx }). Assess("DaemonSet becomes ready", func(ctx context.Context, t *testing.T, cfg *envconf.Config) context.Context { dsName := "containerd-check" dsNS := "default" log.Println("[Assess] Waiting up to 1 minute for containerd-check DS to become Ready...") ds := &appsv1.DaemonSet{ ObjectMeta: metav1.ObjectMeta{ Name: dsName, Namespace: dsNS, }, } err := wait.For( e2e.NewConditionExtension(cfg.Client().Resources()).DaemonSetReady(ds), wait.WithTimeout(1*time.Minute), ) if err != nil { t.Logf("[Assess] containerd-check DS did not become Ready: %v", err) e2e.PrintDaemonSetPodLogs(t, ctx, cfg.Client().RESTConfig(), dsNS, "app=containerd-check") t.Fatalf("containerd-check DS not Ready within 1 minute") } log.Println("[Assess] containerd-check DS is Ready.") return ctx }). Teardown(func(ctx context.Context, t *testing.T, cfg *envconf.Config) context.Context { t.Log("[Teardown] Removing containerd-check DS (no additional logs).") if err := e2e.DeleteManifests(cfg.Client().RESTConfig(), containerdCheckDS); err != nil { t.Fatalf("Failed to delete containerd-check DS: %v", err) } t.Log("[Teardown] containerd-check DS removed successfully.") return ctx }). Feature() testenv.Test(t, feat) } ================================================ FILE: test/cases/nvidia/main_test.go ================================================ //go:build e2e package nvidia import ( "context" _ "embed" "fmt" "log" "os" "os/signal" "slices" "testing" fwext "github.com/aws/aws-k8s-tester/internal/e2e" "github.com/aws/aws-k8s-tester/test/common" "github.com/aws/aws-k8s-tester/test/manifests" appsv1 "k8s.io/api/apps/v1" v1 "k8s.io/api/core/v1" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/client-go/kubernetes" "sigs.k8s.io/e2e-framework/klient/wait" "sigs.k8s.io/e2e-framework/klient/wait/conditions" "sigs.k8s.io/e2e-framework/pkg/env" "sigs.k8s.io/e2e-framework/pkg/envconf" ) type Config struct { common.MetricOps NodeType string `flag:"nodeType" desc:"node type for the tests"` InstallDevicePlugin bool `flag:"installDevicePlugin" desc:"install nvidia device plugin"` EfaEnabled bool `flag:"efaEnabled" desc:"enable efa tests"` NvidiaTestImage string `flag:"nvidiaTestImage" desc:"nccl test image for nccl tests"` PytorchImage string `flag:"pytorchImage" desc:"pytorch cuda image for single node tests"` SkipUnitTestSubcommand string `flag:"skipUnitTestSubcommand" desc:"optional command to skip specified unit test"` } var ( testenv env.Environment testConfig Config nodeCount int gpuPerNode int efaPerNode int ) func deployMPIOperator(ctx context.Context, config *envconf.Config) (context.Context, error) { dep := appsv1.Deployment{ ObjectMeta: metav1.ObjectMeta{Name: "mpi-operator", Namespace: "mpi-operator"}, } err := wait.For(conditions.New(config.Client().Resources()).DeploymentConditionMatch(&dep, appsv1.DeploymentAvailable, v1.ConditionTrue), wait.WithContext(ctx)) if err != nil { return ctx, fmt.Errorf("failed to deploy mpi-operator: %v", err) } return ctx, nil } func checkNodeTypes(ctx context.Context, config *envconf.Config) (context.Context, error) { clientset, err := kubernetes.NewForConfig(config.Client().RESTConfig()) if err != nil { return ctx, err } nodes, err := clientset.CoreV1().Nodes().List(ctx, metav1.ListOptions{}) if err != nil { return ctx, err } for i := 1; i < len(nodes.Items)-1; i++ { if nodes.Items[i].Labels["node.kubernetes.io/instance-type"] != nodes.Items[i-1].Labels["node.kubernetes.io/instance-type"] { return ctx, fmt.Errorf("Node types are not the same, all node types must be the same in the cluster") } } if testConfig.NodeType != "" { for _, v := range nodes.Items { if v.Labels["node.kubernetes.io/instance-type"] == testConfig.NodeType { nodeCount++ gpu := v.Status.Capacity["nvidia.com/gpu"] gpuPerNode = int(gpu.Value()) efa := v.Status.Capacity["vpc.amazonaws.com/efa"] efaPerNode = int(efa.Value()) } } } else { log.Printf("No node type specified. Using the node type %s in the node groups.", nodes.Items[0].Labels["node.kubernetes.io/instance-type"]) testConfig.NodeType = nodes.Items[0].Labels["node.kubernetes.io/instance-type"] nodeCount = len(nodes.Items) gpu := nodes.Items[0].Status.Capacity["nvidia.com/gpu"] gpuPerNode = int(gpu.Value()) efa := nodes.Items[0].Status.Capacity["vpc.amazonaws.com/efa"] efaPerNode = int(efa.Value()) } return ctx, nil } func TestMain(m *testing.M) { testConfig = Config{ InstallDevicePlugin: true, PytorchImage: "763104351884.dkr.ecr.us-west-2.amazonaws.com/pytorch-training:2.1.0-gpu-py310-cu121-ubuntu20.04-ec2", } _, err := common.ParseFlags(&testConfig) if err != nil { log.Fatalf("failed to parse flags: %v", err) } cfg, err := envconf.NewFromFlags() if err != nil { log.Fatalf("failed to initialize test environment: %v", err) } ctx, cancel := signal.NotifyContext(context.Background(), os.Interrupt) defer cancel() testenv = env.NewWithConfig(cfg).WithContext(ctx) manifestsList := [][]byte{ manifests.MpiOperatorManifest, } setUpFunctions := []env.Func{ func(ctx context.Context, config *envconf.Config) (context.Context, error) { err := fwext.ApplyManifests(config.Client().RESTConfig(), manifestsList...) if err != nil { return ctx, err } return ctx, nil }, deployMPIOperator, } if testConfig.InstallDevicePlugin { manifestsList = append(manifestsList, manifests.NvidiaDevicePluginManifest) setUpFunctions = append(setUpFunctions, func(ctx context.Context, config *envconf.Config) (context.Context, error) { return common.DeployDaemonSet("nvidia-device-plugin-daemonset", "kube-system")(ctx, config) }) } if testConfig.EfaEnabled { manifestsList = append(manifestsList, manifests.EfaDevicePluginManifest) setUpFunctions = append(setUpFunctions, func(ctx context.Context, config *envconf.Config) (context.Context, error) { return common.DeployDaemonSet("aws-efa-k8s-device-plugin-daemonset", "kube-system")(ctx, config) }) } if len(testConfig.MetricDimensions) > 0 { renderedCloudWatchAgentManifest, err := manifests.RenderCloudWatchAgentManifest(testConfig.MetricDimensions) if err != nil { log.Printf("Warning: failed to render CloudWatch Agent manifest: %v", err) } manifestsList = append(manifestsList, manifests.DCGMExporterManifest, renderedCloudWatchAgentManifest) setUpFunctions = append(setUpFunctions, func(ctx context.Context, config *envconf.Config) (context.Context, error) { if ctx, err := common.DeployDaemonSet("dcgm-exporter", "kube-system")(ctx, config); err != nil { return ctx, err } if ctx, err := common.DeployDaemonSet("cwagent", "amazon-cloudwatch")(ctx, config); err != nil { return ctx, err } return ctx, nil }) } setUpFunctions = append(setUpFunctions, checkNodeTypes) testenv.Setup(setUpFunctions...) testenv.Finish( func(ctx context.Context, config *envconf.Config) (context.Context, error) { slices.Reverse(manifestsList) err := fwext.DeleteManifests(config.Client().RESTConfig(), manifestsList...) if err != nil { return ctx, err } return ctx, nil }, ) os.Exit(testenv.Run(m)) } ================================================ FILE: test/cases/nvidia/manifests/daemonset-containerd-check.yaml ================================================ apiVersion: apps/v1 kind: DaemonSet metadata: name: containerd-check namespace: default labels: app: containerd-check spec: selector: matchLabels: app: containerd-check template: metadata: labels: app: containerd-check spec: containers: - name: containerd-check image: public.ecr.aws/amazonlinux/amazonlinux:latest command: - sh - -c - | # 1. Ensure the script fails on any command or pipeline error set -e set -o pipefail echo "=== content read by the container ===" cat /host-etc/containerd/config.toml # 2. Check containerd config version and look for appropriate sandbox field # In containerd config version = 2 expect to find pattern `sandbox_image = "registry.k8s.io/pause:3.10.1"` # In containerd config version = 3 expect to find pattern `sandbox = "registry.k8s.io/pause:3.10.1"` # For more details: https://github.com/containerd/containerd/blob/main/docs/cri/config.md version_line=$(grep -E '^version\s*=' /host-etc/containerd/config.toml || true) if [ -z "$version_line" ]; then echo "FAIL: no version line found in containerd config" exit 1 fi version=$(echo "$version_line" | cut -d'=' -f2 | tr -d ' ') echo "INFO: containerd config version = $version" if [ "$version" = "2" ]; then sandbox_line=$(grep -E 'sandbox_image\s*=' /host-etc/containerd/config.toml || true) elif [ "$version" = "3" ]; then sandbox_line=$(grep -E 'sandbox\s*=' /host-etc/containerd/config.toml || true) else echo "FAIL: unsupported containerd config version: $version" exit 1 fi # 3. If no sandbox configuration is found, fail explicitly if [ -z "$sandbox_line" ]; then echo "FAIL: no sandbox_image or sandbox line found" echo "=== debug ===" exit 1 fi sandbox_image=$(echo "$sandbox_line" | cut -d'"' -f2) # 4. Check that $sandbox_image references .ecr. or is provided on the instance if [[ "$sandbox_image" == "localhost"* ]]; then echo "INFO: skipping .ecr. check for localhost sandbox image" else if [[ "$sandbox_image" != *".ecr."* ]]; then echo "FAIL: no .ecr. reference in $sandbox_image" echo "=== debug ===" exit 1 fi fi # 5. Check for 'nvidia-container-runtime' if ! grep -q "nvidia-container-runtime" /host-etc/containerd/config.toml; then echo "FAIL: no nvidia-container-runtime found" echo "=== debug ===" exit 1 fi # 6. Check for 'systemd_cgroup = true' or 'SystemdCgroup = true' if ! ( grep -q 'systemd_cgroup = true' /host-etc/containerd/config.toml || \ grep -q 'SystemdCgroup = true' /host-etc/containerd/config.toml ); then echo "FAIL: no systemd cgroup setting" echo "=== debug ===" exit 1 fi echo "containerd config check PASSED." # Keep container running so DS can be marked Ready tail -f /dev/null volumeMounts: - name: containerd-config mountPath: /host-etc/containerd readOnly: true volumes: - name: containerd-config hostPath: path: /etc/containerd ================================================ FILE: test/cases/nvidia/manifests/job-hpc-benchmarks.yaml ================================================ kind: Job apiVersion: batch/v1 metadata: name: hpc-benckmarks-job labels: app: hpc-benckmarks-job spec: completions: 1 parallelism: 1 template: metadata: labels: app: hpc-benckmarks-job spec: volumes: - name: dshm emptyDir: medium: Memory containers: - name: hpc-benchmarks image: "nvcr.io/nvidia/hpc-benchmarks:25.04" command: - mpirun - --allow-run-as-root - -np - "{{.GpuPerNode}}" - -bind-to - none - -x - NCCL_DEBUG=INFO - -x - HPL_FCT_COMM_POLICY=1 - -x - HPL_USE_NVSHMEM=0 # TODO: for arm it will be # - hpl-aarch64.sh - hpl.sh - --mem-affinity - 0:0:0:0:1:1:1:1 # --cpu-affinity needs to be tuned depending on the number of CPUs # available on the instance type. - --cpu-affinity - 0-13:14-27:28-41:42-55:56-69:70-83:84-97:98-111 - --no-multinode - --dat - hpl-linux-x86_64/sample-dat/HPL-dgx-1N.dat # TODO: the path differs for arm64 # - hpl-linux-aarch64-gpu/sample-dat/HPL-dgx-1N.dat volumeMounts: - mountPath: /dev/shm name: dshm imagePullPolicy: Always resources: limits: nvidia.com/gpu: {{.GpuPerNode}} env: - name: UCX_TLS value: "^sysv" restartPolicy: Never backoffLimit: 4 ================================================ FILE: test/cases/nvidia/manifests/job-unit-test-single-node.yaml ================================================ kind: Job apiVersion: batch/v1 metadata: name: unit-test-job labels: app: unit-test-job spec: template: metadata: labels: app: unit-test-job spec: containers: - name: unit-test-container image: {{.NvidiaTestImage}} command: - /bin/bash - ./gpu_unit_tests/unit_test env: - name: SKIP_TESTS_SUBCOMMAND value: {{.SkipTestSubcommand}} # because we started building these from source, this is just a # regular binary. - name: DEMO_SUITE_DIR value: /usr/bin - name: EC2_INSTANCE_TYPE value: {{.NodeType}} imagePullPolicy: Always resources: limits: nvidia.com/gpu: {{.GpuPerNode}} requests: cpu: "1" memory: 1Gi restartPolicy: Never backoffLimit: 1 ================================================ FILE: test/cases/nvidia/manifests/mpi-job-nccl-test-multi-node.yaml ================================================ apiVersion: kubeflow.org/v2beta1 kind: MPIJob metadata: name: {{.JobName}} spec: slotsPerWorker: {{.GpuPerNode}} runPolicy: # it may take a bit for the workers to get ready (the container image is heavy) # and we don't want the launcher to reach it's CrashLoopBackoff limit in the meantime backoffLimit: 20 cleanPodPolicy: Running mpiReplicaSpecs: Launcher: replicas: 1 template: spec: restartPolicy: OnFailure containers: - image: {{.NvidiaTestImage}} imagePullPolicy: Always name: nccl-test-launcher env: command: - mpirun - --allow-run-as-root - --tag-output - -np - "{{.WorkerNodeGpuCount}}" - -bind-to - none - -map-by - slot - -x - PATH - -x - LD_LIBRARY_PATH - -x - NCCL_DEBUG=INFO - -x - NCCL_BUFFSIZE={{.NcclBuffSize}} - -x - NCCL_TUNER_PLUGIN=/opt/aws-ofi-nccl/install/lib/libnccl-ofi-tuner.so - --mca - pml - ^cm,ucx - --mca - btl - tcp,self - --mca - btl_tcp_if_exclude - lo,docker0,veth_def_agent - /opt/nccl-tests/build/{{.TestName}} - -b - "8" - -e - {{.MaxBytes}} - -f - "2" - -c - "1" - -n - "10" Worker: replicas: {{.WorkerNodeCount}} template: spec: volumes: - name: dshm emptyDir: medium: Memory containers: - image: {{.NvidiaTestImage}} imagePullPolicy: Always name: nccl-test-worker volumeMounts: - mountPath: /dev/shm name: dshm resources: requests: nvidia.com/gpu: {{.GpuPerNode}} vpc.amazonaws.com/efa: {{.EfaInterfacePerNode}} limits: nvidia.com/gpu: {{.GpuPerNode}} vpc.amazonaws.com/efa: {{.EfaInterfacePerNode}} ================================================ FILE: test/cases/nvidia/manifests/mpi-job-pytorch-training-single-node.yaml ================================================ --- # container image from: https://github.com/aws/deep-learning-containers/blob/master/available_images.md apiVersion: kubeflow.org/v2beta1 kind: MPIJob metadata: name: pytorch-training-single-node spec: slotsPerWorker: 4 runPolicy: cleanPodPolicy: Running mpiImplementation: OpenMPI mpiReplicaSpecs: Launcher: replicas: 1 template: spec: restartPolicy: OnFailure containers: - image: {{.PytorchTestImage}} name: gpu-test command: - mpirun - --allow-run-as-root - -np - "1" - -mca - btl_tcp_if_exclude - lo - -mca - pml - ob1 - -mca - btl - ^openib - --bind-to - none - -map-by - slot - -x - LD_LIBRARY_PATH - -x - PATH - -x - NCCL_SOCKET_IFNAME=eth0 - -x - NCCL_DEBUG=INFO - -x - MXNET_CUDNN_AUTOTUNE_DEFAULT=0 - python - -c - import os; os.system("git clone https://github.com/pytorch/examples.git pytorch-examples"); os.system("git -C pytorch-examples checkout 0f0c9131ca5c79d1332dce1f4c06fe942fbdc665"); os.system("python pytorch-examples/mnist/main.py --epochs 1") resources: limits: nvidia.com/gpu: 1 ================================================ FILE: test/cases/nvidia/manifests/nvidia-driver-capabilities-check.yaml ================================================ apiVersion: v1 kind: Pod metadata: name: moderngl-pod spec: restartPolicy: Never tolerations: - key: "nvidia.com/gpu" operator: "Exists" effect: "NoSchedule" containers: - name: moderngl-container env: - name: NVIDIA_DRIVER_CAPABILITIES value: "all" image: public.ecr.aws/ubuntu/ubuntu:22.04 command: ["/bin/bash"] args: - -c - | set -e apt-get update apt-get install -y \ python3 \ python3-pip \ libgl1-mesa-glx \ libegl1-mesa-dev \ libgles2-mesa-dev \ mesa-utils \ xvfb pip3 install moderngl sleep 60 cat <<'EOF' > moderngl-script.py import moderngl moderngl.create_standalone_context(backend='egl') EOF python3 moderngl-script.py resources: requests: memory: "50Gi" cpu: "15" "nvidia.com/gpu": "1" limits: memory: "50Gi" "nvidia.com/gpu": "1" ================================================ FILE: test/cases/nvidia/mpi_test.go ================================================ //go:build e2e package nvidia import ( "context" _ "embed" "fmt" "regexp" "strings" "testing" "time" fwext "github.com/aws/aws-k8s-tester/internal/e2e" "github.com/aws/aws-k8s-tester/internal/e2e/mpijobs" "sigs.k8s.io/e2e-framework/klient/wait" "sigs.k8s.io/e2e-framework/klient/wait/conditions" "sigs.k8s.io/e2e-framework/pkg/envconf" "sigs.k8s.io/e2e-framework/pkg/features" "k8s.io/apimachinery/pkg/apis/meta/v1/unstructured" "k8s.io/utils/strings/slices" ) var ( instanceSupportsRdmaRead = []string{"p5.48xlarge", "p4d.24xlarge", "p4de.24xlarge", "p5e.48xlarge", "p5en.48xlarge"} ) var ( //go:embed manifests/mpi-job-pytorch-training-single-node.yaml mpiJobPytorchTrainingSingleNodeManifest []byte //go:embed manifests/mpi-job-nccl-test-multi-node.yaml mpiJobNcclTestMultiNodeManifest []byte ) type ncclTestManifestTplVars struct { WorkerNodeCount int WorkerNodeGpuCount int GpuPerNode int NvidiaTestImage string EfaInterfacePerNode int MaxBytes string NcclBuffSize string TestName string JobName string } func TestMPIJobPytorchTraining(t *testing.T) { testenv.Test(t, singleNode(), multiNode("all_reduce_perf"), multiNode("all_gather_perf"), multiNode("alltoall_perf"), ) } func multiNode(testName string) features.Feature { var renderedMpiJobNcclTestMultiNodeManifest []byte jobName := strings.ReplaceAll(fmt.Sprintf("multi-node-%s", testName), "_", "-") return features.New(fmt.Sprintf("multi-node:%s", testName)). WithLabel("suite", "nvidia"). WithLabel("hardware", "gpu"). WithLabel("hardware", "efa"). Setup(func(ctx context.Context, t *testing.T, cfg *envconf.Config) context.Context { if testConfig.NvidiaTestImage == "" { t.Fatal(fmt.Errorf("nvidiaTestImage must be set to run unit test, use https://github.com/aws/aws-k8s-tester/blob/main/test/images/nvidia/Dockerfile to build the image and -nvidiaTestImage to set the image url")) } maxBytes := "2G" ncclBuffSize := "4194304" if slices.Contains(instanceSupportsRdmaRead, testConfig.NodeType) { t.Log("Instance supports RDMA") // TODO: revisit this with some kind of per-instance optimizer, or maybe use the defaults for all instance types unless specified if testName == "alltoall_perf" && strings.Contains(testConfig.NodeType, "p4") { // Keep default values for P4 running all-to-all } else { maxBytes = "16G" ncclBuffSize = "8388608" } } var err error renderedMpiJobNcclTestMultiNodeManifest, err = fwext.RenderManifests(mpiJobNcclTestMultiNodeManifest, ncclTestManifestTplVars{ // one of the nodes will be used for the master pod WorkerNodeCount: nodeCount, WorkerNodeGpuCount: nodeCount * gpuPerNode, GpuPerNode: gpuPerNode, NvidiaTestImage: testConfig.NvidiaTestImage, EfaInterfacePerNode: efaPerNode, MaxBytes: maxBytes, NcclBuffSize: ncclBuffSize, TestName: testName, JobName: jobName, }) if err != nil { t.Fatal(err) } t.Log("Applying multi node manifest") err = fwext.ApplyManifests(cfg.Client().RESTConfig(), renderedMpiJobNcclTestMultiNodeManifest) if err != nil { t.Fatal(err) } t.Log("Manifest applied successfully") return ctx }). Assess("MPIJob succeeds", func(ctx context.Context, t *testing.T, cfg *envconf.Config) context.Context { mpiJob := mpijobs.NewUnstructured(jobName, "default") t.Log("Waiting for multi node job to complete") err := wait.For(conditions.New(cfg.Client().Resources()).ResourceMatch(mpiJob, mpijobs.MPIJobSucceeded), wait.WithContext(ctx), wait.WithTimeout(60*time.Minute), ) if err != nil { t.Error(err) } t.Logf("final mpijob resource: %v", mpiJob) log, err := fwext.GetJobLogs(cfg.Client().RESTConfig(), mpiJob) if err != nil { t.Errorf("failed to get job logs: %v", err) } t.Logf("Test log for %s:", jobName) t.Log(log) if !t.Failed() { t.Log("Multi node job completed") // Verify GPU Direct RDMA is used on P4/P5 if testConfig.EfaEnabled && slices.Contains(instanceSupportsRdmaRead, testConfig.NodeType) { pattern := regexp.MustCompile(`\[send\] via NET/.*Libfabric/.*/GDRDMA`) if !pattern.MatchString(log) { t.Errorf("GPU Direct RDMA is not utilized for inter-node communication in NCCL tests on instances that support GDRDMA: %s", testConfig.NodeType) } } } return ctx }). Teardown(func(ctx context.Context, t *testing.T, cfg *envconf.Config) context.Context { err := fwext.DeleteManifests(cfg.Client().RESTConfig(), renderedMpiJobNcclTestMultiNodeManifest) if err != nil { t.Fatal(err) } return ctx }). Feature() } func singleNode() features.Feature { var renderedSingleNodeManifest []byte return features.New("single-node"). WithLabel("suite", "nvidia"). WithLabel("hardware", "gpu"). Setup(func(ctx context.Context, t *testing.T, cfg *envconf.Config) context.Context { t.Log("Applying single node manifest") var err error renderedSingleNodeManifest, err = fwext.RenderManifests(mpiJobPytorchTrainingSingleNodeManifest, struct { PytorchTestImage string }{ PytorchTestImage: testConfig.PytorchImage, }) if err != nil { t.Fatal(err) } err = fwext.ApplyManifests(cfg.Client().RESTConfig(), renderedSingleNodeManifest) if err != nil { t.Fatal(err) } t.Log("Manifest applied successfully") return ctx }). Assess("MPIJob succeeds", func(ctx context.Context, t *testing.T, cfg *envconf.Config) context.Context { mpiJob := mpijobs.NewUnstructured("pytorch-training-single-node", "default") ctx = context.WithValue(ctx, "mpiJob", mpiJob) t.Log("Waiting for single node job to complete") err := wait.For(fwext.NewConditionExtension(cfg.Client().Resources()).ResourceMatch(mpiJob, mpijobs.MPIJobSucceeded), wait.WithContext(ctx), wait.WithTimeout(30*time.Minute), ) if err != nil { t.Error(err) } else { t.Log("Single node job completed") } t.Logf("final mpijob resource: %v", mpiJob) return ctx }). Teardown(func(ctx context.Context, t *testing.T, cfg *envconf.Config) context.Context { job := ctx.Value("mpiJob") if job == nil { // nothing to do return ctx } u, ok := job.(*unstructured.Unstructured) if !ok { t.Errorf("mpiJob in context is not unstructured: %v", job) } log, err := fwext.GetJobLogs(cfg.Client().RESTConfig(), u) if err != nil { t.Errorf("failed to get job logs: %v", err) } t.Log("Test log for pytorch-training-single-node:") t.Log(log) err = fwext.DeleteManifests(cfg.Client().RESTConfig(), renderedSingleNodeManifest) if err != nil { t.Error(err) } return ctx }). Feature() } ================================================ FILE: test/cases/nvidia/unit_test.go ================================================ //go:build e2e package nvidia import ( "context" _ "embed" "fmt" "testing" "time" fwext "github.com/aws/aws-k8s-tester/internal/e2e" "sigs.k8s.io/e2e-framework/klient/wait" "sigs.k8s.io/e2e-framework/pkg/envconf" "sigs.k8s.io/e2e-framework/pkg/features" batchv1 "k8s.io/api/batch/v1" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" ) var ( //go:embed manifests/job-unit-test-single-node.yaml jobUnitTestSingleNodeManifest []byte renderedJobUnitTestSingleNodeManifest []byte //go:embed manifests/job-hpc-benchmarks.yaml jobHpcBenchmarksSingleNodeManifest []byte renderedJobHpcBenchmarksSingleNodeManifest []byte ) type unitTestManifestTplVars struct { NvidiaTestImage string SkipTestSubcommand string GpuPerNode int NodeType string } type hpcTestManifestTplVars struct { GpuPerNode int } func TestSingleNodeUnitTest(t *testing.T) { unitTest := features.New("unit-test"). WithLabel("suite", "nvidia"). WithLabel("hardware", "gpu"). Setup(func(ctx context.Context, t *testing.T, cfg *envconf.Config) context.Context { if testConfig.NvidiaTestImage == "" { t.Fatal(fmt.Errorf("nvidiaTestImage must be set to run unit test, use https://github.com/aws/aws-k8s-tester/blob/main/test/images/nvidia/Dockerfile to build the image and -nvidiaTestImage to set the image url")) } var err error renderedJobUnitTestSingleNodeManifest, err = fwext.RenderManifests(jobUnitTestSingleNodeManifest, unitTestManifestTplVars{ NvidiaTestImage: testConfig.NvidiaTestImage, SkipTestSubcommand: testConfig.SkipUnitTestSubcommand, GpuPerNode: gpuPerNode, NodeType: testConfig.NodeType, }) if err != nil { t.Fatal(err) } err = fwext.ApplyManifests(cfg.Client().RESTConfig(), renderedJobUnitTestSingleNodeManifest) if err != nil { t.Fatal(err) } return ctx }). Assess("Unit test Job succeeds", func(ctx context.Context, t *testing.T, cfg *envconf.Config) context.Context { job := &batchv1.Job{ ObjectMeta: metav1.ObjectMeta{Name: "unit-test-job", Namespace: "default"}, } err := wait.For(fwext.NewConditionExtension(cfg.Client().Resources()).JobSucceeded(job), wait.WithContext(ctx), wait.WithTimeout(60*time.Minute)) if err != nil { t.Fatal(err) } return ctx }). Teardown(func(ctx context.Context, t *testing.T, cfg *envconf.Config) context.Context { log, err := fwext.GetJobLogs(cfg.Client().RESTConfig(), &batchv1.Job{ ObjectMeta: metav1.ObjectMeta{Name: "unit-test-job", Namespace: "default"}, }) if err != nil { t.Error(err) } t.Log("Test log for unit-test-job:") t.Log(log) err = fwext.DeleteManifests(cfg.Client().RESTConfig(), renderedJobUnitTestSingleNodeManifest) if err != nil { t.Error(err) } return ctx }). Feature() hpcTest := features.New("hpc-benckmarks"). WithLabel("suite", "nvidia"). WithLabel("hardware", "gpu"). Setup(func(ctx context.Context, t *testing.T, cfg *envconf.Config) context.Context { var err error renderedJobHpcBenchmarksSingleNodeManifest, err = fwext.RenderManifests(jobHpcBenchmarksSingleNodeManifest, hpcTestManifestTplVars{ GpuPerNode: gpuPerNode, }) if err != nil { t.Fatal(err) } err = fwext.ApplyManifests(cfg.Client().RESTConfig(), renderedJobHpcBenchmarksSingleNodeManifest) if err != nil { t.Fatal(err) } return ctx }). Assess("HPC test Job succeeds", func(ctx context.Context, t *testing.T, cfg *envconf.Config) context.Context { job := &batchv1.Job{ ObjectMeta: metav1.ObjectMeta{Name: "hpc-benckmarks-job", Namespace: "default"}, } err := wait.For(fwext.NewConditionExtension(cfg.Client().Resources()).JobSucceeded(job), wait.WithContext(ctx)) if err != nil { t.Fatal(err) } return ctx }). Teardown(func(ctx context.Context, t *testing.T, cfg *envconf.Config) context.Context { log, err := fwext.GetJobLogs(cfg.Client().RESTConfig(), &batchv1.Job{ ObjectMeta: metav1.ObjectMeta{Name: "hpc-benckmarks-job", Namespace: "default"}, }) if err != nil { t.Error(err) } t.Log("Test log for hpc-benckmarks-job:") t.Log(log) err = fwext.DeleteManifests(cfg.Client().RESTConfig(), renderedJobHpcBenchmarksSingleNodeManifest) if err != nil { t.Error(err) } return ctx }). Feature() testenv.Test(t, unitTest, hpcTest) } ================================================ FILE: test/cases/nvidia-dra/main_test.go ================================================ //go:build e2e package nvidia_dra import ( "context" "embed" "flag" "fmt" "log" "os" "os/exec" "os/signal" "path/filepath" "slices" "strings" "sync" "testing" "time" fwext "github.com/aws/aws-k8s-tester/internal/e2e" "github.com/aws/aws-k8s-tester/test/common" "github.com/aws/aws-k8s-tester/test/manifests" "golang.org/x/sync/errgroup" appsv1 "k8s.io/api/apps/v1" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/client-go/kubernetes" "sigs.k8s.io/e2e-framework/klient/wait" "sigs.k8s.io/e2e-framework/pkg/env" "sigs.k8s.io/e2e-framework/pkg/envconf" ) //go:embed rcts var rctsFS embed.FS var ( testenv env.Environment clientset kubernetes.Interface nodeType *string rdmaDeviceDraDriverImage *string acceleratorDraDriverImage *string containerTestImage *string nodeCount int ) // supportedRdmaTypes lists the recognized RDMA device types. var supportedRdmaTypes = []string{"efa"} func validateConfig() error { if err := common.ValidateRequiredFlags(map[string]string{ "rdmaDeviceDraDriverImage": *rdmaDeviceDraDriverImage, "containerTestImage": *containerTestImage, "nodeType": *nodeType, }); err != nil { return err } // Validate that nodeType maps to a known topology (and thus a known RDMA type). topo, err := GetTopologyForNodeType(*nodeType) if err != nil { return fmt.Errorf("invalid -nodeType: %w", err) } if !slices.Contains(supportedRdmaTypes, topo.RdmaType) { return fmt.Errorf("instance family %q has unsupported RDMA type %q; supported: %v", topo.Family, topo.RdmaType, supportedRdmaTypes) } // Verify helm is available on the PATH. if _, err := exec.LookPath("helm"); err != nil { return fmt.Errorf("helm is required but not found on PATH: %w", err) } // Verify kubectl is available on the PATH. if _, err := exec.LookPath("kubectl"); err != nil { return fmt.Errorf("kubectl is required but not found on PATH: %w", err) } return nil } const ( nvidiaDRAHelmReleaseName = "nvidia-dra-driver-gpu" nvidiaDRAHelmRepoName = "nvidia-dra" nvidiaDRAHelmRepoURL = "https://helm.ngc.nvidia.com/nvidia" nvidiaDRANamespace = "nvidia-dra-driver-gpu" nvidiaDRAHelmChartVer = "25.8.1" ) // labelNodesGPUPresent labels all nodes with nvidia.com/gpu.present=true. func labelNodesGPUPresent(ctx context.Context) error { args := []string{ "label", "nodes", "--all", "nvidia.com/gpu.present=true", "--overwrite", } log.Printf("[INFO] Labeling nodes: kubectl %s", strings.Join(args, " ")) cmd := exec.CommandContext(ctx, "kubectl", args...) cmd.Stdout = os.Stdout cmd.Stderr = os.Stderr if err := cmd.Run(); err != nil { return fmt.Errorf("kubectl label nodes failed: %w", err) } log.Println("All nodes labeled with nvidia.com/gpu.present=true.") return nil } // installNvidiaDRADriverHelm adds the NVIDIA Helm repo and installs the NVIDIA DRA driver. // If acceleratorDraDriverImage is non-empty, it splits on the last ":" to extract // repository and tag and passes them as --set overrides. func installNvidiaDRADriverHelm(ctx context.Context, config *envconf.Config) (context.Context, error) { // Add the Helm repo. repoArgs := []string{"repo", "add", nvidiaDRAHelmRepoName, nvidiaDRAHelmRepoURL} log.Printf("[INFO] Adding NVIDIA Helm repo: helm %s", strings.Join(repoArgs, " ")) repoCmd := exec.CommandContext(ctx, "helm", repoArgs...) repoCmd.Stdout = os.Stdout repoCmd.Stderr = os.Stderr if err := repoCmd.Run(); err != nil { return ctx, fmt.Errorf("helm repo add nvidia-dra failed: %w", err) } // Install (or upgrade) the chart. args := []string{ "upgrade", "--install", nvidiaDRAHelmReleaseName, fmt.Sprintf("%s/%s", nvidiaDRAHelmRepoName, nvidiaDRAHelmReleaseName), "--version", nvidiaDRAHelmChartVer, "--create-namespace", "--namespace", nvidiaDRANamespace, "--set", "resources.gpus.enabled=true", "--set", "gpuResourcesEnabledOverride=true", "--timeout", "5m", } if *acceleratorDraDriverImage != "" { repo, tag := common.SplitImageRepoTag(*acceleratorDraDriverImage) args = append(args, "--set", fmt.Sprintf("image.repository=%s", repo), "--set", fmt.Sprintf("image.tag=%s", tag), ) } log.Printf("[INFO] Installing NVIDIA DRA driver via Helm: helm %s", strings.Join(args, " ")) cmd := exec.CommandContext(ctx, "helm", args...) cmd.Stdout = os.Stdout cmd.Stderr = os.Stderr if err := cmd.Run(); err != nil { return ctx, fmt.Errorf("helm install nvidia-dra-driver-gpu failed: %w", err) } log.Println("NVIDIA DRA driver Helm release installed successfully.") return ctx, nil } // uninstallNvidiaDRADriverHelm uninstalls the NVIDIA DRA driver Helm release. func uninstallNvidiaDRADriverHelm(ctx context.Context, config *envconf.Config) (context.Context, error) { args := []string{ "uninstall", nvidiaDRAHelmReleaseName, "--namespace", nvidiaDRANamespace, } log.Printf("[INFO] Uninstalling NVIDIA DRA driver Helm release: helm %s", strings.Join(args, " ")) cmd := exec.CommandContext(ctx, "helm", args...) cmd.Stdout = os.Stdout cmd.Stderr = os.Stderr if err := cmd.Run(); err != nil { log.Printf("[WARN] helm uninstall nvidia-dra-driver-gpu failed (may already be removed): %v", err) } return ctx, nil } func waitForNvidiaDRADriverReady(ctx context.Context, config *envconf.Config) (context.Context, error) { ds := appsv1.DaemonSet{ ObjectMeta: metav1.ObjectMeta{Name: "nvidia-dra-driver-gpu-kubelet-plugin", Namespace: nvidiaDRANamespace}, } err := wait.For( fwext.NewConditionExtension(config.Client().Resources()).DaemonSetReady(&ds), wait.WithTimeout(5*time.Minute), wait.WithContext(ctx), ) if err != nil { return ctx, fmt.Errorf("nvidia-dra-driver daemonset is not ready: %w", err) } log.Println("nvidia-dra-driver daemonset is ready.") return ctx, nil } func TestMain(m *testing.M) { nodeType = flag.String("nodeType", "", "instance type for the cluster (e.g. p5.48xlarge)") rdmaDeviceDraDriverImage = flag.String("rdmaDeviceDraDriverImage", "", "container image for the dranet DRA driver") acceleratorDraDriverImage = flag.String("acceleratorDraDriverImage", "", "container image for the NVIDIA DRA driver") containerTestImage = flag.String("containerTestImage", "", "container image for the NCCL test workload") cfg, err := envconf.NewFromFlags() if err != nil { log.Fatalf("failed to initialize test environment: %v", err) } if err := validateConfig(); err != nil { log.Fatalf("invalid configuration: %v", err) } ctx, cancel := signal.NotifyContext(context.Background(), os.Interrupt) defer cancel() testenv = env.NewWithConfig(cfg).WithContext(ctx) // Resolve topology to determine RDMA type from nodeType. topo, err := GetTopologyForNodeType(*nodeType) if err != nil { log.Fatalf("failed to resolve topology: %v", err) } manifestsList := [][]byte{ manifests.MpiOperatorManifest, } setUpFunctions := []env.Func{ // Run independent setup steps concurrently. func(ctx context.Context, config *envconf.Config) (context.Context, error) { var mu sync.Mutex g, gctx := errgroup.WithContext(ctx) // Deploy MPI operator. g.Go(func() error { return common.DeployMPIOperator(gctx, config) }) // Deploy dranet and RCTs based on topology's RDMA type. if topo.RdmaType == "efa" { rctManifests, err := common.LoadRCTManifests(rctsFS, filepath.Join("rcts", topo.RCTSubDir)) if err != nil { return ctx, fmt.Errorf("failed to load RCT manifests: %w", err) } mu.Lock() manifestsList = append(manifestsList, rctManifests...) mu.Unlock() g.Go(func() error { renderedDranet, err := common.DeployDranet(gctx, config, *rdmaDeviceDraDriverImage) if err != nil { return err } mu.Lock() manifestsList = append(manifestsList, renderedDranet) mu.Unlock() return nil }) g.Go(func() error { return fwext.ApplyManifests(config.Client().RESTConfig(), rctManifests...) }) } // Label all nodes with nvidia.com/gpu.present=true. g.Go(func() error { return labelNodesGPUPresent(gctx) }) // Add NVIDIA Helm repo and install NVIDIA DRA driver. g.Go(func() error { _, err := installNvidiaDRADriverHelm(gctx, config) return err }) if err := g.Wait(); err != nil { return ctx, err } return ctx, nil }, waitForNvidiaDRADriverReady, func(ctx context.Context, config *envconf.Config) (context.Context, error) { var err error clientset, err = kubernetes.NewForConfig(config.Client().RESTConfig()) if err != nil { return ctx, err } nodeCount, err = common.CountNodesByType(ctx, clientset, *nodeType) return ctx, err }, } testenv.Setup(setUpFunctions...) testenv.Finish( func(ctx context.Context, config *envconf.Config) (context.Context, error) { // Uninstall NVIDIA DRA driver Helm release first. ctx, _ = uninstallNvidiaDRADriverHelm(ctx, config) // Delete remaining manifests in reverse order. slices.Reverse(manifestsList) if err := fwext.DeleteManifests(config.Client().RESTConfig(), manifestsList...); err != nil { return ctx, fmt.Errorf("failed to delete manifests: %w", err) } return ctx, nil }, ) os.Exit(testenv.Run(m)) } ================================================ FILE: test/cases/nvidia-dra/nvidia_dra_test.go ================================================ //go:build e2e package nvidia_dra import ( "embed" "path/filepath" "testing" "github.com/aws/aws-k8s-tester/test/common" ) //go:embed testcases var embeddedTestCases embed.FS func TestNvidiaDRAMultiNode(t *testing.T) { topo, err := GetTopologyForNodeType(*nodeType) if err != nil { t.Fatalf("resolving topology for %s: %v", *nodeType, err) } rctDir := filepath.Join("rcts", topo.RCTSubDir) rctIndex, err := common.LoadRCTIndex(rctsFS, rctDir) if err != nil { t.Fatalf("loading RCT index from %s: %v", rctDir, err) } tcDir := filepath.Join("testcases", topo.TestCaseSubDir) featureList, err := common.DiscoverAndBuildFeatures( embeddedTestCases, tcDir, rctIndex, "nvidia-dra", "multi-node-nccl-test", nodeCount, func(tc *common.TestCaseSpec, rctIndex map[string]*common.ResourceClaimTemplateSpec) ([]byte, error) { params, err := ComputeNvidiaMPIJobParams(tc, rctIndex, topo, nodeCount, *containerTestImage) if err != nil { return nil, err } return RenderNvidiaMPIJobYAML(*params) }, clientset, ) if err != nil { t.Fatalf("discovering and building features: %v", err) } if len(featureList) == 0 { t.Logf("no test cases found under %s, skipping", tcDir) return } testenv.Test(t, featureList...) } ================================================ FILE: test/cases/nvidia-dra/rcts/p5/rct-all-efas.yaml ================================================ apiVersion: resource.k8s.io/v1beta1 kind: ResourceClaimTemplate metadata: namespace: default name: rct-all-efas spec: spec: devices: requests: - name: all-efas deviceClassName: efa.networking.k8s.aws allocationMode: All ================================================ FILE: test/cases/nvidia-dra/rcts/p5/rct-all-gpus.yaml ================================================ apiVersion: resource.k8s.io/v1beta1 kind: ResourceClaimTemplate metadata: namespace: default name: rct-all-gpus spec: spec: devices: requests: - name: all-gpus deviceClassName: gpu.nvidia.com allocationMode: All ================================================ FILE: test/cases/nvidia-dra/rcts/p5/rct-five-efas-one-gpu.yaml ================================================ apiVersion: resource.k8s.io/v1beta1 kind: ResourceClaimTemplate metadata: namespace: default name: rct-five-efas-one-gpu spec: spec: devices: requests: - name: five-efas deviceClassName: efa.networking.k8s.aws allocationMode: ExactCount count: 5 - name: one-gpu deviceClassName: gpu.nvidia.com allocationMode: ExactCount count: 1 constraints: - requests: ["five-efas", "one-gpu"] matchAttribute: "resource.kubernetes.io/pcieRoot" ================================================ FILE: test/cases/nvidia-dra/templates/nccl-test-mpijob.yaml.tmpl ================================================ apiVersion: kubeflow.org/v2beta1 kind: MPIJob metadata: name: multi-node-nccl-test spec: slotsPerWorker: {{.SlotsPerWorker}} runPolicy: backoffLimit: 20 cleanPodPolicy: Running mpiReplicaSpecs: Launcher: replicas: 1 template: spec: restartPolicy: OnFailure containers: - name: nccl-test-launcher image: {{.ContainerTestImage}} imagePullPolicy: IfNotPresent env: - name: PATH value: $PATH:/opt/amazon/efa/bin:/usr/bin command: - /opt/amazon/openmpi/bin/mpirun - --allow-run-as-root - --tag-output - -np - "{{.TotalProcesses}}" - -N - "{{.SlotsPerWorker}}" - --bind-to - none - -x - PATH - -x - LD_LIBRARY_PATH - -x - NCCL_DEBUG=INFO - -x - NCCL_BUFFSIZE=8388608 - -x - NCCL_P2P_NET_CHUNKSIZE=524288 - -x - NCCL_TUNER_PLUGIN=/opt/amazon/ofi-nccl/lib/x86_64-linux-gnu/libnccl-ofi-tuner.so - --mca - pml - ^cm,ucx - --mca - btl - tcp,self - --mca - btl_tcp_if_exclude - lo,docker0,veth_def_agent - /opt/nccl-tests/build/all_reduce_perf - -b - "8" - -e - "16G" - -f - "2" - -g - "1" - -c - "1" - -n - "100" Worker: replicas: {{.WorkerReplicas}} template: spec: containers: - name: nccl-tests-worker image: {{.ContainerTestImage}} imagePullPolicy: IfNotPresent volumeMounts: - name: shmem mountPath: /dev/shm resources: claims: {{- range .ResourceClaims}} - name: {{.Name}} {{- end}} resourceClaims: {{- range .ResourceClaims}} - name: {{.Name}} resourceClaimTemplateName: {{.TemplateName}} {{- end}} volumes: - name: shmem hostPath: path: /dev/shm ================================================ FILE: test/cases/nvidia-dra/testcases/p5/all-efas-all-gpus.yaml ================================================ resourceClaims: - name: all-efas resourceClaimTemplateName: rct-all-efas - name: all-gpus resourceClaimTemplateName: rct-all-gpus ================================================ FILE: test/cases/nvidia-dra/testcases/p5/five-efas-one-gpu-negative-test.yaml ================================================ expectFailure: true resourceClaims: - name: five-efas-one-gpu resourceClaimTemplateName: rct-five-efas-one-gpu ================================================ FILE: test/cases/nvidia-dra/topology.go ================================================ package nvidia_dra import ( "bytes" _ "embed" "fmt" "log" "strings" "text/template" "github.com/aws/aws-k8s-tester/test/common" ) //go:embed templates/nccl-test-mpijob.yaml.tmpl var mpijobTemplate string // --------------------------------------------------------------------------- // Instance topology // --------------------------------------------------------------------------- // NvidiaInstanceTopology describes the GPU/EFA hardware topology for an NVIDIA instance family. type NvidiaInstanceTopology struct { Family string GPUsPerNode int // total GPUs per node (e.g. 8 for p5.48xlarge) AllGPUCount int // same as GPUsPerNode for "All" allocation mode RdmaType string // RDMA device type (e.g. "efa") RCTSubDir string // subdirectory under rcts/ TestCaseSubDir string // subdirectory under testcases/ } var instanceTopologies = map[string]NvidiaInstanceTopology{ "p5": { Family: "p5", GPUsPerNode: 8, AllGPUCount: 8, RdmaType: "efa", RCTSubDir: "p5", TestCaseSubDir: "p5", }, } // GetTopologyForNodeType returns the NvidiaInstanceTopology for a given node type // (e.g. "p5.48xlarge"). It extracts the family prefix before the first "." // and looks it up in the registry. func GetTopologyForNodeType(nodeType string) (*NvidiaInstanceTopology, error) { family := common.ExtractFamily(nodeType) topo, ok := instanceTopologies[family] if !ok { return nil, fmt.Errorf("unsupported instance family %q (from %q); supported: %s", family, nodeType, supportedFamilies()) } return &topo, nil } func supportedFamilies() string { families := make([]string, 0, len(instanceTopologies)) for k := range instanceTopologies { families = append(families, k) } return strings.Join(families, ", ") } // --------------------------------------------------------------------------- // MPIJob rendering // --------------------------------------------------------------------------- // NvidiaMPIJobParams holds all template parameters for rendering the NCCL MPIJob YAML. type NvidiaMPIJobParams struct { SlotsPerWorker int TotalProcesses int WorkerReplicas int ContainerTestImage string ResourceClaims []common.ResourceClaimRef } // RenderNvidiaMPIJobYAML renders the embedded NCCL MPIJob Go template with the given params // and returns the resulting YAML bytes. func RenderNvidiaMPIJobYAML(params NvidiaMPIJobParams) ([]byte, error) { tmpl, err := template.New("mpijob").Parse(mpijobTemplate) if err != nil { return nil, fmt.Errorf("parsing MPIJob template: %w", err) } var buf bytes.Buffer if err := tmpl.Execute(&buf, params); err != nil { return nil, fmt.Errorf("rendering MPIJob template: %w", err) } return buf.Bytes(), nil } // --------------------------------------------------------------------------- // NVIDIA-specific helpers // --------------------------------------------------------------------------- // getGPUCount returns the GPU device count from an RCT. // For AllocationMode "All" it returns the topology's AllGPUCount; // otherwise it returns the explicit Count from the gpu.nvidia.com request. func getGPUCount(rct *common.ResourceClaimTemplateSpec, topo *NvidiaInstanceTopology) int { for _, req := range rct.Spec.Spec.Devices.Requests { if req.DeviceClassName != "gpu.nvidia.com" { continue } if req.AllocationMode == "All" { return topo.AllGPUCount } if req.Count <= 0 { log.Printf("[WARN] gpu.nvidia.com request has non-positive count: %d", req.Count) } return req.Count } log.Printf("[WARN] no gpu.nvidia.com device request found in RCT, returning GPU count 0") return 0 } // ComputeNvidiaMPIJobParams computes MPIJob parameters from a test case spec. // It resolves each claim's resourceClaimTemplateName against the RCT index to // get the GPU count, then calculates SlotsPerWorker and TotalProcesses. func ComputeNvidiaMPIJobParams(tc *common.TestCaseSpec, rctIndex map[string]*common.ResourceClaimTemplateSpec, topo *NvidiaInstanceTopology, workerReplicas int, containerTestImage string) (*NvidiaMPIJobParams, error) { if topo == nil { return nil, fmt.Errorf("instance topology is required") } if workerReplicas <= 0 { return nil, fmt.Errorf("workerReplicas must be positive, got %d", workerReplicas) } if containerTestImage == "" { return nil, fmt.Errorf("containerTestImage is required") } totalGPUs := 0 var claims []common.ResourceClaimRef for _, tcClaim := range tc.ResourceClaims { rct, ok := rctIndex[tcClaim.ResourceClaimTemplateName] if !ok { return nil, fmt.Errorf("resource claim template %q not found in RCT index", tcClaim.ResourceClaimTemplateName) } totalGPUs += getGPUCount(rct, topo) claims = append(claims, common.ResourceClaimRef{ Name: tcClaim.Name, TemplateName: tcClaim.ResourceClaimTemplateName, }) } slotsPerWorker := totalGPUs totalProcesses := slotsPerWorker * workerReplicas return &NvidiaMPIJobParams{ SlotsPerWorker: slotsPerWorker, TotalProcesses: totalProcesses, WorkerReplicas: workerReplicas, ContainerTestImage: containerTestImage, ResourceClaims: claims, }, nil } ================================================ FILE: test/cases/nvidia-inference/bert_inference_test.go ================================================ //go:build e2e package inference import ( "context" _ "embed" "fmt" "io" "log" "testing" "time" fwext "github.com/aws/aws-k8s-tester/internal/e2e" batchv1 "k8s.io/api/batch/v1" v1 "k8s.io/api/core/v1" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/client-go/kubernetes" "k8s.io/client-go/rest" "sigs.k8s.io/e2e-framework/klient/wait" "sigs.k8s.io/e2e-framework/pkg/envconf" "sigs.k8s.io/e2e-framework/pkg/features" ) //go:embed manifests/bert-inference.yaml var bertInferenceManifest []byte var renderedBertInferenceManifest []byte type bertInferenceManifestTplVars struct { BertInferenceImage string InferenceMode string GPUPerNode string } func TestBertInference(t *testing.T) { feature := features.New("bert-inference"). WithLabel("suite", "nvidia"). WithLabel("hardware", "gpu"). Setup(func(ctx context.Context, t *testing.T, cfg *envconf.Config) context.Context { if testConfig.BertInferenceImage == "" { t.Fatalf("[ERROR] bertInferenceImage must be set") } log.Println("[INFO] Rendering BERT inference manifest...") var err error renderedBertInferenceManifest, err = fwext.RenderManifests( bertInferenceManifest, bertInferenceManifestTplVars{ BertInferenceImage: testConfig.BertInferenceImage, InferenceMode: testConfig.InferenceMode, GPUPerNode: fmt.Sprintf("%d", testConfig.GpuRequested), }, ) if err != nil { t.Fatalf("[ERROR] Failed to render BERT inference manifest: %v", err) } log.Println("[INFO] Applying BERT inference manifest...") if applyErr := fwext.ApplyManifests(cfg.Client().RESTConfig(), renderedBertInferenceManifest); applyErr != nil { t.Fatalf("[ERROR] Failed to apply BERT inference manifest: %v", applyErr) } log.Println("[INFO] BERT inference manifest applied successfully.") // Record time after applying the manifest ctx = context.WithValue(ctx, "applyTime", time.Now()) return ctx }). Assess("BERT inference Job succeeds", func(ctx context.Context, t *testing.T, cfg *envconf.Config) context.Context { log.Println("[INFO] Checking BERT inference job completion...") job := &batchv1.Job{ ObjectMeta: metav1.ObjectMeta{Name: "bert-inference", Namespace: "default"}, } if err := wait.For( fwext.NewConditionExtension(cfg.Client().Resources()).JobSucceeded(job), wait.WithTimeout(20*time.Minute), ); err != nil { log.Println("[ERROR] BERT inference job failed. Gathering logs...") if err := printJobLogs(ctx, cfg, "default", "bert-inference"); err != nil { t.Logf("[WARNING] Failed to retrieve bert-inference job logs: %v", err) } t.Fatalf("[ERROR] BERT inference job did not succeed: %v", err) } log.Println("[INFO] BERT inference job succeeded. Gathering logs...") // Compute duration from manifest apply to job success startVal := ctx.Value("applyTime") if startVal != nil { if applyTime, ok := startVal.(time.Time); ok { duration := time.Since(applyTime) log.Printf("[INFO] BERT inference job completed in %s", duration) } } // Print logs (including node name) for the Pod if err := printJobLogs(ctx, cfg, "default", "bert-inference"); err != nil { t.Logf("[WARNING] Failed to retrieve BERT inference job logs: %v", err) } return ctx }). Teardown(func(ctx context.Context, t *testing.T, cfg *envconf.Config) context.Context { log.Println("[INFO] Cleaning up BERT inference job resources...") if err := fwext.DeleteManifests(cfg.Client().RESTConfig(), renderedBertInferenceManifest); err != nil { t.Fatalf("[ERROR] Failed to delete BERT inference manifest: %v", err) } log.Println("[INFO] BERT inference job resources cleaned up.") return ctx }). Feature() testenv.Test(t, feature) } func printJobLogs(ctx context.Context, cfg *envconf.Config, namespace, jobName string) error { cs, err := getClientset(cfg.Client().RESTConfig()) if err != nil { return fmt.Errorf("[ERROR] Failed to create kubernetes clientset: %w", err) } pods, err := cs.CoreV1().Pods(namespace).List(ctx, metav1.ListOptions{ LabelSelector: fmt.Sprintf("job-name=%s", jobName), }) if err != nil { return fmt.Errorf("[ERROR] Failed to list pods for job %s: %w", jobName, err) } if len(pods.Items) == 0 { return fmt.Errorf("[ERROR] No pods found for job %s", jobName) } for _, pod := range pods.Items { log.Printf("[INFO] Pod %s is running on node %s", pod.Name, pod.Spec.NodeName) log.Printf("[INFO] Retrieving logs from pod %s...", pod.Name) stream, err := cs.CoreV1().Pods(pod.Namespace).GetLogs(pod.Name, &v1.PodLogOptions{}).Stream(ctx) if err != nil { return fmt.Errorf("[ERROR] Failed to get logs from pod %s: %w", pod.Name, err) } defer stream.Close() buf := make([]byte, 4096) for { n, readErr := stream.Read(buf) if n > 0 { log.Printf("[INFO] Logs from Pod %s:\n%s", pod.Name, string(buf[:n])) } if readErr == io.EOF { log.Printf("[INFO] Completed log stream for pod %s.", pod.Name) break } if readErr != nil { return fmt.Errorf("[ERROR] Failed to read logs from pod %s: %w", pod.Name, readErr) } } } return nil } func getClientset(restConfig *rest.Config) (*kubernetes.Clientset, error) { cs, err := kubernetes.NewForConfig(restConfig) if err != nil { return nil, fmt.Errorf("[ERROR] Cannot create kubernetes clientset: %w", err) } return cs, nil } ================================================ FILE: test/cases/nvidia-inference/main_test.go ================================================ //go:build e2e package inference import ( "context" _ "embed" "fmt" "log" "os" "os/signal" "slices" "testing" "time" fwext "github.com/aws/aws-k8s-tester/internal/e2e" "github.com/aws/aws-k8s-tester/test/common" "github.com/aws/aws-k8s-tester/test/manifests" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/client-go/kubernetes" "sigs.k8s.io/e2e-framework/klient/wait" "sigs.k8s.io/e2e-framework/pkg/env" "sigs.k8s.io/e2e-framework/pkg/envconf" ) type TestConfig struct { common.MetricOps BertInferenceImage string `flag:"bertInferenceImage" desc:"BERT inference container image"` InferenceMode string `flag:"inferenceMode" desc:"Inference mode for BERT (throughput or latency)"` GpuRequested int `flag:"gpuRequested" desc:"Number of GPUs required for inference"` } var ( testenv env.Environment testConfig TestConfig ) func TestMain(m *testing.M) { // Initialize testConfig with default values testConfig = TestConfig{ InferenceMode: "throughput", GpuRequested: 1, } _, err := common.ParseFlags(&testConfig) if err != nil { log.Fatalf("[ERROR] Failed to parse flags: %v", err) } cfg, err := envconf.NewFromFlags() if err != nil { log.Fatalf("[ERROR] Failed to initialize test environment: %v", err) } ctx, cancel := signal.NotifyContext(context.Background(), os.Interrupt) defer cancel() testenv = env.NewWithConfig(cfg).WithContext(ctx) manifestsList := [][]byte{ manifests.NvidiaDevicePluginManifest, } if len(testConfig.MetricDimensions) > 0 { // Render CloudWatch Agent manifest with dynamic dimensions renderedCloudWatchAgentManifest, err := manifests.RenderCloudWatchAgentManifest(testConfig.MetricDimensions) if err != nil { log.Printf("Warning: Failed to render CloudWatch Agent manifest: %v", err) } manifestsList = append(manifestsList, manifests.DCGMExporterManifest, renderedCloudWatchAgentManifest) } testenv.Setup( func(ctx context.Context, config *envconf.Config) (context.Context, error) { log.Println("[INFO] Applying manifests.") err := fwext.ApplyManifests(config.Client().RESTConfig(), manifestsList...) if err != nil { return ctx, fmt.Errorf("[ERROR] Failed to apply manifests: %w", err) } log.Println("[INFO] Successfully applied manifests.") return ctx, nil }, common.DeployDaemonSet("nvidia-device-plugin-daemonset", "kube-system"), func(ctx context.Context, config *envconf.Config) (context.Context, error) { if len(testConfig.MetricDimensions) > 0 { if ctx, err := common.DeployDaemonSet("dcgm-exporter", "kube-system")(ctx, config); err != nil { return ctx, err } if ctx, err := common.DeployDaemonSet("cwagent", "amazon-cloudwatch")(ctx, config); err != nil { return ctx, err } } return ctx, nil }, checkGpuCapacity, ) testenv.Finish( func(ctx context.Context, config *envconf.Config) (context.Context, error) { log.Println("[INFO] Deleting manifests.") slices.Reverse(manifestsList) err := fwext.DeleteManifests(config.Client().RESTConfig(), manifestsList...) if err != nil { return ctx, fmt.Errorf("[ERROR] failed to delete manifests: %w", err) } log.Println("[INFO] Successfully deleted manifests.") return ctx, nil }, ) exitCode := testenv.Run(m) log.Printf("[INFO] Tests finished with exit code %d", exitCode) os.Exit(exitCode) } // checkGpuCapacity ensures at least one node has >= the requested number of GPUs, // and logs each node's instance type. func checkGpuCapacity(ctx context.Context, config *envconf.Config) (context.Context, error) { log.Printf("[INFO] Validating cluster has at least %d GPU(s).", testConfig.GpuRequested) cs, err := kubernetes.NewForConfig(config.Client().RESTConfig()) if err != nil { return ctx, fmt.Errorf("failed to create kubernetes client: %w", err) } err = wait.For(func(ctx context.Context) (bool, error) { nodes, err := cs.CoreV1().Nodes().List(ctx, metav1.ListOptions{}) if err != nil { return false, fmt.Errorf("failed to list nodes: %w", err) } else if len(nodes.Items) == 0 { return false, fmt.Errorf("no nodes found in the cluster") } for _, node := range nodes.Items { instanceType := node.Labels["node.kubernetes.io/instance-type"] gpuCap, ok := node.Status.Capacity["nvidia.com/gpu"] if ok && int(gpuCap.Value()) >= testConfig.GpuRequested { log.Printf("[INFO] Node %s (type: %s) meets the request of %d GPU(s).", node.Name, instanceType, testConfig.GpuRequested) return true, nil } log.Printf("[INFO] Node %s (type: %s) has no GPU capacity.", node.Name, instanceType) } log.Printf("[INFO] No node meets the GPU requirement. The GPU info might not be propagated yet. Retrying...") return false, nil }, wait.WithTimeout(5*time.Minute), wait.WithInterval(10*time.Second)) if err != nil { return ctx, fmt.Errorf("no node has >= %d GPU(s)", testConfig.GpuRequested) } log.Println("[INFO] GPU capacity check passed.") return ctx, nil } ================================================ FILE: test/cases/nvidia-inference/manifests/bert-inference.yaml ================================================ # Single-node BERT inference job with GPU. Memory-backed volume for /dev/shm apiVersion: batch/v1 kind: Job metadata: name: bert-inference spec: backoffLimit: 4 template: spec: restartPolicy: OnFailure volumes: - name: dshm emptyDir: medium: Memory containers: - name: bert-inference image: {{.BertInferenceImage}} imagePullPolicy: Always command: ["python", "infer.py"] env: - name: INFERENCE_MODE value: "{{.InferenceMode}}" volumeMounts: - mountPath: /dev/shm name: dshm resources: requests: nvidia.com/gpu: {{.GPUPerNode}} limits: nvidia.com/gpu: {{.GPUPerNode}} ================================================ FILE: test/cases/nvidia-training/bert_training_test.go ================================================ //go:build e2e package training import ( "context" _ "embed" "fmt" "testing" "time" "k8s.io/client-go/kubernetes" "k8s.io/client-go/rest" fwext "github.com/aws/aws-k8s-tester/internal/e2e" batchv1 "k8s.io/api/batch/v1" v1 "k8s.io/api/core/v1" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "sigs.k8s.io/e2e-framework/klient/wait" "sigs.k8s.io/e2e-framework/pkg/envconf" "sigs.k8s.io/e2e-framework/pkg/features" ) // Use the parameterized manifest var ( //go:embed manifests/bert-training.yaml bertTrainingManifest []byte ) func TestBertTraining(t *testing.T) { if testConfig.BertTrainingImage == "" { t.Fatal(fmt.Errorf("bertTrainingImage must be set to run the test")) } slotsPerWorker := gpuPerNode workerReplicas := nodeCount np := slotsPerWorker * workerReplicas efaRequested := 0 if testConfig.EfaEnabled && efaPerNode > 0 { efaRequested = 1 } bertTraining := features.New("bert-training"). WithLabel("suite", "nvidia"). WithLabel("hardware", "gpu"). Setup(func(ctx context.Context, t *testing.T, cfg *envconf.Config) context.Context { renderVars := map[string]string{ "BertTrainingImage": testConfig.BertTrainingImage, "SlotsPerWorker": fmt.Sprintf("%d", slotsPerWorker), "NP": fmt.Sprintf("%d", np), "WorkerReplicas": fmt.Sprintf("%d", workerReplicas), "GPUPerNode": fmt.Sprintf("%d", gpuPerNode), "EFARequested": fmt.Sprintf("%d", efaRequested), } renderedManifest, err := fwext.RenderManifests(bertTrainingManifest, renderVars) if err != nil { t.Fatal(err) } err = fwext.ApplyManifests(cfg.Client().RESTConfig(), renderedManifest) if err != nil { t.Fatal(err) } return ctx }). Assess("BERT training Job succeeds", func(ctx context.Context, t *testing.T, cfg *envconf.Config) context.Context { job := &batchv1.Job{ ObjectMeta: metav1.ObjectMeta{Name: "bert-training-launcher", Namespace: "default"}, } if err := wait.For(fwext.NewConditionExtension(cfg.Client().Resources()).JobSucceeded(job), wait.WithTimeout(time.Minute*20), wait.WithContext(ctx), ); err != nil { t.Logf("[ERROR] BERT training job failed. Gathering logs...") if err = printJobLogs(ctx, cfg, "default", "bert-training-launcher"); err != nil { t.Logf("Warning: failed to retrieve bert-training job logs: %v", err) } t.Fatalf("[ERROR] BERT training job did not succeed: %v", err) } t.Logf("[INFO] BERT training job succeeded. Gathering logs...") err := printJobLogs(ctx, cfg, "default", "bert-training-launcher") if err != nil { t.Logf("Warning: failed to retrieve bert-training job logs: %v", err) } return ctx }). Feature() testenv.Test(t, bertTraining) } func printJobLogs(ctx context.Context, cfg *envconf.Config, namespace, jobName string) error { clientset, err := getClientset(cfg.Client().RESTConfig()) if err != nil { return fmt.Errorf("failed to create kubernetes clientset: %w", err) } podList, err := clientset.CoreV1().Pods(namespace).List(ctx, metav1.ListOptions{ LabelSelector: fmt.Sprintf("job-name=%s", jobName), }) if err != nil { return fmt.Errorf("failed to list pods for job %s: %w", jobName, err) } if len(podList.Items) == 0 { return fmt.Errorf("no pods found for job %s", jobName) } for _, pod := range podList.Items { req := clientset.CoreV1().Pods(pod.Namespace).GetLogs(pod.Name, &v1.PodLogOptions{}) logStream, err := req.Stream(ctx) if err != nil { return fmt.Errorf("failed to get logs from pod %s: %w", pod.Name, err) } defer logStream.Close() buf := make([]byte, 4096) for { n, err := logStream.Read(buf) if n > 0 { fmt.Printf("Logs from Pod %s: \n%s\n", pod.Name, string(buf[:n])) } if err != nil { break } } } return nil } func getClientset(restConfig *rest.Config) (*kubernetes.Clientset, error) { clientset, err := kubernetes.NewForConfig(restConfig) if err != nil { return nil, fmt.Errorf("failed to create kubernetes clientset: %w", err) } return clientset, nil } ================================================ FILE: test/cases/nvidia-training/main_test.go ================================================ //go:build e2e package training import ( "context" _ "embed" "fmt" "github.com/aws/aws-k8s-tester/test/common" "log" "os" "os/signal" "slices" "testing" "time" fwext "github.com/aws/aws-k8s-tester/internal/e2e" "github.com/aws/aws-k8s-tester/test/manifests" appsv1 "k8s.io/api/apps/v1" v1 "k8s.io/api/core/v1" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/client-go/kubernetes" "sigs.k8s.io/e2e-framework/klient/wait" "sigs.k8s.io/e2e-framework/klient/wait/conditions" "sigs.k8s.io/e2e-framework/pkg/env" "sigs.k8s.io/e2e-framework/pkg/envconf" ) func TestMain(m *testing.M) { _, err := common.ParseFlags(&testConfig) if err != nil { log.Fatalf("failed to parse flags: %v", err) } cfg, err := envconf.NewFromFlags() if err != nil { log.Fatalf("failed to initialize test environment: %v", err) } ctx, cancel := signal.NotifyContext(context.Background(), os.Interrupt) defer cancel() testenv = env.NewWithConfig(cfg).WithContext(ctx) manifestsList := [][]byte{ manifests.NvidiaDevicePluginManifest, manifests.MpiOperatorManifest, manifests.EfaDevicePluginManifest, } if len(testConfig.MetricDimensions) > 0 { // Render CloudWatch Agent manifest with dynamic dimensions renderedCloudWatchAgentManifest, err := manifests.RenderCloudWatchAgentManifest(testConfig.MetricDimensions) if err != nil { log.Printf("Warning: failed to render CloudWatch Agent manifest: %v", err) } manifestsList = append(manifestsList, manifests.DCGMExporterManifest, renderedCloudWatchAgentManifest) } testenv.Setup( // Apply all manifests func(ctx context.Context, config *envconf.Config) (context.Context, error) { log.Println("Applying manifests.") err := fwext.ApplyManifests(config.Client().RESTConfig(), manifestsList...) if err != nil { return ctx, fmt.Errorf("failed to apply manifests: %w", err) } log.Println("Successfully applied manifests.") return ctx, nil }, // Wait for MPI Operator deployment func(ctx context.Context, config *envconf.Config) (context.Context, error) { log.Println("Waiting for MPI Operator deployment to be available.") deployment := appsv1.Deployment{ ObjectMeta: metav1.ObjectMeta{Name: "mpi-operator", Namespace: "mpi-operator"}, } err := wait.For( conditions.New(config.Client().Resources()).DeploymentConditionMatch( &deployment, appsv1.DeploymentAvailable, v1.ConditionTrue, ), wait.WithTimeout(time.Minute*5), ) if err != nil { return ctx, fmt.Errorf("MPI Operator deployment is not available: %w", err) } log.Println("MPI Operator deployment is available.") return ctx, nil }, // Wait for required DaemonSets common.DeployDaemonSet("nvidia-device-plugin-daemonset", "kube-system"), common.DeployDaemonSet("aws-efa-k8s-device-plugin-daemonset", "kube-system"), func(ctx context.Context, config *envconf.Config) (context.Context, error) { if len(testConfig.MetricDimensions) > 0 { if ctx, err := common.DeployDaemonSet("dcgm-exporter", "kube-system")(ctx, config); err != nil { return ctx, err } if ctx, err := common.DeployDaemonSet("cwagent", "amazon-cloudwatch")(ctx, config); err != nil { return ctx, err } } return ctx, nil }, // Deploy CloudWatch Agent + DCGM only if MetricDimensions are set checkNodeTypes, // Dynamically check node types and capacities after device plugins are ready ) testenv.Finish( func(ctx context.Context, config *envconf.Config) (context.Context, error) { log.Println("Deleting NVIDIA device plugin, MPI operator, EFA device plugin DCGM Exporter and CloudWatch Agent manifests.") slices.Reverse(manifestsList) err := fwext.DeleteManifests(config.Client().RESTConfig(), manifestsList...) if err != nil { return ctx, fmt.Errorf("failed to delete manifests: %w", err) } log.Println("Successfully deleted NVIDIA device plugin, MPI operator, EFA device plugin, DCGM Exporter and CloudWatch Agent manifests.") return ctx, nil }, ) log.Println("Starting tests...") exitCode := testenv.Run(m) log.Printf("Tests finished with exit code %d", exitCode) os.Exit(exitCode) } func checkNodeTypes(ctx context.Context, config *envconf.Config) (context.Context, error) { clientset, err := kubernetes.NewForConfig(config.Client().RESTConfig()) if err != nil { return ctx, fmt.Errorf("failed to create Kubernetes client: %w", err) } nodes, err := clientset.CoreV1().Nodes().List(ctx, metav1.ListOptions{}) if err != nil { return ctx, fmt.Errorf("failed to list nodes: %w", err) } if len(nodes.Items) == 0 { return ctx, fmt.Errorf("no nodes found in the cluster") } for i := 1; i < len(nodes.Items); i++ { if nodes.Items[i].Labels["node.kubernetes.io/instance-type"] != nodes.Items[i-1].Labels["node.kubernetes.io/instance-type"] { return ctx, fmt.Errorf("node types are not the same, all node types must be the same in the cluster") } } if testConfig.NodeType != "" { count := 0 for _, v := range nodes.Items { if v.Labels["node.kubernetes.io/instance-type"] == testConfig.NodeType { count++ if gpuCap, ok := v.Status.Capacity["nvidia.com/gpu"]; ok { gpuPerNode = int(gpuCap.Value()) } if efaCap, ok := v.Status.Capacity["vpc.amazonaws.com/efa"]; ok { efaPerNode = int(efaCap.Value()) } } } if count == 0 { return ctx, fmt.Errorf("no nodes match the specified nodeType: %s", testConfig.NodeType) } nodeCount = count } else { testConfig.NodeType = nodes.Items[0].Labels["node.kubernetes.io/instance-type"] nodeCount = len(nodes.Items) if gpuCap, ok := nodes.Items[0].Status.Capacity["nvidia.com/gpu"]; ok { gpuPerNode = int(gpuCap.Value()) } if efaCap, ok := nodes.Items[0].Status.Capacity["vpc.amazonaws.com/efa"]; ok { efaPerNode = int(efaCap.Value()) } } log.Printf("[INFO] Node Type: %s", testConfig.NodeType) log.Printf("[INFO] Node Count: %d", nodeCount) log.Printf("[INFO] GPU Per Node: %d", gpuPerNode) log.Printf("[INFO] EFA Per Node: %d", efaPerNode) return ctx, nil } ================================================ FILE: test/cases/nvidia-training/manifests/bert-training.yaml ================================================ apiVersion: kubeflow.org/v2beta1 kind: MPIJob metadata: name: bert-training spec: slotsPerWorker: {{.SlotsPerWorker}} runPolicy: backoffLimit: 20 cleanPodPolicy: Running mpiReplicaSpecs: Launcher: replicas: 1 template: spec: restartPolicy: OnFailure containers: - image: {{.BertTrainingImage}} imagePullPolicy: Always name: bert-training env: - name: NCCL_DEBUG value: "TRACE" - name: MASTER_ADDR value: "bert-training" - name: MASTER_PORT value: "12355" command: - /opt/amazon/openmpi/bin/mpirun - --allow-run-as-root - --tag-output - -np - "{{.NP}}" # Number of processes derived from node/gpu calculations - -bind-to - none - -map-by - slot - -x - PATH - -x - LD_LIBRARY_PATH - -x - NCCL_DEBUG - -x - MASTER_ADDR - -x - MASTER_PORT - --mca - pml - "^cm" - --mca - routed - direct - --oversubscribe - --mca - orte_base_help_aggregate - "0" - python - train.py Worker: replicas: {{.WorkerReplicas}} template: spec: volumes: - name: dshm emptyDir: medium: Memory containers: - image: {{.BertTrainingImage}} imagePullPolicy: Always name: bert-training-worker volumeMounts: - mountPath: /dev/shm name: dshm resources: requests: nvidia.com/gpu: {{.GPUPerNode}} vpc.amazonaws.com/efa: {{.EFARequested}} limits: nvidia.com/gpu: {{.GPUPerNode}} vpc.amazonaws.com/efa: {{.EFARequested}} ================================================ FILE: test/cases/nvidia-training/vars.go ================================================ //go:build e2e package training import ( "github.com/aws/aws-k8s-tester/test/common" "sigs.k8s.io/e2e-framework/pkg/env" ) type Config struct { common.MetricOps BertTrainingImage string `flag:"bertTrainingImage" desc:"Docker image used for BERT training workload"` EfaEnabled bool `flag:"efaEnabled" desc:"Enable Elastic Fabric Adapter (EFA)"` NodeType string `flag:"nodeType" desc:"Instance type for cluster nodes"` } // Shared global variables var ( testenv env.Environment testConfig Config nodeCount int gpuPerNode int efaPerNode int ) ================================================ FILE: test/cases/quick/io_uring_test.go ================================================ //go:build e2e package quick import ( "context" "log" "testing" "time" "github.com/aws/aws-k8s-tester/internal/e2e" corev1 "k8s.io/api/core/v1" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "sigs.k8s.io/e2e-framework/klient/k8s" "sigs.k8s.io/e2e-framework/klient/wait" "sigs.k8s.io/e2e-framework/pkg/envconf" "sigs.k8s.io/e2e-framework/pkg/features" ) func TestNpmInstallWithCPULimits(t *testing.T) { feat := features.New("npm-install"). WithLabel("suite", "quick"). Setup(func(ctx context.Context, t *testing.T, cfg *envconf.Config) context.Context { log.Println("[Setup] Verifying cluster nodes...") var nodeList corev1.NodeList if err := cfg.Client().Resources().List(ctx, &nodeList); err != nil { t.Fatalf("Failed to list nodes: %v", err) } // Log node information for _, node := range nodeList.Items { arch := node.Labels["kubernetes.io/arch"] kernelVersion := node.Status.NodeInfo.KernelVersion t.Logf("Node: %s, Architecture: %s, Kernel: %s", node.Name, arch, kernelVersion) } return ctx }). Assess("Pod can successfully run npm install", func(ctx context.Context, t *testing.T, cfg *envconf.Config) context.Context { podName := "npm-install-test" podNS := "default" pod := &corev1.Pod{ ObjectMeta: metav1.ObjectMeta{ Name: podName, Namespace: podNS, Labels: map[string]string{ "app": "npm-install-test", }, }, Spec: corev1.PodSpec{ Containers: []corev1.Container{ { Name: "test-container", Image: "public.ecr.aws/ubuntu/ubuntu:noble", Command: []string{"/bin/sh", "-c"}, Args: []string{` set -x echo "[Test] Starting npm installation test..." mkdir asd && cd asd && apt-get update && apt-get install -y npm nodejs && echo "[Test] Starting npm install webpack..." npm install webpack --loglevel verbose || exit 1 echo "[Test] npm install completed successfully" `}, }, }, RestartPolicy: corev1.RestartPolicyNever, }, } if err := cfg.Client().Resources().Create(ctx, pod); err != nil { t.Fatalf("[Assess] Failed to create pod: %v", err) } log.Printf("[Assess] Waiting up to 10 minutes for pod %s to complete...", podName) err := wait.For( e2e.NewConditionExtension(cfg.Client().Resources()).ResourceMatch(pod, func(object k8s.Object) bool { pod := object.(*corev1.Pod) return pod.Status.Phase == corev1.PodSucceeded }), wait.WithTimeout(10*time.Minute), ) if err != nil { t.Logf("[Assess] Pod did not complete successfully: %v", err) e2e.PrintDaemonSetPodLogs(t, ctx, cfg.Client().RESTConfig(), podNS, "app=npm-install-test") t.Fatal("Pod did not complete within 10 minutes - possible io_uring hang detected") } log.Printf("[Assess] Pod %s completed successfully", podName) return ctx }). Teardown(func(ctx context.Context, t *testing.T, cfg *envconf.Config) context.Context { podName := "npm-install-test" podNS := "default" t.Logf("[Teardown] Cleaning up pod %s/%s...", podNS, podName) pod := &corev1.Pod{ ObjectMeta: metav1.ObjectMeta{ Name: podName, Namespace: podNS, }, } if err := cfg.Client().Resources().Delete(ctx, pod); err != nil { t.Logf("[Teardown] Failed to delete pod: %v", err) } return ctx }). Feature() testenv.Test(t, feat) } ================================================ FILE: test/cases/quick/limit_test.go ================================================ //go:build e2e package quick import ( "bytes" "context" _ "embed" "io" "strings" "testing" "time" fwext "github.com/aws/aws-k8s-tester/internal/e2e" corev1 "k8s.io/api/core/v1" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/client-go/kubernetes" "sigs.k8s.io/e2e-framework/klient/k8s" "sigs.k8s.io/e2e-framework/klient/wait" "sigs.k8s.io/e2e-framework/klient/wait/conditions" "sigs.k8s.io/e2e-framework/pkg/envconf" "sigs.k8s.io/e2e-framework/pkg/features" ) var ( //go:embed manifests/ulimit.yaml ulimitManifest []byte expectedResourceLimit = map[string]string{ "-R": "unlimited", "-c": "unlimited", "-d": "unlimited", "-e": "0", "-f": "unlimited", "-i": "30446", "-l": "unlimited", "-m": "unlimited", "-n": "1048576", "-p": "8", "-q": "819200", "-r": "0", "-s": "10240", "-t": "unlimited", "-u": "unlimited", "-v": "unlimited", "-x": "unlimited", } ) func TestUserLimits(t *testing.T) { f1 := features.New("ulimit pod"). WithLabel("type", "ulimit"). Setup(func(ctx context.Context, t *testing.T, cfg *envconf.Config) context.Context { err := fwext.ApplyManifests(cfg.Client().RESTConfig(), ulimitManifest) if err != nil { t.Fatalf("failed to apply manifests: %v", err) } pod := &corev1.Pod{ ObjectMeta: metav1.ObjectMeta{Name: "ulimit", Namespace: "default"}, } err = wait.For(conditions.New(cfg.Client().Resources()).ResourceMatch(pod, containerTerminated), wait.WithTimeout(time.Minute*5)) if err != nil { t.Fatalf("encounter error when waiting for container finished running commands: %v", err) } return ctx }). Assess("Use default resources limit", func(ctx context.Context, t *testing.T, cfg *envconf.Config) context.Context { client, err := kubernetes.NewForConfig(cfg.Client().RESTConfig()) if err != nil { t.Fatal(err) } tailLine := int64(10000) podLogOptions := corev1.PodLogOptions{ Container: "al2023", TailLines: &tailLine, } req := client.CoreV1().Pods("default").GetLogs("ulimit", &podLogOptions) logs, err := req.Stream(ctx) if err != nil { t.Fatalf("error in opening stream: %v", err) } defer logs.Close() compareResourceLimitsWithExpectedValues(t, logs) return ctx }). Teardown(func(ctx context.Context, t *testing.T, cfg *envconf.Config) context.Context { err := fwext.DeleteManifests(cfg.Client().RESTConfig(), ulimitManifest) if err != nil { t.Fatalf("failed to delete manifests: %v", err) } return ctx }).Feature() // test feature testenv.Test(t, f1) } func compareResourceLimitsWithExpectedValues(t *testing.T, logs io.ReadCloser) { buf := new(bytes.Buffer) _, err := io.Copy(buf, logs) if err != nil { t.Fatalf("error in copy information from podLogs to buf: %v", err) } str := buf.String() lines := strings.Split(str, "\n") for _, line := range lines[:len(lines)-1] { info := strings.Split(line, " ") marker := getMarker(info[len(info)-2]) value := info[len(info)-1] if expectedResourceLimit[marker] != value { t.Errorf("resource limit doesn't match with the default value, limit we get %v, but default value is %v", line, expectedResourceLimit[marker]) } else { t.Logf("resrouce limit fetched from ulimit: %v. Equal to the default value %v", line, expectedResourceLimit[marker]) } } } func containerTerminated(obj k8s.Object) bool { j := obj.(*corev1.Pod) containerTerminatedState := j.Status.ContainerStatuses[0].State.Terminated return containerTerminatedState.Reason == "Completed" } func getMarker(str string) string { startIndex := 0 if str[:1] == "(" { startIndex = 1 } return str[startIndex : len(str)-1] } ================================================ FILE: test/cases/quick/main_test.go ================================================ //go:build e2e package quick import ( "context" _ "embed" "log" "os" "os/signal" "testing" "sigs.k8s.io/e2e-framework/pkg/env" "sigs.k8s.io/e2e-framework/pkg/envconf" ) var ( testenv env.Environment ) func TestMain(m *testing.M) { cfg, err := envconf.NewFromFlags() if err != nil { log.Fatalf("failed to initialize test environment: %v", err) } testenv = env.NewWithConfig(cfg) ctx, cancel := signal.NotifyContext(context.Background(), os.Interrupt) defer cancel() testenv = testenv.WithContext(ctx) testenv.Setup(func(ctx context.Context, config *envconf.Config) (context.Context, error) { log.Println("Starting quick test suite...") return ctx, nil }) os.Exit(testenv.Run(m)) } ================================================ FILE: test/cases/quick/manifests/ulimit.yaml ================================================ apiVersion: v1 kind: Pod metadata: name: ulimit spec: restartPolicy: Never containers: - name: al2023 image: public.ecr.aws/amazonlinux/amazonlinux:2023 command: ["ulimit"] args: - -a ================================================ FILE: test/cases/quick/node_topology_test.go ================================================ //go:build e2e package quick import ( "context" _ "embed" "strconv" "strings" "testing" "github.com/aws/aws-k8s-tester/internal/e2e" "github.com/aws/aws-sdk-go-v2/aws" v1 "k8s.io/api/core/v1" cloudprovider "k8s.io/cloud-provider-aws/pkg/providers/v1" "sigs.k8s.io/e2e-framework/pkg/envconf" "sigs.k8s.io/e2e-framework/pkg/features" ) func TestNodeTopology(t *testing.T) { topology := features.New("node-topology"). WithLabel("suite", "node-topology"). Assess("Nodes have correct network topology labels", func(ctx context.Context, t *testing.T, cfg *envconf.Config) context.Context { var nodes v1.NodeList cfg.Client().Resources().List(ctx, &nodes) if len(nodes.Items) == 0 { t.Fatal("no nodes found in the cluster") } nodeMap := make(map[string]v1.Node) var instanceIDs []string ec2Client := e2e.NewEC2Client() for _, node := range nodes.Items { providerIDParts := strings.Split(node.Spec.ProviderID, "/") instanceID := providerIDParts[len(providerIDParts)-1] instanceIDs = append(instanceIDs, instanceID) nodeMap[instanceID] = node } nodeTopologies, err := ec2Client.DescribeInstanceTopology(instanceIDs) if err != nil { t.Fatalf("could not describe instance topologies: %v", err) } t.Logf("checking instance topologies for %d node(s) (out of %d node(s) in the cluster)", len(nodeTopologies), len(instanceIDs)) for _, nodeTopology := range nodeTopologies { node := nodeMap[aws.ToString(nodeTopology.InstanceId)] instanceType := node.Labels["node.kubernetes.io/instance-type"] t.Logf("verifying instance topology for node %s (type: %s)", node.Name, instanceType) for i, networkNode := range nodeTopology.NetworkNodes { // https://github.com/kubernetes/cloud-provider-aws/blob/b47d2cf2a33ae655cd353ec42ea43362b804c397/pkg/providers/v1/well_known_labels.go#L26 expectedLabel := cloudprovider.LabelNetworkNodePrefix + strconv.Itoa(i+1) if actualValue, ok := node.Labels[expectedLabel]; !ok { t.Errorf("node %s (type: %s) does not have expected network label %s", node.Name, instanceType, expectedLabel) } else if actualValue != networkNode { t.Errorf("node %s (type: %s) has incorrect value for label %s: expected %s, got %s", node.Name, instanceType, expectedLabel, networkNode, actualValue) } } // https://github.com/kubernetes/cloud-provider-aws/blob/b47d2cf2a33ae655cd353ec42ea43362b804c397/pkg/providers/v1/well_known_labels.go#L22C2-L22C13 if aws.ToString(nodeTopology.ZoneId) != node.Labels[cloudprovider.LabelZoneID] { t.Logf("node %s (type: %s) has incorrect value for label %s: expected %s, got %s", node.Name, instanceType, cloudprovider.LabelZoneID, aws.ToString(nodeTopology.ZoneId), node.Labels[cloudprovider.LabelZoneID]) t.Fail() } } return ctx }).Feature() testenv.Test(t, topology) } ================================================ FILE: test/cases/workload/main_test.go ================================================ //go:build e2e package workload import ( "context" "flag" "fmt" "log" "os" "os/signal" "testing" "time" "sigs.k8s.io/e2e-framework/pkg/env" "sigs.k8s.io/e2e-framework/pkg/envconf" ) const ( defaultWorkloadTestTimeout = 10 * time.Minute ) var ( testenv env.Environment workloadTestCommand *string workloadTestImage *string workloadTestName *string workloadTestResources *string workloadTestTimeout *time.Duration ) func TestMain(m *testing.M) { workloadTestCommand = flag.String("workloadTestCommand", "", "command for workload test") workloadTestImage = flag.String("workloadTestImage", "", "image for workload test") workloadTestName = flag.String("workloadTestName", "workload-test", "name for workload test") workloadTestResources = flag.String("workloadTestResources", "", "JSON map of resources for workload test (e.g., '{\"nvidia.com/gpu\": \"1\"}')") workloadTestTimeout = flag.Duration("workloadTestTimeout", defaultWorkloadTestTimeout, fmt.Sprintf("timeout for workload test (default: %s)", defaultWorkloadTestTimeout)) cfg, err := envconf.NewFromFlags() if err != nil { log.Fatalf("failed to initialize test environment: %v", err) } testenv = env.NewWithConfig(cfg) ctx, cancel := signal.NotifyContext(context.Background(), os.Interrupt) defer cancel() testenv = testenv.WithContext(ctx) testenv.Setup(func(ctx context.Context, config *envconf.Config) (context.Context, error) { log.Println("Starting workload test suite...") return ctx, nil }) os.Exit(testenv.Run(m)) } ================================================ FILE: test/cases/workload/workload_test.go ================================================ //go:build e2e package workload import ( "context" "encoding/json" "strings" "testing" "time" fwext "github.com/aws/aws-k8s-tester/internal/e2e" "github.com/aws/smithy-go/ptr" batchv1 "k8s.io/api/batch/v1" corev1 "k8s.io/api/core/v1" "k8s.io/apimachinery/pkg/api/resource" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "sigs.k8s.io/e2e-framework/klient/wait" "sigs.k8s.io/e2e-framework/pkg/envconf" "sigs.k8s.io/e2e-framework/pkg/features" ) func createWorkloadJob(name, image, command string, resources map[string]string, timeout time.Duration) *batchv1.Job { container := corev1.Container{ Name: name, Image: image, ImagePullPolicy: corev1.PullAlways, Resources: buildResourceRequirements(resources), } // Override entrypoint if command is provided if command != "" { container.Command = strings.Fields(command) } podSpec := corev1.PodSpec{ RestartPolicy: corev1.RestartPolicyNever, ActiveDeadlineSeconds: ptr.Int64(int64(timeout.Seconds())), Containers: []corev1.Container{container}, } job := &batchv1.Job{ ObjectMeta: metav1.ObjectMeta{ Name: name, Namespace: corev1.NamespaceDefault, Labels: map[string]string{"app": name}, }, Spec: batchv1.JobSpec{ BackoffLimit: ptr.Int32(4), Template: corev1.PodTemplateSpec{ ObjectMeta: metav1.ObjectMeta{ Labels: map[string]string{"app": name}, }, Spec: podSpec, }, }, } return job } func buildResourceRequirements(resources map[string]string) corev1.ResourceRequirements { if len(resources) == 0 { return corev1.ResourceRequirements{} } rl := make(corev1.ResourceList) for name, qty := range resources { rl[corev1.ResourceName(name)] = resource.MustParse(qty) } return corev1.ResourceRequirements{Limits: rl, Requests: rl} } func parseResources(resourcesJSON string) (map[string]string, error) { if resourcesJSON == "" { return nil, nil } var resources map[string]string if err := json.Unmarshal([]byte(resourcesJSON), &resources); err != nil { return nil, err } for name, qty := range resources { if q, err := resource.ParseQuantity(qty); err != nil || q.IsZero() { delete(resources, name) } } return resources, nil } func TestWorkload(t *testing.T) { name := ptr.ToString(workloadTestName) image := ptr.ToString(workloadTestImage) command := ptr.ToString(workloadTestCommand) timeout := ptr.ToDuration(workloadTestTimeout) if name == "" { t.Fatal("workloadTestName must be set to run the test") } if image == "" { t.Fatal("workloadTestImage must be set to run the test") } resources, err := parseResources(ptr.ToString(workloadTestResources)) if err != nil { t.Fatalf("Failed to parse workloadTestResources: %v", err) } feature := features.New(name).WithLabel("suite", "workload") if _, ok := resources["aws.amazon.com/neuron"]; ok { feature = feature.WithLabel("hardware", "neuron") } if _, ok := resources["nvidia.com/gpu"]; ok { feature = feature.WithLabel("hardware", "gpu") } workload := feature.Setup(func(ctx context.Context, t *testing.T, cfg *envconf.Config) context.Context { job := createWorkloadJob(name, image, command, resources, timeout) if len(resources) > 0 { t.Logf("Creating %s job with resources: %v", name, resources) } else { t.Logf("Creating %s job", name) } if err := cfg.Client().Resources().Create(ctx, job); err != nil { t.Fatal(err) } t.Logf("%s job created successfully", name) return ctx }). Assess("Job succeeds", func(ctx context.Context, t *testing.T, cfg *envconf.Config) context.Context { job := &batchv1.Job{ ObjectMeta: metav1.ObjectMeta{Name: name, Namespace: corev1.NamespaceDefault}, } t.Logf("Waiting for %s job to complete", name) err := wait.For(fwext.NewConditionExtension(cfg.Client().Resources()).JobSucceeded(job), wait.WithContext(ctx), wait.WithTimeout(timeout), ) if err != nil { t.Fatal(err) } return ctx }). Teardown(func(ctx context.Context, t *testing.T, cfg *envconf.Config) context.Context { log, err := fwext.GetJobLogs(cfg.Client().RESTConfig(), &batchv1.Job{ ObjectMeta: metav1.ObjectMeta{Name: name, Namespace: corev1.NamespaceDefault}, }) if err != nil { t.Error(err) } t.Logf("Test log for %s:", name) t.Log(log) job := &batchv1.Job{ ObjectMeta: metav1.ObjectMeta{Name: name, Namespace: corev1.NamespaceDefault}, } if err := cfg.Client().Resources().Delete(ctx, job, func(do *metav1.DeleteOptions) { policy := metav1.DeletePropagationBackground do.PropagationPolicy = &policy }); err != nil { t.Error(err) } return ctx }). Feature() testenv.Test(t, workload) } ================================================ FILE: test/common/dra.go ================================================ //go:build e2e package common import ( "context" "fmt" "log" "time" fwext "github.com/aws/aws-k8s-tester/internal/e2e" "github.com/aws/aws-k8s-tester/test/manifests" appsv1 "k8s.io/api/apps/v1" v1 "k8s.io/api/core/v1" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/client-go/kubernetes" "sigs.k8s.io/e2e-framework/klient/wait" "sigs.k8s.io/e2e-framework/klient/wait/conditions" "sigs.k8s.io/e2e-framework/pkg/envconf" ) // DeployDranet renders the dranet manifest template with the given image, // applies it to the cluster, and waits for the dranet DaemonSet to be ready. // Returns the rendered manifest bytes for later cleanup. func DeployDranet(ctx context.Context, config *envconf.Config, rdmaDeviceDraDriverImage string) (renderedManifest []byte, err error) { renderedManifest, err = fwext.RenderManifests(manifests.DranetManifest, struct { RdmaDeviceDraDriverImage string }{ RdmaDeviceDraDriverImage: rdmaDeviceDraDriverImage, }) if err != nil { return nil, fmt.Errorf("failed to render dranet manifest: %w", err) } if err := fwext.ApplyManifests(config.Client().RESTConfig(), renderedManifest); err != nil { return nil, fmt.Errorf("failed to apply dranet manifest: %w", err) } ds := appsv1.DaemonSet{ ObjectMeta: metav1.ObjectMeta{Name: "dranet-aws-dranet", Namespace: "kube-system"}, } err = wait.For( fwext.NewConditionExtension(config.Client().Resources()).DaemonSetReady(&ds), wait.WithTimeout(5*time.Minute), wait.WithContext(ctx), ) if err != nil { return nil, fmt.Errorf("dranet daemonset is not ready: %w", err) } log.Println("dranet daemonset is ready.") return renderedManifest, nil } // CountNodesByType lists cluster nodes and returns the count of nodes matching // the given node.kubernetes.io/instance-type label. Returns an error if the // count is 0. func CountNodesByType(ctx context.Context, clientset kubernetes.Interface, nodeType string) (int, error) { nodes, err := clientset.CoreV1().Nodes().List(ctx, metav1.ListOptions{ LabelSelector: "node.kubernetes.io/instance-type=" + nodeType, }) if err != nil { return 0, fmt.Errorf("failed to list nodes: %w", err) } count := len(nodes.Items) if count == 0 { return 0, fmt.Errorf("no nodes of type %q found", nodeType) } log.Printf("[INFO] Found %d node(s) of type %s", count, nodeType) return count, nil } // DeployMPIOperator applies the MPI operator manifest and waits for the // mpi-operator Deployment to become available. func DeployMPIOperator(ctx context.Context, config *envconf.Config) error { if err := fwext.ApplyManifests(config.Client().RESTConfig(), manifests.MpiOperatorManifest); err != nil { return fmt.Errorf("failed to apply mpi-operator manifest: %w", err) } dep := appsv1.Deployment{ ObjectMeta: metav1.ObjectMeta{Name: "mpi-operator", Namespace: "mpi-operator"}, } err := wait.For(conditions.New(config.Client().Resources()).DeploymentConditionMatch(&dep, appsv1.DeploymentAvailable, v1.ConditionTrue), wait.WithContext(ctx)) if err != nil { return fmt.Errorf("failed to deploy mpi-operator: %w", err) } log.Println("mpi-operator deployment is available.") return nil } ================================================ FILE: test/common/dra_features.go ================================================ //go:build e2e package common import ( "context" "fmt" "io/fs" "path/filepath" "strings" "testing" "time" fwext "github.com/aws/aws-k8s-tester/internal/e2e" "github.com/aws/aws-k8s-tester/internal/e2e/mpijobs" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/client-go/kubernetes" "sigs.k8s.io/e2e-framework/klient/wait" "sigs.k8s.io/e2e-framework/klient/wait/conditions" "sigs.k8s.io/e2e-framework/pkg/envconf" "sigs.k8s.io/e2e-framework/pkg/features" ) const ( // NegativeTestTimeout is the duration to wait before checking that a // negative test case's worker pods are still Pending. NegativeTestTimeout = 1 * time.Minute // NegativeTestStabilizationTimeout is the duration to wait after pods // are first observed as Pending before re-checking they remain Pending. NegativeTestStabilizationTimeout = 2 * time.Minute // PositiveTestTimeout is the duration to wait for an MPIJob to succeed. PositiveTestTimeout = 20 * time.Minute ) // ComputeAndRenderFunc is a callback that computes MPIJob parameters and renders // the MPIJob YAML for a given test case. Each package provides its own implementation // that calls its package-specific ComputeMPIJobParams and RenderMPIJobYAML functions. type ComputeAndRenderFunc func(tc *TestCaseSpec, rctIndex map[string]*ResourceClaimTemplateSpec) (renderedYAML []byte, err error) // BuildPositiveFeature constructs an e2e-framework Feature for a positive DRA // test case. It applies the manifest, waits for the MPIJob to succeed, retrieves // logs, and cleans up. func BuildPositiveFeature(name, suiteName, mpiJobName string, manifest []byte) features.Feature { return features.New(name). WithLabel("suite", suiteName). WithLabel("type", "positive"). Setup(func(ctx context.Context, t *testing.T, cfg *envconf.Config) context.Context { t.Logf("Applying MPIJob manifest for %s", name) if err := fwext.ApplyManifests(cfg.Client().RESTConfig(), manifest); err != nil { t.Fatalf("applying MPIJob manifest: %v", err) } return ctx }). Assess("MPIJob succeeds", func(ctx context.Context, t *testing.T, cfg *envconf.Config) context.Context { mpiJob := mpijobs.NewUnstructured(mpiJobName, "default") t.Log("Waiting for MPIJob to complete") err := wait.For( conditions.New(cfg.Client().Resources()).ResourceMatch(mpiJob, mpijobs.MPIJobSucceeded), wait.WithContext(ctx), wait.WithTimeout(PositiveTestTimeout), ) if err != nil { t.Errorf("MPIJob did not succeed: %v", err) } log, err := fwext.GetJobLogs(cfg.Client().RESTConfig(), mpiJob) if err != nil { t.Errorf("failed to get job logs: %v", err) } else { t.Logf("Test log for %s:", name) t.Log(log) } return ctx }). Teardown(func(ctx context.Context, t *testing.T, cfg *envconf.Config) context.Context { if err := fwext.DeleteManifests(cfg.Client().RESTConfig(), manifest); err != nil { t.Errorf("deleting MPIJob manifest: %v", err) } return ctx }). Feature() } // BuildNegativeFeature constructs an e2e-framework Feature for a negative DRA // test case. It applies the manifest, waits for a timeout, verifies worker pods // remain Pending, and cleans up. func BuildNegativeFeature(name, suiteName, mpiJobName string, manifest []byte, expectedPendingCount int, clientset kubernetes.Interface) features.Feature { return features.New(name). WithLabel("suite", suiteName). WithLabel("type", "negative"). Setup(func(ctx context.Context, t *testing.T, cfg *envconf.Config) context.Context { t.Logf("Applying MPIJob manifest for negative test %s", name) if err := fwext.ApplyManifests(cfg.Client().RESTConfig(), manifest); err != nil { t.Fatalf("applying MPIJob manifest: %v", err) } return ctx }). Assess("Worker pods remain Pending", func(ctx context.Context, t *testing.T, cfg *envconf.Config) context.Context { t.Log("Waiting for worker pods to be Pending...") selector := fmt.Sprintf("training.kubeflow.org/job-name=%s,training.kubeflow.org/job-role=worker", mpiJobName) listOpts := metav1.ListOptions{ LabelSelector: selector, FieldSelector: "status.phase=Pending", } err := wait.For(func(ctx context.Context) (bool, error) { pods, err := clientset.CoreV1().Pods("default").List(ctx, listOpts) if err != nil { return false, nil } return len(pods.Items) >= expectedPendingCount, nil }, wait.WithContext(ctx), wait.WithTimeout(NegativeTestTimeout)) if err != nil { t.Fatalf("expected %d worker pods in Pending state: %v", expectedPendingCount, err) } t.Logf("Found %d Pending worker pods, waiting %v to confirm they remain unschedulable...", expectedPendingCount, NegativeTestStabilizationTimeout) time.Sleep(NegativeTestStabilizationTimeout) pods, err := clientset.CoreV1().Pods("default").List(ctx, listOpts) if err != nil { t.Fatalf("re-checking Pending pods: %v", err) } if len(pods.Items) < expectedPendingCount { t.Fatalf("expected %d Pending worker pods after stabilization, but found %d", expectedPendingCount, len(pods.Items)) } t.Logf("All %d worker pods are still Pending after stabilization (scheduling failure confirmed)", expectedPendingCount) return ctx }). Teardown(func(ctx context.Context, t *testing.T, cfg *envconf.Config) context.Context { if err := fwext.DeleteManifests(cfg.Client().RESTConfig(), manifest); err != nil { t.Errorf("deleting MPIJob manifest: %v", err) } return ctx }). Feature() } // DiscoverAndBuildFeatures encapsulates the common test discovery loop: // 1. Reads test case YAML files from testCasesFS at testCaseDir // 2. Parses each via ParseTestCaseSpec // 3. Invokes computeAndRender to get the rendered MPIJob YAML // 4. Builds positive or negative features based on ExpectFailure func DiscoverAndBuildFeatures( testCasesFS fs.FS, testCaseDir string, rctIndex map[string]*ResourceClaimTemplateSpec, suiteName string, mpiJobName string, nodeCount int, computeAndRender ComputeAndRenderFunc, clientset kubernetes.Interface, ) ([]features.Feature, error) { entries, err := fs.ReadDir(testCasesFS, testCaseDir) if err != nil { return nil, fmt.Errorf("reading test case directory %s: %w", testCaseDir, err) } var featureList []features.Feature for _, entry := range entries { if entry.IsDir() || !IsYAMLFile(entry.Name()) { continue } tcName := strings.TrimSuffix(entry.Name(), filepath.Ext(entry.Name())) tcPath := filepath.Join(testCaseDir, entry.Name()) tcData, err := fs.ReadFile(testCasesFS, tcPath) if err != nil { return nil, fmt.Errorf("reading test case %s: %w", tcPath, err) } tc, err := ParseTestCaseSpec(tcData) if err != nil { return nil, fmt.Errorf("parsing test case %s: %w", tcPath, err) } renderedYAML, err := computeAndRender(tc, rctIndex) if err != nil { return nil, fmt.Errorf("computing/rendering MPIJob for %s: %w", tcName, err) } if tc.ExpectFailure { featureList = append(featureList, BuildNegativeFeature(tcName, suiteName, mpiJobName, renderedYAML, nodeCount, clientset)) } else { featureList = append(featureList, BuildPositiveFeature(tcName, suiteName, mpiJobName, renderedYAML)) } } return featureList, nil } ================================================ FILE: test/common/dra_types.go ================================================ package common import ( "fmt" "io/fs" "path/filepath" "strings" yaml "gopkg.in/yaml.v2" ) // --------------------------------------------------------------------------- // Test case spec — what the user authors per test // --------------------------------------------------------------------------- // TestCaseClaimRef is a single entry in a test case YAML file. type TestCaseClaimRef struct { Name string `yaml:"name"` ResourceClaimTemplateName string `yaml:"resourceClaimTemplateName"` } // TestCaseSpec is the structure of a test case YAML file. // Each file defines the resourceClaims that a single MPIJob test should use. // When ExpectFailure is true, the test runner treats the case as a negative test — // it expects the MPIJob's worker pods to remain Pending (unschedulable). type TestCaseSpec struct { ExpectFailure bool `yaml:"expectFailure"` ResourceClaims []TestCaseClaimRef `yaml:"resourceClaims"` } // --------------------------------------------------------------------------- // ResourceClaimTemplate parsing // --------------------------------------------------------------------------- // ResourceClaimTemplateSpec mirrors the relevant parts of a ResourceClaimTemplate YAML. type ResourceClaimTemplateSpec struct { Metadata struct { Name string `yaml:"name"` } `yaml:"metadata"` Spec struct { Spec struct { Devices struct { Requests []struct { Name string `yaml:"name"` DeviceClassName string `yaml:"deviceClassName"` AllocationMode string `yaml:"allocationMode"` Count int `yaml:"count"` } `yaml:"requests"` } `yaml:"devices"` } `yaml:"spec"` } `yaml:"spec"` } // --------------------------------------------------------------------------- // MPIJob rendering helpers // --------------------------------------------------------------------------- // ResourceClaimRef holds the name and template name for a single resource claim // in the rendered MPIJob. type ResourceClaimRef struct { Name string TemplateName string } // --------------------------------------------------------------------------- // Parsing helpers // --------------------------------------------------------------------------- // ParseTestCaseSpec parses YAML bytes into a TestCaseSpec. // It returns an error if the YAML is invalid or contains no resourceClaims. func ParseTestCaseSpec(data []byte) (*TestCaseSpec, error) { var tc TestCaseSpec if err := yaml.Unmarshal(data, &tc); err != nil { return nil, fmt.Errorf("parsing test case YAML: %w", err) } if len(tc.ResourceClaims) == 0 { return nil, fmt.Errorf("test case has no resourceClaims") } return &tc, nil } // IsYAMLFile reports whether the given filename has a .yaml or .yml extension. func IsYAMLFile(name string) bool { ext := filepath.Ext(name) return ext == ".yaml" || ext == ".yml" } // LoadRCTIndex scans a directory of RCT YAML files from the given fs.FS and // returns a map of metadata.name → parsed spec. func LoadRCTIndex(fsys fs.FS, dir string) (map[string]*ResourceClaimTemplateSpec, error) { entries, err := fs.ReadDir(fsys, dir) if err != nil { return nil, fmt.Errorf("reading RCT directory %s: %w", dir, err) } index := make(map[string]*ResourceClaimTemplateSpec) for _, entry := range entries { if entry.IsDir() || !IsYAMLFile(entry.Name()) { continue } data, err := fs.ReadFile(fsys, filepath.Join(dir, entry.Name())) if err != nil { return nil, fmt.Errorf("reading %s: %w", entry.Name(), err) } var rct ResourceClaimTemplateSpec if err := yaml.Unmarshal(data, &rct); err != nil { return nil, fmt.Errorf("parsing %s: %w", entry.Name(), err) } index[rct.Metadata.Name] = &rct } return index, nil } // ExtractFamily extracts the instance family prefix from a node type string // (before the first "."). For example, "trn1.32xlarge" returns "trn1". func ExtractFamily(nodeType string) string { if idx := strings.Index(nodeType, "."); idx > 0 { return nodeType[:idx] } return nodeType } // --------------------------------------------------------------------------- // Runtime helpers // --------------------------------------------------------------------------- // SplitImageRepoTag splits a container image reference on the last ":" into // repository and tag. If there is no ":", the entire string is treated as the // repository and the tag defaults to "latest". func SplitImageRepoTag(image string) (repo, tag string) { idx := strings.LastIndex(image, ":") if idx < 0 { return image, "latest" } return image[:idx], image[idx+1:] } // ValidateRequiredFlags validates that all flag values in the provided map are // non-empty. Returns a descriptive error for the first missing flag, or nil if // all flags are present. func ValidateRequiredFlags(flags map[string]string) error { for name, value := range flags { if value == "" { return fmt.Errorf("-%s is required and must be non-empty", name) } } return nil } // LoadRCTManifests reads all YAML files from the given RCT subdirectory in an // embedded filesystem and returns them as raw byte slices suitable for // fwext.ApplyManifests. func LoadRCTManifests(fsys fs.FS, rctSubDir string) ([][]byte, error) { entries, err := fs.ReadDir(fsys, rctSubDir) if err != nil { return nil, fmt.Errorf("reading RCT directory %s: %w", rctSubDir, err) } var manifests [][]byte for _, entry := range entries { if entry.IsDir() || !IsYAMLFile(entry.Name()) { continue } data, err := fs.ReadFile(fsys, filepath.Join(rctSubDir, entry.Name())) if err != nil { return nil, fmt.Errorf("reading %s: %w", entry.Name(), err) } manifests = append(manifests, data) } return manifests, nil } ================================================ FILE: test/common/flags.go ================================================ //go:build e2e package common import ( "flag" "fmt" "github.com/urfave/sflags/gen/gpflag" "github.com/spf13/pflag" "reflect" ) // For CloudWatch metric dimension flag type MetricOps struct { // gpflag supports map[string]string but with a different non-standard parsing format (key:val) that doesn't match // what the project wants (comma separated key=value pairs). So, we force it to skip parsing under gpflag.Parse. MetricDimensions map[string]string `flag:"-"` } func ParseFlags(config interface{}) (*pflag.FlagSet, error) { flags, err := gpflag.Parse(config) if err != nil { return nil, fmt.Errorf("failed to parse flags: %w", err) } // gpflag supports map[string]string but with a different non-standard parsing format (key:val) that doesn't // match what the project wants (key=val,key=val). So, we handle MetricDimensions separately here to accept // comma separated key=value pairs. if _, hasField := reflect.TypeOf(config).Elem().FieldByName("MetricDimensions"); hasField { field := reflect.ValueOf(config).Elem().FieldByName("MetricDimensions") metricDims := field.Addr().Interface().(*map[string]string) flags.StringToStringVar(metricDims, "metricDimensions", nil, "CloudWatch metric dimensions as comma-separated key=value pairs") } flags.VisitAll(func(pf *pflag.Flag) { flag.CommandLine.Var(pf.Value, pf.Name, pf.Usage) }) return flags, nil } ================================================ FILE: test/common/resources.go ================================================ //go:build e2e package common import ( "context" "fmt" "log" "time" fwext "github.com/aws/aws-k8s-tester/internal/e2e" appsv1 "k8s.io/api/apps/v1" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "sigs.k8s.io/e2e-framework/klient/wait" "sigs.k8s.io/e2e-framework/pkg/env" "sigs.k8s.io/e2e-framework/pkg/envconf" ) // DeployDaemonSet returns a function to deploy and wait for a DaemonSet to be ready func DeployDaemonSet(name, namespace string) env.Func { return func(ctx context.Context, config *envconf.Config) (context.Context, error) { log.Printf("Waiting for %s daemonset to be ready.", name) daemonset := appsv1.DaemonSet{ ObjectMeta: metav1.ObjectMeta{Name: name, Namespace: namespace}, } err := wait.For( fwext.NewConditionExtension(config.Client().Resources()).DaemonSetReady(&daemonset), wait.WithTimeout(5*time.Minute), wait.WithContext(ctx), ) if err != nil { return ctx, fmt.Errorf("%s daemonset is not ready: %w", name, err) } log.Printf("%s daemonset is ready.", name) return ctx, nil } } ================================================ FILE: test/images/efa/Dockerfile ================================================ FROM public.ecr.aws/amazonlinux/amazonlinux:2023 ARG EFA_BIN_PATH="/opt/amazon/efa/bin" RUN dnf -y swap gnupg2-minimal gnupg2 && \ dnf install -y \ gcc gcc-c++ make \ ca-certificates \ cmake \ emacs \ git \ jq \ wget \ unzip \ vim \ zlib-devel \ openssl \ openssl-devel \ sqlite-devel \ gdbm-devel \ glibc-devel \ bzip2-devel \ ncurses-devel \ tk-devel \ libffi-devel \ libcap-devel \ tar \ gnupg2 ENV PATH="$PATH:$EFA_BIN_PATH" RUN cd $HOME \ && curl -O https://efa-installer.amazonaws.com/aws-efa-installer-latest.tar.gz \ && wget https://efa-installer.amazonaws.com/aws-efa-installer.key && gpg --import aws-efa-installer.key \ && cat aws-efa-installer.key | gpg --fingerprint \ && wget https://efa-installer.amazonaws.com/aws-efa-installer-latest.tar.gz.sig && gpg --verify ./aws-efa-installer-latest.tar.gz.sig \ && tar -xf aws-efa-installer-latest.tar.gz \ && cd aws-efa-installer \ && ./efa_installer.sh -y -d --skip-kmod --skip-limit-conf --no-verify \ # TODO: remove this in favor of letting the efa installer add it if that ever becomes an option. # At the moment, this is only installed if omitting --no-verify, which would require # building in a context with EFA available && install -T -m 0755 efa_test.sh "${EFA_BIN_PATH}/efa_test.sh" \ && cd $HOME \ && rm -rf aws-efa-installer RUN dnf clean all RUN INSTALL_DIR=$(mktemp -d) && \ cd $INSTALL_DIR && \ curl "https://awscli.amazonaws.com/awscli-exe-linux-$(uname -m).zip" -o "awscliv2.zip" && \ unzip awscliv2.zip && \ ./aws/install && \ cd && \ rm -rf $INSTALL_DIR COPY test/images/efa/scripts ./scripts RUN chmod -R +x ./scripts ================================================ FILE: test/images/efa/scripts/unit-test.sh ================================================ #!/usr/bin/env bash set -eu get_instance_type() { local token=$(curl -X PUT "http://169.254.169.254/latest/api/token" -H "X-aws-ec2-metadata-token-ttl-seconds: 21600" 2>/dev/null) if [ -n "$token" ]; then curl -H "X-aws-ec2-metadata-token: $token" http://169.254.169.254/latest/meta-data/instance-type else curl http://169.254.169.254/latest/meta-data/instance-type fi } get_expected_efa_device_count() { aws ec2 describe-instance-types --instance-type="$EC2_INSTANCE_TYPE" | jq -r '.InstanceTypes[].NetworkInfo.EfaInfo.MaximumEfaInterfaces' } EC2_INSTANCE_TYPE=${EC2_INSTANCE_TYPE:-$(get_instance_type)} EXPECTED_EFA_DEVICE_COUNT=${EXPECTED_EFA_DEVICE_COUNT:-$(get_expected_efa_device_count)} echo "Running test on a $EC2_INSTANCE_TYPE" fi_info -p efa DGRAM_ENDPOINT_COUNT=$(fi_info -p efa | grep 'type:\sFI_EP_DGRAM$' | wc -l) if ! test $EXPECTED_EFA_DEVICE_COUNT -le $DGRAM_ENDPOINT_COUNT; then echo "Expected at least $EXPECTED_EFA_DEVICE_COUNT DGRAM endpoint(s) but found $DGRAM_ENDPOINT_COUNT" exit 1 else echo "Verified at least $EXPECTED_EFA_DEVICE_COUNT DGRAM endpoint(s) are available (found $DGRAM_ENDPOINT_COUNT)" fi RDM_ENDPOINT_COUNT=$(fi_info -p efa | grep 'type:\sFI_EP_RDM$' | wc -l) if ! test $EXPECTED_EFA_DEVICE_COUNT -le $RDM_ENDPOINT_COUNT; then echo "Expected at least $EXPECTED_EFA_DEVICE_COUNT RDM endpoint(s) but found $RDM_ENDPOINT_COUNT" exit 1 else echo "Verified at least $EXPECTED_EFA_DEVICE_COUNT RDM endpoint(s) are available (found $RDM_ENDPOINT_COUNT)" fi echo "Running single-node efa test" # Run efa_test.sh, a utility added during the build while installing EFA efa_test.sh echo "Success!" ================================================ FILE: test/images/neuron/Dockerfile ================================================ FROM public.ecr.aws/docker/library/ubuntu:22.04 # Neuron SDK components version numbers # https://github.com/aws-neuron/deep-learning-containers/blob/main/docker/pytorch/training/2.5.1/Dockerfile.neuronx ARG NEURONX_DISTRIBUTED_VERSION=0.16.25997+f431c02e ARG NEURONX_CC_VERSION=2.22.12471.0+b4a00d10 ARG NEURONX_FRAMEWORK_VERSION=2.9.0.2.11.19912+e48cd891 ARG NEURONX_COLLECTIVES_LIB_VERSION=2.29.41.0-681fef5f5 ARG NEURONX_RUNTIME_LIB_VERSION=2.29.40.0-f954cd7a5 ARG NEURONX_TOOLS_VERSION=2.27.33.0-5d9c0b901 ARG PYTHON=python3.10 ARG PYTHON_VERSION=3.10.12 ARG PIP=pip3 ARG OMPI_VERSION=4.1.5 # This arg required to stop docker build waiting for region configuration while installing tz data from ubuntu 20 ARG DEBIAN_FRONTEND=noninteractive # Python won’t try to write .pyc or .pyo files on the import of source modules # Force stdin, stdout and stderr to be totally unbuffered. Good for logging ENV PYTHONDONTWRITEBYTECODE=1 ENV PYTHONUNBUFFERED=1 ENV PYTHONIOENCODING=UTF-8 ENV LANG=C.UTF-8 ENV LC_ALL=C.UTF-8 ENV LD_LIBRARY_PATH="${LD_LIBRARY_PATH}:/opt/aws/neuron/lib" ENV LD_LIBRARY_PATH="${LD_LIBRARY_PATH}:/opt/amazon/efa/lib" ENV LD_LIBRARY_PATH="${LD_LIBRARY_PATH}:/opt/amazon/efa/lib64" ENV LD_LIBRARY_PATH="${LD_LIBRARY_PATH}:/opt/amazon/openmpi/lib64" ENV LD_LIBRARY_PATH="${LD_LIBRARY_PATH}:/usr/local/lib" ENV PATH /opt/aws/neuron/bin/:$PATH # ENV SAGEMAKER_TRAINING_MODULE=sagemaker_pytorch_container.training:main ENV DGLBACKEND=pytorch RUN apt-get update \ && apt-get upgrade -y \ && apt-get install -y --no-install-recommends \ build-essential \ ca-certificates \ cmake \ curl \ emacs \ git \ jq \ libopencv-dev \ software-properties-common \ wget \ unzip \ vim \ zlib1g-dev \ openssl \ libssl-dev \ libsqlite3-dev \ libgdbm-dev \ libc6-dev \ libbz2-dev \ libncurses-dev \ tk-dev \ libffi-dev \ libcap-dev \ gnupg2 \ gpg-agent \ libarchive13 \ && rm -rf /var/lib/apt/lists/* \ && apt-get clean RUN apt update RUN apt install -y openssh-server openssh-client wget gnupg2 sudo # Install Neuron packages RUN . /etc/os-release RUN echo "deb https://apt.repos.neuron.amazonaws.com focal main" > /etc/apt/sources.list.d/neuron.list RUN wget -qO - https://apt.repos.neuron.amazonaws.com/GPG-PUB-KEY-AMAZON-AWS-NEURON.PUB | apt-key add - RUN apt-get update \ && apt-get install -y \ aws-neuronx-tools=$NEURONX_TOOLS_VERSION \ aws-neuronx-collectives=$NEURONX_COLLECTIVES_LIB_VERSION \ aws-neuronx-runtime-lib=$NEURONX_RUNTIME_LIB_VERSION \ && rm -rf /var/lib/apt/lists/* \ && rm -rf /tmp/tmp* \ && apt-get clean # Install Open MPI RUN mkdir -p /tmp/openmpi \ && cd /tmp/openmpi \ && wget --quiet https://download.open-mpi.org/release/open-mpi/v4.1/openmpi-${OMPI_VERSION}.tar.gz \ && tar zxf openmpi-${OMPI_VERSION}.tar.gz \ && cd openmpi-${OMPI_VERSION} \ && ./configure --enable-orterun-prefix-by-default \ && make -j $(nproc) all \ && make install \ && ldconfig \ && rm -rf /tmp/openmpi # install Python RUN wget -q https://www.python.org/ftp/python/$PYTHON_VERSION/Python-$PYTHON_VERSION.tgz \ && tar -xzf Python-$PYTHON_VERSION.tgz \ && cd Python-$PYTHON_VERSION \ && ./configure --enable-shared --prefix=/usr/local \ && make -j $(nproc) && make install \ && cd .. && rm -rf ../Python-$PYTHON_VERSION* \ && ln -s /usr/local/bin/pip3 /usr/bin/pip \ && ln -s /usr/local/bin/$PYTHON /usr/local/bin/python \ && ${PIP} --no-cache-dir install --upgrade \ pip \ setuptools WORKDIR / # The ENV variables declared below are changed in the previous section # Grouping these ENV variables in the first section causes # ompi_info to fail. This is only observed in CPU containers ENV PATH="$PATH:/home/.openmpi/bin" ENV LD_LIBRARY_PATH="$LD_LIBRARY_PATH:/home/.openmpi/lib/" RUN ompi_info --parsable --all | grep mpi_built_with_cuda_support:value RUN ${PIP} install --no-cache-dir -U \ "bokeh>=2.3,<3" \ "awscli<2" \ scipy \ click \ "cryptography" \ psutil==5.6.7 \ dataset \ tenacity \ transformers==4.36.2 \ Pillow RUN mkdir -p /etc/pki/tls/certs && cp /etc/ssl/certs/ca-certificates.crt /etc/pki/tls/certs/ca-bundle.crt RUN ${PIP} config set global.extra-index-url https://pip.repos.neuron.amazonaws.com \ && ${PIP} install --force-reinstall torch-neuronx==$NEURONX_FRAMEWORK_VERSION --extra-index-url https://pip.repos.neuron.amazonaws.com \ && ${PIP} install --force-reinstall neuronx-cc==$NEURONX_CC_VERSION --extra-index-url https://pip.repos.neuron.amazonaws.com \ && ${PIP} install --force-reinstall --no-deps neuronx_distributed==$NEURONX_DISTRIBUTED_VERSION --extra-index-url https://pip.repos.neuron.amazonaws.com # attrs, neuronx-cc required: >=19.2.0, sagemaker <24,>=23.1.0 # protobuf neuronx-cc<4, sagemaker-training >=3.9.2,<=3.20.3 # awscli 1.25.47 has requirement docutils<0.17,>=0.10 # etcd for kubernetes installation # awscli 1.27.127 has requirement rsa<4.8,>=3.1.2, but you have rsa 4.9. # awscli 1.27.127 requires urllib3 < 1.27, python-etcd requires urllib3 >= 1.7, latest urllib3 release is 2.0.2 RUN ${PIP} install --no-cache-dir -U \ "attrs<24,>=23.1.0" \ "protobuf>=3.18.3,<=3.20.3" \ "docutils>=0.10,<0.17" \ "rsa<4.8,>=3.1.2" \ "urllib3>=1.26.0,<1.27" # EFA Installer does apt get. Make sure to run apt update before that RUN apt-get update RUN cd $HOME \ && curl -O https://efa-installer.amazonaws.com/aws-efa-installer-latest.tar.gz \ && wget https://efa-installer.amazonaws.com/aws-efa-installer.key && gpg --import aws-efa-installer.key \ && cat aws-efa-installer.key | gpg --fingerprint \ && wget https://efa-installer.amazonaws.com/aws-efa-installer-latest.tar.gz.sig && gpg --verify ./aws-efa-installer-latest.tar.gz.sig \ && tar -xf aws-efa-installer-latest.tar.gz \ && cd aws-efa-installer \ && ./efa_installer.sh -y -g -d --skip-kmod --skip-limit-conf --no-verify \ && cd $HOME # Clean up after apt update RUN rm -rf /var/lib/apt/lists/* \ && rm -rf /tmp/tmp* \ && apt-get clean # Install some common packages used by training scripts # torchvision needed for MLP. since it depends on torch and torch neuron/torch # is already installed install it with nodeps RUN pip3 install --no-cache-dir --no-deps -U \ torchvision==0.16.* RUN HOME_DIR=/root \ && curl -o ${HOME_DIR}/oss_compliance.zip https://aws-dlinfra-utilities.s3.amazonaws.com/oss_compliance.zip \ && unzip ${HOME_DIR}/oss_compliance.zip -d ${HOME_DIR}/ \ && cp ${HOME_DIR}/oss_compliance/test/testOSSCompliance /usr/local/bin/testOSSCompliance \ && chmod +x /usr/local/bin/testOSSCompliance \ && chmod +x ${HOME_DIR}/oss_compliance/generate_oss_compliance.sh \ && ${HOME_DIR}/oss_compliance/generate_oss_compliance.sh ${HOME_DIR} ${PYTHON} \ && rm -rf ${HOME_DIR}/oss_compliance* \ && rm -rf /tmp/tmp* RUN curl -o /license.txt https://aws-dlc-licenses.s3.amazonaws.com/pytorch-2.1/license.txt RUN mkdir -p /var/run/sshd && \ sed -i 's/[ #]\(.*StrictHostKeyChecking \).*/ \1no/g' /etc/ssh/ssh_config && \ echo " UserKnownHostsFile /dev/null" >> /etc/ssh/ssh_config && \ sed -i 's/#\(StrictModes \).*/\1no/g' /etc/ssh/sshd_config && \ sed -i 's/#PubkeyAuthentication yes/PubkeyAuthentication yes/' /etc/ssh/sshd_config && \ sed -i 's/#PasswordAuthentication yes/PasswordAuthentication no/' /etc/ssh/sshd_config COPY test/images/neuron/hack/install-test-resources.sh ./hack/install-test-resources.sh RUN chmod +x ./hack/install-test-resources.sh && \ ./hack/install-test-resources.sh /home/ubuntu && \ rm -f ./hack/install-test-resources.sh RUN useradd -ms /bin/bash ubuntu RUN echo 'ubuntu:password' | chpasswd RUN usermod -aG sudo ubuntu &&\ chown -R ubuntu /home/ubuntu WORKDIR /home/ubuntu USER ubuntu RUN mkdir -p /home/ubuntu/.ssh && \ ssh-keygen -t rsa -f /home/ubuntu/.ssh/id_rsa -N '' && \ cp /home/ubuntu/.ssh/id_rsa.pub /home/ubuntu/.ssh/authorized_keys COPY test/images/neuron/tests ./tests ================================================ FILE: test/images/neuron/hack/install-test-resources.sh ================================================ #!/bin/bash set -o pipefail set -o nounset set -o errexit USER_DIR=${1:-"/root"} # attempt to cache dataset to avoid runtime download. # needs to match https://github.com/pytorch/vision/blob/c0331c5e2933c621db9a44623f4f3981fe2342e0/torchvision/datasets/mnist.py#L42 MNIST_RESOURCES=("train-images-idx3-ubyte.gz" "train-labels-idx1-ubyte.gz" "t10k-images-idx3-ubyte.gz" "t10k-labels-idx1-ubyte.gz") for i in {0..1}; do # we need to populate data for each rank, and we currently always run with 2 DOWNLOAD_DIR="${USER_DIR}/MNIST_DATA_train/${i}/MNIST/raw" mkdir -p "$DOWNLOAD_DIR" for RESOURCE in ${MNIST_RESOURCES[@]}; do DEST_FILE="${DOWNLOAD_DIR}/${RESOURCE}" SOURCE_URL="https://ossci-datasets.s3.amazonaws.com/mnist/${RESOURCE}" echo "Downloading ${SOURCE_URL} to ${DEST_FILE}" curl -o "$DEST_FILE" "$SOURCE_URL" done done ================================================ FILE: test/images/neuron/tests/singleNodeTest.sh ================================================ #!/usr/bin/env bash set -e torchrun --nproc_per_node=2 --nnodes=1 tests/testNeuronSingleAllReduce.py torchrun --nproc_per_node=2 --nnodes=1 tests/testNeuronParallelState.py torchrun --nproc_per_node=2 --nnodes=1 tests/testNeuronMlp.py ================================================ FILE: test/images/neuron/tests/testNeuronMlp.py ================================================ # Source: https://github.com/aws/deep-learning-containers/blob/master/test/dlc_tests/container_tests/bin/pytorch_tests/testNeuronMlp import os import time import torch from torchvision.datasets import mnist from torch.utils.data import DataLoader from torchvision.transforms import ToTensor # XLA imports import torch_xla.core.xla_model as xm import torch_xla.runtime as xr # XLA imports for parallel loader and multi-processing import torch_xla.distributed.parallel_loader as pl from torch.utils.data.distributed import DistributedSampler # Initialize XLA process group for torchrun import torch_xla.distributed.xla_backend import torch.nn as nn import torch.nn.functional as F torch.distributed.init_process_group('xla') # Global constants EPOCHS = 4 WARMUP_STEPS = 2 BATCH_SIZE = 32 # Load MNIST train dataset train_dataset = mnist.MNIST(root=os.path.join(os.path.expanduser("~") + '/MNIST_DATA_train', str(xr.global_ordinal())), train=True, download=True, transform=ToTensor()) # Declare 3-layer MLP for MNIST dataset class MLP(nn.Module): def __init__(self, input_size = 28 * 28, output_size = 10, layers = [120, 84]): super(MLP, self).__init__() self.fc1 = nn.Linear(input_size, layers[0]) self.fc2 = nn.Linear(layers[0], layers[1]) self.fc3 = nn.Linear(layers[1], output_size) def forward(self, x): x = F.relu(self.fc1(x)) x = F.relu(self.fc2(x)) x = self.fc3(x) return F.log_softmax(x, dim=1) def main(): # XLA MP: get world size world_size = xr.world_size() # multi-processing: ensure each worker has same initial weights torch.manual_seed(0) # Move model to device and declare optimizer and loss function device = 'xla' model = MLP().to(device) # For multiprocessing, scale up learning rate optimizer = torch.optim.SGD(model.parameters(), lr=0.01 * world_size) loss_fn = torch.nn.NLLLoss() # Prepare data loader train_sampler = None if world_size > 1: train_sampler = DistributedSampler(train_dataset, num_replicas=world_size, rank=xr.global_ordinal(), shuffle=True) train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, sampler=train_sampler, shuffle=False if train_sampler else True) # XLA MP: use MpDeviceLoader from torch_xla.distributed train_device_loader = pl.MpDeviceLoader(train_loader, device) # Run the training loop print('----------Training ---------------') model.train() for epoch in range(EPOCHS): start = time.time() for idx, (train_x, train_label) in enumerate(train_device_loader): optimizer.zero_grad() train_x = train_x.view(train_x.size(0), -1) output = model(train_x) loss = loss_fn(output, train_label) loss.backward() xm.optimizer_step(optimizer) # XLA MP: performs grad allreduce and optimizer step if idx < WARMUP_STEPS: # skip warmup iterations start = time.time() # Compute statistics for the last epoch interval = idx - WARMUP_STEPS # skip warmup iterations throughput = interval / (time.time() - start) print("Train throughput (iter/sec): {}".format(throughput)) print("Final loss is {:0.4f}".format(loss.detach().to('cpu'))) # Save checkpoint for evaluation (xm.save ensures only one process save) os.makedirs(os.path.expanduser("~") + "/checkpoints", exist_ok=True) checkpoint = {'state_dict': model.state_dict()} xm.save(checkpoint, os.path.expanduser("~") + '/checkpoints/checkpoint.pt') print('----------End Training ---------------') if __name__ == '__main__': main() ================================================ FILE: test/images/neuron/tests/testNeuronParallelState.py ================================================ # Source: https://github.com/aws/deep-learning-containers/blob/master/test/dlc_tests/container_tests/bin/pytorch_tests/testNeuronParallelState import argparse import atexit import json import os import traceback from datetime import datetime import torch import torch_xla.core.xla_model as xm import torch_xla.debug.metrics as met import torch_xla.runtime as xr from neuronx_distributed.parallel_layers import parallel_state from neuronx_distributed.parallel_layers.utils import is_pjrt_device datetime_str = str(datetime.now()) results = {"inference_success": 1} def test_initialize_model_parallel(tensor_model_parallel_size): def _test_initialize_model_parallel(): if torch.distributed.get_rank() == 0: print("testing initialize_model_parallel with size {}".format(tensor_model_parallel_size)) tensor_model_parallel_size_ = min(tensor_model_parallel_size, torch.distributed.get_world_size()) assert not parallel_state.model_parallel_is_initialized() parallel_state.initialize_model_parallel(tensor_model_parallel_size=tensor_model_parallel_size_) assert parallel_state.model_parallel_is_initialized() # Checks. def check(group, world_size, rank): assert world_size == torch.distributed.get_world_size(group=group) assert rank == torch.distributed.get_rank(group=group) # Model parallel. world_size = tensor_model_parallel_size_ rank = torch.distributed.get_rank() % tensor_model_parallel_size_ assert world_size == parallel_state.get_tensor_model_parallel_size() assert rank == parallel_state.get_tensor_model_parallel_rank() check(parallel_state.get_tensor_model_parallel_group(), world_size, rank) # Data parallel. world_size = torch.distributed.get_world_size() // tensor_model_parallel_size_ rank = torch.distributed.get_rank() // tensor_model_parallel_size assert world_size == parallel_state.get_data_parallel_size() assert rank == parallel_state.get_data_parallel_rank() check(parallel_state.get_data_parallel_group(), world_size, rank) # Reset groups parallel_state.destroy_model_parallel() torch.distributed.barrier() if torch.distributed.get_rank() == 0: print("test passed") global results try: _test_initialize_model_parallel() except: results["inference_success"] = 0 print(traceback.format_exc()) raise def test_get_tensor_model_parallel_src_rank(tensor_model_parallel_size_): def _test_get_tensor_model_parallel_src_rank(): if torch.distributed.get_rank() == 0: print("testing get_tensor_model_parallel_src_rank with size {}".format(tensor_model_parallel_size_)) tensor_model_parallel_size = min(tensor_model_parallel_size_, torch.distributed.get_world_size()) assert not parallel_state.model_parallel_is_initialized() parallel_state.initialize_model_parallel(tensor_model_parallel_size) assert parallel_state.model_parallel_is_initialized() # Checks src_rank = torch.distributed.get_rank() - parallel_state.get_tensor_model_parallel_rank() assert parallel_state.get_tensor_model_parallel_src_rank() == src_rank # Reset groups parallel_state.destroy_model_parallel() torch.distributed.barrier() if torch.distributed.get_rank() == 0: print("test passed") global results try: _test_get_tensor_model_parallel_src_rank() except: results["inference_success"] = 0 print(traceback.format_exc()) raise if __name__ == "__main__": if is_pjrt_device(): import torch_xla.experimental.pjrt_backend torch.distributed.init_process_group("xla", init_method="pjrt://") else: torch.distributed.init_process_group("xla") world_size = xr.world_size() tensor_model_parallel_size = 1 while tensor_model_parallel_size <= world_size: test_initialize_model_parallel(tensor_model_parallel_size) test_get_tensor_model_parallel_src_rank(tensor_model_parallel_size) tensor_model_parallel_size *= 2 ================================================ FILE: test/images/neuron/tests/testNeuronSingleAllReduce.py ================================================ # Source: https://github.com/aws/deep-learning-containers/blob/master/test/dlc_tests/container_tests/bin/pytorch_tests/testNeuronSingleAllReduce import os import torch import torch_xla.core.xla_model as xm import torch_xla.distributed.xla_backend import torch_xla.runtime as xr torch.distributed.init_process_group('xla') import torch_xla.distributed.xla_multiprocessing as xmp os.environ["NEURON_RT_EXEC_TIMEOUT"] = "20" os.environ["NCCL_DEBUG"] = "WARN" os.environ["NCCL_DEBUG_SUBSYS"] = "ALL" def _mp_fn(): world_size = xr.world_size() device = xm.xla_device() rank = xr.global_ordinal() ones = torch.ones((2, 3)) xones = ones.to(device) if world_size > 0: print("running all reduce") for i in range(0, 5): print(f'at iteration {i}, with local rank {rank}', flush=True) result = xm.all_reduce(xm.REDUCE_SUM, xones) result_cpu = result.cpu() #xm.mark_step() print(result_cpu, flush = True) expected = torch.ones((2,3))*world_size assert expected.allclose(result_cpu) print('PASS') if __name__ == '__main__': _mp_fn() #xmp.spawn(_mp_fn, args=(),nprocs=2, join=True) ================================================ FILE: test/images/neuron-inference/Dockerfile ================================================ ############################################################################### # 0) Base image, arguments, and environment ############################################################################### FROM public.ecr.aws/docker/library/ubuntu:22.04 # Disable interactive prompts ENV DEBIAN_FRONTEND=noninteractive # Ensure Python prints are unbuffered so we see logs in real time ENV PYTHONUNBUFFERED=1 # Neuron SDK components version numbers # https://github.com/aws-neuron/deep-learning-containers/blob/main/docker/pytorch/inference/2.5.1/Dockerfile.neuronx ARG NEURONX_CC_VERSION=2.22.12471.0+b4a00d10 ARG NEURONX_FRAMEWORK_VERSION=2.9.0.2.11.19912+e48cd891 ARG NEURONX_COLLECTIVES_LIB_VERSION=2.29.41.0-681fef5f5 ARG NEURONX_RUNTIME_LIB_VERSION=2.29.40.0-f954cd7a5 ARG NEURONX_TOOLS_VERSION=2.27.33.0-5d9c0b901 # Python ARG PYTHON=python3.10 ARG PYTHON_VERSION=3.10.12 ENV PYTHONDONTWRITEBYTECODE=1 \ PYTHONUNBUFFERED=1 \ PYTHONIOENCODING=UTF-8 \ LANG=C.UTF-8 \ LC_ALL=C.UTF-8 # Extend library paths for Neuron ENV LD_LIBRARY_PATH="${LD_LIBRARY_PATH}:/opt/aws/neuron/lib" ENV LD_LIBRARY_PATH="${LD_LIBRARY_PATH}:/usr/local/lib" ENV PATH="/opt/aws/neuron/bin:${PATH}" ############################################################################### # 1) Base system packages, user setup ############################################################################### RUN apt-get update \ && apt-get upgrade -y \ && apt-get install -y --no-install-recommends \ build-essential \ ca-certificates \ curl \ git \ jq \ wget \ unzip \ vim \ zlib1g-dev \ openssl \ libssl-dev \ libsqlite3-dev \ libgdbm-dev \ libc6-dev \ libbz2-dev \ libncurses-dev \ tk-dev \ libffi-dev \ gnupg2 \ gpg-agent \ libarchive13 \ openssh-server \ sudo \ && rm -rf /var/lib/apt/lists/* \ && apt-get clean ############################################################################### # 2) Neuron SDK ############################################################################### RUN . /etc/os-release \ && echo "deb https://apt.repos.neuron.amazonaws.com focal main" > /etc/apt/sources.list.d/neuron.list \ && wget -qO - https://apt.repos.neuron.amazonaws.com/GPG-PUB-KEY-AMAZON-AWS-NEURON.PUB | apt-key add - \ && apt-get update -y \ && apt-get install -y \ aws-neuronx-tools=${NEURONX_TOOLS_VERSION} \ aws-neuronx-collectives=${NEURONX_COLLECTIVES_LIB_VERSION} \ aws-neuronx-runtime-lib=${NEURONX_RUNTIME_LIB_VERSION} \ && rm -rf /var/lib/apt/lists/* \ && apt-get clean ############################################################################### # 3) Python 3.10 from source ############################################################################### RUN wget -q https://www.python.org/ftp/python/${PYTHON_VERSION}/Python-${PYTHON_VERSION}.tgz \ && tar -xzf Python-${PYTHON_VERSION}.tgz \ && cd Python-${PYTHON_VERSION} \ && ./configure --enable-shared --prefix=/usr/local \ && make -j $(nproc) && make install \ && cd .. && rm -rf Python-${PYTHON_VERSION}* \ && ln -s /usr/local/bin/pip3 /usr/bin/pip \ && ln -s /usr/local/bin/${PYTHON} /usr/local/bin/python \ && pip --no-cache-dir install --upgrade pip setuptools wheel ############################################################################### # 4) Install PyTorch Neuron, Transformers Neuron, etc. via pip ############################################################################### RUN pip config set global.extra-index-url https://pip.repos.neuron.amazonaws.com \ && pip install --force-reinstall \ "torch-neuronx==${NEURONX_FRAMEWORK_VERSION}" \ "neuronx-cc==${NEURONX_CC_VERSION}" \ "transformers==4.36.2" ############################################################################### # 5) Application files and Python dependencies ############################################################################### WORKDIR /app COPY infer.py /app/ ================================================ FILE: test/images/neuron-inference/infer.py ================================================ import logging import os import sys import time import json import subprocess import random import concurrent.futures import numpy as np from copy import deepcopy import torch import torch_neuronx from torch.utils.data import DataLoader, TensorDataset from transformers import BertForPreTraining, BertTokenizer logging.basicConfig( level=logging.INFO, format='[%(asctime)s] [%(levelname)s] [%(name)s] %(message)s', handlers=[logging.StreamHandler(sys.stdout)] ) logger = logging.getLogger("BERTNeuronInference") logger.setLevel(logging.INFO) def get_neuron_monitor_stats(): """ Runs neuron-monitor command and returns the first JSON output as a dictionary. Also validates if the environment has Inferentia1/2 device and proper device count. Returns: dict: Parsed JSON output containing neuron monitor statistics Raises: RuntimeError: If neuron-monitor command is not found or fails to execute RuntimeError: If environment doesn't have proper Neuron support json.JSONDecodeError: If the output cannot be parsed as valid JSON """ try: # Run neuron-monitor with timeout to get first output process = subprocess.Popen( ['neuron-monitor'], stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True ) # Wait for first line of output output = process.stdout.readline() # Terminate the process since we only need first output process.terminate() process.wait() if not output: raise RuntimeError("No output received from neuron-monitor") # Parse JSON output stats = json.loads(output) # Check for Neuron hardware support hardware_info = stats.get('neuron_hardware_info', {}) device_type = hardware_info.get('neuron_device_type', '').lower() neuroncore_per_device_count = hardware_info.get('neuroncore_per_device_count', 0) if neuroncore_per_device_count <= 0: raise RuntimeError(f"No Neuron devices found (neuroncore_per_device_count: {neuroncore_per_device_count})") return neuroncore_per_device_count except FileNotFoundError: raise RuntimeError("neuron-monitor command not found") except json.JSONDecodeError as e: raise RuntimeError(f"Failed to parse JSON output: {e}") except Exception as e: raise RuntimeError(f"Error running neuron-monitor: {e}") def print_info(msg: str): """Helper function to prefix all info messages uniformly.""" logger.info(f"[INFO] {msg}") def print_warning(msg: str): """Helper function for warnings.""" logger.warning(f"[WARNING] {msg}") def print_error(msg: str): """Helper function for errors.""" logger.error(f"[ERROR] {msg}") def create_dummy_data(tokenizer, batch_size, num_samples=10000, max_length=128, seed=42): """ Creates a realistic Next Sentence Prediction (NSP) dataset for BERT model testing. Args: tokenizer (BertTokenizer): instance used to tokenize the input sentences batch_size (int): specifying the size of each batch num_samples (int): specifying total number of samples to generate (default: 100) max_length (int): specifying maximum sequence length for tokenization (default: 128) seed (int): for random seed to ensure reproducibility (default: 42) Returns: TensorDataset containing: - input_ids (torcTensor): of tokenized input sequences - attention_mask: of attention masks - nsp_labels: Tensor of NSP labels (0 for random next sentence, 1 for actual next sentence) Notes: - Automatically adjusts num_samples to be a multiple of batch_size - Creates balanced dataset with 50% true next sentences and 50% random sentences - Uses a predefined set of sample sentences for generating pairs """ random.seed(seed) if num_samples % batch_size != 0: adjusted = (num_samples // batch_size) * batch_size print_info( f"Adjusting num_samples from {num_samples} to {adjusted} " "to ensure full batches." ) num_samples = adjusted sample_sentences = [ "The dog loves playing fetch in the park.", "Artificial intelligence is reshaping the future.", "Movies with complex storylines can be very engaging.", "This restaurant serves an amazing brunch on weekends.", "Many researchers are exploring neural network architectures.", "A day at the beach can reduce stress and improve well-being.", "ChatGPT is a popular large language model by OpenAI.", "The annual developer conference showcased innovative technologies.", "Hiking in the mountains offers both challenge and relaxation.", "Robotics and automation are revolutionizing many industries.", ] sentences_a = [] sentences_b = [] nsp_labels = [] for _ in range(num_samples): idx_a = random.randint(0, len(sample_sentences) - 1) if random.random() < 0.5: # “True” next sentence idx_b = (idx_a + 1) % len(sample_sentences) nsp_labels.append(1) else: # Random sentence idx_b = random.randint(0, len(sample_sentences) - 1) nsp_labels.append(0) sentences_a.append(sample_sentences[idx_a]) sentences_b.append(sample_sentences[idx_b]) inputs = tokenizer( sentences_a, sentences_b, max_length=max_length, padding="max_length", truncation=True, return_tensors="pt", ) return TensorDataset( inputs.input_ids, inputs.attention_mask, torch.tensor(nsp_labels, dtype=torch.long) ) def run_inference(model, tokenizer, batch_size, mode, n_models=2, n_threads=2): """ Runs BERT model inference using Neuron runtime with dummy NSP (Next Sentence Prediction) data. Args: model (BertForPreTraining): model instance to be used for inference tokenizer (BertTokenizer): instance for processing input text batch_size (int): specifying batch size (8 for throughput mode, 1 for latency mode) mode (str): indicating inference mode ('throughput' or 'latency') n_models (int): number of models to spawn n_threads (int): number of threads for inference Returns: None, but prints performance metrics including: - Duration of the job - Average time per batch - Throughput (samples per second) - P50, P95, P99 latency - Batch Size - Total Batches Processed - Total Inferences Notes: - Performance metrics are logged with prefix [BERT_INFERENCE_NEURON_METRICS] - Uses torch_neuronx for model compilation - Handles both throughput and latency testing modes - Runs inference with no gradient computation (torch.no_grad) """ print_info("About to create dummy data...") try: dataset = create_dummy_data(tokenizer, batch_size=batch_size) except Exception as e: print_error(f"Failed to create dummy data: {e}") raise print_info("Dummy data creation completed.") dataloader = DataLoader( dataset, batch_size=batch_size ) # First compile the model for Neuron: # Since we run inference in batches, we must first # split the dataset into the size of input expected in a # single batch. This input signature would then be used # to call the .trace() method and compile the Bert model to Neuron. _input_ids, _attention_masks, _output_ids = dataset.tensors _split_input_ids = torch.split(_input_ids, batch_size)[0] _split_attention_masks = torch.split(_attention_masks, batch_size)[0] batch_input = (_split_input_ids, _split_attention_masks) try: # Use multicore context for automatic core allocation with torch_neuronx.experimental.multicore_context(): model_neuron = torch_neuronx.trace(model, batch_input) except Exception as e: logger.exception(f"[ERROR] Failed to trace BERT model. Failed with error: {e}") raise e latencies = [] rows_processed = 0 print_info(f"Starting Neuron inference ...") begin = time.time() with torch.no_grad(): for batch in dataloader: batch_input_tensor, batch_attention_tensor, _ = batch batch_input = (batch_input_tensor, batch_attention_tensor) start = time.time() _ = model_neuron(*batch_input) finish = time.time() latencies.append((finish - start) * 1000) rows_processed += len(batch_input_tensor) end = time.time() # Compute metrics boundaries = [50, 95, 99] percentiles = {} for boundary in boundaries: name = f'latency_p{boundary}' percentiles[name] = np.percentile(latencies, boundary) duration = end - begin inferences = rows_processed throughput = inferences / duration avg_time_per_batch = np.mean(latencies) # Print metrics print_info("Neuron inference completed.") # Print metrics to support old logging format print_info( "[BERT_INFERENCE_NEURON_METRICS] " f"mode={mode} " f"avg_time_per_batch={avg_time_per_batch:.6f} " f"throughput_samples_per_sec={throughput:.6f}" ) # performance metrics print_info(f"[BERT_INFERENCE_NEURON_METRICS] mode={mode}") print_info(f"[BERT_INFERENCE_NEURON_METRICS] duration={duration:.6f}") print_info(f"[BERT_INFERENCE_NEURON_METRICS] avg_time_per_batch={avg_time_per_batch:.6f}") print_info(f"[BERT_INFERENCE_NEURON_METRICS] throughput_samples_per_sec={throughput:.6f}") # latency metrics for name, value in percentiles.items(): print_info(f"[BERT_INFERENCE_NEURON_METRICS] {name}={value:.6f}") print_info(f"[BERT_INFERENCE_NEURON_METRICS] batch_size={batch_size}") print_info(f"[BERT_INFERENCE_NEURON_METRICS] total_batches_processed={len(latencies)}") print_info(f"[BERT_INFERENCE_NEURON_METRICS] total_inferences={inferences}") def main(): """Main entry""" print_info("Starting main()...") try: neuroncore_per_device_count = get_neuron_monitor_stats() print_info(f"Spawing a total of {neuroncore_per_device_count} models") except RuntimeError as e: print_error(f"Neuron environment not detected. Failed with error: {e}") sys.exit(1) mode = os.environ.get("INFERENCE_MODE", "throughput").lower() if mode not in ["throughput", "latency"]: print_warning( f"Unrecognized INFERENCE_MODE '{mode}'. " "Falling back to 'throughput'." ) mode = "throughput" batch_size = 1 if mode == "latency" else 8 print_info(f"Running Neuron inference in {mode} mode with batch size {batch_size}.") print_info("Loading tokenizer and model...") try: model_name = "bert-base-uncased" tokenizer = BertTokenizer.from_pretrained(model_name) model = BertForPreTraining.from_pretrained(model_name, torchscript=True) except Exception as e: print_error(f"Failed to load model/tokenizer: {e}") sys.exit(1) print_info("Model and tokenizer loaded successfully.") run_inference(model, tokenizer, batch_size, mode, n_models=neuroncore_per_device_count) print_info("main() completed all steps successfully.") if __name__ == "__main__": main() ================================================ FILE: test/images/neuron-training/Dockerfile ================================================ FROM public.ecr.aws/docker/library/ubuntu:22.04 ############################################################################### # 0) Arguments and environment ############################################################################### ARG DEBIAN_FRONTEND=noninteractive # Neuron SDK component versions (pin these precisely) # https://github.com/aws-neuron/deep-learning-containers/blob/main/docker/pytorch/training/2.5.1/Dockerfile.neuronx ARG NEURONX_CC_VERSION=2.22.12471.0+b4a00d10 ARG NEURONX_FRAMEWORK_VERSION=2.9.0.2.11.19912+e48cd891 ARG NEURONX_COLLECTIVES_LIB_VERSION=2.29.41.0-681fef5f5 ARG NEURONX_RUNTIME_LIB_VERSION=2.29.40.0-f954cd7a5 ARG NEURONX_TOOLS_VERSION=2.27.33.0-5d9c0b901 # Python ARG PYTHON=python3.10 ARG PYTHON_VERSION=3.10.12 ENV PYTHONDONTWRITEBYTECODE=1 \ PYTHONUNBUFFERED=1 \ PYTHONIOENCODING=UTF-8 \ LANG=C.UTF-8 \ LC_ALL=C.UTF-8 # Extend library paths for Neuron & EFA ENV LD_LIBRARY_PATH="/opt/aws/neuron/lib:/opt/amazon/efa/lib:/opt/amazon/efa/lib64:/usr/local/lib:${LD_LIBRARY_PATH}" ENV PATH="/opt/aws/neuron/bin:${PATH}" ############################################################################### # 1) Base system packages, user setup ############################################################################### RUN apt-get update \ && apt-get upgrade -y \ && apt-get install -y --no-install-recommends \ build-essential \ ca-certificates \ curl \ git \ jq \ wget \ unzip \ vim \ lcov \ pkg-config \ zlib1g-dev \ openssl \ libssl-dev \ libsqlite3-dev \ libgdbm-dev \ libc6-dev \ libbz2-dev \ libncurses-dev \ tk-dev \ libffi-dev \ gnupg2 \ gpg-agent \ libarchive13 \ openssh-server \ openssh-client \ sudo \ && rm -rf /var/lib/apt/lists/* \ && apt-get clean ############################################################################### # 2) Neuron SDK ############################################################################### RUN . /etc/os-release \ && echo "deb https://apt.repos.neuron.amazonaws.com focal main" > /etc/apt/sources.list.d/neuron.list \ && wget -qO - https://apt.repos.neuron.amazonaws.com/GPG-PUB-KEY-AMAZON-AWS-NEURON.PUB | apt-key add - \ && apt-get update \ && apt-get install -y \ aws-neuronx-tools=${NEURONX_TOOLS_VERSION} \ aws-neuronx-collectives=${NEURONX_COLLECTIVES_LIB_VERSION} \ aws-neuronx-runtime-lib=${NEURONX_RUNTIME_LIB_VERSION} \ && rm -rf /var/lib/apt/lists/* \ && apt-get clean ############################################################################### # 3) EFA installer (for MPI-based jobs) ############################################################################### RUN apt-get update \ && cd /tmp \ && curl -O https://efa-installer.amazonaws.com/aws-efa-installer-latest.tar.gz \ && wget https://efa-installer.amazonaws.com/aws-efa-installer.key \ && gpg --import aws-efa-installer.key \ && cat aws-efa-installer.key | gpg --fingerprint \ && wget https://efa-installer.amazonaws.com/aws-efa-installer-latest.tar.gz.sig \ && gpg --verify ./aws-efa-installer-latest.tar.gz.sig \ && tar -xf aws-efa-installer-latest.tar.gz \ && cd aws-efa-installer \ && ./efa_installer.sh -y -g -d --skip-kmod --skip-limit-conf --no-verify \ && cd /tmp \ && rm -rf aws-efa-installer* \ && rm -rf /var/lib/apt/lists/* \ && apt-get clean ENV PATH="/opt/amazon/openmpi/bin:${PATH}" ENV LD_LIBRARY_PATH="/opt/amazon/openmpi/lib64:${LD_LIBRARY_PATH}" ############################################################################### # 4) Python 3.10 from source ############################################################################### RUN wget -q https://www.python.org/ftp/python/${PYTHON_VERSION}/Python-${PYTHON_VERSION}.tgz \ && tar -xzf Python-${PYTHON_VERSION}.tgz \ && cd Python-${PYTHON_VERSION} \ && ./configure --enable-shared --prefix=/usr/local \ && make -j $(nproc) && make install \ && cd .. && rm -rf Python-${PYTHON_VERSION}* \ && ln -s /usr/local/bin/pip3 /usr/bin/pip \ && ln -s /usr/local/bin/${PYTHON} /usr/local/bin/python \ && pip --no-cache-dir install --upgrade pip setuptools wheel ############################################################################### # 5) Install pinned Python packages ############################################################################### RUN pip config set global.extra-index-url https://pip.repos.neuron.amazonaws.com \ && pip install --force-reinstall \ "torch-neuronx==${NEURONX_FRAMEWORK_VERSION}" \ "neuronx-cc==${NEURONX_CC_VERSION}" \ "transformers==4.36.2" ############################################################################### # 6) SSH and finalize ############################################################################### # Configure SSH (auto-accept new host keys) RUN mkdir -p /var/run/sshd && \ sed -i 's/[ #]\(.*StrictHostKeyChecking \).*/ \1no/g' /etc/ssh/ssh_config && \ echo " UserKnownHostsFile /dev/null" >> /etc/ssh/ssh_config && \ sed -i 's/#\(StrictModes \).*/\1no/g' /etc/ssh/sshd_config WORKDIR /app/ COPY train.py /app/ ================================================ FILE: test/images/neuron-training/train.py ================================================ import os import time import random import torch import torch.distributed as dist # === torch_xla imports for device and parallel loader === import torch_xla.core.xla_model as xm import torch_xla.runtime as xr import torch_xla.distributed.xla_backend import torch_xla.distributed.parallel_loader as pl from torch.utils.data import DataLoader, TensorDataset, DistributedSampler from transformers import BertForPreTraining, BertTokenizer RANK = int(os.environ.get("RANK", 0)) WORLD_SIZE = int(os.environ.get("WORLD_SIZE", 1)) def create_dummy_data(tokenizer, num_samples=100, max_length=128): """ Creates dummy BERT pretraining data (MLM + NSP). """ print(f"Creating dummy data: {num_samples} samples, max_length={max_length}") sentences = [f"This is a dummy sentence number {i}" for i in range(num_samples)] encodings = tokenizer( sentences, max_length=max_length, padding="max_length", truncation=True, return_tensors="pt", ) labels = encodings.input_ids.detach().clone() # Randomly mask some tokens for MLM mlm_probability = 0.15 input_ids, labels = mask_tokens(encodings.input_ids, tokenizer, mlm_probability) # Dummy next-sentence prediction labels next_sentence_labels = torch.randint(0, 2, (num_samples,)) return TensorDataset(input_ids, encodings.attention_mask, labels, next_sentence_labels) def mask_tokens(inputs, tokenizer, mlm_probability): """ Randomly mask tokens for MLM. Unmasked tokens => label = -100 so we don't compute loss on them. """ labels = inputs.clone() probability_matrix = torch.full(labels.shape, mlm_probability) special_tokens_mask = [ tokenizer.get_special_tokens_mask(val, already_has_special_tokens=True) for val in labels.tolist() ] probability_matrix.masked_fill_( torch.tensor(special_tokens_mask, dtype=torch.bool), value=0.0 ) masked_indices = torch.bernoulli(probability_matrix).bool() labels[~masked_indices] = -100 inputs[masked_indices] = tokenizer.convert_tokens_to_ids(tokenizer.mask_token) return inputs, labels def complete_epoch(epoch, optimizer, parallel_loader, model): for step_idx, batch in enumerate(parallel_loader, start=1): optimizer.zero_grad() input_ids, attention_mask, mlm_labels, next_sentence_labels = batch outputs = model( input_ids=input_ids, attention_mask=attention_mask, labels=mlm_labels, next_sentence_label=next_sentence_labels, ) loss = outputs.loss loss.backward() xm.optimizer_step(optimizer) if step_idx % 10 == 0: print(f"[Rank {RANK}] - Epoch {epoch}, Step {step_idx}, Loss={loss.item():.4f}") def main(): dist.init_process_group( "xla", init_method="xla://" ) # print info with xla runtime functions to sanity check run context correctly propagates to backend print(f"Starting train.py with rank={xr.global_ordinal()}, world_size={xr.world_size()}") # Seed everything for reproducibility SEED = 42 random.seed(SEED) torch.manual_seed(SEED) device = xm.xla_device() # Preload model + tokenizer tokenizer = BertTokenizer.from_pretrained("bert-base-uncased") model = BertForPreTraining.from_pretrained("bert-base-uncased") print(f"[Rank {RANK}]: Model & tokenizer loaded.") # Create dummy dataset dataset = create_dummy_data(tokenizer, num_samples=1000, max_length=128) # Shard dataset for each RANK sampler = DistributedSampler( dataset, num_replicas=WORLD_SIZE, rank=RANK, shuffle=True, drop_last=False, ) train_loader = DataLoader(dataset, batch_size=512, sampler=sampler) # XLA parallel data loader parallel_loader = pl.MpDeviceLoader(train_loader, device) # Move model to XLA device model = model.to(device) optimizer = torch.optim.AdamW(model.parameters(), lr=1e-3) # Let's do 5 epochs epochs = 5 model.train() # TODO: precompile the model. This warmup is arbitrary based on observed behavior # neuronx-cc seems to recompile for the first 2 runs for some reason tbd print(f"[Rank {RANK}] - Starting warmup (2 repetitions of epoch 0)") warmup_start = time.time() complete_epoch(0, optimizer, parallel_loader, model) complete_epoch(0, optimizer, parallel_loader, model) warump_time = time.time() - warmup_start print(f"[Rank {RANK}] - Finished warmup in {warump_time:.2f}s") print(f"[Rank {RANK}] - Starting training for {epochs} epochs...") start_time = time.time() epoch_times = [] for epoch in range(1, epochs + 1): epoch_start_time = time.time() print(f"[Rank {RANK}] - Epoch {epoch}/{epochs}") complete_epoch(epoch, optimizer, parallel_loader, model) epoch_time = time.time() - epoch_start_time epoch_times.append(epoch_time) print(f"[Rank {RANK}] - Epoch {epoch} done in {epoch_time:.2f}s") # Total training time total_time = time.time() - start_time print(f"[Rank {RANK}] - All epochs complete in {total_time:.2f}s") # Each rank processes (dataset_size / WORLD_SIZE) * epochs samples local_samples = (len(dataset) / WORLD_SIZE) * epochs local_throughput = local_samples / total_time # Average epoch time (local) if epoch_times: avg_epoch_time = sum(epoch_times) / len(epoch_times) else: avg_epoch_time = 0.0 print( f"[Rank {RANK}] - local_samples={local_samples:.1f}, total_time={total_time:.2f}s, " f"local_throughput={local_throughput:.2f} samples/s, local_avg_epoch_time={avg_epoch_time:.2f}s" ) print(f"[Rank {RANK}] training complete. Exiting main().") if __name__ == "__main__": main() ================================================ FILE: test/images/nvidia/Dockerfile ================================================ ARG CUDA_MAJOR_VERSION=12 ARG CUDA_MINOR_VERSION=8 # Start with the NVIDIA CUDA base image FROM nvidia/cuda:$CUDA_MAJOR_VERSION.$CUDA_MINOR_VERSION.0-devel-ubuntu22.04 ARG CUDA_MAJOR_VERSION ARG CUDA_MINOR_VERSION ENV DEBIAN_FRONTEND=noninteractive # Install necessary dependencies RUN apt update -y \ && apt upgrade -y \ && apt remove -y --allow-change-held-packages \ libmlx5-1 \ ibverbs-utils \ libibverbs-dev \ libibverbs1 \ libnccl2 \ libnccl-dev \ && rm -rf /opt/hpcx \ && rm -rf /usr/local/mpi \ && rm -rf /usr/local/ucx \ && rm -f /etc/ld.so.conf.d/hpcx.conf \ && apt install -y \ git \ gcc \ openssh-client \ openssh-server \ build-essential \ curl \ autoconf \ libtool \ automake \ cmake \ apt-utils \ libhwloc-dev \ freeglut3-dev \ libglu1-mesa-dev \ datacenter-gpu-manager-4-cuda12 \ datacenter-gpu-manager-4-cuda13 RUN ldconfig ENV LD_LIBRARY_PATH /opt/amazon/openmpi/lib:/opt/amazon/efa/lib:/opt/aws-ofi-nccl/install/lib:/usr/local/cuda/lib:/usr/local/lib/:/usr/lib64:/usr/lib/x86_64-linux-gnu/:/usr/lib/aarch64-linux-gnu/:$LD_LIBRARY_PATH ENV PATH /usr/local/cuda/bin:/opt/amazon/openmpi/bin:/opt/amazon/efa/bin:/usr/sbin:/usr/bin:/usr/local/bin:$PATH RUN mkdir -p /var/run/sshd \ && sed -i 's/[ #]\(.*StrictHostKeyChecking \).*/ \1no/g' /etc/ssh/ssh_config \ && echo " UserKnownHostsFile /dev/null" >> /etc/ssh/ssh_config \ && sed -i 's/#\(StrictModes \).*/\1no/g' /etc/ssh/sshd_config # Build CUDA Samples RUN git clone https://github.com/NVIDIA/cuda-samples.git /tmp/cuda-samples \ --branch v$CUDA_MAJOR_VERSION.$CUDA_MINOR_VERSION \ && cd /tmp/cuda-samples/Samples/0_Introduction/vectorAdd && cmake . && make -j$(nproc) && cp vectorAdd /usr/bin \ && cd /tmp/cuda-samples/Samples/1_Utilities/deviceQuery && cmake . && make -j$(nproc) && cp deviceQuery /usr/bin \ && cd && rm -rf /tmp/cuda-samples # Install EFA ARG EFA_INSTALLER_VERSION=latest RUN curl -sL https://efa-installer.amazonaws.com/aws-efa-installer-$EFA_INSTALLER_VERSION.tar.gz | tar xvz -C /tmp \ && cd /tmp/aws-efa-installer \ && ./efa_installer.sh --yes --enable-gdr --skip-kmod --skip-limit-conf --no-verify --mpi openmpi4 \ && cd && rm -rf /tmp/aws-efa-installer # Build nvbandwidth ARG NVBANDWIDTH_VERSION=v0.8 RUN apt install -y libboost-program-options-dev RUN git clone https://github.com/NVIDIA/nvbandwidth.git --branch $NVBANDWIDTH_VERSION /tmp/nvbandwidth \ && cd /tmp/nvbandwidth \ && cmake -DMULTINODE=1 . && make && cp nvbandwidth /usr/bin \ && cd && rm -rf /tmp/cuda-samples # Install NCCL ARG LIBNCCL_VERSION=2.28.7-1 RUN git clone https://github.com/NVIDIA/nccl.git --branch v$LIBNCCL_VERSION /tmp/nccl \ && cd /tmp/nccl \ && make -j $(nproc) \ && make install \ && cd && rm -rf /tmp/nccl # Install AWS-OFI-NCCL plugin ARG AWS_OFI_NCCL_VERSION=1.17.2 RUN curl -sL https://github.com/aws/aws-ofi-nccl/releases/download/v$AWS_OFI_NCCL_VERSION/aws-ofi-nccl-$AWS_OFI_NCCL_VERSION.tar.gz | tar xvz -C /tmp \ && cd /tmp/aws-ofi-nccl-$AWS_OFI_NCCL_VERSION \ && ./configure \ --prefix=/opt/aws-ofi-nccl/install \ --with-mpi=/opt/amazon/openmpi \ --with-libfabric=/opt/amazon/efa \ --with-cuda=/usr/local/cuda \ --enable-platform-aws \ --disable-tests \ && make -j $(nproc) \ && make install \ && cd && rm -rf /tmp/aws-ofi-nccl-$AWS_OFI_NCCL_VERSION # Install NCCL Tests # TODO: automate pin with version bump RUN git clone https://github.com/NVIDIA/nccl-tests /tmp/nccl-tests \ && cd /tmp/nccl-tests \ && make \ MPI=1 \ MPI_HOME=/opt/amazon/openmpi/ \ CUDA_HOME=/usr/local/cuda \ NCCL_HOME=/usr/local/lib \ && mkdir -p /opt/nccl-tests \ && mv build /opt/nccl-tests/build \ && cd && rm -rf /tmp/nccl-tests # Set a default command for debugging or modify as per requirements ENV NCCL_PROTO simple # see https://linux.die.net/man/8/ld.so for usage. replaces LD_PRELOAD env. RUN echo "/usr/local/lib/libnccl.so" >> /etc/ld.so.preload RUN rm -rf /var/lib/apt/lists/* WORKDIR /app COPY test/images/nvidia/gpu_unit_tests ./gpu_unit_tests RUN chmod +x ./gpu_unit_tests/unit_test ================================================ FILE: test/images/nvidia/gpu_unit_tests/README.md ================================================ # What gpu_unit_tests is the unit tests for gpu enabled platforms. Idea is to create compact set of tests which will cover most of performance critical aspects for gpu platforms. Test designed to run on single instance. # Usage ``` # Run tests ./unit_test ``` **Generate test data for new instance type** Step 1: Copy the `gpu_unit_tests` folder to the EC2 instance where you want to generate the data. Step 2: Execute the following command in the `gpu_unit_tests` directory on the EC2 instance: ``` GENERATE_DATA=1 ./unit_test ``` Step 3: Copy the files from `tests/test_sysinfo.sh.data` (e.g., `tests/test_sysinfo.sh.data/p3.2xlarge`) to your local repository. Step 4: Create PR with the new `tests/test_sysinfo.sh.data/xxx` # Test list - test_sysinfo.sh :: Validate basic system configuration by comparing it with test config - test_numa_topo_topo :: check cpu/numa topology - test_nvidia_gpu_count :: fail if one of GPUs is broken or is not visiable - test_nvidia_fabric_status :: fail if fabric manager is not active - test_nvidia_smi_topo :: fail if nvidia-smi topology is differ - test_nvidia_persistence_status :: validate persistence state - test_nvidia_gpu_unused :: Check that no other process are using GPUs, fail is a signal system misconfiguration. - 10_test_basic_cuda.sh :: Execute trivial cuda binaries, fail if cuda subsys is not healthy Use demo-suite binaries https://docs.nvidia.com/cuda/demo-suite/index.html and DCGM Diagnostics https://docs.nvidia.com/datacenter/dcgm/latest/user-guide/dcgm-diagnostics.html#run-levels-and-tests If this test suite fail this is a sign that cuda subsystem is not usable at all. Usually this is side effect of system misconfiguration (driver or fabric manager is not loaded) - test_01_device_query - test_02_vector_add - test_03_nvbandwidth - test_04_dcgm_diagnostics ================================================ FILE: test/images/nvidia/gpu_unit_tests/bash_unit ================================================ #!/usr/bin/env bash # # bash unit testing enterprise edition framework for professionals # Copyright (C) 2011-2016 Pascal Grange # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation; either version 3 of the License, or # (at your option) any later version. # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # You should have received a copy of the GNU General Public License # along with this program; if not, write to the Free Software Foundation, # Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA # # https://github.com/pgrange/bash_unit VERSION=v2.1.0 ESCAPE=$(printf "\033") NOCOLOR="${ESCAPE}[0m" RED="${ESCAPE}[91m" GREEN="${ESCAPE}[92m" YELLOW="${ESCAPE}[93m" BLUE="${ESCAPE}[94m" # Make bash_unit immune to some basic unix commands faking CAT="$(which cat)" SED="$(which sed)" GREP="$(which grep)" RM="$(which rm)" SHUF="$(which shuf)" fail() { local message=${1:-} local stdout=${2:-} local stderr=${3:-} notify_test_failed "$__bash_unit_current_test__" "$message" [[ ! -z $stdout ]] && [ -s "$stdout" ] && notify_stdout < "$stdout" [[ ! -z $stderr ]] && [ -s "$stderr" ] && notify_stderr < "$stderr" stacktrace | notify_stack exit 1 } skip() { local message=${1:-} notify_test_skipped "$__bash_unit_current_test__" "$message" echo "skipped $message" > $__bash_unit_test_skipped__ exit 0 } _notify_trace() { local caller_shift=$1 local message=${2} local stdout=${3:-} local stderr=${4:-} [ -z $trace_file ] && return caller_hdr="" cl=$((caller_shift + 2)) if [ -n ${BASH_SOURCE[$cl]} ] then caller_hdr="${BASH_SOURCE[$cl]}:${BASH_LINENO[$((cl-1))]}" fi echo "trace:${caller_hdr}> $message" >> $trace_file [[ ! -z $stdout ]] && [ -s "$stdout" ] && "$SED" 's:^:trace-out> :' < "$stdout" >> $trace_file [[ ! -z $stderr ]] && [ -s "$stderr" ] && "$SED" 's:^:trace-err> :' < "$stderr" >> $trace_file } notify_trace_dbg() { _notify_trace 0 "$1" } notify_trace_info() { [ -z $trace_file ] && return local message=${1:-} echo "info> $message" >> $trace_file } assert() { local assertion=$1 local message=${2:-} _assert_expression \ "$assertion" \ "[ \$status == 0 ]" \ "\"$message\"" } assert_fails() { local assertion=$1 local message=${2:-} _assert_expression \ "$assertion" \ "[ \$status != 0 ]" \ "\"$message\"" } assert_fail() { #deprecated, use assert_fails instead assert_fails "$@" } assert_status_code() { local expected_status=$1 local assertion="$2" local message="${3:-}" _assert_expression \ "$assertion" \ "[ \$status == $expected_status ]" \ "\"$message\" expected status code $expected_status but was \$status" } _assert_expression() { local assertion=$1 local condition=$2 local message=$3 ( local stdout=$(mktemp) local stderr=$(mktemp) trap "$RM -f \"$stdout\" \"$stderr\"" EXIT local status eval "($assertion)" >"$stdout" 2>"$stderr" && status=$? || status=$? _notify_trace 1 "assert_expression: exp: '$assertion', cond: '$condition', status: '$status'" "$stdout" "$stderr" if ! eval "$condition" then fail "$(eval echo $message)" "$stdout" "$stderr" fi ) || exit $? } assert_equals() { local expected=$1 local actual=$2 local message=${3:-} [[ -z $message ]] || message="$message\n" notify_trace_dbg "assert_equals '$expected' == '$actual'" if [ "$expected" != "$actual" ] then fail "$message expected [$expected] but was [$actual]" fi } assert_not_equals() { local unexpected="$1" local actual="$2" local message=${3:-} [[ -z $message ]] || message="$message\n" notify_trace_dbg "assert_not_equals: '$unexpected' != '$actual'" [ "$unexpected" != "$actual" ] || \ fail "$message expected different value than [$unexpected] but was the same" } assert_matches() { local expected=$1 local actual=$2 local message=${3:-} [[ -z $message ]] || message="$message\n" notify_trace_dbg "assert_matches: '$actual' =~ '$expected'" if [[ ! "${actual}" =~ ${expected} ]]; then fail "$message expected regex [$expected] to match [$actual]" fi } assert_not_matches() { local unexpected=$1 local actual=$2 local message=${3:-} [[ -z $message ]] || message="$message\n" _notify_trace 0 "assert_not_matches: ! '$actual' =~ '$unexpected'" if [[ "${actual}" =~ ${unexpected} ]]; then fail "$message expected regex [$unexpected] should not match but matched [$actual]" fi } assert_within_delta() { function abs() { local value=$1 local sign=$(( value < 0 ? -1 : 1 )) echo $((value * sign)) } function is_number() { local value=$1 test $value -eq $value 2>/dev/null } local expected=$1 local actual=$2 local max_delta=$3 assert "is_number $expected" "$message expected value [$expected] is not a number" assert "is_number $actual" "$message actual value [$actual] is not a number" assert "is_number $max_delta" "$message max_delta [$max_delta] is not a number" local message=${4:-} [[ -z $message ]] || message="$message\n" local actual_delta="$(abs $(($expected - $actual)))" if (( $actual_delta > $max_delta )); then fail "$message expected value [$expected] to match [$actual] with a maximum delta of [$max_delta]" fi } assert_no_diff() { local expected=$1 local actual=$2 local message=${3:-} [[ -z $message ]] || message="$message\n" assert 'diff '"${expected}"' '"${actual}" \ "$message expected '"${actual}"' to be identical to '"${expected}"' but was different" } fake() { local command=$1 shift if [ $# -gt 0 ] then eval "function $command() { export FAKE_PARAMS=(\"\$@\") ; $@ ; }" else eval "function $command() { echo \"$($CAT)\" ; }" fi export -f $command } stacktrace() { local i=1 while ! [ -z "${BASH_SOURCE[$i]:-}" ] do echo ${BASH_SOURCE[$i]}:${BASH_LINENO[$((i-1))]}:${FUNCNAME[$i]}\(\) i=$((i + 1)) done | "$GREP" -v "^$BASH_SOURCE" } run_test_suite() { local failure=0 if run_setup_suite then run_tests || failure=$? else failure=$? fi run_teardown_suite return $failure } run_setup_suite() { if declare -F | "$GREP" ' setup_suite$' >/dev/null then setup_suite fi } maybe_shuffle() { ((randomise)) && $SHUF || $CAT } run_tests() { local failure=0 for pending_test in $(set | "$GREP" -E '^(pending|todo).* \(\)' | "$GREP" -E "$test_pattern" | "$SED" -e 's: .*::') do notify_test_starting "$pending_test" notify_test_pending "$pending_test" done for test in $(set | "$GREP" -E '^test.* \(\)' | "$GREP" -E "$test_pattern" | "$SED" -e 's: .*::' | maybe_shuffle) do ( local status=0 declare -F | "$GREP" ' setup$' >/dev/null && setup __bash_unit_test_skipped__=$(mktemp) trap "$RM -f \"$stdout\" \"$stderr\"" EXIT if [[ -n "$skip_pattern" && ("$test" =~ $skip_pattern) ]]; then skip "$test as specified in skip pattern: $skip_pattern" fi (__bash_unit_current_test__="$test" run_test) || status=$? test -s $__bash_unit_test_skipped__ && status=0 declare -F | "$GREP" ' teardown$' >/dev/null && teardown exit $status ) failure=$(( $? || failure)) done return $failure } run_test() { set -e notify_test_starting "$__bash_unit_current_test__" "$__bash_unit_current_test__" && notify_test_succeeded "$__bash_unit_current_test__" } run_teardown_suite() { if declare -F | "$GREP" ' teardown_suite$' >/dev/null then teardown_suite fi } usage() { echo "$1" >&2 echo "$0 [-f ] [-p ] [-p ] [-s ] [-r] ... ..." >&2 echo >&2 echo "Runs tests in test files that match s" >&2 echo "Skip tests in test files that match s" >&2 echo " is optional only supported value is tap" >&2 echo "-r to execute test cases in random order" >&2 echo "-v to get current version information" >&2 echo "See https://github.com/pgrange/bash_unit" >&2 exit 1 } # Formating pretty_success() { pretty_format "$GREEN" "\u2713" "${1:-}" } pretty_warning() { pretty_format "$YELLOW" "\u2717" "$1" } pretty_failure() { pretty_format "$RED" "\u2717" "${1:-}" } pretty_format() { local color="$1" local pretty_symbol="$2" local alt_symbol="${3:-}" local term_utf8=false #env if is_terminal && [[ "${LANG:-}" =~ .*UTF-8.* ]] then term_utf8=true fi ( $CAT if $term_utf8 then echo -en " $pretty_symbol " else [[ ! -z "$alt_symbol" ]] && echo -en " $alt_symbol " fi ) | color "$color" } color() { _start_color() { if is_terminal ; then echo -en "$color" ; fi } _stop_color() { if is_terminal ; then echo -en "$NOCOLOR" ; fi } local color=$1 shift _start_color if [ $# -gt 0 ] then echo $* else $CAT fi _stop_color } is_terminal() { [ -t 1 ] || [[ "${FORCE_COLOR:-}" == true ]] } trace_suite_starting() { local test_file="$1" notify_trace_info "Running tests in $test_file" } trace_test_starting() { local test="$1" notify_trace_info "Running $test" } trace_test_pending() { local test="$1" notify_trace_info "Pending $test" } trace_test_skipped() { local test="$1" local message="$2" notify_trace_info "Skip $test message: $message" } trace_test_succeeded() { local test="$1" notify_trace_info "Success $test" } trace_test_failed() { local test="$1" local message="$2" notify_trace_info "$test with message: $message" } trace_suites_succeded() { notify_trace_info "Overall result: SUCCESS" } trace_suites_failed() { notify_trace_info "Overall result: FAILURE" } text_format() { notify_suite_starting() { local test_file="$1" trace_suite_starting $test_file echo "Running tests in $test_file" } notify_test_starting() { local test="$1" trace_test_starting $test echo -e -n "\tRunning $test ... " | color "$BLUE" } notify_test_pending() { local test="$1" trace_test_pending "$test" echo -n "PENDING" | pretty_warning echo } notify_test_skipped() { local test="$1" local message="$2" trace_test_skipped "$test" "$message" echo -n "SKIPPED" | pretty_warning [[ -z $message ]] || printf -- "$message\n" echo } notify_test_succeeded() { local test="$1" trace_test_succeeded "$test" echo -n "SUCCESS" | pretty_success echo } notify_test_failed() { local test="$1" local message="$2" trace_test_failed "$test" "$message" echo -n "FAILURE" | pretty_failure echo [[ -z $message ]] || printf -- "$message\n" } notify_stdout() { "$SED" 's:^:out> :' | color "$GREEN" } notify_stderr() { "$SED" 's:^:err> :' | color "$RED" } notify_stack() { color "$YELLOW" } notify_suites_succeded() { trace_suites_succeded echo -n "Overall result: SUCCESS" | pretty_success echo } notify_suites_failed() { trace_suites_failed echo -n "Overall result: FAILURE" | pretty_failure echo } } tap_format() { notify_suite_starting() { local test_file="$1" trace_suite_starting echo "# Running tests in $test_file" } notify_test_starting() { trace_test_starting $1 } notify_test_pending() { local test="$1" trace_test_pending "$test" echo -n "ok" | pretty_warning - echo -n "$test" | color "$BLUE" echo " # skip test to be written" | color "$YELLOW" } notify_test_skipped() { local test="$1" local message="$2" trace_test_skipped "$test" "$message" echo -n "ok" | pretty_warning - echo -n "$test" | color "$BLUE" echo " # skip ${message}" | color "$YELLOW" } notify_test_succeeded() { local test="$1" trace_test_succeeded "$test" echo -n "ok" | pretty_success - echo "$test" | color "$BLUE" } notify_test_failed() { local test="$1" local message="$2" trace_test_failed "$test" "$message" echo -n "not ok" | pretty_failure - echo "$test" | color "$BLUE" [[ -z $message ]] || printf -- "$message\n" | "$SED" -u -e 's/^/# /' } notify_stdout() { "$SED" 's:^:# out> :' | color "$GREEN" } notify_stderr() { "$SED" 's:^:# err> :' | color "$RED" } notify_stack() { "$SED" 's:^:# :' | color "$YELLOW" } notify_suites_succeded() { trace_suites_succeded } notify_suites_failed() { trace_suites_failed } } output_format=text test_pattern="" skip_pattern="" trace_file="" separator="" randomise=0 while getopts "vp:t:f:r:s:" option do case "$option" in p) test_pattern="${test_pattern}${separator}${OPTARG}" separator="|" ;; s) skip_pattern="${skip_pattern}${separator}${OPTARG}" separator="|" ;; t) trace_file="$(realpath ${OPTARG})" truncate -s0 "$trace_file" ;; f) output_format="${OPTARG}" ;; r) randomise=1 ;; v) echo "bash_unit $VERSION" exit ;; ?|:) usage ;; esac done shift $((OPTIND-1)) for test_file in "$@" do test -e "$test_file" || usage "file does not exist: $test_file" test -r "$test_file" || usage "can not read file: $test_file" done case "$output_format" in text) text_format ;; tap) tap_format ;; *) usage "unsupported output format: $output_format" ;; esac #run tests received as parameters failure=0 for test_file in "$@" do notify_suite_starting "$test_file" ( set -e # Ensure bash_unit will exit with failure # in case of syntax error. if [[ "${STICK_TO_CWD}" != true ]] then cd "$(dirname "$test_file")" source "$(basename "$test_file")" else source "$test_file" fi set +e run_test_suite ) failure=$(( $? || failure)) done if ((failure)) then notify_suites_failed else notify_suites_succeded fi exit $failure ================================================ FILE: test/images/nvidia/gpu_unit_tests/tests/common.sh ================================================ #!/bin/bash get_instance_type() { # Retrieve instance metadata: https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html#instance-metadata-retrieval-examples [ -n "$FORCE_INSTANCE_TYPE" ] && echo $FORCE_INSTANCE_TYPE local token=$(curl -X PUT "http://169.254.169.254/latest/api/token" -H "X-aws-ec2-metadata-token-ttl-seconds: 21600" 2>/dev/null) if [ -n "$token" ]; then curl -H "X-aws-ec2-metadata-token: $token" http://169.254.169.254/latest/meta-data/instance-type else curl http://169.254.169.254/latest/meta-data/instance-type fi } assert_gpu_unused() { cmd="nvidia-smi --query-compute-apps timestamp,gpu_bus_id,gpu_uuid,pid,name,used_memory --format csv,noheader" assert_equals "" "`$cmd`" "gpu is busy by other task, system misconfig?" } _assert_data() { local expected="$1" local cmd="$2" local message="${3:-}" local cmd_out="$ACTUAL_RESULTS/$(basename $expected)" [[ -z $message ]] || message="$message\n" eval "$cmd" > $cmd_out diff_cmd="diff -up $expected $cmd_out" diff_out="`$diff_cmd`" notify_trace_dbg "_assert_data $diff_cmd, out: $diff_out" if [ -n "$diff_out" ] then fail "$message test data value diff:\n$diff_out" fi } assert_data() { _assert_data "$1" "$2" "$3" } generate_data() { local expected="$1" local cmd="$2" local msg="$3" local cmd_out="$ACTUAL_RESULTS/$(basename $expected)" eval "$cmd" > $expected _assert_data "$expected" "$cmd" "$msg" } function is_vgpu() { local instance_type=${EC2_INSTANCE_TYPE:-$(get_instance_type)} case "${instance_type}" in g6f.*|gr6f.*) return ;; *) return 1 ;; # Not supported esac } ================================================ FILE: test/images/nvidia/gpu_unit_tests/tests/test_basic.sh ================================================ # Trivial cuda tests to validate that GPU it functional # Use demu-suite binaries https://docs.nvidia.com/cuda/demo-suite/index.html # and DCGM Diagnostics https://docs.nvidia.com/datacenter/dcgm/latest/user-guide/dcgm-diagnostics.html#run-levels-and-tests setup_suite() { source common.sh assert_gpu_unused DEMO_SUITE_DIR=${DEMO_SUITE_DIR:-$(realpath /usr/local/cuda/extras/demo_suite)} } teardown_suite() { assert_gpu_unused } test_01_device_query() { assert_status_code 0 "$DEMO_SUITE_DIR/deviceQuery" } test_02_vector_add() { assert_status_code 0 "$DEMO_SUITE_DIR/vectorAdd" } test_03_nvbandwidth() { assert_status_code 0 "$DEMO_SUITE_DIR/nvbandwidth" } test_04_dcgm_diagnostics() { # This test is not applicable for vGPU instance types. if is_vgpu; then skip "This test does not apply to vGPU instances (g6f.*, gr6f.*)" fi # https://docs.nvidia.com/datacenter/dcgm/latest/user-guide/dcgm-diagnostics.html#run-levels-and-tests assert_status_code 0 "dcgmi diag -r 2" } ================================================ FILE: test/images/nvidia/gpu_unit_tests/tests/test_sysinfo.sh ================================================ # Validate basic system configuration by comparing with expected config # setup_suite() { source common.sh EC2_INSTANCE_TYPE=${EC2_INSTANCE_TYPE:-$(get_instance_type)} data=test_sysinfo.sh.data/$EC2_INSTANCE_TYPE ACTUAL_RESULTS=`mktemp -t -d test_sysinfo.sh.actual-data.XXX` assert_not_equals "" "$ACTUAL_RESULTS" notify_trace_info "ACTUAL_RESULTS: $ACTUAL_RESULTS" if [ -n "$GENERATE_DATA" ] then echo "GENERATE_DATA is enabled..." mkdir -p $data function assert_data() { generate_data "$@" } fi } teardown_suite() { assert "test -z \"$GENERATE_DATA\"" "GENERATE_DATA was enabled, fail full suite" assert_gpu_unused } test_numa_topo_topo() { assert_data $data/numa_topo.txt "grep . /sys/devices/system/node/node*/{cpulist,distance}" "Unexpected cpu topology" } test_nvidia_gpu_count() { #Just for logging purposesclear assert_status_code 0 "nvidia-smi -q" assert_data $data/gpu_count.txt "nvidia-smi --query-gpu=name,index,pci.bus_id --format csv" "Unexpected gpu count" } test_nvidia_smi_topo() { assert_data $data/nvidia_smi_topo.txt "nvidia-smi topo -m | grep GPU | cut -f 1-11" \ "Unexpected gpu topology, likely broken nvlinks" } test_nvidia_persistence_status() { assert_data $data/nvidia_persistence_status.txt "nvidia-smi --query-gpu=name,pci.bus_id,persistence_mode --format=csv" \ "Unexpected perfistance status, likely system configuration issue" } test_nvidia_gpu_unused() { assert_gpu_unused } test_nvidia_gpu_throttled() { # vGPU instances don't support GPU clock throttling detection. # This test is not applicable for vGPU instance types. if is_vgpu; then skip "This test does not apply to vGPU instances (g6f.*, gr6f.*)" fi # https://docs.nvidia.com/deploy/nvml-api/group__nvmlClocksEventReasons.html#group__nvmlClocksEventReasons # The only bit allowed is nvmlClocksEventReasonGpuIdle 0x0000000000000001LL filter="egrep -v -e '(0x0000000000000000|0x0000000000000001|0x0000000000000004)'" cmd="nvidia-smi --query-gpu index,gpu_bus_id,gpu_uuid,clocks_throttle_reasons.active --format=csv,noheader" assert_status_code 1 "$cmd | $filter" "Throttled gpu detected" } test_nvidia_vgpu_license_status() { if ! is_vgpu; then skip "This test only applies to vGPU instances (g6f.*, gr6f.*)" fi assert_data $data/nvidia_vgpu_license_status.txt \ "nvidia-smi -q | grep 'vGPU Software' -A 2" \ "vGPU license status validation failed" } ================================================ FILE: test/images/nvidia/gpu_unit_tests/tests/test_sysinfo.sh.data/g5.48xlarge/gpu_count.txt ================================================ name, index, pci.bus_id NVIDIA A10G, 0, 00000000:00:16.0 NVIDIA A10G, 1, 00000000:00:17.0 NVIDIA A10G, 2, 00000000:00:18.0 NVIDIA A10G, 3, 00000000:00:19.0 NVIDIA A10G, 4, 00000000:00:1A.0 NVIDIA A10G, 5, 00000000:00:1B.0 NVIDIA A10G, 6, 00000000:00:1C.0 NVIDIA A10G, 7, 00000000:00:1D.0 ================================================ FILE: test/images/nvidia/gpu_unit_tests/tests/test_sysinfo.sh.data/g5.48xlarge/numa_topo.txt ================================================ /sys/devices/system/node/node0/cpulist:0-47,96-143 /sys/devices/system/node/node1/cpulist:48-95,144-191 /sys/devices/system/node/node0/distance:10 32 /sys/devices/system/node/node1/distance:32 10 ================================================ FILE: test/images/nvidia/gpu_unit_tests/tests/test_sysinfo.sh.data/g5.48xlarge/nvidia_persistence_status.txt ================================================ name, pci.bus_id, persistence_mode NVIDIA A10G, 00000000:00:16.0, Enabled NVIDIA A10G, 00000000:00:17.0, Enabled NVIDIA A10G, 00000000:00:18.0, Enabled NVIDIA A10G, 00000000:00:19.0, Enabled NVIDIA A10G, 00000000:00:1A.0, Enabled NVIDIA A10G, 00000000:00:1B.0, Enabled NVIDIA A10G, 00000000:00:1C.0, Enabled NVIDIA A10G, 00000000:00:1D.0, Enabled ================================================ FILE: test/images/nvidia/gpu_unit_tests/tests/test_sysinfo.sh.data/g5.48xlarge/nvidia_smi_topo.txt ================================================ GPU0 GPU1 GPU2 GPU3 GPU4 GPU5 GPU6 GPU7 CPU Affinity NUMA Affinity GPU0 X PHB PHB PHB PHB PHB PHB PHB 0-191 0-1 GPU1 PHB X PHB PHB PHB PHB PHB PHB 0-191 0-1 GPU2 PHB PHB X PHB PHB PHB PHB PHB 0-191 0-1 GPU3 PHB PHB PHB X PHB PHB PHB PHB 0-191 0-1 GPU4 PHB PHB PHB PHB X PHB PHB PHB 0-191 0-1 GPU5 PHB PHB PHB PHB PHB X PHB PHB 0-191 0-1 GPU6 PHB PHB PHB PHB PHB PHB X PHB 0-191 0-1 GPU7 PHB PHB PHB PHB PHB PHB PHB X 0-191 0-1 ================================================ FILE: test/images/nvidia/gpu_unit_tests/tests/test_sysinfo.sh.data/g5.8xlarge/gpu_count.txt ================================================ name, index, pci.bus_id NVIDIA A10G, 0, 00000000:00:1E.0 ================================================ FILE: test/images/nvidia/gpu_unit_tests/tests/test_sysinfo.sh.data/g5.8xlarge/numa_topo.txt ================================================ /sys/devices/system/node/node0/cpulist:0-31 /sys/devices/system/node/node0/distance:10 ================================================ FILE: test/images/nvidia/gpu_unit_tests/tests/test_sysinfo.sh.data/g5.8xlarge/nvidia_persistence_status.txt ================================================ name, pci.bus_id, persistence_mode NVIDIA A10G, 00000000:00:1E.0, Enabled ================================================ FILE: test/images/nvidia/gpu_unit_tests/tests/test_sysinfo.sh.data/g5.8xlarge/nvidia_smi_topo.txt ================================================ GPU0 CPU Affinity NUMA Affinity GPU NUMA ID GPU0 X 0-31 0 N/A ================================================ FILE: test/images/nvidia/gpu_unit_tests/tests/test_sysinfo.sh.data/g5g.2xlarge/gpu_count.txt ================================================ name, index, pci.bus_id NVIDIA T4G, 0, 00000000:00:1F.0 ================================================ FILE: test/images/nvidia/gpu_unit_tests/tests/test_sysinfo.sh.data/g5g.2xlarge/numa_topo.txt ================================================ /sys/devices/system/node/node0/cpulist:0-7 /sys/devices/system/node/node0/distance:10 ================================================ FILE: test/images/nvidia/gpu_unit_tests/tests/test_sysinfo.sh.data/g5g.2xlarge/nvidia_persistence_status.txt ================================================ name, pci.bus_id, persistence_mode NVIDIA T4G, 00000000:00:1F.0, Enabled ================================================ FILE: test/images/nvidia/gpu_unit_tests/tests/test_sysinfo.sh.data/g5g.2xlarge/nvidia_smi_topo.txt ================================================ GPU0 CPU Affinity NUMA Affinity GPU NUMA ID GPU0 X 0-7 0 N/A ================================================ FILE: test/images/nvidia/gpu_unit_tests/tests/test_sysinfo.sh.data/g6f.2xlarge/efa_count.txt ================================================ 0 ================================================ FILE: test/images/nvidia/gpu_unit_tests/tests/test_sysinfo.sh.data/g6f.2xlarge/gpu_count.txt ================================================ name, index, pci.bus_id NVIDIA L4-6Q, 0, 00000000:31:00.0 ================================================ FILE: test/images/nvidia/gpu_unit_tests/tests/test_sysinfo.sh.data/g6f.2xlarge/numa_topo.txt ================================================ /sys/devices/system/node/node0/cpulist:0-7 /sys/devices/system/node/node0/distance:10 ================================================ FILE: test/images/nvidia/gpu_unit_tests/tests/test_sysinfo.sh.data/g6f.2xlarge/nvidia_persistence_status.txt ================================================ name, pci.bus_id, persistence_mode NVIDIA L4-6Q, 00000000:31:00.0, Enabled ================================================ FILE: test/images/nvidia/gpu_unit_tests/tests/test_sysinfo.sh.data/g6f.2xlarge/nvidia_smi_topo.txt ================================================ GPU0 CPU Affinity NUMA Affinity GPU NUMA ID GPU0 X 0-7 0 N/A ================================================ FILE: test/images/nvidia/gpu_unit_tests/tests/test_sysinfo.sh.data/g6f.2xlarge/nvidia_vgpu_license_status.txt ================================================ vGPU Software Licensed Product Product Name : NVIDIA RTX Virtual Workstation License Status : Licensed (Expiry: N/A) ================================================ FILE: test/images/nvidia/gpu_unit_tests/tests/test_sysinfo.sh.data/g6f.4xlarge/efa_count.txt ================================================ 0 ================================================ FILE: test/images/nvidia/gpu_unit_tests/tests/test_sysinfo.sh.data/g6f.4xlarge/gpu_count.txt ================================================ name, index, pci.bus_id NVIDIA L4-12Q, 0, 00000000:35:00.0 ================================================ FILE: test/images/nvidia/gpu_unit_tests/tests/test_sysinfo.sh.data/g6f.4xlarge/numa_topo.txt ================================================ /sys/devices/system/node/node0/cpulist:0-15 /sys/devices/system/node/node0/distance:10 ================================================ FILE: test/images/nvidia/gpu_unit_tests/tests/test_sysinfo.sh.data/g6f.4xlarge/nvidia_persistence_status.txt ================================================ name, pci.bus_id, persistence_mode NVIDIA L4-12Q, 00000000:35:00.0, Enabled ================================================ FILE: test/images/nvidia/gpu_unit_tests/tests/test_sysinfo.sh.data/g6f.4xlarge/nvidia_smi_topo.txt ================================================ GPU0 CPU Affinity NUMA Affinity GPU NUMA ID GPU0 X 0-15 0 N/A ================================================ FILE: test/images/nvidia/gpu_unit_tests/tests/test_sysinfo.sh.data/g6f.4xlarge/nvidia_vgpu_license_status.txt ================================================ vGPU Software Licensed Product Product Name : NVIDIA RTX Virtual Workstation License Status : Licensed (Expiry: N/A) ================================================ FILE: test/images/nvidia/gpu_unit_tests/tests/test_sysinfo.sh.data/g6f.large/efa_count.txt ================================================ 0 ================================================ FILE: test/images/nvidia/gpu_unit_tests/tests/test_sysinfo.sh.data/g6f.large/gpu_count.txt ================================================ name, index, pci.bus_id NVIDIA L4-3Q, 0, 00000000:31:00.0 ================================================ FILE: test/images/nvidia/gpu_unit_tests/tests/test_sysinfo.sh.data/g6f.large/numa_topo.txt ================================================ /sys/devices/system/node/node0/cpulist:0-1 /sys/devices/system/node/node0/distance:10 ================================================ FILE: test/images/nvidia/gpu_unit_tests/tests/test_sysinfo.sh.data/g6f.large/nvidia_persistence_status.txt ================================================ name, pci.bus_id, persistence_mode NVIDIA L4-3Q, 00000000:31:00.0, Enabled ================================================ FILE: test/images/nvidia/gpu_unit_tests/tests/test_sysinfo.sh.data/g6f.large/nvidia_smi_topo.txt ================================================ GPU0 CPU Affinity NUMA Affinity GPU NUMA ID GPU0 X 0-1 0 N/A ================================================ FILE: test/images/nvidia/gpu_unit_tests/tests/test_sysinfo.sh.data/g6f.large/nvidia_vgpu_license_status.txt ================================================ vGPU Software Licensed Product Product Name : NVIDIA RTX Virtual Workstation License Status : Licensed (Expiry: N/A) ================================================ FILE: test/images/nvidia/gpu_unit_tests/tests/test_sysinfo.sh.data/g6f.xlarge/efa_count.txt ================================================ 0 ================================================ FILE: test/images/nvidia/gpu_unit_tests/tests/test_sysinfo.sh.data/g6f.xlarge/gpu_count.txt ================================================ name, index, pci.bus_id NVIDIA L4-3Q, 0, 00000000:31:00.0 ================================================ FILE: test/images/nvidia/gpu_unit_tests/tests/test_sysinfo.sh.data/g6f.xlarge/numa_topo.txt ================================================ /sys/devices/system/node/node0/cpulist:0-3 /sys/devices/system/node/node0/distance:10 ================================================ FILE: test/images/nvidia/gpu_unit_tests/tests/test_sysinfo.sh.data/g6f.xlarge/nvidia_persistence_status.txt ================================================ name, pci.bus_id, persistence_mode NVIDIA L4-3Q, 00000000:31:00.0, Enabled ================================================ FILE: test/images/nvidia/gpu_unit_tests/tests/test_sysinfo.sh.data/g6f.xlarge/nvidia_smi_topo.txt ================================================ GPU0 CPU Affinity NUMA Affinity GPU NUMA ID GPU0 X 0-3 0 N/A ================================================ FILE: test/images/nvidia/gpu_unit_tests/tests/test_sysinfo.sh.data/g6f.xlarge/nvidia_vgpu_license_status.txt ================================================ vGPU Software Licensed Product Product Name : NVIDIA RTX Virtual Workstation License Status : Licensed (Expiry: N/A) ================================================ FILE: test/images/nvidia/gpu_unit_tests/tests/test_sysinfo.sh.data/p3.16xlarge/gpu_count.txt ================================================ name, index, pci.bus_id Tesla V100-SXM2-16GB, 0, 00000000:00:17.0 Tesla V100-SXM2-16GB, 1, 00000000:00:18.0 Tesla V100-SXM2-16GB, 2, 00000000:00:19.0 Tesla V100-SXM2-16GB, 3, 00000000:00:1A.0 Tesla V100-SXM2-16GB, 4, 00000000:00:1B.0 Tesla V100-SXM2-16GB, 5, 00000000:00:1C.0 Tesla V100-SXM2-16GB, 6, 00000000:00:1D.0 Tesla V100-SXM2-16GB, 7, 00000000:00:1E.0 ================================================ FILE: test/images/nvidia/gpu_unit_tests/tests/test_sysinfo.sh.data/p3.16xlarge/numa_topo.txt ================================================ /sys/devices/system/node/node0/cpulist:0-15,32-47 /sys/devices/system/node/node1/cpulist:16-31,48-63 /sys/devices/system/node/node0/distance:10 21 /sys/devices/system/node/node1/distance:21 10 ================================================ FILE: test/images/nvidia/gpu_unit_tests/tests/test_sysinfo.sh.data/p3.16xlarge/nvidia_persistence_status.txt ================================================ name, pci.bus_id, persistence_mode Tesla V100-SXM2-16GB, 00000000:00:17.0, Enabled Tesla V100-SXM2-16GB, 00000000:00:18.0, Enabled Tesla V100-SXM2-16GB, 00000000:00:19.0, Enabled Tesla V100-SXM2-16GB, 00000000:00:1A.0, Enabled Tesla V100-SXM2-16GB, 00000000:00:1B.0, Enabled Tesla V100-SXM2-16GB, 00000000:00:1C.0, Enabled Tesla V100-SXM2-16GB, 00000000:00:1D.0, Enabled Tesla V100-SXM2-16GB, 00000000:00:1E.0, Enabled ================================================ FILE: test/images/nvidia/gpu_unit_tests/tests/test_sysinfo.sh.data/p3.16xlarge/nvidia_smi_topo.txt ================================================ GPU0 GPU1 GPU2 GPU3 GPU4 GPU5 GPU6 GPU7 CPU Affinity NUMA Affinity GPU0 X NV1 NV1 NV2 NV2 PHB PHB PHB 0-63 0-1 GPU1 NV1 X NV2 NV1 PHB NV2 PHB PHB 0-63 0-1 GPU2 NV1 NV2 X NV2 PHB PHB NV1 PHB 0-63 0-1 GPU3 NV2 NV1 NV2 X PHB PHB PHB NV1 0-63 0-1 GPU4 NV2 PHB PHB PHB X NV1 NV1 NV2 0-63 0-1 GPU5 PHB NV2 PHB PHB NV1 X NV2 NV1 0-63 0-1 GPU6 PHB PHB NV1 PHB NV1 NV2 X NV2 0-63 0-1 GPU7 PHB PHB PHB NV1 NV2 NV1 NV2 X 0-63 0-1 ================================================ FILE: test/images/nvidia/gpu_unit_tests/tests/test_sysinfo.sh.data/p3.2xlarge/gpu_count.txt ================================================ name, index, pci.bus_id Tesla V100-SXM2-16GB, 0, 00000000:00:1E.0 ================================================ FILE: test/images/nvidia/gpu_unit_tests/tests/test_sysinfo.sh.data/p3.2xlarge/numa_topo.txt ================================================ /sys/devices/system/node/node0/cpulist:0-7 /sys/devices/system/node/node0/distance:10 ================================================ FILE: test/images/nvidia/gpu_unit_tests/tests/test_sysinfo.sh.data/p3.2xlarge/nvidia_persistence_status.txt ================================================ name, pci.bus_id, persistence_mode Tesla V100-SXM2-16GB, 00000000:00:1E.0, Enabled ================================================ FILE: test/images/nvidia/gpu_unit_tests/tests/test_sysinfo.sh.data/p3.2xlarge/nvidia_smi_topo.txt ================================================ GPU0 CPU Affinity NUMA Affinity GPU NUMA ID GPU0 X 0-7 0 N/A ================================================ FILE: test/images/nvidia/gpu_unit_tests/tests/test_sysinfo.sh.data/p4d.24xlarge/gpu_count.txt ================================================ name, index, pci.bus_id NVIDIA A100-SXM4-40GB, 0, 00000000:10:1C.0 NVIDIA A100-SXM4-40GB, 1, 00000000:10:1D.0 NVIDIA A100-SXM4-40GB, 2, 00000000:20:1C.0 NVIDIA A100-SXM4-40GB, 3, 00000000:20:1D.0 NVIDIA A100-SXM4-40GB, 4, 00000000:90:1C.0 NVIDIA A100-SXM4-40GB, 5, 00000000:90:1D.0 NVIDIA A100-SXM4-40GB, 6, 00000000:A0:1C.0 NVIDIA A100-SXM4-40GB, 7, 00000000:A0:1D.0 ================================================ FILE: test/images/nvidia/gpu_unit_tests/tests/test_sysinfo.sh.data/p4d.24xlarge/numa_topo.txt ================================================ /sys/devices/system/node/node0/cpulist:0-23,48-71 /sys/devices/system/node/node1/cpulist:24-47,72-95 /sys/devices/system/node/node0/distance:10 21 /sys/devices/system/node/node1/distance:21 10 ================================================ FILE: test/images/nvidia/gpu_unit_tests/tests/test_sysinfo.sh.data/p4d.24xlarge/nvidia_persistence_status.txt ================================================ name, pci.bus_id, persistence_mode NVIDIA A100-SXM4-40GB, 00000000:10:1C.0, Enabled NVIDIA A100-SXM4-40GB, 00000000:10:1D.0, Enabled NVIDIA A100-SXM4-40GB, 00000000:20:1C.0, Enabled NVIDIA A100-SXM4-40GB, 00000000:20:1D.0, Enabled NVIDIA A100-SXM4-40GB, 00000000:90:1C.0, Enabled NVIDIA A100-SXM4-40GB, 00000000:90:1D.0, Enabled NVIDIA A100-SXM4-40GB, 00000000:A0:1C.0, Enabled NVIDIA A100-SXM4-40GB, 00000000:A0:1D.0, Enabled ================================================ FILE: test/images/nvidia/gpu_unit_tests/tests/test_sysinfo.sh.data/p4d.24xlarge/nvidia_smi_topo.txt ================================================ GPU0 GPU1 GPU2 GPU3 GPU4 GPU5 GPU6 GPU7 CPU Affinity NUMA Affinity GPU0 X NV12 NV12 NV12 NV12 NV12 NV12 NV12 0-23,48-71 0 GPU1 NV12 X NV12 NV12 NV12 NV12 NV12 NV12 0-23,48-71 0 GPU2 NV12 NV12 X NV12 NV12 NV12 NV12 NV12 0-23,48-71 0 GPU3 NV12 NV12 NV12 X NV12 NV12 NV12 NV12 0-23,48-71 0 GPU4 NV12 NV12 NV12 NV12 X NV12 NV12 NV12 24-47,72-95 1 GPU5 NV12 NV12 NV12 NV12 NV12 X NV12 NV12 24-47,72-95 1 GPU6 NV12 NV12 NV12 NV12 NV12 NV12 X NV12 24-47,72-95 1 GPU7 NV12 NV12 NV12 NV12 NV12 NV12 NV12 X 24-47,72-95 1 ================================================ FILE: test/images/nvidia/gpu_unit_tests/tests/test_sysinfo.sh.data/p4de.24xlarge/gpu_count.txt ================================================ name, index, pci.bus_id NVIDIA A100-SXM4-80GB, 0, 00000000:10:1C.0 NVIDIA A100-SXM4-80GB, 1, 00000000:10:1D.0 NVIDIA A100-SXM4-80GB, 2, 00000000:20:1C.0 NVIDIA A100-SXM4-80GB, 3, 00000000:20:1D.0 NVIDIA A100-SXM4-80GB, 4, 00000000:90:1C.0 NVIDIA A100-SXM4-80GB, 5, 00000000:90:1D.0 NVIDIA A100-SXM4-80GB, 6, 00000000:A0:1C.0 NVIDIA A100-SXM4-80GB, 7, 00000000:A0:1D.0 ================================================ FILE: test/images/nvidia/gpu_unit_tests/tests/test_sysinfo.sh.data/p4de.24xlarge/numa_topo.txt ================================================ /sys/devices/system/node/node0/cpulist:0-23,48-71 /sys/devices/system/node/node1/cpulist:24-47,72-95 /sys/devices/system/node/node0/distance:10 21 /sys/devices/system/node/node1/distance:21 10 ================================================ FILE: test/images/nvidia/gpu_unit_tests/tests/test_sysinfo.sh.data/p4de.24xlarge/nvidia_persistence_status.txt ================================================ name, pci.bus_id, persistence_mode NVIDIA A100-SXM4-80GB, 00000000:10:1C.0, Enabled NVIDIA A100-SXM4-80GB, 00000000:10:1D.0, Enabled NVIDIA A100-SXM4-80GB, 00000000:20:1C.0, Enabled NVIDIA A100-SXM4-80GB, 00000000:20:1D.0, Enabled NVIDIA A100-SXM4-80GB, 00000000:90:1C.0, Enabled NVIDIA A100-SXM4-80GB, 00000000:90:1D.0, Enabled NVIDIA A100-SXM4-80GB, 00000000:A0:1C.0, Enabled NVIDIA A100-SXM4-80GB, 00000000:A0:1D.0, Enabled ================================================ FILE: test/images/nvidia/gpu_unit_tests/tests/test_sysinfo.sh.data/p4de.24xlarge/nvidia_smi_topo.txt ================================================ GPU0 GPU1 GPU2 GPU3 GPU4 GPU5 GPU6 GPU7 CPU Affinity NUMA Affinity GPU0 X NV12 NV12 NV12 NV12 NV12 NV12 NV12 0-23,48-71 0 GPU1 NV12 X NV12 NV12 NV12 NV12 NV12 NV12 0-23,48-71 0 GPU2 NV12 NV12 X NV12 NV12 NV12 NV12 NV12 0-23,48-71 0 GPU3 NV12 NV12 NV12 X NV12 NV12 NV12 NV12 0-23,48-71 0 GPU4 NV12 NV12 NV12 NV12 X NV12 NV12 NV12 24-47,72-95 1 GPU5 NV12 NV12 NV12 NV12 NV12 X NV12 NV12 24-47,72-95 1 GPU6 NV12 NV12 NV12 NV12 NV12 NV12 X NV12 24-47,72-95 1 GPU7 NV12 NV12 NV12 NV12 NV12 NV12 NV12 X 24-47,72-95 1 ================================================ FILE: test/images/nvidia/gpu_unit_tests/tests/test_sysinfo.sh.data/p5.48xlarge/gpu_count.txt ================================================ name, index, pci.bus_id NVIDIA H100 80GB HBM3, 0, 00000000:53:00.0 NVIDIA H100 80GB HBM3, 1, 00000000:64:00.0 NVIDIA H100 80GB HBM3, 2, 00000000:75:00.0 NVIDIA H100 80GB HBM3, 3, 00000000:86:00.0 NVIDIA H100 80GB HBM3, 4, 00000000:97:00.0 NVIDIA H100 80GB HBM3, 5, 00000000:A8:00.0 NVIDIA H100 80GB HBM3, 6, 00000000:B9:00.0 NVIDIA H100 80GB HBM3, 7, 00000000:CA:00.0 ================================================ FILE: test/images/nvidia/gpu_unit_tests/tests/test_sysinfo.sh.data/p5.48xlarge/numa_topo.txt ================================================ /sys/devices/system/node/node0/cpulist:0-47,96-143 /sys/devices/system/node/node1/cpulist:48-95,144-191 /sys/devices/system/node/node0/distance:10 32 /sys/devices/system/node/node1/distance:32 10 ================================================ FILE: test/images/nvidia/gpu_unit_tests/tests/test_sysinfo.sh.data/p5.48xlarge/nvidia_persistence_status.txt ================================================ name, pci.bus_id, persistence_mode NVIDIA H100 80GB HBM3, 00000000:53:00.0, Enabled NVIDIA H100 80GB HBM3, 00000000:64:00.0, Enabled NVIDIA H100 80GB HBM3, 00000000:75:00.0, Enabled NVIDIA H100 80GB HBM3, 00000000:86:00.0, Enabled NVIDIA H100 80GB HBM3, 00000000:97:00.0, Enabled NVIDIA H100 80GB HBM3, 00000000:A8:00.0, Enabled NVIDIA H100 80GB HBM3, 00000000:B9:00.0, Enabled NVIDIA H100 80GB HBM3, 00000000:CA:00.0, Enabled ================================================ FILE: test/images/nvidia/gpu_unit_tests/tests/test_sysinfo.sh.data/p5.48xlarge/nvidia_smi_topo.txt ================================================ GPU0 GPU1 GPU2 GPU3 GPU4 GPU5 GPU6 GPU7 CPU Affinity NUMA Affinity GPU0 X NV18 NV18 NV18 NV18 NV18 NV18 NV18 0-47,96-143 0 GPU1 NV18 X NV18 NV18 NV18 NV18 NV18 NV18 0-47,96-143 0 GPU2 NV18 NV18 X NV18 NV18 NV18 NV18 NV18 0-47,96-143 0 GPU3 NV18 NV18 NV18 X NV18 NV18 NV18 NV18 0-47,96-143 0 GPU4 NV18 NV18 NV18 NV18 X NV18 NV18 NV18 48-95,144-191 1 GPU5 NV18 NV18 NV18 NV18 NV18 X NV18 NV18 48-95,144-191 1 GPU6 NV18 NV18 NV18 NV18 NV18 NV18 X NV18 48-95,144-191 1 GPU7 NV18 NV18 NV18 NV18 NV18 NV18 NV18 X 48-95,144-191 1 ================================================ FILE: test/images/nvidia/gpu_unit_tests/unit_test ================================================ #!/usr/bin/env bash set -o errexit set -o nounset set -o pipefail TRACE_LOG=trace.log TEST_TIMEOUT=3600 BASH="/usr/bin/bash" CURRENT_DIR=$(pwd) SKIP_TESTS_SUBCOMMAND=${SKIP_TESTS_SUBCOMMAND:-""} timeout -k 10 ${TEST_TIMEOUT} ${BASH} gpu_unit_tests/bash_unit -f tap ${SKIP_TESTS_SUBCOMMAND} -t gpu_unit_tests/${TRACE_LOG} gpu_unit_tests/tests/*test*.sh ================================================ FILE: test/images/nvidia-inference/Dockerfile ================================================ ############################################################################### # Base image, arguments, and environment ############################################################################### ARG CUDA_MAJOR_VERSION=12 ARG CUDA_MINOR_VERSION=8 FROM nvidia/cuda:$CUDA_MAJOR_VERSION.$CUDA_MINOR_VERSION.0-devel-ubuntu22.04 ARG CUDA_MAJOR_VERSION ARG CUDA_MINOR_VERSION # Disable interactive prompts ENV DEBIAN_FRONTEND=noninteractive ############################################################################### # System packages ############################################################################### RUN apt update \ && apt upgrade -y \ && apt install -y --no-install-recommends \ build-essential \ ca-certificates \ cmake \ curl \ emacs \ git \ jq \ libopencv-dev \ software-properties-common \ wget \ unzip \ vim \ pkg-config \ gdb \ lcov \ libbz2-dev \ zlib1g-dev \ openssl \ libssl-dev \ libsqlite3-dev \ libgdbm-dev \ libc6-dev \ libbz2-dev \ libncurses-dev \ tk-dev \ libffi-dev \ libcap-dev \ gnupg2 \ gpg-agent \ && rm -rf /var/lib/apt/lists/* ############################################################################### # Build and install Python from source ############################################################################### ARG PYTHON=python3.10 ARG PYTHON_VERSION=3.10.12 RUN curl -sL https://www.python.org/ftp/python/$PYTHON_VERSION/Python-$PYTHON_VERSION.tgz | tar xvz -C /tmp \ && cd /tmp/Python-$PYTHON_VERSION \ && ./configure --enable-shared --prefix=/usr/local \ && make -j$(nproc) \ && make install \ && cd && rm -rf /tmp/Python-$PYTHON_VERSION RUN ln -s /usr/local/bin/pip3 /usr/bin/pip \ && ln -s /usr/local/bin/$PYTHON /usr/local/bin/python \ && pip3 --no-cache-dir install --upgrade pip setuptools ############################################################################### # Install Pytorch from Source ############################################################################### ARG PYTORCH_BRANCH=v2.6.0 ARG PYTORCH_BUILD_ENV="MAX_JOBS=8 BUILD_TEST=0" # envs needed to make the path of NVCC known to the compilation ENV CUDA_HOME=/usr/local/cuda ENV LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/cuda/lib64:/usr/local/cuda/extras/CUPTI/lib64 ENV PATH=$PATH:$CUDA_HOME/bin ENV TORCH_CUDA_ARCH_LIST="7.5;8.0;8.6;8.7;8.9;9.0;10.0;12.0" RUN pip3 install typing-extensions sympy pyyaml RUN git clone https://github.com/pytorch/pytorch.git /tmp/pytorch \ --recursive \ --branch $PYTORCH_BRANCH \ && cd /tmp/pytorch \ && eval "$PYTORCH_BUILD_ENV python3 setup.py install" \ && cd && rm -rf /tmp/pytorch ############################################################################### # Application files and Python dependencies ############################################################################### WORKDIR /app COPY infer.py /app/ COPY requirements.txt /app/ RUN pip install --no-cache-dir -r requirements.txt ================================================ FILE: test/images/nvidia-inference/infer.py ================================================ import logging import os import sys import time import random import torch from torch.utils.data import DataLoader, TensorDataset from transformers import BertForPreTraining, BertTokenizer logging.basicConfig( level=logging.INFO, format='[%(asctime)s] [%(levelname)s] [%(name)s] %(message)s', handlers=[logging.StreamHandler(sys.stdout)] ) logger = logging.getLogger("BERTInference") def create_dummy_data(tokenizer, batch_size, num_samples=100, max_length=128, seed=42): """ Creates a realistic NSP-style dataset: - 50% true next-sentence pairs - 50% random second sentences Ensures the final number of samples is a multiple of 'batch_size'. """ random.seed(seed) if num_samples % batch_size != 0: adjusted = (num_samples // batch_size) * batch_size logger.info( f"[INFO] Adjusting num_samples from {num_samples} to {adjusted} " f"to ensure full batches." ) num_samples = adjusted sample_sentences = [ "The dog loves playing fetch in the park.", "Artificial intelligence is reshaping the future.", "Movies with complex storylines can be very engaging.", "This restaurant serves an amazing brunch on weekends.", "Many researchers are exploring neural network architectures.", "A day at the beach can reduce stress and improve well-being.", "ChatGPT is a popular large language model by OpenAI.", "The annual developer conference showcased innovative technologies.", "Hiking in the mountains offers both challenge and relaxation.", "Robotics and automation are revolutionizing many industries.", ] sentences_a = [] sentences_b = [] nsp_labels = [] for _ in range(num_samples): idx_a = random.randint(0, len(sample_sentences) - 1) if random.random() < 0.5: idx_b = (idx_a + 1) % len(sample_sentences) nsp_labels.append(1) else: idx_b = random.randint(0, len(sample_sentences) - 1) nsp_labels.append(0) sentences_a.append(sample_sentences[idx_a]) sentences_b.append(sample_sentences[idx_b]) tokenized_inputs = tokenizer( sentences_a, sentences_b, max_length=max_length, padding="max_length", truncation=True, return_tensors="pt", ) return TensorDataset( tokenized_inputs.input_ids, tokenized_inputs.attention_mask, torch.tensor(nsp_labels, dtype=torch.long) ) def run_inference(model, tokenizer, batch_size, mode, device): """ Runs a dummy BERT inference workload using the given model and tokenizer. Calculates average time per batch and throughput. Expects 'device' to be GPU only (validated in main()). """ model.to(device) model.eval() try: dataset = create_dummy_data(tokenizer, batch_size=batch_size, num_samples=100, max_length=128) except Exception: logger.exception("[ERROR] Failed to create dummy data.") raise dataloader = DataLoader(dataset, batch_size=batch_size) total_time = 0.0 total_batches = len(dataloader) with torch.no_grad(): for batch_idx, batch in enumerate(dataloader): try: inputs, masks, next_sentence_labels = batch inputs, masks, next_sentence_labels = ( inputs.to(device), masks.to(device), next_sentence_labels.to(device), ) start_time = time.time() _ = model( input_ids=inputs, attention_mask=masks, next_sentence_label=next_sentence_labels ) end_time = time.time() except Exception: logger.exception(f"[ERROR] Inference failed on batch {batch_idx}.") raise total_time += (end_time - start_time) if total_time == 0.0: avg_time_per_batch = float('inf') throughput = 0.0 else: avg_time_per_batch = total_time / total_batches throughput = (total_batches * batch_size) / total_time logger.info( "[BERT_INFERENCE_METRICS] " f"mode={mode} " f"avg_time_per_batch={avg_time_per_batch:.6f} " f"throughput_samples_per_sec={throughput:.6f}" ) def main(): """ Main entry point. Checks for GPU availability, determines inference mode, sets batch size, and runs inference. Logs throughput and timing stats. """ if not torch.cuda.is_available(): logger.error("[ERROR] GPU is not available. Exiting.") sys.exit(1) device = torch.device("cuda") num_gpus = torch.cuda.device_count() logger.info(f"[INFO] Found {num_gpus} GPU(s). GPU is available.") mode = os.environ.get("INFERENCE_MODE", "throughput").lower() if mode not in ["throughput", "latency"]: logger.warning( f"[WARNING] Unrecognized INFERENCE_MODE '{mode}'. " "Falling back to 'throughput'." ) mode = "throughput" batch_size = 1 if mode == "latency" else 8 logger.info(f"[INFO] Running inference in {mode} mode with batch size {batch_size}.") try: tokenizer = BertTokenizer.from_pretrained("bert-base-uncased") model = BertForPreTraining.from_pretrained("bert-base-uncased") except Exception: logger.exception("[ERROR] Failed to load model/tokenizer. Exiting.") sys.exit(1) run_inference(model, tokenizer, batch_size, mode, device) if __name__ == "__main__": main() ================================================ FILE: test/images/nvidia-inference/requirements.txt ================================================ transformers==4.53.0 numpy==1.26 ================================================ FILE: test/images/nvidia-training/Dockerfile ================================================ ARG CUDA_MAJOR_VERSION=12 ARG CUDA_MINOR_VERSION=8 # Use the NVIDIA CUDA runtime as a parent image FROM nvidia/cuda:$CUDA_MAJOR_VERSION.$CUDA_MINOR_VERSION.0-devel-ubuntu22.04 # Redeclare build arguments ARG CUDA_MAJOR_VERSION ARG CUDA_MINOR_VERSION # Set environment variable to disable interactive prompts ENV DEBIAN_FRONTEND=noninteractive # Set default values for MASTER_ADDR, MASTER_PORT, and NUM_GPUS_PER_NODE ENV MASTER_ADDR=127.0.0.1 ENV MASTER_PORT=12355 RUN apt-get update \ && apt-get upgrade -y \ && apt-get install -y --no-install-recommends \ build-essential \ ca-certificates \ cmake \ curl \ emacs \ git \ jq \ libopencv-dev \ software-properties-common \ wget \ unzip \ vim \ pkg-config \ gdb \ lcov \ libbz2-dev \ zlib1g-dev \ openssl \ libssl-dev \ libsqlite3-dev \ libgdbm-dev \ libc6-dev \ libbz2-dev \ libncurses-dev \ tk-dev \ libffi-dev \ libcap-dev \ gnupg2 \ gpg-agent \ && rm -rf /var/lib/apt/lists/* # Install Python ARG PYTHON=python3.10 ARG PYTHON_VERSION=3.10.12 RUN curl -sL https://www.python.org/ftp/python/$PYTHON_VERSION/Python-$PYTHON_VERSION.tgz | tar xvz -C /tmp \ && cd /tmp/Python-$PYTHON_VERSION \ && ./configure --enable-shared --prefix=/usr/local \ && make -j $(nproc) \ && make install \ && cd && rm -rf /tmp/Python-$PYTHON_VERSION RUN ln -s /usr/local/bin/pip3 /usr/bin/pip \ && ln -s /usr/local/bin/$PYTHON /usr/local/bin/python \ && pip --no-cache-dir install --upgrade pip setuptools # Install Pytorch from Source ARG PYTORCH_BRANCH=v2.6.0 ARG PYTORCH_BUILD_ENV="MAX_JOBS=8 BUILD_TEST=0" ENV CUDA_HOME=/usr/local/cuda ENV LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/cuda/lib64:/usr/local/cuda/extras/CUPTI/lib64 ENV PATH=$PATH:$CUDA_HOME/bin ENV TORCH_CUDA_ARCH_LIST="7.5;8.0;8.6;8.7;8.9;9.0;10.0;12.0" RUN pip install typing-extensions sympy pyyaml RUN git clone https://github.com/pytorch/pytorch.git /tmp/pytorch \ --recursive \ --branch $PYTORCH_BRANCH \ && cd /tmp/pytorch \ && eval "$PYTORCH_BUILD_ENV python3 setup.py install" \ && cd && rm -rf /tmp/pytorch RUN apt-get update -y && \ apt-get remove -y --allow-change-held-packages \ libmlx5-1 ibverbs-utils libibverbs-dev libibverbs1 libnccl2 libnccl-dev && \ rm -rf /opt/hpcx /usr/local/mpi /usr/local/ucx /etc/ld.so.conf.d/hpcx.conf RUN apt-get install -y --allow-unauthenticated \ sudo git gcc vim kmod openssh-client openssh-server build-essential \ wget curl autoconf libtool gdb automake python3-distutils cmake \ apt-utils devscripts debhelper libsubunit-dev check pkg-config libhwloc-dev RUN ldconfig # SSH configuration RUN mkdir -p /var/run/sshd && \ sed -i 's/[ #]\(.*StrictHostKeyChecking \).*/ \1no/g' /etc/ssh/ssh_config && \ echo " UserKnownHostsFile /dev/null" >> /etc/ssh/ssh_config && \ sed -i 's/#\(StrictModes \).*/\1no/g' /etc/ssh/sshd_config # Set environment variables for OpenMPI, CUDA, EFA, and NCCL ENV LD_LIBRARY_PATH /opt/amazon/openmpi/lib64:/opt/amazon/openmpi/lib:/opt/amazon/efa/lib64:/opt/aws-ofi-nccl/install/lib:/usr/local/cuda/lib64:/usr/local/cuda/extras/CUPTI/lib64:/usr/local/lib/:/usr/lib64:/usr/lib/x86_64-linux-gnu/:/usr/lib/aarch64-linux-gnu/:$LD_LIBRARY_PATH ENV PATH /usr/local/cuda/bin:/opt/amazon/openmpi/bin:/opt/amazon/efa/bin:/usr/sbin:/usr/bin:/usr/local/bin:$PATH # Install EFA ARG EFA_INSTALLER_VERSION=latest RUN curl -sL https://efa-installer.amazonaws.com/aws-efa-installer-$EFA_INSTALLER_VERSION.tar.gz | tar xvz -C /tmp \ && cd /tmp/aws-efa-installer \ && ./efa_installer.sh -y -g -d --skip-kmod --skip-limit-conf --no-verify \ && cd && rm -rf /tmp/aws-efa-installer # Install NCCL ARG LIBNCCL_VERSION=2.28.7-1 RUN git clone https://github.com/NVIDIA/nccl.git --branch v$LIBNCCL_VERSION /tmp/nccl \ && cd /tmp/nccl \ && make -j $(nproc) \ && make install \ && cd && rm -rf /tmp/nccl # Install AWS-OFI-NCCL plugin ARG AWS_OFI_NCCL_VERSION=1.17.2 RUN curl -sL https://github.com/aws/aws-ofi-nccl/releases/download/v$AWS_OFI_NCCL_VERSION/aws-ofi-nccl-$AWS_OFI_NCCL_VERSION.tar.gz | tar xvz -C /tmp \ && cd /tmp/aws-ofi-nccl-$AWS_OFI_NCCL_VERSION \ && ./configure \ --prefix=/opt/aws-ofi-nccl/install \ --with-mpi=/opt/amazon/openmpi \ --with-libfabric=/opt/amazon/efa \ --with-cuda=/usr/local/cuda \ --enable-platform-aws \ --disable-tests \ && make -j $(nproc) \ && make install \ && cd && rm -rf /tmp/aws-ofi-nccl-$AWS_OFI_NCCL_VERSION ENV NCCL_PROTO simple ENV LD_LIBRARY_PATH=/usr/local/cuda/lib64:$LD_LIBRARY_PATH RUN rm -rf /var/lib/apt/lists/* # Set the working directory in the container WORKDIR /app # Copy the training script and install requirements COPY train.py /app/ COPY requirements.txt /app/ RUN pip install --no-cache-dir -r requirements.txt ================================================ FILE: test/images/nvidia-training/requirements.txt ================================================ transformers==4.53.0 numpy==1.26 ================================================ FILE: test/images/nvidia-training/train.py ================================================ import os import time import torch import torch.distributed as dist from torch.nn.parallel import DistributedDataParallel as DDP from transformers import BertForPreTraining, BertTokenizer from torch.utils.data import DataLoader, TensorDataset import numpy as np def create_dummy_data(tokenizer, num_samples=100, max_length=128): sentences = [f"This is a dummy sentence number {i}" for i in range(num_samples)] tokenized_inputs = tokenizer( sentences, max_length=max_length, padding="max_length", truncation=True, return_tensors="pt", ) labels = tokenized_inputs.input_ids.detach().clone() # MLM task: randomly mask some tokens mlm_probability = 0.15 input_ids, labels = mask_tokens(tokenized_inputs.input_ids, tokenizer, mlm_probability) # NSP task: create dummy pairs next_sentence_labels = torch.randint(0, 2, (num_samples,)) return TensorDataset(input_ids, tokenized_inputs.attention_mask, labels, next_sentence_labels) def mask_tokens(inputs, tokenizer, mlm_probability): labels = inputs.clone() probability_matrix = torch.full(labels.shape, mlm_probability) special_tokens_mask = [ tokenizer.get_special_tokens_mask(val, already_has_special_tokens=True) for val in labels.tolist() ] probability_matrix.masked_fill_(torch.tensor(special_tokens_mask, dtype=torch.bool), value=0.0) masked_indices = torch.bernoulli(probability_matrix).bool() labels[~masked_indices] = -100 inputs[masked_indices] = tokenizer.convert_tokens_to_ids(tokenizer.mask_token) return inputs, labels def setup(rank, world_size, local_rank): master_addr = os.environ["MASTER_ADDR"] master_port = os.environ["MASTER_PORT"] dist.init_process_group( "nccl", init_method=f"tcp://{master_addr}:{master_port}", rank=rank, world_size=world_size, ) torch.cuda.set_device(local_rank) print(f"Process {rank} initialized, using GPU {local_rank}") def cleanup(): dist.destroy_process_group() def train_bert(rank, world_size, local_rank, model, tokenizer): setup(rank, world_size, local_rank) model = model.to(local_rank) ddp_model = DDP(model, device_ids=[local_rank]) dataset = create_dummy_data(tokenizer) train_dataloader = DataLoader(dataset, batch_size=8) optimizer = torch.optim.AdamW(ddp_model.parameters(), lr=0.001) start_time = time.time() # Simple single-epoch training loop for epoch in range(1): ddp_model.train() for batch in train_dataloader: optimizer.zero_grad() inputs, masks, labels, next_sentence_labels = batch inputs = inputs.to(local_rank) masks = masks.to(local_rank) labels = labels.to(local_rank) next_sentence_labels = next_sentence_labels.to(local_rank) outputs = ddp_model( input_ids=inputs, attention_mask=masks, labels=labels, next_sentence_label=next_sentence_labels, ) loss = outputs.loss loss.backward() optimizer.step() end_time = time.time() training_time = end_time - start_time throughput = len(dataset) / training_time print(f"Process {rank} - Training time: {training_time:.2f} seconds") print(f"Process {rank} - Throughput: {throughput:.2f} samples/second") cleanup() return throughput def main(): # Retrieve environment variables rank = int(os.getenv("OMPI_COMM_WORLD_RANK", "0")) world_size = int(os.getenv("OMPI_COMM_WORLD_SIZE", "1")) local_rank = int(os.getenv("OMPI_COMM_WORLD_LOCAL_RANK", "0")) print(f"Process started for rank {rank} with local rank {local_rank}") # Pre-download model and tokenizer tokenizer = BertTokenizer.from_pretrained("bert-base-uncased") model = BertForPreTraining.from_pretrained("bert-base-uncased") print(f"successfully downloaded model and tokenizer for rank: {rank}") throughput = train_bert(rank, world_size, local_rank, model, tokenizer) # Only rank 0 prints the "Average Throughput" line if rank == 0: print(f"Average Throughput: {throughput:.2f} samples/second") if __name__ == "__main__": main() ================================================ FILE: test/manifests/assets/cloudwatch-agent.yaml ================================================ apiVersion: v1 kind: ConfigMap metadata: name: prometheus-cwagentconfig namespace: amazon-cloudwatch data: cwagentconfig.json: | { "agent": { "debug": true }, "logs": { "metrics_collected": { "prometheus": { "prometheus_config_path": "/etc/prometheusconfig/prometheus.yaml", "emf_processor": { "metric_declaration": [ { "source_labels": ["job"], "label_matcher": "dcgm-exporter", "dimensions": [[{{.DimensionKeys}}]], "metric_selectors": [ "^DCGM_FI_DEV_GPU_UTIL$", "^DCGM_FI_DEV_MEM_COPY_UTIL$", "^DCGM_FI_DEV_FB_USED$", "^DCGM_FI_DEV_FB_FREE$", "^DCGM_FI_DEV_POWER_USAGE$" ] } ] } } }, "force_flush_interval": 5 } } --- apiVersion: v1 kind: ConfigMap metadata: name: prometheus-config namespace: amazon-cloudwatch data: prometheus.yaml: | global: scrape_interval: 1s scrape_timeout: 1s scrape_configs: - job_name: dcgm-exporter static_configs: - targets: - dcgm-exporter.kube-system.svc.cluster.local:9400 metrics_path: /metrics metric_relabel_configs: {{- range $key, $value := .MetricDimensions}} - {action: replace, target_label: {{$key}}, replacement: '{{$value}}'} {{- end}} --- apiVersion: apps/v1 kind: DaemonSet metadata: name: cwagent namespace: amazon-cloudwatch spec: selector: matchLabels: app: cwagent template: metadata: labels: app: cwagent spec: serviceAccountName: cwagent dnsPolicy: ClusterFirst containers: - name: cloudwatch-agent image: public.ecr.aws/cloudwatch-agent/cloudwatch-agent:latest imagePullPolicy: Always resources: limits: cpu: 1000m memory: 1000Mi requests: cpu: 200m memory: 200Mi volumeMounts: - name: prometheus-cwagentconfig mountPath: /etc/cwagentconfig - name: prometheus-config mountPath: /etc/prometheusconfig volumes: - name: prometheus-cwagentconfig configMap: name: prometheus-cwagentconfig - name: prometheus-config configMap: name: prometheus-config terminationGracePeriodSeconds: 60 --- ================================================ FILE: test/manifests/assets/dcgm-exporter.yaml ================================================ # Derived from: Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. apiVersion: apps/v1 kind: DaemonSet metadata: name: "dcgm-exporter" namespace: "kube-system" labels: app.kubernetes.io/name: "dcgm-exporter" app.kubernetes.io/version: "4.1.3" spec: updateStrategy: type: RollingUpdate selector: matchLabels: app.kubernetes.io/name: "dcgm-exporter" app.kubernetes.io/version: "4.1.3" template: metadata: labels: app.kubernetes.io/name: "dcgm-exporter" app.kubernetes.io/version: "4.1.3" name: "dcgm-exporter" spec: containers: - image: "nvcr.io/nvidia/k8s/dcgm-exporter:4.2.3-4.1.3-ubuntu22.04" env: - name: "DCGM_EXPORTER_LISTEN" value: ":9400" - name: "DCGM_EXPORTER_INTERVAL" value: "100" - name: "DCGM_EXPORTER_KUBERNETES" value: "true" name: "dcgm-exporter" ports: - name: "metrics" containerPort: 9400 securityContext: runAsNonRoot: false runAsUser: 0 capabilities: add: ["SYS_ADMIN"] volumeMounts: - name: "pod-gpu-resources" readOnly: true mountPath: "/var/lib/kubelet/pod-resources" volumes: - name: "pod-gpu-resources" hostPath: path: "/var/lib/kubelet/pod-resources" --- kind: Service apiVersion: v1 metadata: name: "dcgm-exporter" namespace: "kube-system" labels: app.kubernetes.io/name: "dcgm-exporter" app.kubernetes.io/version: "4.1.3" spec: clusterIP: "None" selector: app.kubernetes.io/name: "dcgm-exporter" app.kubernetes.io/version: "4.1.3" ports: - name: "metrics" port: 9400 ================================================ FILE: test/manifests/assets/dranet.yaml ================================================ --- # Source: aws-dranet/templates/serviceaccount.yaml apiVersion: v1 kind: ServiceAccount metadata: name: dranet-aws-dranet namespace: kube-system labels: helm.sh/chart: aws-dranet-1.0.0 app.kubernetes.io/name: aws-dranet app.kubernetes.io/instance: dranet app.kubernetes.io/version: "v1.2.0-eksbuild.2" app.kubernetes.io/managed-by: Helm --- # Source: aws-dranet/templates/clusterrole.yaml kind: ClusterRole apiVersion: rbac.authorization.k8s.io/v1 metadata: name: dranet-aws-dranet labels: helm.sh/chart: aws-dranet-1.0.0 app.kubernetes.io/name: aws-dranet app.kubernetes.io/instance: dranet app.kubernetes.io/version: "v1.2.0-eksbuild.2" app.kubernetes.io/managed-by: Helm rules: - apiGroups: - "" resources: - nodes verbs: - get - apiGroups: - "resource.k8s.io" resources: - resourceslices verbs: - list - watch - create - update - delete - apiGroups: - "resource.k8s.io" resources: - resourceclaims - deviceclasses verbs: - get - apiGroups: - "resource.k8s.io" resources: - resourceclaims/status verbs: - patch - update --- # Source: aws-dranet/templates/clusterrolebinding.yaml kind: ClusterRoleBinding apiVersion: rbac.authorization.k8s.io/v1 metadata: name: dranet-aws-dranet labels: helm.sh/chart: aws-dranet-1.0.0 app.kubernetes.io/name: aws-dranet app.kubernetes.io/instance: dranet app.kubernetes.io/version: "v1.2.0-eksbuild.2" app.kubernetes.io/managed-by: Helm roleRef: apiGroup: rbac.authorization.k8s.io kind: ClusterRole name: dranet-aws-dranet subjects: - kind: ServiceAccount name: dranet-aws-dranet namespace: kube-system --- # Source: aws-dranet/templates/daemonset.yaml apiVersion: apps/v1 kind: DaemonSet metadata: name: dranet-aws-dranet namespace: kube-system labels: helm.sh/chart: aws-dranet-1.0.0 app.kubernetes.io/name: aws-dranet app.kubernetes.io/instance: dranet app.kubernetes.io/version: "v1.2.0-eksbuild.2" app.kubernetes.io/managed-by: Helm tier: node app: dranet-aws-dranet k8s-app: dranet-aws-dranet spec: selector: matchLabels: app: dranet-aws-dranet template: metadata: labels: app.kubernetes.io/name: aws-dranet app.kubernetes.io/instance: dranet tier: node app: dranet-aws-dranet k8s-app: dranet-aws-dranet spec: priorityClassName: "system-node-critical" hostNetwork: true hostPID: false tolerations: - key: CriticalAddonsOnly operator: Exists serviceAccountName: dranet-aws-dranet containers: - name: dranet args: - /dranet - --v=4 - --hostname-override=$(NODE_NAME) - "--bind-address=:9177" - --cloud-provider-hint=AWS - --filter="dra.net/pciDevice" in attributes && attributes["dra.net/pciDevice"].StringValue == "Elastic Fabric Adapter (EFA)" image: {{.RdmaDeviceDraDriverImage}} imagePullPolicy: IfNotPresent env: - name: NODE_NAME valueFrom: fieldRef: fieldPath: spec.nodeName resources: limits: cpu: 500m memory: 256Mi requests: cpu: 100m memory: 50Mi securityContext: allowPrivilegeEscalation: false capabilities: drop: - ALL privileged: false readOnlyRootFilesystem: true runAsGroup: 0 runAsUser: 0 seccompProfile: type: RuntimeDefault readinessProbe: httpGet: path: /healthz port: 9177 volumeMounts: - name: device-plugin mountPath: /var/lib/kubelet/plugins - name: plugin-registry mountPath: /var/lib/kubelet/plugins_registry - name: nri-plugin mountPath: /var/run/nri - name: netns mountPath: /var/run/netns mountPropagation: HostToContainer - name: infiniband mountPath: /dev/infiniband mountPropagation: HostToContainer - name: tmp mountPath: /tmp - name: dranet-run mountPath: /var/run/dranet volumes: - name: device-plugin hostPath: path: /var/lib/kubelet/plugins type: DirectoryOrCreate - name: plugin-registry hostPath: path: /var/lib/kubelet/plugins_registry type: DirectoryOrCreate - name: nri-plugin hostPath: path: /var/run/nri type: DirectoryOrCreate - name: netns hostPath: path: /var/run/netns type: DirectoryOrCreate - name: infiniband hostPath: path: /dev/infiniband type: DirectoryOrCreate - name: tmp emptyDir: medium: Memory sizeLimit: 10Mi - name: dranet-run hostPath: path: /var/run/dranet type: DirectoryOrCreate --- # Source: aws-dranet/templates/deviceclass.yaml apiVersion: resource.k8s.io/v1 kind: DeviceClass metadata: name: efa.networking.k8s.aws labels: helm.sh/chart: aws-dranet-1.0.0 app.kubernetes.io/name: aws-dranet app.kubernetes.io/instance: dranet app.kubernetes.io/version: "v1.2.0-eksbuild.2" app.kubernetes.io/managed-by: Helm spec: selectors: - cel: expression: | device.driver == "dra.net" && device.attributes["dra.net"].pciDevice == 'Elastic Fabric Adapter (EFA)' ================================================ FILE: test/manifests/assets/efa-device-plugin.yaml ================================================ # Source: https://raw.githubusercontent.com/aws-samples/aws-efa-eks/main/manifest/efa-k8s-device-plugin.yml apiVersion: apps/v1 kind: DaemonSet metadata: name: aws-efa-k8s-device-plugin-daemonset namespace: kube-system spec: selector: matchLabels: name: aws-efa-k8s-device-plugin updateStrategy: type: RollingUpdate template: metadata: labels: name: aws-efa-k8s-device-plugin spec: serviceAccount: default tolerations: - key: CriticalAddonsOnly operator: Exists - key: aws.amazon.com/efa operator: Exists effect: NoSchedule # Mark this pod as a critical add-on; when enabled, the critical add-on # scheduler reserves resources for critical add-on pods so that they can # be rescheduled after a failure. # See https://kubernetes.io/docs/tasks/administer-cluster/guaranteed-scheduling-critical-addon-pods/ priorityClassName: "system-node-critical" hostNetwork: true containers: - image: 602401143452.dkr.ecr.us-west-2.amazonaws.com/eks/aws-efa-k8s-device-plugin:v0.5.8 name: aws-efa-k8s-device-plugin securityContext: allowPrivilegeEscalation: false capabilities: drop: ["ALL"] runAsNonRoot: false volumeMounts: - name: device-plugin mountPath: /var/lib/kubelet/device-plugins - name: infiniband-volume mountPath: /dev/infiniband resources: requests: cpu: 10m memory: 20Mi volumes: - name: device-plugin hostPath: path: /var/lib/kubelet/device-plugins - name: infiniband-volume hostPath: path: /dev/infiniband ================================================ FILE: test/manifests/assets/k8s-neuron-device-plugin-rbac.yml ================================================ # Source: https://github.com/aws-neuron/aws-neuron-sdk/blob/master/src/k8/k8s-neuron-device-plugin-rbac.yml kind: ClusterRole apiVersion: rbac.authorization.k8s.io/v1 metadata: name: neuron-device-plugin rules: - apiGroups: - "" resources: - nodes verbs: - get - list - watch - apiGroups: - "" resources: - events verbs: - create - patch - apiGroups: - "" resources: - pods verbs: - update - patch - get - list - watch - apiGroups: - "" resources: - nodes/status verbs: - patch - update --- apiVersion: v1 kind: ServiceAccount metadata: name: neuron-device-plugin namespace: kube-system --- kind: ClusterRoleBinding apiVersion: rbac.authorization.k8s.io/v1 metadata: name: neuron-device-plugin namespace: kube-system roleRef: apiGroup: rbac.authorization.k8s.io kind: ClusterRole name: neuron-device-plugin subjects: - kind: ServiceAccount name: neuron-device-plugin namespace: kube-system ================================================ FILE: test/manifests/assets/k8s-neuron-device-plugin.yml ================================================ # Source: https://github.com/aws-neuron/aws-neuron-sdk/blob/master/src/k8/k8s-neuron-device-plugin.yml apiVersion: apps/v1 kind: DaemonSet metadata: name: neuron-device-plugin-daemonset namespace: kube-system spec: selector: matchLabels: name: neuron-device-plugin-ds updateStrategy: type: RollingUpdate template: metadata: # Uncomment the annotation below if k8s version is 1.13 or lower # annotations: # scheduler.alpha.kubernetes.io/critical-pod: "" labels: name: neuron-device-plugin-ds spec: serviceAccount: neuron-device-plugin tolerations: - key: CriticalAddonsOnly operator: Exists - key: aws.amazon.com/neuron operator: Exists effect: NoSchedule # Mark this pod as a critical add-on; when enabled, the critical add-on # scheduler reserves resources for critical add-on pods so that they can # be rescheduled after a failure. # See https://kubernetes.io/docs/tasks/administer-cluster/guaranteed-scheduling-critical-addon-pods/ priorityClassName: "system-node-critical" affinity: nodeAffinity: requiredDuringSchedulingIgnoredDuringExecution: nodeSelectorTerms: - matchExpressions: - key: "node.kubernetes.io/instance-type" operator: In values: - inf1.xlarge - inf1.2xlarge - inf1.6xlarge - inf1.24xlarge - inf2.xlarge - inf2.8xlarge - inf2.24xlarge - inf2.48xlarge - trn1.2xlarge - trn1.32xlarge - trn1n.32xlarge - trn2.48xlarge - trn2u.48xlarge containers: # Find all neuron-device-plugin images at https://gallery.ecr.aws/neuron/neuron-device-plugin - image: public.ecr.aws/neuron/neuron-device-plugin:2.26.26.0 imagePullPolicy: Always name: neuron-device-plugin env: - name: KUBECONFIG value: /etc/kubernetes/kubelet.conf - name: NODE_NAME valueFrom: fieldRef: fieldPath: spec.nodeName securityContext: allowPrivilegeEscalation: false capabilities: drop: ["ALL"] volumeMounts: - name: device-plugin mountPath: /var/lib/kubelet/device-plugins - name: infa-map mountPath: /run volumes: - name: device-plugin hostPath: path: /var/lib/kubelet/device-plugins - name: infa-map hostPath: path: /run ================================================ FILE: test/manifests/assets/mpi-operator.yaml ================================================ # -------------------------------------------------- # - Single configuration deployment YAML for MPI-Operator # - Includes: # CRD # Namespace # RBAC # Controller deployment # -------------------------------------------------- --- apiVersion: v1 kind: Namespace metadata: labels: app: mpi-operator app.kubernetes.io/component: mpijob app.kubernetes.io/name: mpi-operator kustomize.component: mpi-operator name: mpi-operator --- apiVersion: apiextensions.k8s.io/v1 kind: CustomResourceDefinition metadata: annotations: controller-gen.kubebuilder.io/version: v0.20.1 labels: app: mpi-operator app.kubernetes.io/component: mpijob app.kubernetes.io/name: mpi-operator kustomize.component: mpi-operator name: mpijobs.kubeflow.org spec: group: kubeflow.org names: kind: MPIJob listKind: MPIJobList plural: mpijobs singular: mpijob scope: Namespaced versions: - name: v2beta1 schema: openAPIV3Schema: properties: apiVersion: description: |- APIVersion defines the versioned schema of this representation of an object. Servers should convert recognized schemas to the latest internal value, and may reject unrecognized values. More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#resources type: string kind: description: |- Kind is a string value representing the REST resource this object represents. Servers may infer this from the endpoint the client submits requests to. Cannot be updated. In CamelCase. More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#types-kinds type: string metadata: type: object spec: properties: launcherCreationPolicy: default: AtStartup description: launcherCreationPolicy if WaitForWorkersReady, the launcher is created only after all workers are in Ready state. Defaults to AtStartup. type: string mpiImplementation: default: OpenMPI description: |- MPIImplementation is the MPI implementation. Options are "OpenMPI" (default), "Intel" and "MPICH". enum: - OpenMPI - Intel - MPICH type: string mpiReplicaSpecs: additionalProperties: description: ReplicaSpec is a description of the replica properties: replicas: description: |- Replicas is the desired number of replicas of the given template. If unspecified, defaults to 1. format: int32 type: integer restartPolicy: description: |- Restart policy for all replicas within the job. One of Always, OnFailure, Never and ExitCode. Default to Never. type: string template: description: |- Template is the object that describes the pod that will be created for this replica. RestartPolicy in PodTemplateSpec will be overide by RestartPolicy in ReplicaSpec properties: metadata: description: |- Standard object's metadata. More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#metadata properties: annotations: additionalProperties: type: string type: object finalizers: items: type: string type: array labels: additionalProperties: type: string type: object name: type: string namespace: type: string type: object spec: description: |- Specification of the desired behavior of the pod. More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#spec-and-status properties: activeDeadlineSeconds: description: |- Optional duration in seconds the pod may be active on the node relative to StartTime before the system will actively try to mark it failed and kill associated containers. Value must be a positive integer. format: int64 type: integer affinity: description: If specified, the pod's scheduling constraints properties: nodeAffinity: description: Describes node affinity scheduling rules for the pod. properties: preferredDuringSchedulingIgnoredDuringExecution: description: |- The scheduler will prefer to schedule pods to nodes that satisfy the affinity expressions specified by this field, but it may choose a node that violates one or more of the expressions. The node that is most preferred is the one with the greatest sum of weights, i.e. for each node that meets all of the scheduling requirements (resource request, requiredDuringScheduling affinity expressions, etc.), compute a sum by iterating through the elements of this field and adding "weight" to the sum if the node matches the corresponding matchExpressions; the node(s) with the highest sum are the most preferred. items: description: |- An empty preferred scheduling term matches all objects with implicit weight 0 (i.e. it's a no-op). A null preferred scheduling term matches no objects (i.e. is also a no-op). properties: preference: description: A node selector term, associated with the corresponding weight. properties: matchExpressions: description: A list of node selector requirements by node's labels. items: description: |- A node selector requirement is a selector that contains values, a key, and an operator that relates the key and values. properties: key: description: The label key that the selector applies to. type: string operator: description: |- Represents a key's relationship to a set of values. Valid operators are In, NotIn, Exists, DoesNotExist. Gt, and Lt. type: string values: description: |- An array of string values. If the operator is In or NotIn, the values array must be non-empty. If the operator is Exists or DoesNotExist, the values array must be empty. If the operator is Gt or Lt, the values array must have a single element, which will be interpreted as an integer. This array is replaced during a strategic merge patch. items: type: string type: array x-kubernetes-list-type: atomic required: - key - operator type: object type: array x-kubernetes-list-type: atomic matchFields: description: A list of node selector requirements by node's fields. items: description: |- A node selector requirement is a selector that contains values, a key, and an operator that relates the key and values. properties: key: description: The label key that the selector applies to. type: string operator: description: |- Represents a key's relationship to a set of values. Valid operators are In, NotIn, Exists, DoesNotExist. Gt, and Lt. type: string values: description: |- An array of string values. If the operator is In or NotIn, the values array must be non-empty. If the operator is Exists or DoesNotExist, the values array must be empty. If the operator is Gt or Lt, the values array must have a single element, which will be interpreted as an integer. This array is replaced during a strategic merge patch. items: type: string type: array x-kubernetes-list-type: atomic required: - key - operator type: object type: array x-kubernetes-list-type: atomic type: object x-kubernetes-map-type: atomic weight: description: Weight associated with matching the corresponding nodeSelectorTerm, in the range 1-100. format: int32 type: integer required: - preference - weight type: object type: array x-kubernetes-list-type: atomic requiredDuringSchedulingIgnoredDuringExecution: description: |- If the affinity requirements specified by this field are not met at scheduling time, the pod will not be scheduled onto the node. If the affinity requirements specified by this field cease to be met at some point during pod execution (e.g. due to an update), the system may or may not try to eventually evict the pod from its node. properties: nodeSelectorTerms: description: Required. A list of node selector terms. The terms are ORed. items: description: |- A null or empty node selector term matches no objects. The requirements of them are ANDed. The TopologySelectorTerm type implements a subset of the NodeSelectorTerm. properties: matchExpressions: description: A list of node selector requirements by node's labels. items: description: |- A node selector requirement is a selector that contains values, a key, and an operator that relates the key and values. properties: key: description: The label key that the selector applies to. type: string operator: description: |- Represents a key's relationship to a set of values. Valid operators are In, NotIn, Exists, DoesNotExist. Gt, and Lt. type: string values: description: |- An array of string values. If the operator is In or NotIn, the values array must be non-empty. If the operator is Exists or DoesNotExist, the values array must be empty. If the operator is Gt or Lt, the values array must have a single element, which will be interpreted as an integer. This array is replaced during a strategic merge patch. items: type: string type: array x-kubernetes-list-type: atomic required: - key - operator type: object type: array x-kubernetes-list-type: atomic matchFields: description: A list of node selector requirements by node's fields. items: description: |- A node selector requirement is a selector that contains values, a key, and an operator that relates the key and values. properties: key: description: The label key that the selector applies to. type: string operator: description: |- Represents a key's relationship to a set of values. Valid operators are In, NotIn, Exists, DoesNotExist. Gt, and Lt. type: string values: description: |- An array of string values. If the operator is In or NotIn, the values array must be non-empty. If the operator is Exists or DoesNotExist, the values array must be empty. If the operator is Gt or Lt, the values array must have a single element, which will be interpreted as an integer. This array is replaced during a strategic merge patch. items: type: string type: array x-kubernetes-list-type: atomic required: - key - operator type: object type: array x-kubernetes-list-type: atomic type: object x-kubernetes-map-type: atomic type: array x-kubernetes-list-type: atomic required: - nodeSelectorTerms type: object x-kubernetes-map-type: atomic type: object podAffinity: description: Describes pod affinity scheduling rules (e.g. co-locate this pod in the same node, zone, etc. as some other pod(s)). properties: preferredDuringSchedulingIgnoredDuringExecution: description: |- The scheduler will prefer to schedule pods to nodes that satisfy the affinity expressions specified by this field, but it may choose a node that violates one or more of the expressions. The node that is most preferred is the one with the greatest sum of weights, i.e. for each node that meets all of the scheduling requirements (resource request, requiredDuringScheduling affinity expressions, etc.), compute a sum by iterating through the elements of this field and adding "weight" to the sum if the node has pods which matches the corresponding podAffinityTerm; the node(s) with the highest sum are the most preferred. items: description: The weights of all of the matched WeightedPodAffinityTerm fields are added per-node to find the most preferred node(s) properties: podAffinityTerm: description: Required. A pod affinity term, associated with the corresponding weight. properties: labelSelector: description: |- A label query over a set of resources, in this case pods. If it's null, this PodAffinityTerm matches with no Pods. properties: matchExpressions: description: matchExpressions is a list of label selector requirements. The requirements are ANDed. items: description: |- A label selector requirement is a selector that contains values, a key, and an operator that relates the key and values. properties: key: description: key is the label key that the selector applies to. type: string operator: description: |- operator represents a key's relationship to a set of values. Valid operators are In, NotIn, Exists and DoesNotExist. type: string values: description: |- values is an array of string values. If the operator is In or NotIn, the values array must be non-empty. If the operator is Exists or DoesNotExist, the values array must be empty. This array is replaced during a strategic merge patch. items: type: string type: array x-kubernetes-list-type: atomic required: - key - operator type: object type: array x-kubernetes-list-type: atomic matchLabels: additionalProperties: type: string description: |- matchLabels is a map of {key,value} pairs. A single {key,value} in the matchLabels map is equivalent to an element of matchExpressions, whose key field is "key", the operator is "In", and the values array contains only "value". The requirements are ANDed. type: object type: object x-kubernetes-map-type: atomic matchLabelKeys: description: |- MatchLabelKeys is a set of pod label keys to select which pods will be taken into consideration. The keys are used to lookup values from the incoming pod labels, those key-value labels are merged with `labelSelector` as `key in (value)` to select the group of existing pods which pods will be taken into consideration for the incoming pod's pod (anti) affinity. Keys that don't exist in the incoming pod labels will be ignored. The default value is empty. The same key is forbidden to exist in both matchLabelKeys and labelSelector. Also, matchLabelKeys cannot be set when labelSelector isn't set. items: type: string type: array x-kubernetes-list-type: atomic mismatchLabelKeys: description: |- MismatchLabelKeys is a set of pod label keys to select which pods will be taken into consideration. The keys are used to lookup values from the incoming pod labels, those key-value labels are merged with `labelSelector` as `key notin (value)` to select the group of existing pods which pods will be taken into consideration for the incoming pod's pod (anti) affinity. Keys that don't exist in the incoming pod labels will be ignored. The default value is empty. The same key is forbidden to exist in both mismatchLabelKeys and labelSelector. Also, mismatchLabelKeys cannot be set when labelSelector isn't set. items: type: string type: array x-kubernetes-list-type: atomic namespaceSelector: description: |- A label query over the set of namespaces that the term applies to. The term is applied to the union of the namespaces selected by this field and the ones listed in the namespaces field. null selector and null or empty namespaces list means "this pod's namespace". An empty selector ({}) matches all namespaces. properties: matchExpressions: description: matchExpressions is a list of label selector requirements. The requirements are ANDed. items: description: |- A label selector requirement is a selector that contains values, a key, and an operator that relates the key and values. properties: key: description: key is the label key that the selector applies to. type: string operator: description: |- operator represents a key's relationship to a set of values. Valid operators are In, NotIn, Exists and DoesNotExist. type: string values: description: |- values is an array of string values. If the operator is In or NotIn, the values array must be non-empty. If the operator is Exists or DoesNotExist, the values array must be empty. This array is replaced during a strategic merge patch. items: type: string type: array x-kubernetes-list-type: atomic required: - key - operator type: object type: array x-kubernetes-list-type: atomic matchLabels: additionalProperties: type: string description: |- matchLabels is a map of {key,value} pairs. A single {key,value} in the matchLabels map is equivalent to an element of matchExpressions, whose key field is "key", the operator is "In", and the values array contains only "value". The requirements are ANDed. type: object type: object x-kubernetes-map-type: atomic namespaces: description: |- namespaces specifies a static list of namespace names that the term applies to. The term is applied to the union of the namespaces listed in this field and the ones selected by namespaceSelector. null or empty namespaces list and null namespaceSelector means "this pod's namespace". items: type: string type: array x-kubernetes-list-type: atomic topologyKey: description: |- This pod should be co-located (affinity) or not co-located (anti-affinity) with the pods matching the labelSelector in the specified namespaces, where co-located is defined as running on a node whose value of the label with key topologyKey matches that of any node on which any of the selected pods is running. Empty topologyKey is not allowed. type: string required: - topologyKey type: object weight: description: |- weight associated with matching the corresponding podAffinityTerm, in the range 1-100. format: int32 type: integer required: - podAffinityTerm - weight type: object type: array x-kubernetes-list-type: atomic requiredDuringSchedulingIgnoredDuringExecution: description: |- If the affinity requirements specified by this field are not met at scheduling time, the pod will not be scheduled onto the node. If the affinity requirements specified by this field cease to be met at some point during pod execution (e.g. due to a pod label update), the system may or may not try to eventually evict the pod from its node. When there are multiple elements, the lists of nodes corresponding to each podAffinityTerm are intersected, i.e. all terms must be satisfied. items: description: |- Defines a set of pods (namely those matching the labelSelector relative to the given namespace(s)) that this pod should be co-located (affinity) or not co-located (anti-affinity) with, where co-located is defined as running on a node whose value of the label with key matches that of any node on which a pod of the set of pods is running properties: labelSelector: description: |- A label query over a set of resources, in this case pods. If it's null, this PodAffinityTerm matches with no Pods. properties: matchExpressions: description: matchExpressions is a list of label selector requirements. The requirements are ANDed. items: description: |- A label selector requirement is a selector that contains values, a key, and an operator that relates the key and values. properties: key: description: key is the label key that the selector applies to. type: string operator: description: |- operator represents a key's relationship to a set of values. Valid operators are In, NotIn, Exists and DoesNotExist. type: string values: description: |- values is an array of string values. If the operator is In or NotIn, the values array must be non-empty. If the operator is Exists or DoesNotExist, the values array must be empty. This array is replaced during a strategic merge patch. items: type: string type: array x-kubernetes-list-type: atomic required: - key - operator type: object type: array x-kubernetes-list-type: atomic matchLabels: additionalProperties: type: string description: |- matchLabels is a map of {key,value} pairs. A single {key,value} in the matchLabels map is equivalent to an element of matchExpressions, whose key field is "key", the operator is "In", and the values array contains only "value". The requirements are ANDed. type: object type: object x-kubernetes-map-type: atomic matchLabelKeys: description: |- MatchLabelKeys is a set of pod label keys to select which pods will be taken into consideration. The keys are used to lookup values from the incoming pod labels, those key-value labels are merged with `labelSelector` as `key in (value)` to select the group of existing pods which pods will be taken into consideration for the incoming pod's pod (anti) affinity. Keys that don't exist in the incoming pod labels will be ignored. The default value is empty. The same key is forbidden to exist in both matchLabelKeys and labelSelector. Also, matchLabelKeys cannot be set when labelSelector isn't set. items: type: string type: array x-kubernetes-list-type: atomic mismatchLabelKeys: description: |- MismatchLabelKeys is a set of pod label keys to select which pods will be taken into consideration. The keys are used to lookup values from the incoming pod labels, those key-value labels are merged with `labelSelector` as `key notin (value)` to select the group of existing pods which pods will be taken into consideration for the incoming pod's pod (anti) affinity. Keys that don't exist in the incoming pod labels will be ignored. The default value is empty. The same key is forbidden to exist in both mismatchLabelKeys and labelSelector. Also, mismatchLabelKeys cannot be set when labelSelector isn't set. items: type: string type: array x-kubernetes-list-type: atomic namespaceSelector: description: |- A label query over the set of namespaces that the term applies to. The term is applied to the union of the namespaces selected by this field and the ones listed in the namespaces field. null selector and null or empty namespaces list means "this pod's namespace". An empty selector ({}) matches all namespaces. properties: matchExpressions: description: matchExpressions is a list of label selector requirements. The requirements are ANDed. items: description: |- A label selector requirement is a selector that contains values, a key, and an operator that relates the key and values. properties: key: description: key is the label key that the selector applies to. type: string operator: description: |- operator represents a key's relationship to a set of values. Valid operators are In, NotIn, Exists and DoesNotExist. type: string values: description: |- values is an array of string values. If the operator is In or NotIn, the values array must be non-empty. If the operator is Exists or DoesNotExist, the values array must be empty. This array is replaced during a strategic merge patch. items: type: string type: array x-kubernetes-list-type: atomic required: - key - operator type: object type: array x-kubernetes-list-type: atomic matchLabels: additionalProperties: type: string description: |- matchLabels is a map of {key,value} pairs. A single {key,value} in the matchLabels map is equivalent to an element of matchExpressions, whose key field is "key", the operator is "In", and the values array contains only "value". The requirements are ANDed. type: object type: object x-kubernetes-map-type: atomic namespaces: description: |- namespaces specifies a static list of namespace names that the term applies to. The term is applied to the union of the namespaces listed in this field and the ones selected by namespaceSelector. null or empty namespaces list and null namespaceSelector means "this pod's namespace". items: type: string type: array x-kubernetes-list-type: atomic topologyKey: description: |- This pod should be co-located (affinity) or not co-located (anti-affinity) with the pods matching the labelSelector in the specified namespaces, where co-located is defined as running on a node whose value of the label with key topologyKey matches that of any node on which any of the selected pods is running. Empty topologyKey is not allowed. type: string required: - topologyKey type: object type: array x-kubernetes-list-type: atomic type: object podAntiAffinity: description: Describes pod anti-affinity scheduling rules (e.g. avoid putting this pod in the same node, zone, etc. as some other pod(s)). properties: preferredDuringSchedulingIgnoredDuringExecution: description: |- The scheduler will prefer to schedule pods to nodes that satisfy the anti-affinity expressions specified by this field, but it may choose a node that violates one or more of the expressions. The node that is most preferred is the one with the greatest sum of weights, i.e. for each node that meets all of the scheduling requirements (resource request, requiredDuringScheduling anti-affinity expressions, etc.), compute a sum by iterating through the elements of this field and subtracting "weight" from the sum if the node has pods which matches the corresponding podAffinityTerm; the node(s) with the highest sum are the most preferred. items: description: The weights of all of the matched WeightedPodAffinityTerm fields are added per-node to find the most preferred node(s) properties: podAffinityTerm: description: Required. A pod affinity term, associated with the corresponding weight. properties: labelSelector: description: |- A label query over a set of resources, in this case pods. If it's null, this PodAffinityTerm matches with no Pods. properties: matchExpressions: description: matchExpressions is a list of label selector requirements. The requirements are ANDed. items: description: |- A label selector requirement is a selector that contains values, a key, and an operator that relates the key and values. properties: key: description: key is the label key that the selector applies to. type: string operator: description: |- operator represents a key's relationship to a set of values. Valid operators are In, NotIn, Exists and DoesNotExist. type: string values: description: |- values is an array of string values. If the operator is In or NotIn, the values array must be non-empty. If the operator is Exists or DoesNotExist, the values array must be empty. This array is replaced during a strategic merge patch. items: type: string type: array x-kubernetes-list-type: atomic required: - key - operator type: object type: array x-kubernetes-list-type: atomic matchLabels: additionalProperties: type: string description: |- matchLabels is a map of {key,value} pairs. A single {key,value} in the matchLabels map is equivalent to an element of matchExpressions, whose key field is "key", the operator is "In", and the values array contains only "value". The requirements are ANDed. type: object type: object x-kubernetes-map-type: atomic matchLabelKeys: description: |- MatchLabelKeys is a set of pod label keys to select which pods will be taken into consideration. The keys are used to lookup values from the incoming pod labels, those key-value labels are merged with `labelSelector` as `key in (value)` to select the group of existing pods which pods will be taken into consideration for the incoming pod's pod (anti) affinity. Keys that don't exist in the incoming pod labels will be ignored. The default value is empty. The same key is forbidden to exist in both matchLabelKeys and labelSelector. Also, matchLabelKeys cannot be set when labelSelector isn't set. items: type: string type: array x-kubernetes-list-type: atomic mismatchLabelKeys: description: |- MismatchLabelKeys is a set of pod label keys to select which pods will be taken into consideration. The keys are used to lookup values from the incoming pod labels, those key-value labels are merged with `labelSelector` as `key notin (value)` to select the group of existing pods which pods will be taken into consideration for the incoming pod's pod (anti) affinity. Keys that don't exist in the incoming pod labels will be ignored. The default value is empty. The same key is forbidden to exist in both mismatchLabelKeys and labelSelector. Also, mismatchLabelKeys cannot be set when labelSelector isn't set. items: type: string type: array x-kubernetes-list-type: atomic namespaceSelector: description: |- A label query over the set of namespaces that the term applies to. The term is applied to the union of the namespaces selected by this field and the ones listed in the namespaces field. null selector and null or empty namespaces list means "this pod's namespace". An empty selector ({}) matches all namespaces. properties: matchExpressions: description: matchExpressions is a list of label selector requirements. The requirements are ANDed. items: description: |- A label selector requirement is a selector that contains values, a key, and an operator that relates the key and values. properties: key: description: key is the label key that the selector applies to. type: string operator: description: |- operator represents a key's relationship to a set of values. Valid operators are In, NotIn, Exists and DoesNotExist. type: string values: description: |- values is an array of string values. If the operator is In or NotIn, the values array must be non-empty. If the operator is Exists or DoesNotExist, the values array must be empty. This array is replaced during a strategic merge patch. items: type: string type: array x-kubernetes-list-type: atomic required: - key - operator type: object type: array x-kubernetes-list-type: atomic matchLabels: additionalProperties: type: string description: |- matchLabels is a map of {key,value} pairs. A single {key,value} in the matchLabels map is equivalent to an element of matchExpressions, whose key field is "key", the operator is "In", and the values array contains only "value". The requirements are ANDed. type: object type: object x-kubernetes-map-type: atomic namespaces: description: |- namespaces specifies a static list of namespace names that the term applies to. The term is applied to the union of the namespaces listed in this field and the ones selected by namespaceSelector. null or empty namespaces list and null namespaceSelector means "this pod's namespace". items: type: string type: array x-kubernetes-list-type: atomic topologyKey: description: |- This pod should be co-located (affinity) or not co-located (anti-affinity) with the pods matching the labelSelector in the specified namespaces, where co-located is defined as running on a node whose value of the label with key topologyKey matches that of any node on which any of the selected pods is running. Empty topologyKey is not allowed. type: string required: - topologyKey type: object weight: description: |- weight associated with matching the corresponding podAffinityTerm, in the range 1-100. format: int32 type: integer required: - podAffinityTerm - weight type: object type: array x-kubernetes-list-type: atomic requiredDuringSchedulingIgnoredDuringExecution: description: |- If the anti-affinity requirements specified by this field are not met at scheduling time, the pod will not be scheduled onto the node. If the anti-affinity requirements specified by this field cease to be met at some point during pod execution (e.g. due to a pod label update), the system may or may not try to eventually evict the pod from its node. When there are multiple elements, the lists of nodes corresponding to each podAffinityTerm are intersected, i.e. all terms must be satisfied. items: description: |- Defines a set of pods (namely those matching the labelSelector relative to the given namespace(s)) that this pod should be co-located (affinity) or not co-located (anti-affinity) with, where co-located is defined as running on a node whose value of the label with key matches that of any node on which a pod of the set of pods is running properties: labelSelector: description: |- A label query over a set of resources, in this case pods. If it's null, this PodAffinityTerm matches with no Pods. properties: matchExpressions: description: matchExpressions is a list of label selector requirements. The requirements are ANDed. items: description: |- A label selector requirement is a selector that contains values, a key, and an operator that relates the key and values. properties: key: description: key is the label key that the selector applies to. type: string operator: description: |- operator represents a key's relationship to a set of values. Valid operators are In, NotIn, Exists and DoesNotExist. type: string values: description: |- values is an array of string values. If the operator is In or NotIn, the values array must be non-empty. If the operator is Exists or DoesNotExist, the values array must be empty. This array is replaced during a strategic merge patch. items: type: string type: array x-kubernetes-list-type: atomic required: - key - operator type: object type: array x-kubernetes-list-type: atomic matchLabels: additionalProperties: type: string description: |- matchLabels is a map of {key,value} pairs. A single {key,value} in the matchLabels map is equivalent to an element of matchExpressions, whose key field is "key", the operator is "In", and the values array contains only "value". The requirements are ANDed. type: object type: object x-kubernetes-map-type: atomic matchLabelKeys: description: |- MatchLabelKeys is a set of pod label keys to select which pods will be taken into consideration. The keys are used to lookup values from the incoming pod labels, those key-value labels are merged with `labelSelector` as `key in (value)` to select the group of existing pods which pods will be taken into consideration for the incoming pod's pod (anti) affinity. Keys that don't exist in the incoming pod labels will be ignored. The default value is empty. The same key is forbidden to exist in both matchLabelKeys and labelSelector. Also, matchLabelKeys cannot be set when labelSelector isn't set. items: type: string type: array x-kubernetes-list-type: atomic mismatchLabelKeys: description: |- MismatchLabelKeys is a set of pod label keys to select which pods will be taken into consideration. The keys are used to lookup values from the incoming pod labels, those key-value labels are merged with `labelSelector` as `key notin (value)` to select the group of existing pods which pods will be taken into consideration for the incoming pod's pod (anti) affinity. Keys that don't exist in the incoming pod labels will be ignored. The default value is empty. The same key is forbidden to exist in both mismatchLabelKeys and labelSelector. Also, mismatchLabelKeys cannot be set when labelSelector isn't set. items: type: string type: array x-kubernetes-list-type: atomic namespaceSelector: description: |- A label query over the set of namespaces that the term applies to. The term is applied to the union of the namespaces selected by this field and the ones listed in the namespaces field. null selector and null or empty namespaces list means "this pod's namespace". An empty selector ({}) matches all namespaces. properties: matchExpressions: description: matchExpressions is a list of label selector requirements. The requirements are ANDed. items: description: |- A label selector requirement is a selector that contains values, a key, and an operator that relates the key and values. properties: key: description: key is the label key that the selector applies to. type: string operator: description: |- operator represents a key's relationship to a set of values. Valid operators are In, NotIn, Exists and DoesNotExist. type: string values: description: |- values is an array of string values. If the operator is In or NotIn, the values array must be non-empty. If the operator is Exists or DoesNotExist, the values array must be empty. This array is replaced during a strategic merge patch. items: type: string type: array x-kubernetes-list-type: atomic required: - key - operator type: object type: array x-kubernetes-list-type: atomic matchLabels: additionalProperties: type: string description: |- matchLabels is a map of {key,value} pairs. A single {key,value} in the matchLabels map is equivalent to an element of matchExpressions, whose key field is "key", the operator is "In", and the values array contains only "value". The requirements are ANDed. type: object type: object x-kubernetes-map-type: atomic namespaces: description: |- namespaces specifies a static list of namespace names that the term applies to. The term is applied to the union of the namespaces listed in this field and the ones selected by namespaceSelector. null or empty namespaces list and null namespaceSelector means "this pod's namespace". items: type: string type: array x-kubernetes-list-type: atomic topologyKey: description: |- This pod should be co-located (affinity) or not co-located (anti-affinity) with the pods matching the labelSelector in the specified namespaces, where co-located is defined as running on a node whose value of the label with key topologyKey matches that of any node on which any of the selected pods is running. Empty topologyKey is not allowed. type: string required: - topologyKey type: object type: array x-kubernetes-list-type: atomic type: object type: object automountServiceAccountToken: description: AutomountServiceAccountToken indicates whether a service account token should be automatically mounted. type: boolean containers: description: |- List of containers belonging to the pod. Containers cannot currently be added or removed. There must be at least one container in a Pod. Cannot be updated. items: description: A single application container that you want to run within a pod. properties: args: description: |- Arguments to the entrypoint. The container image's CMD is used if this is not provided. Variable references $(VAR_NAME) are expanded using the container's environment. If a variable cannot be resolved, the reference in the input string will be unchanged. Double $$ are reduced to a single $, which allows for escaping the $(VAR_NAME) syntax: i.e. "$$(VAR_NAME)" will produce the string literal "$(VAR_NAME)". Escaped references will never be expanded, regardless of whether the variable exists or not. Cannot be updated. More info: https://kubernetes.io/docs/tasks/inject-data-application/define-command-argument-container/#running-a-command-in-a-shell items: type: string type: array x-kubernetes-list-type: atomic command: description: |- Entrypoint array. Not executed within a shell. The container image's ENTRYPOINT is used if this is not provided. Variable references $(VAR_NAME) are expanded using the container's environment. If a variable cannot be resolved, the reference in the input string will be unchanged. Double $$ are reduced to a single $, which allows for escaping the $(VAR_NAME) syntax: i.e. "$$(VAR_NAME)" will produce the string literal "$(VAR_NAME)". Escaped references will never be expanded, regardless of whether the variable exists or not. Cannot be updated. More info: https://kubernetes.io/docs/tasks/inject-data-application/define-command-argument-container/#running-a-command-in-a-shell items: type: string type: array x-kubernetes-list-type: atomic env: description: |- List of environment variables to set in the container. Cannot be updated. items: description: EnvVar represents an environment variable present in a Container. properties: name: description: |- Name of the environment variable. May consist of any printable ASCII characters except '='. type: string value: description: |- Variable references $(VAR_NAME) are expanded using the previously defined environment variables in the container and any service environment variables. If a variable cannot be resolved, the reference in the input string will be unchanged. Double $$ are reduced to a single $, which allows for escaping the $(VAR_NAME) syntax: i.e. "$$(VAR_NAME)" will produce the string literal "$(VAR_NAME)". Escaped references will never be expanded, regardless of whether the variable exists or not. Defaults to "". type: string valueFrom: description: Source for the environment variable's value. Cannot be used if value is not empty. properties: configMapKeyRef: description: Selects a key of a ConfigMap. properties: key: description: The key to select. type: string name: default: "" description: |- Name of the referent. This field is effectively required, but due to backwards compatibility is allowed to be empty. Instances of this type with an empty value here are almost certainly wrong. More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names type: string optional: description: Specify whether the ConfigMap or its key must be defined type: boolean required: - key type: object x-kubernetes-map-type: atomic fieldRef: description: |- Selects a field of the pod: supports metadata.name, metadata.namespace, `metadata.labels['']`, `metadata.annotations['']`, spec.nodeName, spec.serviceAccountName, status.hostIP, status.podIP, status.podIPs. properties: apiVersion: description: Version of the schema the FieldPath is written in terms of, defaults to "v1". type: string fieldPath: description: Path of the field to select in the specified API version. type: string required: - fieldPath type: object x-kubernetes-map-type: atomic fileKeyRef: description: |- FileKeyRef selects a key of the env file. Requires the EnvFiles feature gate to be enabled. properties: key: description: |- The key within the env file. An invalid key will prevent the pod from starting. The keys defined within a source may consist of any printable ASCII characters except '='. During Alpha stage of the EnvFiles feature gate, the key size is limited to 128 characters. type: string optional: default: false description: |- Specify whether the file or its key must be defined. If the file or key does not exist, then the env var is not published. If optional is set to true and the specified key does not exist, the environment variable will not be set in the Pod's containers. If optional is set to false and the specified key does not exist, an error will be returned during Pod creation. type: boolean path: description: |- The path within the volume from which to select the file. Must be relative and may not contain the '..' path or start with '..'. type: string volumeName: description: The name of the volume mount containing the env file. type: string required: - key - path - volumeName type: object x-kubernetes-map-type: atomic resourceFieldRef: description: |- Selects a resource of the container: only resources limits and requests (limits.cpu, limits.memory, limits.ephemeral-storage, requests.cpu, requests.memory and requests.ephemeral-storage) are currently supported. properties: containerName: description: 'Container name: required for volumes, optional for env vars' type: string divisor: anyOf: - type: integer - type: string description: Specifies the output format of the exposed resources, defaults to "1" pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ x-kubernetes-int-or-string: true resource: description: 'Required: resource to select' type: string required: - resource type: object x-kubernetes-map-type: atomic secretKeyRef: description: Selects a key of a secret in the pod's namespace properties: key: description: The key of the secret to select from. Must be a valid secret key. type: string name: default: "" description: |- Name of the referent. This field is effectively required, but due to backwards compatibility is allowed to be empty. Instances of this type with an empty value here are almost certainly wrong. More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names type: string optional: description: Specify whether the Secret or its key must be defined type: boolean required: - key type: object x-kubernetes-map-type: atomic type: object required: - name type: object type: array x-kubernetes-list-map-keys: - name x-kubernetes-list-type: map envFrom: description: |- List of sources to populate environment variables in the container. The keys defined within a source may consist of any printable ASCII characters except '='. When a key exists in multiple sources, the value associated with the last source will take precedence. Values defined by an Env with a duplicate key will take precedence. Cannot be updated. items: description: EnvFromSource represents the source of a set of ConfigMaps or Secrets properties: configMapRef: description: The ConfigMap to select from properties: name: default: "" description: |- Name of the referent. This field is effectively required, but due to backwards compatibility is allowed to be empty. Instances of this type with an empty value here are almost certainly wrong. More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names type: string optional: description: Specify whether the ConfigMap must be defined type: boolean type: object x-kubernetes-map-type: atomic prefix: description: |- Optional text to prepend to the name of each environment variable. May consist of any printable ASCII characters except '='. type: string secretRef: description: The Secret to select from properties: name: default: "" description: |- Name of the referent. This field is effectively required, but due to backwards compatibility is allowed to be empty. Instances of this type with an empty value here are almost certainly wrong. More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names type: string optional: description: Specify whether the Secret must be defined type: boolean type: object x-kubernetes-map-type: atomic type: object type: array x-kubernetes-list-type: atomic image: description: |- Container image name. More info: https://kubernetes.io/docs/concepts/containers/images This field is optional to allow higher level config management to default or override container images in workload controllers like Deployments and StatefulSets. type: string imagePullPolicy: description: |- Image pull policy. One of Always, Never, IfNotPresent. Defaults to Always if :latest tag is specified, or IfNotPresent otherwise. Cannot be updated. More info: https://kubernetes.io/docs/concepts/containers/images#updating-images type: string lifecycle: description: |- Actions that the management system should take in response to container lifecycle events. Cannot be updated. properties: postStart: description: |- PostStart is called immediately after a container is created. If the handler fails, the container is terminated and restarted according to its restart policy. Other management of the container blocks until the hook completes. More info: https://kubernetes.io/docs/concepts/containers/container-lifecycle-hooks/#container-hooks properties: exec: description: Exec specifies a command to execute in the container. properties: command: description: |- Command is the command line to execute inside the container, the working directory for the command is root ('/') in the container's filesystem. The command is simply exec'd, it is not run inside a shell, so traditional shell instructions ('|', etc) won't work. To use a shell, you need to explicitly call out to that shell. Exit status of 0 is treated as live/healthy and non-zero is unhealthy. items: type: string type: array x-kubernetes-list-type: atomic type: object httpGet: description: HTTPGet specifies an HTTP GET request to perform. properties: host: description: |- Host name to connect to, defaults to the pod IP. You probably want to set "Host" in httpHeaders instead. type: string httpHeaders: description: Custom headers to set in the request. HTTP allows repeated headers. items: description: HTTPHeader describes a custom header to be used in HTTP probes properties: name: description: |- The header field name. This will be canonicalized upon output, so case-variant names will be understood as the same header. type: string value: description: The header field value type: string required: - name - value type: object type: array x-kubernetes-list-type: atomic path: description: Path to access on the HTTP server. type: string port: anyOf: - type: integer - type: string description: |- Name or number of the port to access on the container. Number must be in the range 1 to 65535. Name must be an IANA_SVC_NAME. x-kubernetes-int-or-string: true scheme: description: |- Scheme to use for connecting to the host. Defaults to HTTP. type: string required: - port type: object sleep: description: Sleep represents a duration that the container should sleep. properties: seconds: description: Seconds is the number of seconds to sleep. format: int64 type: integer required: - seconds type: object tcpSocket: description: |- Deprecated. TCPSocket is NOT supported as a LifecycleHandler and kept for backward compatibility. There is no validation of this field and lifecycle hooks will fail at runtime when it is specified. properties: host: description: 'Optional: Host name to connect to, defaults to the pod IP.' type: string port: anyOf: - type: integer - type: string description: |- Number or name of the port to access on the container. Number must be in the range 1 to 65535. Name must be an IANA_SVC_NAME. x-kubernetes-int-or-string: true required: - port type: object type: object preStop: description: |- PreStop is called immediately before a container is terminated due to an API request or management event such as liveness/startup probe failure, preemption, resource contention, etc. The handler is not called if the container crashes or exits. The Pod's termination grace period countdown begins before the PreStop hook is executed. Regardless of the outcome of the handler, the container will eventually terminate within the Pod's termination grace period (unless delayed by finalizers). Other management of the container blocks until the hook completes or until the termination grace period is reached. More info: https://kubernetes.io/docs/concepts/containers/container-lifecycle-hooks/#container-hooks properties: exec: description: Exec specifies a command to execute in the container. properties: command: description: |- Command is the command line to execute inside the container, the working directory for the command is root ('/') in the container's filesystem. The command is simply exec'd, it is not run inside a shell, so traditional shell instructions ('|', etc) won't work. To use a shell, you need to explicitly call out to that shell. Exit status of 0 is treated as live/healthy and non-zero is unhealthy. items: type: string type: array x-kubernetes-list-type: atomic type: object httpGet: description: HTTPGet specifies an HTTP GET request to perform. properties: host: description: |- Host name to connect to, defaults to the pod IP. You probably want to set "Host" in httpHeaders instead. type: string httpHeaders: description: Custom headers to set in the request. HTTP allows repeated headers. items: description: HTTPHeader describes a custom header to be used in HTTP probes properties: name: description: |- The header field name. This will be canonicalized upon output, so case-variant names will be understood as the same header. type: string value: description: The header field value type: string required: - name - value type: object type: array x-kubernetes-list-type: atomic path: description: Path to access on the HTTP server. type: string port: anyOf: - type: integer - type: string description: |- Name or number of the port to access on the container. Number must be in the range 1 to 65535. Name must be an IANA_SVC_NAME. x-kubernetes-int-or-string: true scheme: description: |- Scheme to use for connecting to the host. Defaults to HTTP. type: string required: - port type: object sleep: description: Sleep represents a duration that the container should sleep. properties: seconds: description: Seconds is the number of seconds to sleep. format: int64 type: integer required: - seconds type: object tcpSocket: description: |- Deprecated. TCPSocket is NOT supported as a LifecycleHandler and kept for backward compatibility. There is no validation of this field and lifecycle hooks will fail at runtime when it is specified. properties: host: description: 'Optional: Host name to connect to, defaults to the pod IP.' type: string port: anyOf: - type: integer - type: string description: |- Number or name of the port to access on the container. Number must be in the range 1 to 65535. Name must be an IANA_SVC_NAME. x-kubernetes-int-or-string: true required: - port type: object type: object stopSignal: description: |- StopSignal defines which signal will be sent to a container when it is being stopped. If not specified, the default is defined by the container runtime in use. StopSignal can only be set for Pods with a non-empty .spec.os.name type: string type: object livenessProbe: description: |- Periodic probe of container liveness. Container will be restarted if the probe fails. Cannot be updated. More info: https://kubernetes.io/docs/concepts/workloads/pods/pod-lifecycle#container-probes properties: exec: description: Exec specifies a command to execute in the container. properties: command: description: |- Command is the command line to execute inside the container, the working directory for the command is root ('/') in the container's filesystem. The command is simply exec'd, it is not run inside a shell, so traditional shell instructions ('|', etc) won't work. To use a shell, you need to explicitly call out to that shell. Exit status of 0 is treated as live/healthy and non-zero is unhealthy. items: type: string type: array x-kubernetes-list-type: atomic type: object failureThreshold: description: |- Minimum consecutive failures for the probe to be considered failed after having succeeded. Defaults to 3. Minimum value is 1. format: int32 type: integer grpc: description: GRPC specifies a GRPC HealthCheckRequest. properties: port: description: Port number of the gRPC service. Number must be in the range 1 to 65535. format: int32 type: integer service: default: "" description: |- Service is the name of the service to place in the gRPC HealthCheckRequest (see https://github.com/grpc/grpc/blob/master/doc/health-checking.md). If this is not specified, the default behavior is defined by gRPC. type: string required: - port type: object httpGet: description: HTTPGet specifies an HTTP GET request to perform. properties: host: description: |- Host name to connect to, defaults to the pod IP. You probably want to set "Host" in httpHeaders instead. type: string httpHeaders: description: Custom headers to set in the request. HTTP allows repeated headers. items: description: HTTPHeader describes a custom header to be used in HTTP probes properties: name: description: |- The header field name. This will be canonicalized upon output, so case-variant names will be understood as the same header. type: string value: description: The header field value type: string required: - name - value type: object type: array x-kubernetes-list-type: atomic path: description: Path to access on the HTTP server. type: string port: anyOf: - type: integer - type: string description: |- Name or number of the port to access on the container. Number must be in the range 1 to 65535. Name must be an IANA_SVC_NAME. x-kubernetes-int-or-string: true scheme: description: |- Scheme to use for connecting to the host. Defaults to HTTP. type: string required: - port type: object initialDelaySeconds: description: |- Number of seconds after the container has started before liveness probes are initiated. More info: https://kubernetes.io/docs/concepts/workloads/pods/pod-lifecycle#container-probes format: int32 type: integer periodSeconds: description: |- How often (in seconds) to perform the probe. Default to 10 seconds. Minimum value is 1. format: int32 type: integer successThreshold: description: |- Minimum consecutive successes for the probe to be considered successful after having failed. Defaults to 1. Must be 1 for liveness and startup. Minimum value is 1. format: int32 type: integer tcpSocket: description: TCPSocket specifies a connection to a TCP port. properties: host: description: 'Optional: Host name to connect to, defaults to the pod IP.' type: string port: anyOf: - type: integer - type: string description: |- Number or name of the port to access on the container. Number must be in the range 1 to 65535. Name must be an IANA_SVC_NAME. x-kubernetes-int-or-string: true required: - port type: object terminationGracePeriodSeconds: description: |- Optional duration in seconds the pod needs to terminate gracefully upon probe failure. The grace period is the duration in seconds after the processes running in the pod are sent a termination signal and the time when the processes are forcibly halted with a kill signal. Set this value longer than the expected cleanup time for your process. If this value is nil, the pod's terminationGracePeriodSeconds will be used. Otherwise, this value overrides the value provided by the pod spec. Value must be non-negative integer. The value zero indicates stop immediately via the kill signal (no opportunity to shut down). This is a beta field and requires enabling ProbeTerminationGracePeriod feature gate. Minimum value is 1. spec.terminationGracePeriodSeconds is used if unset. format: int64 type: integer timeoutSeconds: description: |- Number of seconds after which the probe times out. Defaults to 1 second. Minimum value is 1. More info: https://kubernetes.io/docs/concepts/workloads/pods/pod-lifecycle#container-probes format: int32 type: integer type: object name: description: |- Name of the container specified as a DNS_LABEL. Each container in a pod must have a unique name (DNS_LABEL). Cannot be updated. type: string ports: description: |- List of ports to expose from the container. Not specifying a port here DOES NOT prevent that port from being exposed. Any port which is listening on the default "0.0.0.0" address inside a container will be accessible from the network. Modifying this array with strategic merge patch may corrupt the data. For more information See https://github.com/kubernetes/kubernetes/issues/108255. Cannot be updated. items: description: ContainerPort represents a network port in a single container. properties: containerPort: description: |- Number of port to expose on the pod's IP address. This must be a valid port number, 0 < x < 65536. format: int32 type: integer hostIP: description: What host IP to bind the external port to. type: string hostPort: description: |- Number of port to expose on the host. If specified, this must be a valid port number, 0 < x < 65536. If HostNetwork is specified, this must match ContainerPort. Most containers do not need this. format: int32 type: integer name: description: |- If specified, this must be an IANA_SVC_NAME and unique within the pod. Each named port in a pod must have a unique name. Name for the port that can be referred to by services. type: string protocol: default: TCP description: |- Protocol for port. Must be UDP, TCP, or SCTP. Defaults to "TCP". type: string required: - containerPort type: object type: array x-kubernetes-list-map-keys: - containerPort - protocol x-kubernetes-list-type: map readinessProbe: description: |- Periodic probe of container service readiness. Container will be removed from service endpoints if the probe fails. Cannot be updated. More info: https://kubernetes.io/docs/concepts/workloads/pods/pod-lifecycle#container-probes properties: exec: description: Exec specifies a command to execute in the container. properties: command: description: |- Command is the command line to execute inside the container, the working directory for the command is root ('/') in the container's filesystem. The command is simply exec'd, it is not run inside a shell, so traditional shell instructions ('|', etc) won't work. To use a shell, you need to explicitly call out to that shell. Exit status of 0 is treated as live/healthy and non-zero is unhealthy. items: type: string type: array x-kubernetes-list-type: atomic type: object failureThreshold: description: |- Minimum consecutive failures for the probe to be considered failed after having succeeded. Defaults to 3. Minimum value is 1. format: int32 type: integer grpc: description: GRPC specifies a GRPC HealthCheckRequest. properties: port: description: Port number of the gRPC service. Number must be in the range 1 to 65535. format: int32 type: integer service: default: "" description: |- Service is the name of the service to place in the gRPC HealthCheckRequest (see https://github.com/grpc/grpc/blob/master/doc/health-checking.md). If this is not specified, the default behavior is defined by gRPC. type: string required: - port type: object httpGet: description: HTTPGet specifies an HTTP GET request to perform. properties: host: description: |- Host name to connect to, defaults to the pod IP. You probably want to set "Host" in httpHeaders instead. type: string httpHeaders: description: Custom headers to set in the request. HTTP allows repeated headers. items: description: HTTPHeader describes a custom header to be used in HTTP probes properties: name: description: |- The header field name. This will be canonicalized upon output, so case-variant names will be understood as the same header. type: string value: description: The header field value type: string required: - name - value type: object type: array x-kubernetes-list-type: atomic path: description: Path to access on the HTTP server. type: string port: anyOf: - type: integer - type: string description: |- Name or number of the port to access on the container. Number must be in the range 1 to 65535. Name must be an IANA_SVC_NAME. x-kubernetes-int-or-string: true scheme: description: |- Scheme to use for connecting to the host. Defaults to HTTP. type: string required: - port type: object initialDelaySeconds: description: |- Number of seconds after the container has started before liveness probes are initiated. More info: https://kubernetes.io/docs/concepts/workloads/pods/pod-lifecycle#container-probes format: int32 type: integer periodSeconds: description: |- How often (in seconds) to perform the probe. Default to 10 seconds. Minimum value is 1. format: int32 type: integer successThreshold: description: |- Minimum consecutive successes for the probe to be considered successful after having failed. Defaults to 1. Must be 1 for liveness and startup. Minimum value is 1. format: int32 type: integer tcpSocket: description: TCPSocket specifies a connection to a TCP port. properties: host: description: 'Optional: Host name to connect to, defaults to the pod IP.' type: string port: anyOf: - type: integer - type: string description: |- Number or name of the port to access on the container. Number must be in the range 1 to 65535. Name must be an IANA_SVC_NAME. x-kubernetes-int-or-string: true required: - port type: object terminationGracePeriodSeconds: description: |- Optional duration in seconds the pod needs to terminate gracefully upon probe failure. The grace period is the duration in seconds after the processes running in the pod are sent a termination signal and the time when the processes are forcibly halted with a kill signal. Set this value longer than the expected cleanup time for your process. If this value is nil, the pod's terminationGracePeriodSeconds will be used. Otherwise, this value overrides the value provided by the pod spec. Value must be non-negative integer. The value zero indicates stop immediately via the kill signal (no opportunity to shut down). This is a beta field and requires enabling ProbeTerminationGracePeriod feature gate. Minimum value is 1. spec.terminationGracePeriodSeconds is used if unset. format: int64 type: integer timeoutSeconds: description: |- Number of seconds after which the probe times out. Defaults to 1 second. Minimum value is 1. More info: https://kubernetes.io/docs/concepts/workloads/pods/pod-lifecycle#container-probes format: int32 type: integer type: object resizePolicy: description: |- Resources resize policy for the container. This field cannot be set on ephemeral containers. items: description: ContainerResizePolicy represents resource resize policy for the container. properties: resourceName: description: |- Name of the resource to which this resource resize policy applies. Supported values: cpu, memory. type: string restartPolicy: description: |- Restart policy to apply when specified resource is resized. If not specified, it defaults to NotRequired. type: string required: - resourceName - restartPolicy type: object type: array x-kubernetes-list-type: atomic resources: description: |- Compute Resources required by this container. Cannot be updated. More info: https://kubernetes.io/docs/concepts/configuration/manage-resources-containers/ properties: claims: description: |- Claims lists the names of resources, defined in spec.resourceClaims, that are used by this container. This field depends on the DynamicResourceAllocation feature gate. This field is immutable. It can only be set for containers. items: description: ResourceClaim references one entry in PodSpec.ResourceClaims. properties: name: description: |- Name must match the name of one entry in pod.spec.resourceClaims of the Pod where this field is used. It makes that resource available inside a container. type: string request: description: |- Request is the name chosen for a request in the referenced claim. If empty, everything from the claim is made available, otherwise only the result of this request. type: string required: - name type: object type: array x-kubernetes-list-map-keys: - name x-kubernetes-list-type: map limits: additionalProperties: anyOf: - type: integer - type: string pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ x-kubernetes-int-or-string: true description: |- Limits describes the maximum amount of compute resources allowed. More info: https://kubernetes.io/docs/concepts/configuration/manage-resources-containers/ type: object requests: additionalProperties: anyOf: - type: integer - type: string pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ x-kubernetes-int-or-string: true description: |- Requests describes the minimum amount of compute resources required. If Requests is omitted for a container, it defaults to Limits if that is explicitly specified, otherwise to an implementation-defined value. Requests cannot exceed Limits. More info: https://kubernetes.io/docs/concepts/configuration/manage-resources-containers/ type: object type: object restartPolicy: description: |- RestartPolicy defines the restart behavior of individual containers in a pod. This overrides the pod-level restart policy. When this field is not specified, the restart behavior is defined by the Pod's restart policy and the container type. Additionally, setting the RestartPolicy as "Always" for the init container will have the following effect: this init container will be continually restarted on exit until all regular containers have terminated. Once all regular containers have completed, all init containers with restartPolicy "Always" will be shut down. This lifecycle differs from normal init containers and is often referred to as a "sidecar" container. Although this init container still starts in the init container sequence, it does not wait for the container to complete before proceeding to the next init container. Instead, the next init container starts immediately after this init container is started, or after any startupProbe has successfully completed. type: string restartPolicyRules: description: |- Represents a list of rules to be checked to determine if the container should be restarted on exit. The rules are evaluated in order. Once a rule matches a container exit condition, the remaining rules are ignored. If no rule matches the container exit condition, the Container-level restart policy determines the whether the container is restarted or not. Constraints on the rules: - At most 20 rules are allowed. - Rules can have the same action. - Identical rules are not forbidden in validations. When rules are specified, container MUST set RestartPolicy explicitly even it if matches the Pod's RestartPolicy. items: description: ContainerRestartRule describes how a container exit is handled. properties: action: description: |- Specifies the action taken on a container exit if the requirements are satisfied. The only possible value is "Restart" to restart the container. type: string exitCodes: description: Represents the exit codes to check on container exits. properties: operator: description: |- Represents the relationship between the container exit code(s) and the specified values. Possible values are: - In: the requirement is satisfied if the container exit code is in the set of specified values. - NotIn: the requirement is satisfied if the container exit code is not in the set of specified values. type: string values: description: |- Specifies the set of values to check for container exit codes. At most 255 elements are allowed. items: format: int32 type: integer type: array x-kubernetes-list-type: set required: - operator type: object required: - action type: object type: array x-kubernetes-list-type: atomic securityContext: description: |- SecurityContext defines the security options the container should be run with. If set, the fields of SecurityContext override the equivalent fields of PodSecurityContext. More info: https://kubernetes.io/docs/tasks/configure-pod-container/security-context/ properties: allowPrivilegeEscalation: description: |- AllowPrivilegeEscalation controls whether a process can gain more privileges than its parent process. This bool directly controls if the no_new_privs flag will be set on the container process. AllowPrivilegeEscalation is true always when the container is: 1) run as Privileged 2) has CAP_SYS_ADMIN Note that this field cannot be set when spec.os.name is windows. type: boolean appArmorProfile: description: |- appArmorProfile is the AppArmor options to use by this container. If set, this profile overrides the pod's appArmorProfile. Note that this field cannot be set when spec.os.name is windows. properties: localhostProfile: description: |- localhostProfile indicates a profile loaded on the node that should be used. The profile must be preconfigured on the node to work. Must match the loaded name of the profile. Must be set if and only if type is "Localhost". type: string type: description: |- type indicates which kind of AppArmor profile will be applied. Valid options are: Localhost - a profile pre-loaded on the node. RuntimeDefault - the container runtime's default profile. Unconfined - no AppArmor enforcement. type: string required: - type type: object capabilities: description: |- The capabilities to add/drop when running containers. Defaults to the default set of capabilities granted by the container runtime. Note that this field cannot be set when spec.os.name is windows. properties: add: description: Added capabilities items: description: Capability represent POSIX capabilities type type: string type: array x-kubernetes-list-type: atomic drop: description: Removed capabilities items: description: Capability represent POSIX capabilities type type: string type: array x-kubernetes-list-type: atomic type: object privileged: description: |- Run container in privileged mode. Processes in privileged containers are essentially equivalent to root on the host. Defaults to false. Note that this field cannot be set when spec.os.name is windows. type: boolean procMount: description: |- procMount denotes the type of proc mount to use for the containers. The default value is Default which uses the container runtime defaults for readonly paths and masked paths. This requires the ProcMountType feature flag to be enabled. Note that this field cannot be set when spec.os.name is windows. type: string readOnlyRootFilesystem: description: |- Whether this container has a read-only root filesystem. Default is false. Note that this field cannot be set when spec.os.name is windows. type: boolean runAsGroup: description: |- The GID to run the entrypoint of the container process. Uses runtime default if unset. May also be set in PodSecurityContext. If set in both SecurityContext and PodSecurityContext, the value specified in SecurityContext takes precedence. Note that this field cannot be set when spec.os.name is windows. format: int64 type: integer runAsNonRoot: description: |- Indicates that the container must run as a non-root user. If true, the Kubelet will validate the image at runtime to ensure that it does not run as UID 0 (root) and fail to start the container if it does. If unset or false, no such validation will be performed. May also be set in PodSecurityContext. If set in both SecurityContext and PodSecurityContext, the value specified in SecurityContext takes precedence. type: boolean runAsUser: description: |- The UID to run the entrypoint of the container process. Defaults to user specified in image metadata if unspecified. May also be set in PodSecurityContext. If set in both SecurityContext and PodSecurityContext, the value specified in SecurityContext takes precedence. Note that this field cannot be set when spec.os.name is windows. format: int64 type: integer seLinuxOptions: description: |- The SELinux context to be applied to the container. If unspecified, the container runtime will allocate a random SELinux context for each container. May also be set in PodSecurityContext. If set in both SecurityContext and PodSecurityContext, the value specified in SecurityContext takes precedence. Note that this field cannot be set when spec.os.name is windows. properties: level: description: Level is SELinux level label that applies to the container. type: string role: description: Role is a SELinux role label that applies to the container. type: string type: description: Type is a SELinux type label that applies to the container. type: string user: description: User is a SELinux user label that applies to the container. type: string type: object seccompProfile: description: |- The seccomp options to use by this container. If seccomp options are provided at both the pod & container level, the container options override the pod options. Note that this field cannot be set when spec.os.name is windows. properties: localhostProfile: description: |- localhostProfile indicates a profile defined in a file on the node should be used. The profile must be preconfigured on the node to work. Must be a descending path, relative to the kubelet's configured seccomp profile location. Must be set if type is "Localhost". Must NOT be set for any other type. type: string type: description: |- type indicates which kind of seccomp profile will be applied. Valid options are: Localhost - a profile defined in a file on the node should be used. RuntimeDefault - the container runtime default profile should be used. Unconfined - no profile should be applied. type: string required: - type type: object windowsOptions: description: |- The Windows specific settings applied to all containers. If unspecified, the options from the PodSecurityContext will be used. If set in both SecurityContext and PodSecurityContext, the value specified in SecurityContext takes precedence. Note that this field cannot be set when spec.os.name is linux. properties: gmsaCredentialSpec: description: |- GMSACredentialSpec is where the GMSA admission webhook (https://github.com/kubernetes-sigs/windows-gmsa) inlines the contents of the GMSA credential spec named by the GMSACredentialSpecName field. type: string gmsaCredentialSpecName: description: GMSACredentialSpecName is the name of the GMSA credential spec to use. type: string hostProcess: description: |- HostProcess determines if a container should be run as a 'Host Process' container. All of a Pod's containers must have the same effective HostProcess value (it is not allowed to have a mix of HostProcess containers and non-HostProcess containers). In addition, if HostProcess is true then HostNetwork must also be set to true. type: boolean runAsUserName: description: |- The UserName in Windows to run the entrypoint of the container process. Defaults to the user specified in image metadata if unspecified. May also be set in PodSecurityContext. If set in both SecurityContext and PodSecurityContext, the value specified in SecurityContext takes precedence. type: string type: object type: object startupProbe: description: |- StartupProbe indicates that the Pod has successfully initialized. If specified, no other probes are executed until this completes successfully. If this probe fails, the Pod will be restarted, just as if the livenessProbe failed. This can be used to provide different probe parameters at the beginning of a Pod's lifecycle, when it might take a long time to load data or warm a cache, than during steady-state operation. This cannot be updated. More info: https://kubernetes.io/docs/concepts/workloads/pods/pod-lifecycle#container-probes properties: exec: description: Exec specifies a command to execute in the container. properties: command: description: |- Command is the command line to execute inside the container, the working directory for the command is root ('/') in the container's filesystem. The command is simply exec'd, it is not run inside a shell, so traditional shell instructions ('|', etc) won't work. To use a shell, you need to explicitly call out to that shell. Exit status of 0 is treated as live/healthy and non-zero is unhealthy. items: type: string type: array x-kubernetes-list-type: atomic type: object failureThreshold: description: |- Minimum consecutive failures for the probe to be considered failed after having succeeded. Defaults to 3. Minimum value is 1. format: int32 type: integer grpc: description: GRPC specifies a GRPC HealthCheckRequest. properties: port: description: Port number of the gRPC service. Number must be in the range 1 to 65535. format: int32 type: integer service: default: "" description: |- Service is the name of the service to place in the gRPC HealthCheckRequest (see https://github.com/grpc/grpc/blob/master/doc/health-checking.md). If this is not specified, the default behavior is defined by gRPC. type: string required: - port type: object httpGet: description: HTTPGet specifies an HTTP GET request to perform. properties: host: description: |- Host name to connect to, defaults to the pod IP. You probably want to set "Host" in httpHeaders instead. type: string httpHeaders: description: Custom headers to set in the request. HTTP allows repeated headers. items: description: HTTPHeader describes a custom header to be used in HTTP probes properties: name: description: |- The header field name. This will be canonicalized upon output, so case-variant names will be understood as the same header. type: string value: description: The header field value type: string required: - name - value type: object type: array x-kubernetes-list-type: atomic path: description: Path to access on the HTTP server. type: string port: anyOf: - type: integer - type: string description: |- Name or number of the port to access on the container. Number must be in the range 1 to 65535. Name must be an IANA_SVC_NAME. x-kubernetes-int-or-string: true scheme: description: |- Scheme to use for connecting to the host. Defaults to HTTP. type: string required: - port type: object initialDelaySeconds: description: |- Number of seconds after the container has started before liveness probes are initiated. More info: https://kubernetes.io/docs/concepts/workloads/pods/pod-lifecycle#container-probes format: int32 type: integer periodSeconds: description: |- How often (in seconds) to perform the probe. Default to 10 seconds. Minimum value is 1. format: int32 type: integer successThreshold: description: |- Minimum consecutive successes for the probe to be considered successful after having failed. Defaults to 1. Must be 1 for liveness and startup. Minimum value is 1. format: int32 type: integer tcpSocket: description: TCPSocket specifies a connection to a TCP port. properties: host: description: 'Optional: Host name to connect to, defaults to the pod IP.' type: string port: anyOf: - type: integer - type: string description: |- Number or name of the port to access on the container. Number must be in the range 1 to 65535. Name must be an IANA_SVC_NAME. x-kubernetes-int-or-string: true required: - port type: object terminationGracePeriodSeconds: description: |- Optional duration in seconds the pod needs to terminate gracefully upon probe failure. The grace period is the duration in seconds after the processes running in the pod are sent a termination signal and the time when the processes are forcibly halted with a kill signal. Set this value longer than the expected cleanup time for your process. If this value is nil, the pod's terminationGracePeriodSeconds will be used. Otherwise, this value overrides the value provided by the pod spec. Value must be non-negative integer. The value zero indicates stop immediately via the kill signal (no opportunity to shut down). This is a beta field and requires enabling ProbeTerminationGracePeriod feature gate. Minimum value is 1. spec.terminationGracePeriodSeconds is used if unset. format: int64 type: integer timeoutSeconds: description: |- Number of seconds after which the probe times out. Defaults to 1 second. Minimum value is 1. More info: https://kubernetes.io/docs/concepts/workloads/pods/pod-lifecycle#container-probes format: int32 type: integer type: object stdin: description: |- Whether this container should allocate a buffer for stdin in the container runtime. If this is not set, reads from stdin in the container will always result in EOF. Default is false. type: boolean stdinOnce: description: |- Whether the container runtime should close the stdin channel after it has been opened by a single attach. When stdin is true the stdin stream will remain open across multiple attach sessions. If stdinOnce is set to true, stdin is opened on container start, is empty until the first client attaches to stdin, and then remains open and accepts data until the client disconnects, at which time stdin is closed and remains closed until the container is restarted. If this flag is false, a container processes that reads from stdin will never receive an EOF. Default is false type: boolean terminationMessagePath: description: |- Optional: Path at which the file to which the container's termination message will be written is mounted into the container's filesystem. Message written is intended to be brief final status, such as an assertion failure message. Will be truncated by the node if greater than 4096 bytes. The total message length across all containers will be limited to 12kb. Defaults to /dev/termination-log. Cannot be updated. type: string terminationMessagePolicy: description: |- Indicate how the termination message should be populated. File will use the contents of terminationMessagePath to populate the container status message on both success and failure. FallbackToLogsOnError will use the last chunk of container log output if the termination message file is empty and the container exited with an error. The log output is limited to 2048 bytes or 80 lines, whichever is smaller. Defaults to File. Cannot be updated. type: string tty: description: |- Whether this container should allocate a TTY for itself, also requires 'stdin' to be true. Default is false. type: boolean volumeDevices: description: volumeDevices is the list of block devices to be used by the container. items: description: volumeDevice describes a mapping of a raw block device within a container. properties: devicePath: description: devicePath is the path inside of the container that the device will be mapped to. type: string name: description: name must match the name of a persistentVolumeClaim in the pod type: string required: - devicePath - name type: object type: array x-kubernetes-list-map-keys: - devicePath x-kubernetes-list-type: map volumeMounts: description: |- Pod volumes to mount into the container's filesystem. Cannot be updated. items: description: VolumeMount describes a mounting of a Volume within a container. properties: mountPath: description: |- Path within the container at which the volume should be mounted. Must not contain ':'. type: string mountPropagation: description: |- mountPropagation determines how mounts are propagated from the host to container and the other way around. When not set, MountPropagationNone is used. This field is beta in 1.10. When RecursiveReadOnly is set to IfPossible or to Enabled, MountPropagation must be None or unspecified (which defaults to None). type: string name: description: This must match the Name of a Volume. type: string readOnly: description: |- Mounted read-only if true, read-write otherwise (false or unspecified). Defaults to false. type: boolean recursiveReadOnly: description: |- RecursiveReadOnly specifies whether read-only mounts should be handled recursively. If ReadOnly is false, this field has no meaning and must be unspecified. If ReadOnly is true, and this field is set to Disabled, the mount is not made recursively read-only. If this field is set to IfPossible, the mount is made recursively read-only, if it is supported by the container runtime. If this field is set to Enabled, the mount is made recursively read-only if it is supported by the container runtime, otherwise the pod will not be started and an error will be generated to indicate the reason. If this field is set to IfPossible or Enabled, MountPropagation must be set to None (or be unspecified, which defaults to None). If this field is not specified, it is treated as an equivalent of Disabled. type: string subPath: description: |- Path within the volume from which the container's volume should be mounted. Defaults to "" (volume's root). type: string subPathExpr: description: |- Expanded path within the volume from which the container's volume should be mounted. Behaves similarly to SubPath but environment variable references $(VAR_NAME) are expanded using the container's environment. Defaults to "" (volume's root). SubPathExpr and SubPath are mutually exclusive. type: string required: - mountPath - name type: object type: array x-kubernetes-list-map-keys: - mountPath x-kubernetes-list-type: map workingDir: description: |- Container's working directory. If not specified, the container runtime's default will be used, which might be configured in the container image. Cannot be updated. type: string required: - name type: object type: array x-kubernetes-list-map-keys: - name x-kubernetes-list-type: map dnsConfig: description: |- Specifies the DNS parameters of a pod. Parameters specified here will be merged to the generated DNS configuration based on DNSPolicy. properties: nameservers: description: |- A list of DNS name server IP addresses. This will be appended to the base nameservers generated from DNSPolicy. Duplicated nameservers will be removed. items: type: string type: array x-kubernetes-list-type: atomic options: description: |- A list of DNS resolver options. This will be merged with the base options generated from DNSPolicy. Duplicated entries will be removed. Resolution options given in Options will override those that appear in the base DNSPolicy. items: description: PodDNSConfigOption defines DNS resolver options of a pod. properties: name: description: |- Name is this DNS resolver option's name. Required. type: string value: description: Value is this DNS resolver option's value. type: string type: object type: array x-kubernetes-list-type: atomic searches: description: |- A list of DNS search domains for host-name lookup. This will be appended to the base search paths generated from DNSPolicy. Duplicated search paths will be removed. items: type: string type: array x-kubernetes-list-type: atomic type: object dnsPolicy: description: |- Set DNS policy for the pod. Defaults to "ClusterFirst". Valid values are 'ClusterFirstWithHostNet', 'ClusterFirst', 'Default' or 'None'. DNS parameters given in DNSConfig will be merged with the policy selected with DNSPolicy. To have DNS options set along with hostNetwork, you have to specify DNS policy explicitly to 'ClusterFirstWithHostNet'. type: string enableServiceLinks: description: |- EnableServiceLinks indicates whether information about services should be injected into pod's environment variables, matching the syntax of Docker links. Optional: Defaults to true. type: boolean ephemeralContainers: description: |- List of ephemeral containers run in this pod. Ephemeral containers may be run in an existing pod to perform user-initiated actions such as debugging. This list cannot be specified when creating a pod, and it cannot be modified by updating the pod spec. In order to add an ephemeral container to an existing pod, use the pod's ephemeralcontainers subresource. items: description: |- An EphemeralContainer is a temporary container that you may add to an existing Pod for user-initiated activities such as debugging. Ephemeral containers have no resource or scheduling guarantees, and they will not be restarted when they exit or when a Pod is removed or restarted. The kubelet may evict a Pod if an ephemeral container causes the Pod to exceed its resource allocation. To add an ephemeral container, use the ephemeralcontainers subresource of an existing Pod. Ephemeral containers may not be removed or restarted. properties: args: description: |- Arguments to the entrypoint. The image's CMD is used if this is not provided. Variable references $(VAR_NAME) are expanded using the container's environment. If a variable cannot be resolved, the reference in the input string will be unchanged. Double $$ are reduced to a single $, which allows for escaping the $(VAR_NAME) syntax: i.e. "$$(VAR_NAME)" will produce the string literal "$(VAR_NAME)". Escaped references will never be expanded, regardless of whether the variable exists or not. Cannot be updated. More info: https://kubernetes.io/docs/tasks/inject-data-application/define-command-argument-container/#running-a-command-in-a-shell items: type: string type: array x-kubernetes-list-type: atomic command: description: |- Entrypoint array. Not executed within a shell. The image's ENTRYPOINT is used if this is not provided. Variable references $(VAR_NAME) are expanded using the container's environment. If a variable cannot be resolved, the reference in the input string will be unchanged. Double $$ are reduced to a single $, which allows for escaping the $(VAR_NAME) syntax: i.e. "$$(VAR_NAME)" will produce the string literal "$(VAR_NAME)". Escaped references will never be expanded, regardless of whether the variable exists or not. Cannot be updated. More info: https://kubernetes.io/docs/tasks/inject-data-application/define-command-argument-container/#running-a-command-in-a-shell items: type: string type: array x-kubernetes-list-type: atomic env: description: |- List of environment variables to set in the container. Cannot be updated. items: description: EnvVar represents an environment variable present in a Container. properties: name: description: |- Name of the environment variable. May consist of any printable ASCII characters except '='. type: string value: description: |- Variable references $(VAR_NAME) are expanded using the previously defined environment variables in the container and any service environment variables. If a variable cannot be resolved, the reference in the input string will be unchanged. Double $$ are reduced to a single $, which allows for escaping the $(VAR_NAME) syntax: i.e. "$$(VAR_NAME)" will produce the string literal "$(VAR_NAME)". Escaped references will never be expanded, regardless of whether the variable exists or not. Defaults to "". type: string valueFrom: description: Source for the environment variable's value. Cannot be used if value is not empty. properties: configMapKeyRef: description: Selects a key of a ConfigMap. properties: key: description: The key to select. type: string name: default: "" description: |- Name of the referent. This field is effectively required, but due to backwards compatibility is allowed to be empty. Instances of this type with an empty value here are almost certainly wrong. More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names type: string optional: description: Specify whether the ConfigMap or its key must be defined type: boolean required: - key type: object x-kubernetes-map-type: atomic fieldRef: description: |- Selects a field of the pod: supports metadata.name, metadata.namespace, `metadata.labels['']`, `metadata.annotations['']`, spec.nodeName, spec.serviceAccountName, status.hostIP, status.podIP, status.podIPs. properties: apiVersion: description: Version of the schema the FieldPath is written in terms of, defaults to "v1". type: string fieldPath: description: Path of the field to select in the specified API version. type: string required: - fieldPath type: object x-kubernetes-map-type: atomic fileKeyRef: description: |- FileKeyRef selects a key of the env file. Requires the EnvFiles feature gate to be enabled. properties: key: description: |- The key within the env file. An invalid key will prevent the pod from starting. The keys defined within a source may consist of any printable ASCII characters except '='. During Alpha stage of the EnvFiles feature gate, the key size is limited to 128 characters. type: string optional: default: false description: |- Specify whether the file or its key must be defined. If the file or key does not exist, then the env var is not published. If optional is set to true and the specified key does not exist, the environment variable will not be set in the Pod's containers. If optional is set to false and the specified key does not exist, an error will be returned during Pod creation. type: boolean path: description: |- The path within the volume from which to select the file. Must be relative and may not contain the '..' path or start with '..'. type: string volumeName: description: The name of the volume mount containing the env file. type: string required: - key - path - volumeName type: object x-kubernetes-map-type: atomic resourceFieldRef: description: |- Selects a resource of the container: only resources limits and requests (limits.cpu, limits.memory, limits.ephemeral-storage, requests.cpu, requests.memory and requests.ephemeral-storage) are currently supported. properties: containerName: description: 'Container name: required for volumes, optional for env vars' type: string divisor: anyOf: - type: integer - type: string description: Specifies the output format of the exposed resources, defaults to "1" pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ x-kubernetes-int-or-string: true resource: description: 'Required: resource to select' type: string required: - resource type: object x-kubernetes-map-type: atomic secretKeyRef: description: Selects a key of a secret in the pod's namespace properties: key: description: The key of the secret to select from. Must be a valid secret key. type: string name: default: "" description: |- Name of the referent. This field is effectively required, but due to backwards compatibility is allowed to be empty. Instances of this type with an empty value here are almost certainly wrong. More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names type: string optional: description: Specify whether the Secret or its key must be defined type: boolean required: - key type: object x-kubernetes-map-type: atomic type: object required: - name type: object type: array x-kubernetes-list-map-keys: - name x-kubernetes-list-type: map envFrom: description: |- List of sources to populate environment variables in the container. The keys defined within a source may consist of any printable ASCII characters except '='. When a key exists in multiple sources, the value associated with the last source will take precedence. Values defined by an Env with a duplicate key will take precedence. Cannot be updated. items: description: EnvFromSource represents the source of a set of ConfigMaps or Secrets properties: configMapRef: description: The ConfigMap to select from properties: name: default: "" description: |- Name of the referent. This field is effectively required, but due to backwards compatibility is allowed to be empty. Instances of this type with an empty value here are almost certainly wrong. More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names type: string optional: description: Specify whether the ConfigMap must be defined type: boolean type: object x-kubernetes-map-type: atomic prefix: description: |- Optional text to prepend to the name of each environment variable. May consist of any printable ASCII characters except '='. type: string secretRef: description: The Secret to select from properties: name: default: "" description: |- Name of the referent. This field is effectively required, but due to backwards compatibility is allowed to be empty. Instances of this type with an empty value here are almost certainly wrong. More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names type: string optional: description: Specify whether the Secret must be defined type: boolean type: object x-kubernetes-map-type: atomic type: object type: array x-kubernetes-list-type: atomic image: description: |- Container image name. More info: https://kubernetes.io/docs/concepts/containers/images type: string imagePullPolicy: description: |- Image pull policy. One of Always, Never, IfNotPresent. Defaults to Always if :latest tag is specified, or IfNotPresent otherwise. Cannot be updated. More info: https://kubernetes.io/docs/concepts/containers/images#updating-images type: string lifecycle: description: Lifecycle is not allowed for ephemeral containers. properties: postStart: description: |- PostStart is called immediately after a container is created. If the handler fails, the container is terminated and restarted according to its restart policy. Other management of the container blocks until the hook completes. More info: https://kubernetes.io/docs/concepts/containers/container-lifecycle-hooks/#container-hooks properties: exec: description: Exec specifies a command to execute in the container. properties: command: description: |- Command is the command line to execute inside the container, the working directory for the command is root ('/') in the container's filesystem. The command is simply exec'd, it is not run inside a shell, so traditional shell instructions ('|', etc) won't work. To use a shell, you need to explicitly call out to that shell. Exit status of 0 is treated as live/healthy and non-zero is unhealthy. items: type: string type: array x-kubernetes-list-type: atomic type: object httpGet: description: HTTPGet specifies an HTTP GET request to perform. properties: host: description: |- Host name to connect to, defaults to the pod IP. You probably want to set "Host" in httpHeaders instead. type: string httpHeaders: description: Custom headers to set in the request. HTTP allows repeated headers. items: description: HTTPHeader describes a custom header to be used in HTTP probes properties: name: description: |- The header field name. This will be canonicalized upon output, so case-variant names will be understood as the same header. type: string value: description: The header field value type: string required: - name - value type: object type: array x-kubernetes-list-type: atomic path: description: Path to access on the HTTP server. type: string port: anyOf: - type: integer - type: string description: |- Name or number of the port to access on the container. Number must be in the range 1 to 65535. Name must be an IANA_SVC_NAME. x-kubernetes-int-or-string: true scheme: description: |- Scheme to use for connecting to the host. Defaults to HTTP. type: string required: - port type: object sleep: description: Sleep represents a duration that the container should sleep. properties: seconds: description: Seconds is the number of seconds to sleep. format: int64 type: integer required: - seconds type: object tcpSocket: description: |- Deprecated. TCPSocket is NOT supported as a LifecycleHandler and kept for backward compatibility. There is no validation of this field and lifecycle hooks will fail at runtime when it is specified. properties: host: description: 'Optional: Host name to connect to, defaults to the pod IP.' type: string port: anyOf: - type: integer - type: string description: |- Number or name of the port to access on the container. Number must be in the range 1 to 65535. Name must be an IANA_SVC_NAME. x-kubernetes-int-or-string: true required: - port type: object type: object preStop: description: |- PreStop is called immediately before a container is terminated due to an API request or management event such as liveness/startup probe failure, preemption, resource contention, etc. The handler is not called if the container crashes or exits. The Pod's termination grace period countdown begins before the PreStop hook is executed. Regardless of the outcome of the handler, the container will eventually terminate within the Pod's termination grace period (unless delayed by finalizers). Other management of the container blocks until the hook completes or until the termination grace period is reached. More info: https://kubernetes.io/docs/concepts/containers/container-lifecycle-hooks/#container-hooks properties: exec: description: Exec specifies a command to execute in the container. properties: command: description: |- Command is the command line to execute inside the container, the working directory for the command is root ('/') in the container's filesystem. The command is simply exec'd, it is not run inside a shell, so traditional shell instructions ('|', etc) won't work. To use a shell, you need to explicitly call out to that shell. Exit status of 0 is treated as live/healthy and non-zero is unhealthy. items: type: string type: array x-kubernetes-list-type: atomic type: object httpGet: description: HTTPGet specifies an HTTP GET request to perform. properties: host: description: |- Host name to connect to, defaults to the pod IP. You probably want to set "Host" in httpHeaders instead. type: string httpHeaders: description: Custom headers to set in the request. HTTP allows repeated headers. items: description: HTTPHeader describes a custom header to be used in HTTP probes properties: name: description: |- The header field name. This will be canonicalized upon output, so case-variant names will be understood as the same header. type: string value: description: The header field value type: string required: - name - value type: object type: array x-kubernetes-list-type: atomic path: description: Path to access on the HTTP server. type: string port: anyOf: - type: integer - type: string description: |- Name or number of the port to access on the container. Number must be in the range 1 to 65535. Name must be an IANA_SVC_NAME. x-kubernetes-int-or-string: true scheme: description: |- Scheme to use for connecting to the host. Defaults to HTTP. type: string required: - port type: object sleep: description: Sleep represents a duration that the container should sleep. properties: seconds: description: Seconds is the number of seconds to sleep. format: int64 type: integer required: - seconds type: object tcpSocket: description: |- Deprecated. TCPSocket is NOT supported as a LifecycleHandler and kept for backward compatibility. There is no validation of this field and lifecycle hooks will fail at runtime when it is specified. properties: host: description: 'Optional: Host name to connect to, defaults to the pod IP.' type: string port: anyOf: - type: integer - type: string description: |- Number or name of the port to access on the container. Number must be in the range 1 to 65535. Name must be an IANA_SVC_NAME. x-kubernetes-int-or-string: true required: - port type: object type: object stopSignal: description: |- StopSignal defines which signal will be sent to a container when it is being stopped. If not specified, the default is defined by the container runtime in use. StopSignal can only be set for Pods with a non-empty .spec.os.name type: string type: object livenessProbe: description: Probes are not allowed for ephemeral containers. properties: exec: description: Exec specifies a command to execute in the container. properties: command: description: |- Command is the command line to execute inside the container, the working directory for the command is root ('/') in the container's filesystem. The command is simply exec'd, it is not run inside a shell, so traditional shell instructions ('|', etc) won't work. To use a shell, you need to explicitly call out to that shell. Exit status of 0 is treated as live/healthy and non-zero is unhealthy. items: type: string type: array x-kubernetes-list-type: atomic type: object failureThreshold: description: |- Minimum consecutive failures for the probe to be considered failed after having succeeded. Defaults to 3. Minimum value is 1. format: int32 type: integer grpc: description: GRPC specifies a GRPC HealthCheckRequest. properties: port: description: Port number of the gRPC service. Number must be in the range 1 to 65535. format: int32 type: integer service: default: "" description: |- Service is the name of the service to place in the gRPC HealthCheckRequest (see https://github.com/grpc/grpc/blob/master/doc/health-checking.md). If this is not specified, the default behavior is defined by gRPC. type: string required: - port type: object httpGet: description: HTTPGet specifies an HTTP GET request to perform. properties: host: description: |- Host name to connect to, defaults to the pod IP. You probably want to set "Host" in httpHeaders instead. type: string httpHeaders: description: Custom headers to set in the request. HTTP allows repeated headers. items: description: HTTPHeader describes a custom header to be used in HTTP probes properties: name: description: |- The header field name. This will be canonicalized upon output, so case-variant names will be understood as the same header. type: string value: description: The header field value type: string required: - name - value type: object type: array x-kubernetes-list-type: atomic path: description: Path to access on the HTTP server. type: string port: anyOf: - type: integer - type: string description: |- Name or number of the port to access on the container. Number must be in the range 1 to 65535. Name must be an IANA_SVC_NAME. x-kubernetes-int-or-string: true scheme: description: |- Scheme to use for connecting to the host. Defaults to HTTP. type: string required: - port type: object initialDelaySeconds: description: |- Number of seconds after the container has started before liveness probes are initiated. More info: https://kubernetes.io/docs/concepts/workloads/pods/pod-lifecycle#container-probes format: int32 type: integer periodSeconds: description: |- How often (in seconds) to perform the probe. Default to 10 seconds. Minimum value is 1. format: int32 type: integer successThreshold: description: |- Minimum consecutive successes for the probe to be considered successful after having failed. Defaults to 1. Must be 1 for liveness and startup. Minimum value is 1. format: int32 type: integer tcpSocket: description: TCPSocket specifies a connection to a TCP port. properties: host: description: 'Optional: Host name to connect to, defaults to the pod IP.' type: string port: anyOf: - type: integer - type: string description: |- Number or name of the port to access on the container. Number must be in the range 1 to 65535. Name must be an IANA_SVC_NAME. x-kubernetes-int-or-string: true required: - port type: object terminationGracePeriodSeconds: description: |- Optional duration in seconds the pod needs to terminate gracefully upon probe failure. The grace period is the duration in seconds after the processes running in the pod are sent a termination signal and the time when the processes are forcibly halted with a kill signal. Set this value longer than the expected cleanup time for your process. If this value is nil, the pod's terminationGracePeriodSeconds will be used. Otherwise, this value overrides the value provided by the pod spec. Value must be non-negative integer. The value zero indicates stop immediately via the kill signal (no opportunity to shut down). This is a beta field and requires enabling ProbeTerminationGracePeriod feature gate. Minimum value is 1. spec.terminationGracePeriodSeconds is used if unset. format: int64 type: integer timeoutSeconds: description: |- Number of seconds after which the probe times out. Defaults to 1 second. Minimum value is 1. More info: https://kubernetes.io/docs/concepts/workloads/pods/pod-lifecycle#container-probes format: int32 type: integer type: object name: description: |- Name of the ephemeral container specified as a DNS_LABEL. This name must be unique among all containers, init containers and ephemeral containers. type: string ports: description: Ports are not allowed for ephemeral containers. items: description: ContainerPort represents a network port in a single container. properties: containerPort: description: |- Number of port to expose on the pod's IP address. This must be a valid port number, 0 < x < 65536. format: int32 type: integer hostIP: description: What host IP to bind the external port to. type: string hostPort: description: |- Number of port to expose on the host. If specified, this must be a valid port number, 0 < x < 65536. If HostNetwork is specified, this must match ContainerPort. Most containers do not need this. format: int32 type: integer name: description: |- If specified, this must be an IANA_SVC_NAME and unique within the pod. Each named port in a pod must have a unique name. Name for the port that can be referred to by services. type: string protocol: default: TCP description: |- Protocol for port. Must be UDP, TCP, or SCTP. Defaults to "TCP". type: string required: - containerPort type: object type: array x-kubernetes-list-map-keys: - containerPort - protocol x-kubernetes-list-type: map readinessProbe: description: Probes are not allowed for ephemeral containers. properties: exec: description: Exec specifies a command to execute in the container. properties: command: description: |- Command is the command line to execute inside the container, the working directory for the command is root ('/') in the container's filesystem. The command is simply exec'd, it is not run inside a shell, so traditional shell instructions ('|', etc) won't work. To use a shell, you need to explicitly call out to that shell. Exit status of 0 is treated as live/healthy and non-zero is unhealthy. items: type: string type: array x-kubernetes-list-type: atomic type: object failureThreshold: description: |- Minimum consecutive failures for the probe to be considered failed after having succeeded. Defaults to 3. Minimum value is 1. format: int32 type: integer grpc: description: GRPC specifies a GRPC HealthCheckRequest. properties: port: description: Port number of the gRPC service. Number must be in the range 1 to 65535. format: int32 type: integer service: default: "" description: |- Service is the name of the service to place in the gRPC HealthCheckRequest (see https://github.com/grpc/grpc/blob/master/doc/health-checking.md). If this is not specified, the default behavior is defined by gRPC. type: string required: - port type: object httpGet: description: HTTPGet specifies an HTTP GET request to perform. properties: host: description: |- Host name to connect to, defaults to the pod IP. You probably want to set "Host" in httpHeaders instead. type: string httpHeaders: description: Custom headers to set in the request. HTTP allows repeated headers. items: description: HTTPHeader describes a custom header to be used in HTTP probes properties: name: description: |- The header field name. This will be canonicalized upon output, so case-variant names will be understood as the same header. type: string value: description: The header field value type: string required: - name - value type: object type: array x-kubernetes-list-type: atomic path: description: Path to access on the HTTP server. type: string port: anyOf: - type: integer - type: string description: |- Name or number of the port to access on the container. Number must be in the range 1 to 65535. Name must be an IANA_SVC_NAME. x-kubernetes-int-or-string: true scheme: description: |- Scheme to use for connecting to the host. Defaults to HTTP. type: string required: - port type: object initialDelaySeconds: description: |- Number of seconds after the container has started before liveness probes are initiated. More info: https://kubernetes.io/docs/concepts/workloads/pods/pod-lifecycle#container-probes format: int32 type: integer periodSeconds: description: |- How often (in seconds) to perform the probe. Default to 10 seconds. Minimum value is 1. format: int32 type: integer successThreshold: description: |- Minimum consecutive successes for the probe to be considered successful after having failed. Defaults to 1. Must be 1 for liveness and startup. Minimum value is 1. format: int32 type: integer tcpSocket: description: TCPSocket specifies a connection to a TCP port. properties: host: description: 'Optional: Host name to connect to, defaults to the pod IP.' type: string port: anyOf: - type: integer - type: string description: |- Number or name of the port to access on the container. Number must be in the range 1 to 65535. Name must be an IANA_SVC_NAME. x-kubernetes-int-or-string: true required: - port type: object terminationGracePeriodSeconds: description: |- Optional duration in seconds the pod needs to terminate gracefully upon probe failure. The grace period is the duration in seconds after the processes running in the pod are sent a termination signal and the time when the processes are forcibly halted with a kill signal. Set this value longer than the expected cleanup time for your process. If this value is nil, the pod's terminationGracePeriodSeconds will be used. Otherwise, this value overrides the value provided by the pod spec. Value must be non-negative integer. The value zero indicates stop immediately via the kill signal (no opportunity to shut down). This is a beta field and requires enabling ProbeTerminationGracePeriod feature gate. Minimum value is 1. spec.terminationGracePeriodSeconds is used if unset. format: int64 type: integer timeoutSeconds: description: |- Number of seconds after which the probe times out. Defaults to 1 second. Minimum value is 1. More info: https://kubernetes.io/docs/concepts/workloads/pods/pod-lifecycle#container-probes format: int32 type: integer type: object resizePolicy: description: Resources resize policy for the container. items: description: ContainerResizePolicy represents resource resize policy for the container. properties: resourceName: description: |- Name of the resource to which this resource resize policy applies. Supported values: cpu, memory. type: string restartPolicy: description: |- Restart policy to apply when specified resource is resized. If not specified, it defaults to NotRequired. type: string required: - resourceName - restartPolicy type: object type: array x-kubernetes-list-type: atomic resources: description: |- Resources are not allowed for ephemeral containers. Ephemeral containers use spare resources already allocated to the pod. properties: claims: description: |- Claims lists the names of resources, defined in spec.resourceClaims, that are used by this container. This field depends on the DynamicResourceAllocation feature gate. This field is immutable. It can only be set for containers. items: description: ResourceClaim references one entry in PodSpec.ResourceClaims. properties: name: description: |- Name must match the name of one entry in pod.spec.resourceClaims of the Pod where this field is used. It makes that resource available inside a container. type: string request: description: |- Request is the name chosen for a request in the referenced claim. If empty, everything from the claim is made available, otherwise only the result of this request. type: string required: - name type: object type: array x-kubernetes-list-map-keys: - name x-kubernetes-list-type: map limits: additionalProperties: anyOf: - type: integer - type: string pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ x-kubernetes-int-or-string: true description: |- Limits describes the maximum amount of compute resources allowed. More info: https://kubernetes.io/docs/concepts/configuration/manage-resources-containers/ type: object requests: additionalProperties: anyOf: - type: integer - type: string pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ x-kubernetes-int-or-string: true description: |- Requests describes the minimum amount of compute resources required. If Requests is omitted for a container, it defaults to Limits if that is explicitly specified, otherwise to an implementation-defined value. Requests cannot exceed Limits. More info: https://kubernetes.io/docs/concepts/configuration/manage-resources-containers/ type: object type: object restartPolicy: description: |- Restart policy for the container to manage the restart behavior of each container within a pod. You cannot set this field on ephemeral containers. type: string restartPolicyRules: description: |- Represents a list of rules to be checked to determine if the container should be restarted on exit. You cannot set this field on ephemeral containers. items: description: ContainerRestartRule describes how a container exit is handled. properties: action: description: |- Specifies the action taken on a container exit if the requirements are satisfied. The only possible value is "Restart" to restart the container. type: string exitCodes: description: Represents the exit codes to check on container exits. properties: operator: description: |- Represents the relationship between the container exit code(s) and the specified values. Possible values are: - In: the requirement is satisfied if the container exit code is in the set of specified values. - NotIn: the requirement is satisfied if the container exit code is not in the set of specified values. type: string values: description: |- Specifies the set of values to check for container exit codes. At most 255 elements are allowed. items: format: int32 type: integer type: array x-kubernetes-list-type: set required: - operator type: object required: - action type: object type: array x-kubernetes-list-type: atomic securityContext: description: |- Optional: SecurityContext defines the security options the ephemeral container should be run with. If set, the fields of SecurityContext override the equivalent fields of PodSecurityContext. properties: allowPrivilegeEscalation: description: |- AllowPrivilegeEscalation controls whether a process can gain more privileges than its parent process. This bool directly controls if the no_new_privs flag will be set on the container process. AllowPrivilegeEscalation is true always when the container is: 1) run as Privileged 2) has CAP_SYS_ADMIN Note that this field cannot be set when spec.os.name is windows. type: boolean appArmorProfile: description: |- appArmorProfile is the AppArmor options to use by this container. If set, this profile overrides the pod's appArmorProfile. Note that this field cannot be set when spec.os.name is windows. properties: localhostProfile: description: |- localhostProfile indicates a profile loaded on the node that should be used. The profile must be preconfigured on the node to work. Must match the loaded name of the profile. Must be set if and only if type is "Localhost". type: string type: description: |- type indicates which kind of AppArmor profile will be applied. Valid options are: Localhost - a profile pre-loaded on the node. RuntimeDefault - the container runtime's default profile. Unconfined - no AppArmor enforcement. type: string required: - type type: object capabilities: description: |- The capabilities to add/drop when running containers. Defaults to the default set of capabilities granted by the container runtime. Note that this field cannot be set when spec.os.name is windows. properties: add: description: Added capabilities items: description: Capability represent POSIX capabilities type type: string type: array x-kubernetes-list-type: atomic drop: description: Removed capabilities items: description: Capability represent POSIX capabilities type type: string type: array x-kubernetes-list-type: atomic type: object privileged: description: |- Run container in privileged mode. Processes in privileged containers are essentially equivalent to root on the host. Defaults to false. Note that this field cannot be set when spec.os.name is windows. type: boolean procMount: description: |- procMount denotes the type of proc mount to use for the containers. The default value is Default which uses the container runtime defaults for readonly paths and masked paths. This requires the ProcMountType feature flag to be enabled. Note that this field cannot be set when spec.os.name is windows. type: string readOnlyRootFilesystem: description: |- Whether this container has a read-only root filesystem. Default is false. Note that this field cannot be set when spec.os.name is windows. type: boolean runAsGroup: description: |- The GID to run the entrypoint of the container process. Uses runtime default if unset. May also be set in PodSecurityContext. If set in both SecurityContext and PodSecurityContext, the value specified in SecurityContext takes precedence. Note that this field cannot be set when spec.os.name is windows. format: int64 type: integer runAsNonRoot: description: |- Indicates that the container must run as a non-root user. If true, the Kubelet will validate the image at runtime to ensure that it does not run as UID 0 (root) and fail to start the container if it does. If unset or false, no such validation will be performed. May also be set in PodSecurityContext. If set in both SecurityContext and PodSecurityContext, the value specified in SecurityContext takes precedence. type: boolean runAsUser: description: |- The UID to run the entrypoint of the container process. Defaults to user specified in image metadata if unspecified. May also be set in PodSecurityContext. If set in both SecurityContext and PodSecurityContext, the value specified in SecurityContext takes precedence. Note that this field cannot be set when spec.os.name is windows. format: int64 type: integer seLinuxOptions: description: |- The SELinux context to be applied to the container. If unspecified, the container runtime will allocate a random SELinux context for each container. May also be set in PodSecurityContext. If set in both SecurityContext and PodSecurityContext, the value specified in SecurityContext takes precedence. Note that this field cannot be set when spec.os.name is windows. properties: level: description: Level is SELinux level label that applies to the container. type: string role: description: Role is a SELinux role label that applies to the container. type: string type: description: Type is a SELinux type label that applies to the container. type: string user: description: User is a SELinux user label that applies to the container. type: string type: object seccompProfile: description: |- The seccomp options to use by this container. If seccomp options are provided at both the pod & container level, the container options override the pod options. Note that this field cannot be set when spec.os.name is windows. properties: localhostProfile: description: |- localhostProfile indicates a profile defined in a file on the node should be used. The profile must be preconfigured on the node to work. Must be a descending path, relative to the kubelet's configured seccomp profile location. Must be set if type is "Localhost". Must NOT be set for any other type. type: string type: description: |- type indicates which kind of seccomp profile will be applied. Valid options are: Localhost - a profile defined in a file on the node should be used. RuntimeDefault - the container runtime default profile should be used. Unconfined - no profile should be applied. type: string required: - type type: object windowsOptions: description: |- The Windows specific settings applied to all containers. If unspecified, the options from the PodSecurityContext will be used. If set in both SecurityContext and PodSecurityContext, the value specified in SecurityContext takes precedence. Note that this field cannot be set when spec.os.name is linux. properties: gmsaCredentialSpec: description: |- GMSACredentialSpec is where the GMSA admission webhook (https://github.com/kubernetes-sigs/windows-gmsa) inlines the contents of the GMSA credential spec named by the GMSACredentialSpecName field. type: string gmsaCredentialSpecName: description: GMSACredentialSpecName is the name of the GMSA credential spec to use. type: string hostProcess: description: |- HostProcess determines if a container should be run as a 'Host Process' container. All of a Pod's containers must have the same effective HostProcess value (it is not allowed to have a mix of HostProcess containers and non-HostProcess containers). In addition, if HostProcess is true then HostNetwork must also be set to true. type: boolean runAsUserName: description: |- The UserName in Windows to run the entrypoint of the container process. Defaults to the user specified in image metadata if unspecified. May also be set in PodSecurityContext. If set in both SecurityContext and PodSecurityContext, the value specified in SecurityContext takes precedence. type: string type: object type: object startupProbe: description: Probes are not allowed for ephemeral containers. properties: exec: description: Exec specifies a command to execute in the container. properties: command: description: |- Command is the command line to execute inside the container, the working directory for the command is root ('/') in the container's filesystem. The command is simply exec'd, it is not run inside a shell, so traditional shell instructions ('|', etc) won't work. To use a shell, you need to explicitly call out to that shell. Exit status of 0 is treated as live/healthy and non-zero is unhealthy. items: type: string type: array x-kubernetes-list-type: atomic type: object failureThreshold: description: |- Minimum consecutive failures for the probe to be considered failed after having succeeded. Defaults to 3. Minimum value is 1. format: int32 type: integer grpc: description: GRPC specifies a GRPC HealthCheckRequest. properties: port: description: Port number of the gRPC service. Number must be in the range 1 to 65535. format: int32 type: integer service: default: "" description: |- Service is the name of the service to place in the gRPC HealthCheckRequest (see https://github.com/grpc/grpc/blob/master/doc/health-checking.md). If this is not specified, the default behavior is defined by gRPC. type: string required: - port type: object httpGet: description: HTTPGet specifies an HTTP GET request to perform. properties: host: description: |- Host name to connect to, defaults to the pod IP. You probably want to set "Host" in httpHeaders instead. type: string httpHeaders: description: Custom headers to set in the request. HTTP allows repeated headers. items: description: HTTPHeader describes a custom header to be used in HTTP probes properties: name: description: |- The header field name. This will be canonicalized upon output, so case-variant names will be understood as the same header. type: string value: description: The header field value type: string required: - name - value type: object type: array x-kubernetes-list-type: atomic path: description: Path to access on the HTTP server. type: string port: anyOf: - type: integer - type: string description: |- Name or number of the port to access on the container. Number must be in the range 1 to 65535. Name must be an IANA_SVC_NAME. x-kubernetes-int-or-string: true scheme: description: |- Scheme to use for connecting to the host. Defaults to HTTP. type: string required: - port type: object initialDelaySeconds: description: |- Number of seconds after the container has started before liveness probes are initiated. More info: https://kubernetes.io/docs/concepts/workloads/pods/pod-lifecycle#container-probes format: int32 type: integer periodSeconds: description: |- How often (in seconds) to perform the probe. Default to 10 seconds. Minimum value is 1. format: int32 type: integer successThreshold: description: |- Minimum consecutive successes for the probe to be considered successful after having failed. Defaults to 1. Must be 1 for liveness and startup. Minimum value is 1. format: int32 type: integer tcpSocket: description: TCPSocket specifies a connection to a TCP port. properties: host: description: 'Optional: Host name to connect to, defaults to the pod IP.' type: string port: anyOf: - type: integer - type: string description: |- Number or name of the port to access on the container. Number must be in the range 1 to 65535. Name must be an IANA_SVC_NAME. x-kubernetes-int-or-string: true required: - port type: object terminationGracePeriodSeconds: description: |- Optional duration in seconds the pod needs to terminate gracefully upon probe failure. The grace period is the duration in seconds after the processes running in the pod are sent a termination signal and the time when the processes are forcibly halted with a kill signal. Set this value longer than the expected cleanup time for your process. If this value is nil, the pod's terminationGracePeriodSeconds will be used. Otherwise, this value overrides the value provided by the pod spec. Value must be non-negative integer. The value zero indicates stop immediately via the kill signal (no opportunity to shut down). This is a beta field and requires enabling ProbeTerminationGracePeriod feature gate. Minimum value is 1. spec.terminationGracePeriodSeconds is used if unset. format: int64 type: integer timeoutSeconds: description: |- Number of seconds after which the probe times out. Defaults to 1 second. Minimum value is 1. More info: https://kubernetes.io/docs/concepts/workloads/pods/pod-lifecycle#container-probes format: int32 type: integer type: object stdin: description: |- Whether this container should allocate a buffer for stdin in the container runtime. If this is not set, reads from stdin in the container will always result in EOF. Default is false. type: boolean stdinOnce: description: |- Whether the container runtime should close the stdin channel after it has been opened by a single attach. When stdin is true the stdin stream will remain open across multiple attach sessions. If stdinOnce is set to true, stdin is opened on container start, is empty until the first client attaches to stdin, and then remains open and accepts data until the client disconnects, at which time stdin is closed and remains closed until the container is restarted. If this flag is false, a container processes that reads from stdin will never receive an EOF. Default is false type: boolean targetContainerName: description: |- If set, the name of the container from PodSpec that this ephemeral container targets. The ephemeral container will be run in the namespaces (IPC, PID, etc) of this container. If not set then the ephemeral container uses the namespaces configured in the Pod spec. The container runtime must implement support for this feature. If the runtime does not support namespace targeting then the result of setting this field is undefined. type: string terminationMessagePath: description: |- Optional: Path at which the file to which the container's termination message will be written is mounted into the container's filesystem. Message written is intended to be brief final status, such as an assertion failure message. Will be truncated by the node if greater than 4096 bytes. The total message length across all containers will be limited to 12kb. Defaults to /dev/termination-log. Cannot be updated. type: string terminationMessagePolicy: description: |- Indicate how the termination message should be populated. File will use the contents of terminationMessagePath to populate the container status message on both success and failure. FallbackToLogsOnError will use the last chunk of container log output if the termination message file is empty and the container exited with an error. The log output is limited to 2048 bytes or 80 lines, whichever is smaller. Defaults to File. Cannot be updated. type: string tty: description: |- Whether this container should allocate a TTY for itself, also requires 'stdin' to be true. Default is false. type: boolean volumeDevices: description: volumeDevices is the list of block devices to be used by the container. items: description: volumeDevice describes a mapping of a raw block device within a container. properties: devicePath: description: devicePath is the path inside of the container that the device will be mapped to. type: string name: description: name must match the name of a persistentVolumeClaim in the pod type: string required: - devicePath - name type: object type: array x-kubernetes-list-map-keys: - devicePath x-kubernetes-list-type: map volumeMounts: description: |- Pod volumes to mount into the container's filesystem. Subpath mounts are not allowed for ephemeral containers. Cannot be updated. items: description: VolumeMount describes a mounting of a Volume within a container. properties: mountPath: description: |- Path within the container at which the volume should be mounted. Must not contain ':'. type: string mountPropagation: description: |- mountPropagation determines how mounts are propagated from the host to container and the other way around. When not set, MountPropagationNone is used. This field is beta in 1.10. When RecursiveReadOnly is set to IfPossible or to Enabled, MountPropagation must be None or unspecified (which defaults to None). type: string name: description: This must match the Name of a Volume. type: string readOnly: description: |- Mounted read-only if true, read-write otherwise (false or unspecified). Defaults to false. type: boolean recursiveReadOnly: description: |- RecursiveReadOnly specifies whether read-only mounts should be handled recursively. If ReadOnly is false, this field has no meaning and must be unspecified. If ReadOnly is true, and this field is set to Disabled, the mount is not made recursively read-only. If this field is set to IfPossible, the mount is made recursively read-only, if it is supported by the container runtime. If this field is set to Enabled, the mount is made recursively read-only if it is supported by the container runtime, otherwise the pod will not be started and an error will be generated to indicate the reason. If this field is set to IfPossible or Enabled, MountPropagation must be set to None (or be unspecified, which defaults to None). If this field is not specified, it is treated as an equivalent of Disabled. type: string subPath: description: |- Path within the volume from which the container's volume should be mounted. Defaults to "" (volume's root). type: string subPathExpr: description: |- Expanded path within the volume from which the container's volume should be mounted. Behaves similarly to SubPath but environment variable references $(VAR_NAME) are expanded using the container's environment. Defaults to "" (volume's root). SubPathExpr and SubPath are mutually exclusive. type: string required: - mountPath - name type: object type: array x-kubernetes-list-map-keys: - mountPath x-kubernetes-list-type: map workingDir: description: |- Container's working directory. If not specified, the container runtime's default will be used, which might be configured in the container image. Cannot be updated. type: string required: - name type: object type: array x-kubernetes-list-map-keys: - name x-kubernetes-list-type: map hostAliases: description: |- HostAliases is an optional list of hosts and IPs that will be injected into the pod's hosts file if specified. items: description: |- HostAlias holds the mapping between IP and hostnames that will be injected as an entry in the pod's hosts file. properties: hostnames: description: Hostnames for the above IP address. items: type: string type: array x-kubernetes-list-type: atomic ip: description: IP address of the host file entry. type: string required: - ip type: object type: array x-kubernetes-list-map-keys: - ip x-kubernetes-list-type: map hostIPC: description: |- Use the host's ipc namespace. Optional: Default to false. type: boolean hostNetwork: description: |- Host networking requested for this pod. Use the host's network namespace. When using HostNetwork you should specify ports so the scheduler is aware. When `hostNetwork` is true, specified `hostPort` fields in port definitions must match `containerPort`, and unspecified `hostPort` fields in port definitions are defaulted to match `containerPort`. Default to false. type: boolean hostPID: description: |- Use the host's pid namespace. Optional: Default to false. type: boolean hostUsers: description: |- Use the host's user namespace. Optional: Default to true. If set to true or not present, the pod will be run in the host user namespace, useful for when the pod needs a feature only available to the host user namespace, such as loading a kernel module with CAP_SYS_MODULE. When set to false, a new userns is created for the pod. Setting false is useful for mitigating container breakout vulnerabilities even allowing users to run their containers as root without actually having root privileges on the host. This field is alpha-level and is only honored by servers that enable the UserNamespacesSupport feature. type: boolean hostname: description: |- Specifies the hostname of the Pod If not specified, the pod's hostname will be set to a system-defined value. type: string hostnameOverride: description: |- HostnameOverride specifies an explicit override for the pod's hostname as perceived by the pod. This field only specifies the pod's hostname and does not affect its DNS records. When this field is set to a non-empty string: - It takes precedence over the values set in `hostname` and `subdomain`. - The Pod's hostname will be set to this value. - `setHostnameAsFQDN` must be nil or set to false. - `hostNetwork` must be set to false. This field must be a valid DNS subdomain as defined in RFC 1123 and contain at most 64 characters. Requires the HostnameOverride feature gate to be enabled. type: string imagePullSecrets: description: |- ImagePullSecrets is an optional list of references to secrets in the same namespace to use for pulling any of the images used by this PodSpec. If specified, these secrets will be passed to individual puller implementations for them to use. More info: https://kubernetes.io/docs/concepts/containers/images#specifying-imagepullsecrets-on-a-pod items: description: |- LocalObjectReference contains enough information to let you locate the referenced object inside the same namespace. properties: name: default: "" description: |- Name of the referent. This field is effectively required, but due to backwards compatibility is allowed to be empty. Instances of this type with an empty value here are almost certainly wrong. More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names type: string type: object x-kubernetes-map-type: atomic type: array x-kubernetes-list-map-keys: - name x-kubernetes-list-type: map initContainers: description: |- List of initialization containers belonging to the pod. Init containers are executed in order prior to containers being started. If any init container fails, the pod is considered to have failed and is handled according to its restartPolicy. The name for an init container or normal container must be unique among all containers. Init containers may not have Lifecycle actions, Readiness probes, Liveness probes, or Startup probes. The resourceRequirements of an init container are taken into account during scheduling by finding the highest request/limit for each resource type, and then using the max of that value or the sum of the normal containers. Limits are applied to init containers in a similar fashion. Init containers cannot currently be added or removed. Cannot be updated. More info: https://kubernetes.io/docs/concepts/workloads/pods/init-containers/ items: description: A single application container that you want to run within a pod. properties: args: description: |- Arguments to the entrypoint. The container image's CMD is used if this is not provided. Variable references $(VAR_NAME) are expanded using the container's environment. If a variable cannot be resolved, the reference in the input string will be unchanged. Double $$ are reduced to a single $, which allows for escaping the $(VAR_NAME) syntax: i.e. "$$(VAR_NAME)" will produce the string literal "$(VAR_NAME)". Escaped references will never be expanded, regardless of whether the variable exists or not. Cannot be updated. More info: https://kubernetes.io/docs/tasks/inject-data-application/define-command-argument-container/#running-a-command-in-a-shell items: type: string type: array x-kubernetes-list-type: atomic command: description: |- Entrypoint array. Not executed within a shell. The container image's ENTRYPOINT is used if this is not provided. Variable references $(VAR_NAME) are expanded using the container's environment. If a variable cannot be resolved, the reference in the input string will be unchanged. Double $$ are reduced to a single $, which allows for escaping the $(VAR_NAME) syntax: i.e. "$$(VAR_NAME)" will produce the string literal "$(VAR_NAME)". Escaped references will never be expanded, regardless of whether the variable exists or not. Cannot be updated. More info: https://kubernetes.io/docs/tasks/inject-data-application/define-command-argument-container/#running-a-command-in-a-shell items: type: string type: array x-kubernetes-list-type: atomic env: description: |- List of environment variables to set in the container. Cannot be updated. items: description: EnvVar represents an environment variable present in a Container. properties: name: description: |- Name of the environment variable. May consist of any printable ASCII characters except '='. type: string value: description: |- Variable references $(VAR_NAME) are expanded using the previously defined environment variables in the container and any service environment variables. If a variable cannot be resolved, the reference in the input string will be unchanged. Double $$ are reduced to a single $, which allows for escaping the $(VAR_NAME) syntax: i.e. "$$(VAR_NAME)" will produce the string literal "$(VAR_NAME)". Escaped references will never be expanded, regardless of whether the variable exists or not. Defaults to "". type: string valueFrom: description: Source for the environment variable's value. Cannot be used if value is not empty. properties: configMapKeyRef: description: Selects a key of a ConfigMap. properties: key: description: The key to select. type: string name: default: "" description: |- Name of the referent. This field is effectively required, but due to backwards compatibility is allowed to be empty. Instances of this type with an empty value here are almost certainly wrong. More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names type: string optional: description: Specify whether the ConfigMap or its key must be defined type: boolean required: - key type: object x-kubernetes-map-type: atomic fieldRef: description: |- Selects a field of the pod: supports metadata.name, metadata.namespace, `metadata.labels['']`, `metadata.annotations['']`, spec.nodeName, spec.serviceAccountName, status.hostIP, status.podIP, status.podIPs. properties: apiVersion: description: Version of the schema the FieldPath is written in terms of, defaults to "v1". type: string fieldPath: description: Path of the field to select in the specified API version. type: string required: - fieldPath type: object x-kubernetes-map-type: atomic fileKeyRef: description: |- FileKeyRef selects a key of the env file. Requires the EnvFiles feature gate to be enabled. properties: key: description: |- The key within the env file. An invalid key will prevent the pod from starting. The keys defined within a source may consist of any printable ASCII characters except '='. During Alpha stage of the EnvFiles feature gate, the key size is limited to 128 characters. type: string optional: default: false description: |- Specify whether the file or its key must be defined. If the file or key does not exist, then the env var is not published. If optional is set to true and the specified key does not exist, the environment variable will not be set in the Pod's containers. If optional is set to false and the specified key does not exist, an error will be returned during Pod creation. type: boolean path: description: |- The path within the volume from which to select the file. Must be relative and may not contain the '..' path or start with '..'. type: string volumeName: description: The name of the volume mount containing the env file. type: string required: - key - path - volumeName type: object x-kubernetes-map-type: atomic resourceFieldRef: description: |- Selects a resource of the container: only resources limits and requests (limits.cpu, limits.memory, limits.ephemeral-storage, requests.cpu, requests.memory and requests.ephemeral-storage) are currently supported. properties: containerName: description: 'Container name: required for volumes, optional for env vars' type: string divisor: anyOf: - type: integer - type: string description: Specifies the output format of the exposed resources, defaults to "1" pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ x-kubernetes-int-or-string: true resource: description: 'Required: resource to select' type: string required: - resource type: object x-kubernetes-map-type: atomic secretKeyRef: description: Selects a key of a secret in the pod's namespace properties: key: description: The key of the secret to select from. Must be a valid secret key. type: string name: default: "" description: |- Name of the referent. This field is effectively required, but due to backwards compatibility is allowed to be empty. Instances of this type with an empty value here are almost certainly wrong. More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names type: string optional: description: Specify whether the Secret or its key must be defined type: boolean required: - key type: object x-kubernetes-map-type: atomic type: object required: - name type: object type: array x-kubernetes-list-map-keys: - name x-kubernetes-list-type: map envFrom: description: |- List of sources to populate environment variables in the container. The keys defined within a source may consist of any printable ASCII characters except '='. When a key exists in multiple sources, the value associated with the last source will take precedence. Values defined by an Env with a duplicate key will take precedence. Cannot be updated. items: description: EnvFromSource represents the source of a set of ConfigMaps or Secrets properties: configMapRef: description: The ConfigMap to select from properties: name: default: "" description: |- Name of the referent. This field is effectively required, but due to backwards compatibility is allowed to be empty. Instances of this type with an empty value here are almost certainly wrong. More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names type: string optional: description: Specify whether the ConfigMap must be defined type: boolean type: object x-kubernetes-map-type: atomic prefix: description: |- Optional text to prepend to the name of each environment variable. May consist of any printable ASCII characters except '='. type: string secretRef: description: The Secret to select from properties: name: default: "" description: |- Name of the referent. This field is effectively required, but due to backwards compatibility is allowed to be empty. Instances of this type with an empty value here are almost certainly wrong. More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names type: string optional: description: Specify whether the Secret must be defined type: boolean type: object x-kubernetes-map-type: atomic type: object type: array x-kubernetes-list-type: atomic image: description: |- Container image name. More info: https://kubernetes.io/docs/concepts/containers/images This field is optional to allow higher level config management to default or override container images in workload controllers like Deployments and StatefulSets. type: string imagePullPolicy: description: |- Image pull policy. One of Always, Never, IfNotPresent. Defaults to Always if :latest tag is specified, or IfNotPresent otherwise. Cannot be updated. More info: https://kubernetes.io/docs/concepts/containers/images#updating-images type: string lifecycle: description: |- Actions that the management system should take in response to container lifecycle events. Cannot be updated. properties: postStart: description: |- PostStart is called immediately after a container is created. If the handler fails, the container is terminated and restarted according to its restart policy. Other management of the container blocks until the hook completes. More info: https://kubernetes.io/docs/concepts/containers/container-lifecycle-hooks/#container-hooks properties: exec: description: Exec specifies a command to execute in the container. properties: command: description: |- Command is the command line to execute inside the container, the working directory for the command is root ('/') in the container's filesystem. The command is simply exec'd, it is not run inside a shell, so traditional shell instructions ('|', etc) won't work. To use a shell, you need to explicitly call out to that shell. Exit status of 0 is treated as live/healthy and non-zero is unhealthy. items: type: string type: array x-kubernetes-list-type: atomic type: object httpGet: description: HTTPGet specifies an HTTP GET request to perform. properties: host: description: |- Host name to connect to, defaults to the pod IP. You probably want to set "Host" in httpHeaders instead. type: string httpHeaders: description: Custom headers to set in the request. HTTP allows repeated headers. items: description: HTTPHeader describes a custom header to be used in HTTP probes properties: name: description: |- The header field name. This will be canonicalized upon output, so case-variant names will be understood as the same header. type: string value: description: The header field value type: string required: - name - value type: object type: array x-kubernetes-list-type: atomic path: description: Path to access on the HTTP server. type: string port: anyOf: - type: integer - type: string description: |- Name or number of the port to access on the container. Number must be in the range 1 to 65535. Name must be an IANA_SVC_NAME. x-kubernetes-int-or-string: true scheme: description: |- Scheme to use for connecting to the host. Defaults to HTTP. type: string required: - port type: object sleep: description: Sleep represents a duration that the container should sleep. properties: seconds: description: Seconds is the number of seconds to sleep. format: int64 type: integer required: - seconds type: object tcpSocket: description: |- Deprecated. TCPSocket is NOT supported as a LifecycleHandler and kept for backward compatibility. There is no validation of this field and lifecycle hooks will fail at runtime when it is specified. properties: host: description: 'Optional: Host name to connect to, defaults to the pod IP.' type: string port: anyOf: - type: integer - type: string description: |- Number or name of the port to access on the container. Number must be in the range 1 to 65535. Name must be an IANA_SVC_NAME. x-kubernetes-int-or-string: true required: - port type: object type: object preStop: description: |- PreStop is called immediately before a container is terminated due to an API request or management event such as liveness/startup probe failure, preemption, resource contention, etc. The handler is not called if the container crashes or exits. The Pod's termination grace period countdown begins before the PreStop hook is executed. Regardless of the outcome of the handler, the container will eventually terminate within the Pod's termination grace period (unless delayed by finalizers). Other management of the container blocks until the hook completes or until the termination grace period is reached. More info: https://kubernetes.io/docs/concepts/containers/container-lifecycle-hooks/#container-hooks properties: exec: description: Exec specifies a command to execute in the container. properties: command: description: |- Command is the command line to execute inside the container, the working directory for the command is root ('/') in the container's filesystem. The command is simply exec'd, it is not run inside a shell, so traditional shell instructions ('|', etc) won't work. To use a shell, you need to explicitly call out to that shell. Exit status of 0 is treated as live/healthy and non-zero is unhealthy. items: type: string type: array x-kubernetes-list-type: atomic type: object httpGet: description: HTTPGet specifies an HTTP GET request to perform. properties: host: description: |- Host name to connect to, defaults to the pod IP. You probably want to set "Host" in httpHeaders instead. type: string httpHeaders: description: Custom headers to set in the request. HTTP allows repeated headers. items: description: HTTPHeader describes a custom header to be used in HTTP probes properties: name: description: |- The header field name. This will be canonicalized upon output, so case-variant names will be understood as the same header. type: string value: description: The header field value type: string required: - name - value type: object type: array x-kubernetes-list-type: atomic path: description: Path to access on the HTTP server. type: string port: anyOf: - type: integer - type: string description: |- Name or number of the port to access on the container. Number must be in the range 1 to 65535. Name must be an IANA_SVC_NAME. x-kubernetes-int-or-string: true scheme: description: |- Scheme to use for connecting to the host. Defaults to HTTP. type: string required: - port type: object sleep: description: Sleep represents a duration that the container should sleep. properties: seconds: description: Seconds is the number of seconds to sleep. format: int64 type: integer required: - seconds type: object tcpSocket: description: |- Deprecated. TCPSocket is NOT supported as a LifecycleHandler and kept for backward compatibility. There is no validation of this field and lifecycle hooks will fail at runtime when it is specified. properties: host: description: 'Optional: Host name to connect to, defaults to the pod IP.' type: string port: anyOf: - type: integer - type: string description: |- Number or name of the port to access on the container. Number must be in the range 1 to 65535. Name must be an IANA_SVC_NAME. x-kubernetes-int-or-string: true required: - port type: object type: object stopSignal: description: |- StopSignal defines which signal will be sent to a container when it is being stopped. If not specified, the default is defined by the container runtime in use. StopSignal can only be set for Pods with a non-empty .spec.os.name type: string type: object livenessProbe: description: |- Periodic probe of container liveness. Container will be restarted if the probe fails. Cannot be updated. More info: https://kubernetes.io/docs/concepts/workloads/pods/pod-lifecycle#container-probes properties: exec: description: Exec specifies a command to execute in the container. properties: command: description: |- Command is the command line to execute inside the container, the working directory for the command is root ('/') in the container's filesystem. The command is simply exec'd, it is not run inside a shell, so traditional shell instructions ('|', etc) won't work. To use a shell, you need to explicitly call out to that shell. Exit status of 0 is treated as live/healthy and non-zero is unhealthy. items: type: string type: array x-kubernetes-list-type: atomic type: object failureThreshold: description: |- Minimum consecutive failures for the probe to be considered failed after having succeeded. Defaults to 3. Minimum value is 1. format: int32 type: integer grpc: description: GRPC specifies a GRPC HealthCheckRequest. properties: port: description: Port number of the gRPC service. Number must be in the range 1 to 65535. format: int32 type: integer service: default: "" description: |- Service is the name of the service to place in the gRPC HealthCheckRequest (see https://github.com/grpc/grpc/blob/master/doc/health-checking.md). If this is not specified, the default behavior is defined by gRPC. type: string required: - port type: object httpGet: description: HTTPGet specifies an HTTP GET request to perform. properties: host: description: |- Host name to connect to, defaults to the pod IP. You probably want to set "Host" in httpHeaders instead. type: string httpHeaders: description: Custom headers to set in the request. HTTP allows repeated headers. items: description: HTTPHeader describes a custom header to be used in HTTP probes properties: name: description: |- The header field name. This will be canonicalized upon output, so case-variant names will be understood as the same header. type: string value: description: The header field value type: string required: - name - value type: object type: array x-kubernetes-list-type: atomic path: description: Path to access on the HTTP server. type: string port: anyOf: - type: integer - type: string description: |- Name or number of the port to access on the container. Number must be in the range 1 to 65535. Name must be an IANA_SVC_NAME. x-kubernetes-int-or-string: true scheme: description: |- Scheme to use for connecting to the host. Defaults to HTTP. type: string required: - port type: object initialDelaySeconds: description: |- Number of seconds after the container has started before liveness probes are initiated. More info: https://kubernetes.io/docs/concepts/workloads/pods/pod-lifecycle#container-probes format: int32 type: integer periodSeconds: description: |- How often (in seconds) to perform the probe. Default to 10 seconds. Minimum value is 1. format: int32 type: integer successThreshold: description: |- Minimum consecutive successes for the probe to be considered successful after having failed. Defaults to 1. Must be 1 for liveness and startup. Minimum value is 1. format: int32 type: integer tcpSocket: description: TCPSocket specifies a connection to a TCP port. properties: host: description: 'Optional: Host name to connect to, defaults to the pod IP.' type: string port: anyOf: - type: integer - type: string description: |- Number or name of the port to access on the container. Number must be in the range 1 to 65535. Name must be an IANA_SVC_NAME. x-kubernetes-int-or-string: true required: - port type: object terminationGracePeriodSeconds: description: |- Optional duration in seconds the pod needs to terminate gracefully upon probe failure. The grace period is the duration in seconds after the processes running in the pod are sent a termination signal and the time when the processes are forcibly halted with a kill signal. Set this value longer than the expected cleanup time for your process. If this value is nil, the pod's terminationGracePeriodSeconds will be used. Otherwise, this value overrides the value provided by the pod spec. Value must be non-negative integer. The value zero indicates stop immediately via the kill signal (no opportunity to shut down). This is a beta field and requires enabling ProbeTerminationGracePeriod feature gate. Minimum value is 1. spec.terminationGracePeriodSeconds is used if unset. format: int64 type: integer timeoutSeconds: description: |- Number of seconds after which the probe times out. Defaults to 1 second. Minimum value is 1. More info: https://kubernetes.io/docs/concepts/workloads/pods/pod-lifecycle#container-probes format: int32 type: integer type: object name: description: |- Name of the container specified as a DNS_LABEL. Each container in a pod must have a unique name (DNS_LABEL). Cannot be updated. type: string ports: description: |- List of ports to expose from the container. Not specifying a port here DOES NOT prevent that port from being exposed. Any port which is listening on the default "0.0.0.0" address inside a container will be accessible from the network. Modifying this array with strategic merge patch may corrupt the data. For more information See https://github.com/kubernetes/kubernetes/issues/108255. Cannot be updated. items: description: ContainerPort represents a network port in a single container. properties: containerPort: description: |- Number of port to expose on the pod's IP address. This must be a valid port number, 0 < x < 65536. format: int32 type: integer hostIP: description: What host IP to bind the external port to. type: string hostPort: description: |- Number of port to expose on the host. If specified, this must be a valid port number, 0 < x < 65536. If HostNetwork is specified, this must match ContainerPort. Most containers do not need this. format: int32 type: integer name: description: |- If specified, this must be an IANA_SVC_NAME and unique within the pod. Each named port in a pod must have a unique name. Name for the port that can be referred to by services. type: string protocol: default: TCP description: |- Protocol for port. Must be UDP, TCP, or SCTP. Defaults to "TCP". type: string required: - containerPort type: object type: array x-kubernetes-list-map-keys: - containerPort - protocol x-kubernetes-list-type: map readinessProbe: description: |- Periodic probe of container service readiness. Container will be removed from service endpoints if the probe fails. Cannot be updated. More info: https://kubernetes.io/docs/concepts/workloads/pods/pod-lifecycle#container-probes properties: exec: description: Exec specifies a command to execute in the container. properties: command: description: |- Command is the command line to execute inside the container, the working directory for the command is root ('/') in the container's filesystem. The command is simply exec'd, it is not run inside a shell, so traditional shell instructions ('|', etc) won't work. To use a shell, you need to explicitly call out to that shell. Exit status of 0 is treated as live/healthy and non-zero is unhealthy. items: type: string type: array x-kubernetes-list-type: atomic type: object failureThreshold: description: |- Minimum consecutive failures for the probe to be considered failed after having succeeded. Defaults to 3. Minimum value is 1. format: int32 type: integer grpc: description: GRPC specifies a GRPC HealthCheckRequest. properties: port: description: Port number of the gRPC service. Number must be in the range 1 to 65535. format: int32 type: integer service: default: "" description: |- Service is the name of the service to place in the gRPC HealthCheckRequest (see https://github.com/grpc/grpc/blob/master/doc/health-checking.md). If this is not specified, the default behavior is defined by gRPC. type: string required: - port type: object httpGet: description: HTTPGet specifies an HTTP GET request to perform. properties: host: description: |- Host name to connect to, defaults to the pod IP. You probably want to set "Host" in httpHeaders instead. type: string httpHeaders: description: Custom headers to set in the request. HTTP allows repeated headers. items: description: HTTPHeader describes a custom header to be used in HTTP probes properties: name: description: |- The header field name. This will be canonicalized upon output, so case-variant names will be understood as the same header. type: string value: description: The header field value type: string required: - name - value type: object type: array x-kubernetes-list-type: atomic path: description: Path to access on the HTTP server. type: string port: anyOf: - type: integer - type: string description: |- Name or number of the port to access on the container. Number must be in the range 1 to 65535. Name must be an IANA_SVC_NAME. x-kubernetes-int-or-string: true scheme: description: |- Scheme to use for connecting to the host. Defaults to HTTP. type: string required: - port type: object initialDelaySeconds: description: |- Number of seconds after the container has started before liveness probes are initiated. More info: https://kubernetes.io/docs/concepts/workloads/pods/pod-lifecycle#container-probes format: int32 type: integer periodSeconds: description: |- How often (in seconds) to perform the probe. Default to 10 seconds. Minimum value is 1. format: int32 type: integer successThreshold: description: |- Minimum consecutive successes for the probe to be considered successful after having failed. Defaults to 1. Must be 1 for liveness and startup. Minimum value is 1. format: int32 type: integer tcpSocket: description: TCPSocket specifies a connection to a TCP port. properties: host: description: 'Optional: Host name to connect to, defaults to the pod IP.' type: string port: anyOf: - type: integer - type: string description: |- Number or name of the port to access on the container. Number must be in the range 1 to 65535. Name must be an IANA_SVC_NAME. x-kubernetes-int-or-string: true required: - port type: object terminationGracePeriodSeconds: description: |- Optional duration in seconds the pod needs to terminate gracefully upon probe failure. The grace period is the duration in seconds after the processes running in the pod are sent a termination signal and the time when the processes are forcibly halted with a kill signal. Set this value longer than the expected cleanup time for your process. If this value is nil, the pod's terminationGracePeriodSeconds will be used. Otherwise, this value overrides the value provided by the pod spec. Value must be non-negative integer. The value zero indicates stop immediately via the kill signal (no opportunity to shut down). This is a beta field and requires enabling ProbeTerminationGracePeriod feature gate. Minimum value is 1. spec.terminationGracePeriodSeconds is used if unset. format: int64 type: integer timeoutSeconds: description: |- Number of seconds after which the probe times out. Defaults to 1 second. Minimum value is 1. More info: https://kubernetes.io/docs/concepts/workloads/pods/pod-lifecycle#container-probes format: int32 type: integer type: object resizePolicy: description: |- Resources resize policy for the container. This field cannot be set on ephemeral containers. items: description: ContainerResizePolicy represents resource resize policy for the container. properties: resourceName: description: |- Name of the resource to which this resource resize policy applies. Supported values: cpu, memory. type: string restartPolicy: description: |- Restart policy to apply when specified resource is resized. If not specified, it defaults to NotRequired. type: string required: - resourceName - restartPolicy type: object type: array x-kubernetes-list-type: atomic resources: description: |- Compute Resources required by this container. Cannot be updated. More info: https://kubernetes.io/docs/concepts/configuration/manage-resources-containers/ properties: claims: description: |- Claims lists the names of resources, defined in spec.resourceClaims, that are used by this container. This field depends on the DynamicResourceAllocation feature gate. This field is immutable. It can only be set for containers. items: description: ResourceClaim references one entry in PodSpec.ResourceClaims. properties: name: description: |- Name must match the name of one entry in pod.spec.resourceClaims of the Pod where this field is used. It makes that resource available inside a container. type: string request: description: |- Request is the name chosen for a request in the referenced claim. If empty, everything from the claim is made available, otherwise only the result of this request. type: string required: - name type: object type: array x-kubernetes-list-map-keys: - name x-kubernetes-list-type: map limits: additionalProperties: anyOf: - type: integer - type: string pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ x-kubernetes-int-or-string: true description: |- Limits describes the maximum amount of compute resources allowed. More info: https://kubernetes.io/docs/concepts/configuration/manage-resources-containers/ type: object requests: additionalProperties: anyOf: - type: integer - type: string pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ x-kubernetes-int-or-string: true description: |- Requests describes the minimum amount of compute resources required. If Requests is omitted for a container, it defaults to Limits if that is explicitly specified, otherwise to an implementation-defined value. Requests cannot exceed Limits. More info: https://kubernetes.io/docs/concepts/configuration/manage-resources-containers/ type: object type: object restartPolicy: description: |- RestartPolicy defines the restart behavior of individual containers in a pod. This overrides the pod-level restart policy. When this field is not specified, the restart behavior is defined by the Pod's restart policy and the container type. Additionally, setting the RestartPolicy as "Always" for the init container will have the following effect: this init container will be continually restarted on exit until all regular containers have terminated. Once all regular containers have completed, all init containers with restartPolicy "Always" will be shut down. This lifecycle differs from normal init containers and is often referred to as a "sidecar" container. Although this init container still starts in the init container sequence, it does not wait for the container to complete before proceeding to the next init container. Instead, the next init container starts immediately after this init container is started, or after any startupProbe has successfully completed. type: string restartPolicyRules: description: |- Represents a list of rules to be checked to determine if the container should be restarted on exit. The rules are evaluated in order. Once a rule matches a container exit condition, the remaining rules are ignored. If no rule matches the container exit condition, the Container-level restart policy determines the whether the container is restarted or not. Constraints on the rules: - At most 20 rules are allowed. - Rules can have the same action. - Identical rules are not forbidden in validations. When rules are specified, container MUST set RestartPolicy explicitly even it if matches the Pod's RestartPolicy. items: description: ContainerRestartRule describes how a container exit is handled. properties: action: description: |- Specifies the action taken on a container exit if the requirements are satisfied. The only possible value is "Restart" to restart the container. type: string exitCodes: description: Represents the exit codes to check on container exits. properties: operator: description: |- Represents the relationship between the container exit code(s) and the specified values. Possible values are: - In: the requirement is satisfied if the container exit code is in the set of specified values. - NotIn: the requirement is satisfied if the container exit code is not in the set of specified values. type: string values: description: |- Specifies the set of values to check for container exit codes. At most 255 elements are allowed. items: format: int32 type: integer type: array x-kubernetes-list-type: set required: - operator type: object required: - action type: object type: array x-kubernetes-list-type: atomic securityContext: description: |- SecurityContext defines the security options the container should be run with. If set, the fields of SecurityContext override the equivalent fields of PodSecurityContext. More info: https://kubernetes.io/docs/tasks/configure-pod-container/security-context/ properties: allowPrivilegeEscalation: description: |- AllowPrivilegeEscalation controls whether a process can gain more privileges than its parent process. This bool directly controls if the no_new_privs flag will be set on the container process. AllowPrivilegeEscalation is true always when the container is: 1) run as Privileged 2) has CAP_SYS_ADMIN Note that this field cannot be set when spec.os.name is windows. type: boolean appArmorProfile: description: |- appArmorProfile is the AppArmor options to use by this container. If set, this profile overrides the pod's appArmorProfile. Note that this field cannot be set when spec.os.name is windows. properties: localhostProfile: description: |- localhostProfile indicates a profile loaded on the node that should be used. The profile must be preconfigured on the node to work. Must match the loaded name of the profile. Must be set if and only if type is "Localhost". type: string type: description: |- type indicates which kind of AppArmor profile will be applied. Valid options are: Localhost - a profile pre-loaded on the node. RuntimeDefault - the container runtime's default profile. Unconfined - no AppArmor enforcement. type: string required: - type type: object capabilities: description: |- The capabilities to add/drop when running containers. Defaults to the default set of capabilities granted by the container runtime. Note that this field cannot be set when spec.os.name is windows. properties: add: description: Added capabilities items: description: Capability represent POSIX capabilities type type: string type: array x-kubernetes-list-type: atomic drop: description: Removed capabilities items: description: Capability represent POSIX capabilities type type: string type: array x-kubernetes-list-type: atomic type: object privileged: description: |- Run container in privileged mode. Processes in privileged containers are essentially equivalent to root on the host. Defaults to false. Note that this field cannot be set when spec.os.name is windows. type: boolean procMount: description: |- procMount denotes the type of proc mount to use for the containers. The default value is Default which uses the container runtime defaults for readonly paths and masked paths. This requires the ProcMountType feature flag to be enabled. Note that this field cannot be set when spec.os.name is windows. type: string readOnlyRootFilesystem: description: |- Whether this container has a read-only root filesystem. Default is false. Note that this field cannot be set when spec.os.name is windows. type: boolean runAsGroup: description: |- The GID to run the entrypoint of the container process. Uses runtime default if unset. May also be set in PodSecurityContext. If set in both SecurityContext and PodSecurityContext, the value specified in SecurityContext takes precedence. Note that this field cannot be set when spec.os.name is windows. format: int64 type: integer runAsNonRoot: description: |- Indicates that the container must run as a non-root user. If true, the Kubelet will validate the image at runtime to ensure that it does not run as UID 0 (root) and fail to start the container if it does. If unset or false, no such validation will be performed. May also be set in PodSecurityContext. If set in both SecurityContext and PodSecurityContext, the value specified in SecurityContext takes precedence. type: boolean runAsUser: description: |- The UID to run the entrypoint of the container process. Defaults to user specified in image metadata if unspecified. May also be set in PodSecurityContext. If set in both SecurityContext and PodSecurityContext, the value specified in SecurityContext takes precedence. Note that this field cannot be set when spec.os.name is windows. format: int64 type: integer seLinuxOptions: description: |- The SELinux context to be applied to the container. If unspecified, the container runtime will allocate a random SELinux context for each container. May also be set in PodSecurityContext. If set in both SecurityContext and PodSecurityContext, the value specified in SecurityContext takes precedence. Note that this field cannot be set when spec.os.name is windows. properties: level: description: Level is SELinux level label that applies to the container. type: string role: description: Role is a SELinux role label that applies to the container. type: string type: description: Type is a SELinux type label that applies to the container. type: string user: description: User is a SELinux user label that applies to the container. type: string type: object seccompProfile: description: |- The seccomp options to use by this container. If seccomp options are provided at both the pod & container level, the container options override the pod options. Note that this field cannot be set when spec.os.name is windows. properties: localhostProfile: description: |- localhostProfile indicates a profile defined in a file on the node should be used. The profile must be preconfigured on the node to work. Must be a descending path, relative to the kubelet's configured seccomp profile location. Must be set if type is "Localhost". Must NOT be set for any other type. type: string type: description: |- type indicates which kind of seccomp profile will be applied. Valid options are: Localhost - a profile defined in a file on the node should be used. RuntimeDefault - the container runtime default profile should be used. Unconfined - no profile should be applied. type: string required: - type type: object windowsOptions: description: |- The Windows specific settings applied to all containers. If unspecified, the options from the PodSecurityContext will be used. If set in both SecurityContext and PodSecurityContext, the value specified in SecurityContext takes precedence. Note that this field cannot be set when spec.os.name is linux. properties: gmsaCredentialSpec: description: |- GMSACredentialSpec is where the GMSA admission webhook (https://github.com/kubernetes-sigs/windows-gmsa) inlines the contents of the GMSA credential spec named by the GMSACredentialSpecName field. type: string gmsaCredentialSpecName: description: GMSACredentialSpecName is the name of the GMSA credential spec to use. type: string hostProcess: description: |- HostProcess determines if a container should be run as a 'Host Process' container. All of a Pod's containers must have the same effective HostProcess value (it is not allowed to have a mix of HostProcess containers and non-HostProcess containers). In addition, if HostProcess is true then HostNetwork must also be set to true. type: boolean runAsUserName: description: |- The UserName in Windows to run the entrypoint of the container process. Defaults to the user specified in image metadata if unspecified. May also be set in PodSecurityContext. If set in both SecurityContext and PodSecurityContext, the value specified in SecurityContext takes precedence. type: string type: object type: object startupProbe: description: |- StartupProbe indicates that the Pod has successfully initialized. If specified, no other probes are executed until this completes successfully. If this probe fails, the Pod will be restarted, just as if the livenessProbe failed. This can be used to provide different probe parameters at the beginning of a Pod's lifecycle, when it might take a long time to load data or warm a cache, than during steady-state operation. This cannot be updated. More info: https://kubernetes.io/docs/concepts/workloads/pods/pod-lifecycle#container-probes properties: exec: description: Exec specifies a command to execute in the container. properties: command: description: |- Command is the command line to execute inside the container, the working directory for the command is root ('/') in the container's filesystem. The command is simply exec'd, it is not run inside a shell, so traditional shell instructions ('|', etc) won't work. To use a shell, you need to explicitly call out to that shell. Exit status of 0 is treated as live/healthy and non-zero is unhealthy. items: type: string type: array x-kubernetes-list-type: atomic type: object failureThreshold: description: |- Minimum consecutive failures for the probe to be considered failed after having succeeded. Defaults to 3. Minimum value is 1. format: int32 type: integer grpc: description: GRPC specifies a GRPC HealthCheckRequest. properties: port: description: Port number of the gRPC service. Number must be in the range 1 to 65535. format: int32 type: integer service: default: "" description: |- Service is the name of the service to place in the gRPC HealthCheckRequest (see https://github.com/grpc/grpc/blob/master/doc/health-checking.md). If this is not specified, the default behavior is defined by gRPC. type: string required: - port type: object httpGet: description: HTTPGet specifies an HTTP GET request to perform. properties: host: description: |- Host name to connect to, defaults to the pod IP. You probably want to set "Host" in httpHeaders instead. type: string httpHeaders: description: Custom headers to set in the request. HTTP allows repeated headers. items: description: HTTPHeader describes a custom header to be used in HTTP probes properties: name: description: |- The header field name. This will be canonicalized upon output, so case-variant names will be understood as the same header. type: string value: description: The header field value type: string required: - name - value type: object type: array x-kubernetes-list-type: atomic path: description: Path to access on the HTTP server. type: string port: anyOf: - type: integer - type: string description: |- Name or number of the port to access on the container. Number must be in the range 1 to 65535. Name must be an IANA_SVC_NAME. x-kubernetes-int-or-string: true scheme: description: |- Scheme to use for connecting to the host. Defaults to HTTP. type: string required: - port type: object initialDelaySeconds: description: |- Number of seconds after the container has started before liveness probes are initiated. More info: https://kubernetes.io/docs/concepts/workloads/pods/pod-lifecycle#container-probes format: int32 type: integer periodSeconds: description: |- How often (in seconds) to perform the probe. Default to 10 seconds. Minimum value is 1. format: int32 type: integer successThreshold: description: |- Minimum consecutive successes for the probe to be considered successful after having failed. Defaults to 1. Must be 1 for liveness and startup. Minimum value is 1. format: int32 type: integer tcpSocket: description: TCPSocket specifies a connection to a TCP port. properties: host: description: 'Optional: Host name to connect to, defaults to the pod IP.' type: string port: anyOf: - type: integer - type: string description: |- Number or name of the port to access on the container. Number must be in the range 1 to 65535. Name must be an IANA_SVC_NAME. x-kubernetes-int-or-string: true required: - port type: object terminationGracePeriodSeconds: description: |- Optional duration in seconds the pod needs to terminate gracefully upon probe failure. The grace period is the duration in seconds after the processes running in the pod are sent a termination signal and the time when the processes are forcibly halted with a kill signal. Set this value longer than the expected cleanup time for your process. If this value is nil, the pod's terminationGracePeriodSeconds will be used. Otherwise, this value overrides the value provided by the pod spec. Value must be non-negative integer. The value zero indicates stop immediately via the kill signal (no opportunity to shut down). This is a beta field and requires enabling ProbeTerminationGracePeriod feature gate. Minimum value is 1. spec.terminationGracePeriodSeconds is used if unset. format: int64 type: integer timeoutSeconds: description: |- Number of seconds after which the probe times out. Defaults to 1 second. Minimum value is 1. More info: https://kubernetes.io/docs/concepts/workloads/pods/pod-lifecycle#container-probes format: int32 type: integer type: object stdin: description: |- Whether this container should allocate a buffer for stdin in the container runtime. If this is not set, reads from stdin in the container will always result in EOF. Default is false. type: boolean stdinOnce: description: |- Whether the container runtime should close the stdin channel after it has been opened by a single attach. When stdin is true the stdin stream will remain open across multiple attach sessions. If stdinOnce is set to true, stdin is opened on container start, is empty until the first client attaches to stdin, and then remains open and accepts data until the client disconnects, at which time stdin is closed and remains closed until the container is restarted. If this flag is false, a container processes that reads from stdin will never receive an EOF. Default is false type: boolean terminationMessagePath: description: |- Optional: Path at which the file to which the container's termination message will be written is mounted into the container's filesystem. Message written is intended to be brief final status, such as an assertion failure message. Will be truncated by the node if greater than 4096 bytes. The total message length across all containers will be limited to 12kb. Defaults to /dev/termination-log. Cannot be updated. type: string terminationMessagePolicy: description: |- Indicate how the termination message should be populated. File will use the contents of terminationMessagePath to populate the container status message on both success and failure. FallbackToLogsOnError will use the last chunk of container log output if the termination message file is empty and the container exited with an error. The log output is limited to 2048 bytes or 80 lines, whichever is smaller. Defaults to File. Cannot be updated. type: string tty: description: |- Whether this container should allocate a TTY for itself, also requires 'stdin' to be true. Default is false. type: boolean volumeDevices: description: volumeDevices is the list of block devices to be used by the container. items: description: volumeDevice describes a mapping of a raw block device within a container. properties: devicePath: description: devicePath is the path inside of the container that the device will be mapped to. type: string name: description: name must match the name of a persistentVolumeClaim in the pod type: string required: - devicePath - name type: object type: array x-kubernetes-list-map-keys: - devicePath x-kubernetes-list-type: map volumeMounts: description: |- Pod volumes to mount into the container's filesystem. Cannot be updated. items: description: VolumeMount describes a mounting of a Volume within a container. properties: mountPath: description: |- Path within the container at which the volume should be mounted. Must not contain ':'. type: string mountPropagation: description: |- mountPropagation determines how mounts are propagated from the host to container and the other way around. When not set, MountPropagationNone is used. This field is beta in 1.10. When RecursiveReadOnly is set to IfPossible or to Enabled, MountPropagation must be None or unspecified (which defaults to None). type: string name: description: This must match the Name of a Volume. type: string readOnly: description: |- Mounted read-only if true, read-write otherwise (false or unspecified). Defaults to false. type: boolean recursiveReadOnly: description: |- RecursiveReadOnly specifies whether read-only mounts should be handled recursively. If ReadOnly is false, this field has no meaning and must be unspecified. If ReadOnly is true, and this field is set to Disabled, the mount is not made recursively read-only. If this field is set to IfPossible, the mount is made recursively read-only, if it is supported by the container runtime. If this field is set to Enabled, the mount is made recursively read-only if it is supported by the container runtime, otherwise the pod will not be started and an error will be generated to indicate the reason. If this field is set to IfPossible or Enabled, MountPropagation must be set to None (or be unspecified, which defaults to None). If this field is not specified, it is treated as an equivalent of Disabled. type: string subPath: description: |- Path within the volume from which the container's volume should be mounted. Defaults to "" (volume's root). type: string subPathExpr: description: |- Expanded path within the volume from which the container's volume should be mounted. Behaves similarly to SubPath but environment variable references $(VAR_NAME) are expanded using the container's environment. Defaults to "" (volume's root). SubPathExpr and SubPath are mutually exclusive. type: string required: - mountPath - name type: object type: array x-kubernetes-list-map-keys: - mountPath x-kubernetes-list-type: map workingDir: description: |- Container's working directory. If not specified, the container runtime's default will be used, which might be configured in the container image. Cannot be updated. type: string required: - name type: object type: array x-kubernetes-list-map-keys: - name x-kubernetes-list-type: map nodeName: description: |- NodeName indicates in which node this pod is scheduled. If empty, this pod is a candidate for scheduling by the scheduler defined in schedulerName. Once this field is set, the kubelet for this node becomes responsible for the lifecycle of this pod. This field should not be used to express a desire for the pod to be scheduled on a specific node. https://kubernetes.io/docs/concepts/scheduling-eviction/assign-pod-node/#nodename type: string nodeSelector: additionalProperties: type: string description: |- NodeSelector is a selector which must be true for the pod to fit on a node. Selector which must match a node's labels for the pod to be scheduled on that node. More info: https://kubernetes.io/docs/concepts/configuration/assign-pod-node/ type: object x-kubernetes-map-type: atomic os: description: |- Specifies the OS of the containers in the pod. Some pod and container fields are restricted if this is set. If the OS field is set to linux, the following fields must be unset: -securityContext.windowsOptions If the OS field is set to windows, following fields must be unset: - spec.hostPID - spec.hostIPC - spec.hostUsers - spec.resources - spec.securityContext.appArmorProfile - spec.securityContext.seLinuxOptions - spec.securityContext.seccompProfile - spec.securityContext.fsGroup - spec.securityContext.fsGroupChangePolicy - spec.securityContext.sysctls - spec.shareProcessNamespace - spec.securityContext.runAsUser - spec.securityContext.runAsGroup - spec.securityContext.supplementalGroups - spec.securityContext.supplementalGroupsPolicy - spec.containers[*].securityContext.appArmorProfile - spec.containers[*].securityContext.seLinuxOptions - spec.containers[*].securityContext.seccompProfile - spec.containers[*].securityContext.capabilities - spec.containers[*].securityContext.readOnlyRootFilesystem - spec.containers[*].securityContext.privileged - spec.containers[*].securityContext.allowPrivilegeEscalation - spec.containers[*].securityContext.procMount - spec.containers[*].securityContext.runAsUser - spec.containers[*].securityContext.runAsGroup properties: name: description: |- Name is the name of the operating system. The currently supported values are linux and windows. Additional value may be defined in future and can be one of: https://github.com/opencontainers/runtime-spec/blob/master/config.md#platform-specific-configuration Clients should expect to handle additional values and treat unrecognized values in this field as os: null type: string required: - name type: object overhead: additionalProperties: anyOf: - type: integer - type: string pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ x-kubernetes-int-or-string: true description: |- Overhead represents the resource overhead associated with running a pod for a given RuntimeClass. This field will be autopopulated at admission time by the RuntimeClass admission controller. If the RuntimeClass admission controller is enabled, overhead must not be set in Pod create requests. The RuntimeClass admission controller will reject Pod create requests which have the overhead already set. If RuntimeClass is configured and selected in the PodSpec, Overhead will be set to the value defined in the corresponding RuntimeClass, otherwise it will remain unset and treated as zero. More info: https://git.k8s.io/enhancements/keps/sig-node/688-pod-overhead/README.md type: object preemptionPolicy: description: |- PreemptionPolicy is the Policy for preempting pods with lower priority. One of Never, PreemptLowerPriority. Defaults to PreemptLowerPriority if unset. type: string priority: description: |- The priority value. Various system components use this field to find the priority of the pod. When Priority Admission Controller is enabled, it prevents users from setting this field. The admission controller populates this field from PriorityClassName. The higher the value, the higher the priority. format: int32 type: integer priorityClassName: description: |- If specified, indicates the pod's priority. "system-node-critical" and "system-cluster-critical" are two special keywords which indicate the highest priorities with the former being the highest priority. Any other name must be defined by creating a PriorityClass object with that name. If not specified, the pod priority will be default or zero if there is no default. type: string readinessGates: description: |- If specified, all readiness gates will be evaluated for pod readiness. A pod is ready when all its containers are ready AND all conditions specified in the readiness gates have status equal to "True" More info: https://git.k8s.io/enhancements/keps/sig-network/580-pod-readiness-gates items: description: PodReadinessGate contains the reference to a pod condition properties: conditionType: description: ConditionType refers to a condition in the pod's condition list with matching type. type: string required: - conditionType type: object type: array x-kubernetes-list-type: atomic resourceClaims: description: |- ResourceClaims defines which ResourceClaims must be allocated and reserved before the Pod is allowed to start. The resources will be made available to those containers which consume them by name. This is a stable field but requires that the DynamicResourceAllocation feature gate is enabled. This field is immutable. items: description: |- PodResourceClaim references exactly one ResourceClaim, either directly or by naming a ResourceClaimTemplate which is then turned into a ResourceClaim for the pod. It adds a name to it that uniquely identifies the ResourceClaim inside the Pod. Containers that need access to the ResourceClaim reference it with this name. properties: name: description: |- Name uniquely identifies this resource claim inside the pod. This must be a DNS_LABEL. type: string resourceClaimName: description: |- ResourceClaimName is the name of a ResourceClaim object in the same namespace as this pod. Exactly one of ResourceClaimName and ResourceClaimTemplateName must be set. type: string resourceClaimTemplateName: description: |- ResourceClaimTemplateName is the name of a ResourceClaimTemplate object in the same namespace as this pod. The template will be used to create a new ResourceClaim, which will be bound to this pod. When this pod is deleted, the ResourceClaim will also be deleted. The pod name and resource name, along with a generated component, will be used to form a unique name for the ResourceClaim, which will be recorded in pod.status.resourceClaimStatuses. This field is immutable and no changes will be made to the corresponding ResourceClaim by the control plane after creating the ResourceClaim. Exactly one of ResourceClaimName and ResourceClaimTemplateName must be set. type: string required: - name type: object type: array x-kubernetes-list-map-keys: - name x-kubernetes-list-type: map resources: description: |- Resources is the total amount of CPU and Memory resources required by all containers in the pod. It supports specifying Requests and Limits for "cpu", "memory" and "hugepages-" resource names only. ResourceClaims are not supported. This field enables fine-grained control over resource allocation for the entire pod, allowing resource sharing among containers in a pod. This is an alpha field and requires enabling the PodLevelResources feature gate. properties: claims: description: |- Claims lists the names of resources, defined in spec.resourceClaims, that are used by this container. This field depends on the DynamicResourceAllocation feature gate. This field is immutable. It can only be set for containers. items: description: ResourceClaim references one entry in PodSpec.ResourceClaims. properties: name: description: |- Name must match the name of one entry in pod.spec.resourceClaims of the Pod where this field is used. It makes that resource available inside a container. type: string request: description: |- Request is the name chosen for a request in the referenced claim. If empty, everything from the claim is made available, otherwise only the result of this request. type: string required: - name type: object type: array x-kubernetes-list-map-keys: - name x-kubernetes-list-type: map limits: additionalProperties: anyOf: - type: integer - type: string pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ x-kubernetes-int-or-string: true description: |- Limits describes the maximum amount of compute resources allowed. More info: https://kubernetes.io/docs/concepts/configuration/manage-resources-containers/ type: object requests: additionalProperties: anyOf: - type: integer - type: string pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ x-kubernetes-int-or-string: true description: |- Requests describes the minimum amount of compute resources required. If Requests is omitted for a container, it defaults to Limits if that is explicitly specified, otherwise to an implementation-defined value. Requests cannot exceed Limits. More info: https://kubernetes.io/docs/concepts/configuration/manage-resources-containers/ type: object type: object restartPolicy: description: |- Restart policy for all containers within the pod. One of Always, OnFailure, Never. In some contexts, only a subset of those values may be permitted. Default to Always. More info: https://kubernetes.io/docs/concepts/workloads/pods/pod-lifecycle/#restart-policy type: string runtimeClassName: description: |- RuntimeClassName refers to a RuntimeClass object in the node.k8s.io group, which should be used to run this pod. If no RuntimeClass resource matches the named class, the pod will not be run. If unset or empty, the "legacy" RuntimeClass will be used, which is an implicit class with an empty definition that uses the default runtime handler. More info: https://git.k8s.io/enhancements/keps/sig-node/585-runtime-class type: string schedulerName: description: |- If specified, the pod will be dispatched by specified scheduler. If not specified, the pod will be dispatched by default scheduler. type: string schedulingGates: description: |- SchedulingGates is an opaque list of values that if specified will block scheduling the pod. If schedulingGates is not empty, the pod will stay in the SchedulingGated state and the scheduler will not attempt to schedule the pod. SchedulingGates can only be set at pod creation time, and be removed only afterwards. items: description: PodSchedulingGate is associated to a Pod to guard its scheduling. properties: name: description: |- Name of the scheduling gate. Each scheduling gate must have a unique name field. type: string required: - name type: object type: array x-kubernetes-list-map-keys: - name x-kubernetes-list-type: map securityContext: description: |- SecurityContext holds pod-level security attributes and common container settings. Optional: Defaults to empty. See type description for default values of each field. properties: appArmorProfile: description: |- appArmorProfile is the AppArmor options to use by the containers in this pod. Note that this field cannot be set when spec.os.name is windows. properties: localhostProfile: description: |- localhostProfile indicates a profile loaded on the node that should be used. The profile must be preconfigured on the node to work. Must match the loaded name of the profile. Must be set if and only if type is "Localhost". type: string type: description: |- type indicates which kind of AppArmor profile will be applied. Valid options are: Localhost - a profile pre-loaded on the node. RuntimeDefault - the container runtime's default profile. Unconfined - no AppArmor enforcement. type: string required: - type type: object fsGroup: description: |- A special supplemental group that applies to all containers in a pod. Some volume types allow the Kubelet to change the ownership of that volume to be owned by the pod: 1. The owning GID will be the FSGroup 2. The setgid bit is set (new files created in the volume will be owned by FSGroup) 3. The permission bits are OR'd with rw-rw---- If unset, the Kubelet will not modify the ownership and permissions of any volume. Note that this field cannot be set when spec.os.name is windows. format: int64 type: integer fsGroupChangePolicy: description: |- fsGroupChangePolicy defines behavior of changing ownership and permission of the volume before being exposed inside Pod. This field will only apply to volume types which support fsGroup based ownership(and permissions). It will have no effect on ephemeral volume types such as: secret, configmaps and emptydir. Valid values are "OnRootMismatch" and "Always". If not specified, "Always" is used. Note that this field cannot be set when spec.os.name is windows. type: string runAsGroup: description: |- The GID to run the entrypoint of the container process. Uses runtime default if unset. May also be set in SecurityContext. If set in both SecurityContext and PodSecurityContext, the value specified in SecurityContext takes precedence for that container. Note that this field cannot be set when spec.os.name is windows. format: int64 type: integer runAsNonRoot: description: |- Indicates that the container must run as a non-root user. If true, the Kubelet will validate the image at runtime to ensure that it does not run as UID 0 (root) and fail to start the container if it does. If unset or false, no such validation will be performed. May also be set in SecurityContext. If set in both SecurityContext and PodSecurityContext, the value specified in SecurityContext takes precedence. type: boolean runAsUser: description: |- The UID to run the entrypoint of the container process. Defaults to user specified in image metadata if unspecified. May also be set in SecurityContext. If set in both SecurityContext and PodSecurityContext, the value specified in SecurityContext takes precedence for that container. Note that this field cannot be set when spec.os.name is windows. format: int64 type: integer seLinuxChangePolicy: description: |- seLinuxChangePolicy defines how the container's SELinux label is applied to all volumes used by the Pod. It has no effect on nodes that do not support SELinux or to volumes does not support SELinux. Valid values are "MountOption" and "Recursive". "Recursive" means relabeling of all files on all Pod volumes by the container runtime. This may be slow for large volumes, but allows mixing privileged and unprivileged Pods sharing the same volume on the same node. "MountOption" mounts all eligible Pod volumes with `-o context` mount option. This requires all Pods that share the same volume to use the same SELinux label. It is not possible to share the same volume among privileged and unprivileged Pods. Eligible volumes are in-tree FibreChannel and iSCSI volumes, and all CSI volumes whose CSI driver announces SELinux support by setting spec.seLinuxMount: true in their CSIDriver instance. Other volumes are always re-labelled recursively. "MountOption" value is allowed only when SELinuxMount feature gate is enabled. If not specified and SELinuxMount feature gate is enabled, "MountOption" is used. If not specified and SELinuxMount feature gate is disabled, "MountOption" is used for ReadWriteOncePod volumes and "Recursive" for all other volumes. This field affects only Pods that have SELinux label set, either in PodSecurityContext or in SecurityContext of all containers. All Pods that use the same volume should use the same seLinuxChangePolicy, otherwise some pods can get stuck in ContainerCreating state. Note that this field cannot be set when spec.os.name is windows. type: string seLinuxOptions: description: |- The SELinux context to be applied to all containers. If unspecified, the container runtime will allocate a random SELinux context for each container. May also be set in SecurityContext. If set in both SecurityContext and PodSecurityContext, the value specified in SecurityContext takes precedence for that container. Note that this field cannot be set when spec.os.name is windows. properties: level: description: Level is SELinux level label that applies to the container. type: string role: description: Role is a SELinux role label that applies to the container. type: string type: description: Type is a SELinux type label that applies to the container. type: string user: description: User is a SELinux user label that applies to the container. type: string type: object seccompProfile: description: |- The seccomp options to use by the containers in this pod. Note that this field cannot be set when spec.os.name is windows. properties: localhostProfile: description: |- localhostProfile indicates a profile defined in a file on the node should be used. The profile must be preconfigured on the node to work. Must be a descending path, relative to the kubelet's configured seccomp profile location. Must be set if type is "Localhost". Must NOT be set for any other type. type: string type: description: |- type indicates which kind of seccomp profile will be applied. Valid options are: Localhost - a profile defined in a file on the node should be used. RuntimeDefault - the container runtime default profile should be used. Unconfined - no profile should be applied. type: string required: - type type: object supplementalGroups: description: |- A list of groups applied to the first process run in each container, in addition to the container's primary GID and fsGroup (if specified). If the SupplementalGroupsPolicy feature is enabled, the supplementalGroupsPolicy field determines whether these are in addition to or instead of any group memberships defined in the container image. If unspecified, no additional groups are added, though group memberships defined in the container image may still be used, depending on the supplementalGroupsPolicy field. Note that this field cannot be set when spec.os.name is windows. items: format: int64 type: integer type: array x-kubernetes-list-type: atomic supplementalGroupsPolicy: description: |- Defines how supplemental groups of the first container processes are calculated. Valid values are "Merge" and "Strict". If not specified, "Merge" is used. (Alpha) Using the field requires the SupplementalGroupsPolicy feature gate to be enabled and the container runtime must implement support for this feature. Note that this field cannot be set when spec.os.name is windows. type: string sysctls: description: |- Sysctls hold a list of namespaced sysctls used for the pod. Pods with unsupported sysctls (by the container runtime) might fail to launch. Note that this field cannot be set when spec.os.name is windows. items: description: Sysctl defines a kernel parameter to be set properties: name: description: Name of a property to set type: string value: description: Value of a property to set type: string required: - name - value type: object type: array x-kubernetes-list-type: atomic windowsOptions: description: |- The Windows specific settings applied to all containers. If unspecified, the options within a container's SecurityContext will be used. If set in both SecurityContext and PodSecurityContext, the value specified in SecurityContext takes precedence. Note that this field cannot be set when spec.os.name is linux. properties: gmsaCredentialSpec: description: |- GMSACredentialSpec is where the GMSA admission webhook (https://github.com/kubernetes-sigs/windows-gmsa) inlines the contents of the GMSA credential spec named by the GMSACredentialSpecName field. type: string gmsaCredentialSpecName: description: GMSACredentialSpecName is the name of the GMSA credential spec to use. type: string hostProcess: description: |- HostProcess determines if a container should be run as a 'Host Process' container. All of a Pod's containers must have the same effective HostProcess value (it is not allowed to have a mix of HostProcess containers and non-HostProcess containers). In addition, if HostProcess is true then HostNetwork must also be set to true. type: boolean runAsUserName: description: |- The UserName in Windows to run the entrypoint of the container process. Defaults to the user specified in image metadata if unspecified. May also be set in PodSecurityContext. If set in both SecurityContext and PodSecurityContext, the value specified in SecurityContext takes precedence. type: string type: object type: object serviceAccount: description: |- DeprecatedServiceAccount is a deprecated alias for ServiceAccountName. Deprecated: Use serviceAccountName instead. type: string serviceAccountName: description: |- ServiceAccountName is the name of the ServiceAccount to use to run this pod. More info: https://kubernetes.io/docs/tasks/configure-pod-container/configure-service-account/ type: string setHostnameAsFQDN: description: |- If true the pod's hostname will be configured as the pod's FQDN, rather than the leaf name (the default). In Linux containers, this means setting the FQDN in the hostname field of the kernel (the nodename field of struct utsname). In Windows containers, this means setting the registry value of hostname for the registry key HKEY_LOCAL_MACHINE\\SYSTEM\\CurrentControlSet\\Services\\Tcpip\\Parameters to FQDN. If a pod does not have FQDN, this has no effect. Default to false. type: boolean shareProcessNamespace: description: |- Share a single process namespace between all of the containers in a pod. When this is set containers will be able to view and signal processes from other containers in the same pod, and the first process in each container will not be assigned PID 1. HostPID and ShareProcessNamespace cannot both be set. Optional: Default to false. type: boolean subdomain: description: |- If specified, the fully qualified Pod hostname will be "...svc.". If not specified, the pod will not have a domainname at all. type: string terminationGracePeriodSeconds: description: |- Optional duration in seconds the pod needs to terminate gracefully. May be decreased in delete request. Value must be non-negative integer. The value zero indicates stop immediately via the kill signal (no opportunity to shut down). If this value is nil, the default grace period will be used instead. The grace period is the duration in seconds after the processes running in the pod are sent a termination signal and the time when the processes are forcibly halted with a kill signal. Set this value longer than the expected cleanup time for your process. Defaults to 30 seconds. format: int64 type: integer tolerations: description: If specified, the pod's tolerations. items: description: |- The pod this Toleration is attached to tolerates any taint that matches the triple using the matching operator . properties: effect: description: |- Effect indicates the taint effect to match. Empty means match all taint effects. When specified, allowed values are NoSchedule, PreferNoSchedule and NoExecute. type: string key: description: |- Key is the taint key that the toleration applies to. Empty means match all taint keys. If the key is empty, operator must be Exists; this combination means to match all values and all keys. type: string operator: description: |- Operator represents a key's relationship to the value. Valid operators are Exists, Equal, Lt, and Gt. Defaults to Equal. Exists is equivalent to wildcard for value, so that a pod can tolerate all taints of a particular category. Lt and Gt perform numeric comparisons (requires feature gate TaintTolerationComparisonOperators). type: string tolerationSeconds: description: |- TolerationSeconds represents the period of time the toleration (which must be of effect NoExecute, otherwise this field is ignored) tolerates the taint. By default, it is not set, which means tolerate the taint forever (do not evict). Zero and negative values will be treated as 0 (evict immediately) by the system. format: int64 type: integer value: description: |- Value is the taint value the toleration matches to. If the operator is Exists, the value should be empty, otherwise just a regular string. type: string type: object type: array x-kubernetes-list-type: atomic topologySpreadConstraints: description: |- TopologySpreadConstraints describes how a group of pods ought to spread across topology domains. Scheduler will schedule pods in a way which abides by the constraints. All topologySpreadConstraints are ANDed. items: description: TopologySpreadConstraint specifies how to spread matching pods among the given topology. properties: labelSelector: description: |- LabelSelector is used to find matching pods. Pods that match this label selector are counted to determine the number of pods in their corresponding topology domain. properties: matchExpressions: description: matchExpressions is a list of label selector requirements. The requirements are ANDed. items: description: |- A label selector requirement is a selector that contains values, a key, and an operator that relates the key and values. properties: key: description: key is the label key that the selector applies to. type: string operator: description: |- operator represents a key's relationship to a set of values. Valid operators are In, NotIn, Exists and DoesNotExist. type: string values: description: |- values is an array of string values. If the operator is In or NotIn, the values array must be non-empty. If the operator is Exists or DoesNotExist, the values array must be empty. This array is replaced during a strategic merge patch. items: type: string type: array x-kubernetes-list-type: atomic required: - key - operator type: object type: array x-kubernetes-list-type: atomic matchLabels: additionalProperties: type: string description: |- matchLabels is a map of {key,value} pairs. A single {key,value} in the matchLabels map is equivalent to an element of matchExpressions, whose key field is "key", the operator is "In", and the values array contains only "value". The requirements are ANDed. type: object type: object x-kubernetes-map-type: atomic matchLabelKeys: description: |- MatchLabelKeys is a set of pod label keys to select the pods over which spreading will be calculated. The keys are used to lookup values from the incoming pod labels, those key-value labels are ANDed with labelSelector to select the group of existing pods over which spreading will be calculated for the incoming pod. The same key is forbidden to exist in both MatchLabelKeys and LabelSelector. MatchLabelKeys cannot be set when LabelSelector isn't set. Keys that don't exist in the incoming pod labels will be ignored. A null or empty list means only match against labelSelector. This is a beta field and requires the MatchLabelKeysInPodTopologySpread feature gate to be enabled (enabled by default). items: type: string type: array x-kubernetes-list-type: atomic maxSkew: description: |- MaxSkew describes the degree to which pods may be unevenly distributed. When `whenUnsatisfiable=DoNotSchedule`, it is the maximum permitted difference between the number of matching pods in the target topology and the global minimum. The global minimum is the minimum number of matching pods in an eligible domain or zero if the number of eligible domains is less than MinDomains. For example, in a 3-zone cluster, MaxSkew is set to 1, and pods with the same labelSelector spread as 2/2/1: In this case, the global minimum is 1. | zone1 | zone2 | zone3 | | P P | P P | P | - if MaxSkew is 1, incoming pod can only be scheduled to zone3 to become 2/2/2; scheduling it onto zone1(zone2) would make the ActualSkew(3-1) on zone1(zone2) violate MaxSkew(1). - if MaxSkew is 2, incoming pod can be scheduled onto any zone. When `whenUnsatisfiable=ScheduleAnyway`, it is used to give higher precedence to topologies that satisfy it. It's a required field. Default value is 1 and 0 is not allowed. format: int32 type: integer minDomains: description: |- MinDomains indicates a minimum number of eligible domains. When the number of eligible domains with matching topology keys is less than minDomains, Pod Topology Spread treats "global minimum" as 0, and then the calculation of Skew is performed. And when the number of eligible domains with matching topology keys equals or greater than minDomains, this value has no effect on scheduling. As a result, when the number of eligible domains is less than minDomains, scheduler won't schedule more than maxSkew Pods to those domains. If value is nil, the constraint behaves as if MinDomains is equal to 1. Valid values are integers greater than 0. When value is not nil, WhenUnsatisfiable must be DoNotSchedule. For example, in a 3-zone cluster, MaxSkew is set to 2, MinDomains is set to 5 and pods with the same labelSelector spread as 2/2/2: | zone1 | zone2 | zone3 | | P P | P P | P P | The number of domains is less than 5(MinDomains), so "global minimum" is treated as 0. In this situation, new pod with the same labelSelector cannot be scheduled, because computed skew will be 3(3 - 0) if new Pod is scheduled to any of the three zones, it will violate MaxSkew. format: int32 type: integer nodeAffinityPolicy: description: |- NodeAffinityPolicy indicates how we will treat Pod's nodeAffinity/nodeSelector when calculating pod topology spread skew. Options are: - Honor: only nodes matching nodeAffinity/nodeSelector are included in the calculations. - Ignore: nodeAffinity/nodeSelector are ignored. All nodes are included in the calculations. If this value is nil, the behavior is equivalent to the Honor policy. type: string nodeTaintsPolicy: description: |- NodeTaintsPolicy indicates how we will treat node taints when calculating pod topology spread skew. Options are: - Honor: nodes without taints, along with tainted nodes for which the incoming pod has a toleration, are included. - Ignore: node taints are ignored. All nodes are included. If this value is nil, the behavior is equivalent to the Ignore policy. type: string topologyKey: description: |- TopologyKey is the key of node labels. Nodes that have a label with this key and identical values are considered to be in the same topology. We consider each as a "bucket", and try to put balanced number of pods into each bucket. We define a domain as a particular instance of a topology. Also, we define an eligible domain as a domain whose nodes meet the requirements of nodeAffinityPolicy and nodeTaintsPolicy. e.g. If TopologyKey is "kubernetes.io/hostname", each Node is a domain of that topology. And, if TopologyKey is "topology.kubernetes.io/zone", each zone is a domain of that topology. It's a required field. type: string whenUnsatisfiable: description: |- WhenUnsatisfiable indicates how to deal with a pod if it doesn't satisfy the spread constraint. - DoNotSchedule (default) tells the scheduler not to schedule it. - ScheduleAnyway tells the scheduler to schedule the pod in any location, but giving higher precedence to topologies that would help reduce the skew. A constraint is considered "Unsatisfiable" for an incoming pod if and only if every possible node assignment for that pod would violate "MaxSkew" on some topology. For example, in a 3-zone cluster, MaxSkew is set to 1, and pods with the same labelSelector spread as 3/1/1: | zone1 | zone2 | zone3 | | P P P | P | P | If WhenUnsatisfiable is set to DoNotSchedule, incoming pod can only be scheduled to zone2(zone3) to become 3/2/1(3/1/2) as ActualSkew(2-1) on zone2(zone3) satisfies MaxSkew(1). In other words, the cluster can still be imbalanced, but scheduler won't make it *more* imbalanced. It's a required field. type: string required: - maxSkew - topologyKey - whenUnsatisfiable type: object type: array x-kubernetes-list-map-keys: - topologyKey - whenUnsatisfiable x-kubernetes-list-type: map volumes: description: |- List of volumes that can be mounted by containers belonging to the pod. More info: https://kubernetes.io/docs/concepts/storage/volumes items: description: Volume represents a named volume in a pod that may be accessed by any container in the pod. properties: awsElasticBlockStore: description: |- awsElasticBlockStore represents an AWS Disk resource that is attached to a kubelet's host machine and then exposed to the pod. Deprecated: AWSElasticBlockStore is deprecated. All operations for the in-tree awsElasticBlockStore type are redirected to the ebs.csi.aws.com CSI driver. More info: https://kubernetes.io/docs/concepts/storage/volumes#awselasticblockstore properties: fsType: description: |- fsType is the filesystem type of the volume that you want to mount. Tip: Ensure that the filesystem type is supported by the host operating system. Examples: "ext4", "xfs", "ntfs". Implicitly inferred to be "ext4" if unspecified. More info: https://kubernetes.io/docs/concepts/storage/volumes#awselasticblockstore type: string partition: description: |- partition is the partition in the volume that you want to mount. If omitted, the default is to mount by volume name. Examples: For volume /dev/sda1, you specify the partition as "1". Similarly, the volume partition for /dev/sda is "0" (or you can leave the property empty). format: int32 type: integer readOnly: description: |- readOnly value true will force the readOnly setting in VolumeMounts. More info: https://kubernetes.io/docs/concepts/storage/volumes#awselasticblockstore type: boolean volumeID: description: |- volumeID is unique ID of the persistent disk resource in AWS (Amazon EBS volume). More info: https://kubernetes.io/docs/concepts/storage/volumes#awselasticblockstore type: string required: - volumeID type: object azureDisk: description: |- azureDisk represents an Azure Data Disk mount on the host and bind mount to the pod. Deprecated: AzureDisk is deprecated. All operations for the in-tree azureDisk type are redirected to the disk.csi.azure.com CSI driver. properties: cachingMode: description: 'cachingMode is the Host Caching mode: None, Read Only, Read Write.' type: string diskName: description: diskName is the Name of the data disk in the blob storage type: string diskURI: description: diskURI is the URI of data disk in the blob storage type: string fsType: default: ext4 description: |- fsType is Filesystem type to mount. Must be a filesystem type supported by the host operating system. Ex. "ext4", "xfs", "ntfs". Implicitly inferred to be "ext4" if unspecified. type: string kind: description: 'kind expected values are Shared: multiple blob disks per storage account Dedicated: single blob disk per storage account Managed: azure managed data disk (only in managed availability set). defaults to shared' type: string readOnly: default: false description: |- readOnly Defaults to false (read/write). ReadOnly here will force the ReadOnly setting in VolumeMounts. type: boolean required: - diskName - diskURI type: object azureFile: description: |- azureFile represents an Azure File Service mount on the host and bind mount to the pod. Deprecated: AzureFile is deprecated. All operations for the in-tree azureFile type are redirected to the file.csi.azure.com CSI driver. properties: readOnly: description: |- readOnly defaults to false (read/write). ReadOnly here will force the ReadOnly setting in VolumeMounts. type: boolean secretName: description: secretName is the name of secret that contains Azure Storage Account Name and Key type: string shareName: description: shareName is the azure share Name type: string required: - secretName - shareName type: object cephfs: description: |- cephFS represents a Ceph FS mount on the host that shares a pod's lifetime. Deprecated: CephFS is deprecated and the in-tree cephfs type is no longer supported. properties: monitors: description: |- monitors is Required: Monitors is a collection of Ceph monitors More info: https://examples.k8s.io/volumes/cephfs/README.md#how-to-use-it items: type: string type: array x-kubernetes-list-type: atomic path: description: 'path is Optional: Used as the mounted root, rather than the full Ceph tree, default is /' type: string readOnly: description: |- readOnly is Optional: Defaults to false (read/write). ReadOnly here will force the ReadOnly setting in VolumeMounts. More info: https://examples.k8s.io/volumes/cephfs/README.md#how-to-use-it type: boolean secretFile: description: |- secretFile is Optional: SecretFile is the path to key ring for User, default is /etc/ceph/user.secret More info: https://examples.k8s.io/volumes/cephfs/README.md#how-to-use-it type: string secretRef: description: |- secretRef is Optional: SecretRef is reference to the authentication secret for User, default is empty. More info: https://examples.k8s.io/volumes/cephfs/README.md#how-to-use-it properties: name: default: "" description: |- Name of the referent. This field is effectively required, but due to backwards compatibility is allowed to be empty. Instances of this type with an empty value here are almost certainly wrong. More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names type: string type: object x-kubernetes-map-type: atomic user: description: |- user is optional: User is the rados user name, default is admin More info: https://examples.k8s.io/volumes/cephfs/README.md#how-to-use-it type: string required: - monitors type: object cinder: description: |- cinder represents a cinder volume attached and mounted on kubelets host machine. Deprecated: Cinder is deprecated. All operations for the in-tree cinder type are redirected to the cinder.csi.openstack.org CSI driver. More info: https://examples.k8s.io/mysql-cinder-pd/README.md properties: fsType: description: |- fsType is the filesystem type to mount. Must be a filesystem type supported by the host operating system. Examples: "ext4", "xfs", "ntfs". Implicitly inferred to be "ext4" if unspecified. More info: https://examples.k8s.io/mysql-cinder-pd/README.md type: string readOnly: description: |- readOnly defaults to false (read/write). ReadOnly here will force the ReadOnly setting in VolumeMounts. More info: https://examples.k8s.io/mysql-cinder-pd/README.md type: boolean secretRef: description: |- secretRef is optional: points to a secret object containing parameters used to connect to OpenStack. properties: name: default: "" description: |- Name of the referent. This field is effectively required, but due to backwards compatibility is allowed to be empty. Instances of this type with an empty value here are almost certainly wrong. More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names type: string type: object x-kubernetes-map-type: atomic volumeID: description: |- volumeID used to identify the volume in cinder. More info: https://examples.k8s.io/mysql-cinder-pd/README.md type: string required: - volumeID type: object configMap: description: configMap represents a configMap that should populate this volume properties: defaultMode: description: |- defaultMode is optional: mode bits used to set permissions on created files by default. Must be an octal value between 0000 and 0777 or a decimal value between 0 and 511. YAML accepts both octal and decimal values, JSON requires decimal values for mode bits. Defaults to 0644. Directories within the path are not affected by this setting. This might be in conflict with other options that affect the file mode, like fsGroup, and the result can be other mode bits set. format: int32 type: integer items: description: |- items if unspecified, each key-value pair in the Data field of the referenced ConfigMap will be projected into the volume as a file whose name is the key and content is the value. If specified, the listed keys will be projected into the specified paths, and unlisted keys will not be present. If a key is specified which is not present in the ConfigMap, the volume setup will error unless it is marked optional. Paths must be relative and may not contain the '..' path or start with '..'. items: description: Maps a string key to a path within a volume. properties: key: description: key is the key to project. type: string mode: description: |- mode is Optional: mode bits used to set permissions on this file. Must be an octal value between 0000 and 0777 or a decimal value between 0 and 511. YAML accepts both octal and decimal values, JSON requires decimal values for mode bits. If not specified, the volume defaultMode will be used. This might be in conflict with other options that affect the file mode, like fsGroup, and the result can be other mode bits set. format: int32 type: integer path: description: |- path is the relative path of the file to map the key to. May not be an absolute path. May not contain the path element '..'. May not start with the string '..'. type: string required: - key - path type: object type: array x-kubernetes-list-type: atomic name: default: "" description: |- Name of the referent. This field is effectively required, but due to backwards compatibility is allowed to be empty. Instances of this type with an empty value here are almost certainly wrong. More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names type: string optional: description: optional specify whether the ConfigMap or its keys must be defined type: boolean type: object x-kubernetes-map-type: atomic csi: description: csi (Container Storage Interface) represents ephemeral storage that is handled by certain external CSI drivers. properties: driver: description: |- driver is the name of the CSI driver that handles this volume. Consult with your admin for the correct name as registered in the cluster. type: string fsType: description: |- fsType to mount. Ex. "ext4", "xfs", "ntfs". If not provided, the empty value is passed to the associated CSI driver which will determine the default filesystem to apply. type: string nodePublishSecretRef: description: |- nodePublishSecretRef is a reference to the secret object containing sensitive information to pass to the CSI driver to complete the CSI NodePublishVolume and NodeUnpublishVolume calls. This field is optional, and may be empty if no secret is required. If the secret object contains more than one secret, all secret references are passed. properties: name: default: "" description: |- Name of the referent. This field is effectively required, but due to backwards compatibility is allowed to be empty. Instances of this type with an empty value here are almost certainly wrong. More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names type: string type: object x-kubernetes-map-type: atomic readOnly: description: |- readOnly specifies a read-only configuration for the volume. Defaults to false (read/write). type: boolean volumeAttributes: additionalProperties: type: string description: |- volumeAttributes stores driver-specific properties that are passed to the CSI driver. Consult your driver's documentation for supported values. type: object required: - driver type: object downwardAPI: description: downwardAPI represents downward API about the pod that should populate this volume properties: defaultMode: description: |- Optional: mode bits to use on created files by default. Must be a Optional: mode bits used to set permissions on created files by default. Must be an octal value between 0000 and 0777 or a decimal value between 0 and 511. YAML accepts both octal and decimal values, JSON requires decimal values for mode bits. Defaults to 0644. Directories within the path are not affected by this setting. This might be in conflict with other options that affect the file mode, like fsGroup, and the result can be other mode bits set. format: int32 type: integer items: description: Items is a list of downward API volume file items: description: DownwardAPIVolumeFile represents information to create the file containing the pod field properties: fieldRef: description: 'Required: Selects a field of the pod: only annotations, labels, name, namespace and uid are supported.' properties: apiVersion: description: Version of the schema the FieldPath is written in terms of, defaults to "v1". type: string fieldPath: description: Path of the field to select in the specified API version. type: string required: - fieldPath type: object x-kubernetes-map-type: atomic mode: description: |- Optional: mode bits used to set permissions on this file, must be an octal value between 0000 and 0777 or a decimal value between 0 and 511. YAML accepts both octal and decimal values, JSON requires decimal values for mode bits. If not specified, the volume defaultMode will be used. This might be in conflict with other options that affect the file mode, like fsGroup, and the result can be other mode bits set. format: int32 type: integer path: description: 'Required: Path is the relative path name of the file to be created. Must not be absolute or contain the ''..'' path. Must be utf-8 encoded. The first item of the relative path must not start with ''..''' type: string resourceFieldRef: description: |- Selects a resource of the container: only resources limits and requests (limits.cpu, limits.memory, requests.cpu and requests.memory) are currently supported. properties: containerName: description: 'Container name: required for volumes, optional for env vars' type: string divisor: anyOf: - type: integer - type: string description: Specifies the output format of the exposed resources, defaults to "1" pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ x-kubernetes-int-or-string: true resource: description: 'Required: resource to select' type: string required: - resource type: object x-kubernetes-map-type: atomic required: - path type: object type: array x-kubernetes-list-type: atomic type: object emptyDir: description: |- emptyDir represents a temporary directory that shares a pod's lifetime. More info: https://kubernetes.io/docs/concepts/storage/volumes#emptydir properties: medium: description: |- medium represents what type of storage medium should back this directory. The default is "" which means to use the node's default medium. Must be an empty string (default) or Memory. More info: https://kubernetes.io/docs/concepts/storage/volumes#emptydir type: string sizeLimit: anyOf: - type: integer - type: string description: |- sizeLimit is the total amount of local storage required for this EmptyDir volume. The size limit is also applicable for memory medium. The maximum usage on memory medium EmptyDir would be the minimum value between the SizeLimit specified here and the sum of memory limits of all containers in a pod. The default is nil which means that the limit is undefined. More info: https://kubernetes.io/docs/concepts/storage/volumes#emptydir pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ x-kubernetes-int-or-string: true type: object ephemeral: description: |- ephemeral represents a volume that is handled by a cluster storage driver. The volume's lifecycle is tied to the pod that defines it - it will be created before the pod starts, and deleted when the pod is removed. Use this if: a) the volume is only needed while the pod runs, b) features of normal volumes like restoring from snapshot or capacity tracking are needed, c) the storage driver is specified through a storage class, and d) the storage driver supports dynamic volume provisioning through a PersistentVolumeClaim (see EphemeralVolumeSource for more information on the connection between this volume type and PersistentVolumeClaim). Use PersistentVolumeClaim or one of the vendor-specific APIs for volumes that persist for longer than the lifecycle of an individual pod. Use CSI for light-weight local ephemeral volumes if the CSI driver is meant to be used that way - see the documentation of the driver for more information. A pod can use both types of ephemeral volumes and persistent volumes at the same time. properties: volumeClaimTemplate: description: |- Will be used to create a stand-alone PVC to provision the volume. The pod in which this EphemeralVolumeSource is embedded will be the owner of the PVC, i.e. the PVC will be deleted together with the pod. The name of the PVC will be `-` where `` is the name from the `PodSpec.Volumes` array entry. Pod validation will reject the pod if the concatenated name is not valid for a PVC (for example, too long). An existing PVC with that name that is not owned by the pod will *not* be used for the pod to avoid using an unrelated volume by mistake. Starting the pod is then blocked until the unrelated PVC is removed. If such a pre-created PVC is meant to be used by the pod, the PVC has to updated with an owner reference to the pod once the pod exists. Normally this should not be necessary, but it may be useful when manually reconstructing a broken cluster. This field is read-only and no changes will be made by Kubernetes to the PVC after it has been created. Required, must not be nil. properties: metadata: description: |- May contain labels and annotations that will be copied into the PVC when creating it. No other fields are allowed and will be rejected during validation. properties: annotations: additionalProperties: type: string type: object finalizers: items: type: string type: array labels: additionalProperties: type: string type: object name: type: string namespace: type: string type: object spec: description: |- The specification for the PersistentVolumeClaim. The entire content is copied unchanged into the PVC that gets created from this template. The same fields as in a PersistentVolumeClaim are also valid here. properties: accessModes: description: |- accessModes contains the desired access modes the volume should have. More info: https://kubernetes.io/docs/concepts/storage/persistent-volumes#access-modes-1 items: type: string type: array x-kubernetes-list-type: atomic dataSource: description: |- dataSource field can be used to specify either: * An existing VolumeSnapshot object (snapshot.storage.k8s.io/VolumeSnapshot) * An existing PVC (PersistentVolumeClaim) If the provisioner or an external controller can support the specified data source, it will create a new volume based on the contents of the specified data source. When the AnyVolumeDataSource feature gate is enabled, dataSource contents will be copied to dataSourceRef, and dataSourceRef contents will be copied to dataSource when dataSourceRef.namespace is not specified. If the namespace is specified, then dataSourceRef will not be copied to dataSource. properties: apiGroup: description: |- APIGroup is the group for the resource being referenced. If APIGroup is not specified, the specified Kind must be in the core API group. For any other third-party types, APIGroup is required. type: string kind: description: Kind is the type of resource being referenced type: string name: description: Name is the name of resource being referenced type: string required: - kind - name type: object x-kubernetes-map-type: atomic dataSourceRef: description: |- dataSourceRef specifies the object from which to populate the volume with data, if a non-empty volume is desired. This may be any object from a non-empty API group (non core object) or a PersistentVolumeClaim object. When this field is specified, volume binding will only succeed if the type of the specified object matches some installed volume populator or dynamic provisioner. This field will replace the functionality of the dataSource field and as such if both fields are non-empty, they must have the same value. For backwards compatibility, when namespace isn't specified in dataSourceRef, both fields (dataSource and dataSourceRef) will be set to the same value automatically if one of them is empty and the other is non-empty. When namespace is specified in dataSourceRef, dataSource isn't set to the same value and must be empty. There are three important differences between dataSource and dataSourceRef: * While dataSource only allows two specific types of objects, dataSourceRef allows any non-core object, as well as PersistentVolumeClaim objects. * While dataSource ignores disallowed values (dropping them), dataSourceRef preserves all values, and generates an error if a disallowed value is specified. * While dataSource only allows local objects, dataSourceRef allows objects in any namespaces. (Beta) Using this field requires the AnyVolumeDataSource feature gate to be enabled. (Alpha) Using the namespace field of dataSourceRef requires the CrossNamespaceVolumeDataSource feature gate to be enabled. properties: apiGroup: description: |- APIGroup is the group for the resource being referenced. If APIGroup is not specified, the specified Kind must be in the core API group. For any other third-party types, APIGroup is required. type: string kind: description: Kind is the type of resource being referenced type: string name: description: Name is the name of resource being referenced type: string namespace: description: |- Namespace is the namespace of resource being referenced Note that when a namespace is specified, a gateway.networking.k8s.io/ReferenceGrant object is required in the referent namespace to allow that namespace's owner to accept the reference. See the ReferenceGrant documentation for details. (Alpha) This field requires the CrossNamespaceVolumeDataSource feature gate to be enabled. type: string required: - kind - name type: object resources: description: |- resources represents the minimum resources the volume should have. Users are allowed to specify resource requirements that are lower than previous value but must still be higher than capacity recorded in the status field of the claim. More info: https://kubernetes.io/docs/concepts/storage/persistent-volumes#resources properties: limits: additionalProperties: anyOf: - type: integer - type: string pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ x-kubernetes-int-or-string: true description: |- Limits describes the maximum amount of compute resources allowed. More info: https://kubernetes.io/docs/concepts/configuration/manage-resources-containers/ type: object requests: additionalProperties: anyOf: - type: integer - type: string pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ x-kubernetes-int-or-string: true description: |- Requests describes the minimum amount of compute resources required. If Requests is omitted for a container, it defaults to Limits if that is explicitly specified, otherwise to an implementation-defined value. Requests cannot exceed Limits. More info: https://kubernetes.io/docs/concepts/configuration/manage-resources-containers/ type: object type: object selector: description: selector is a label query over volumes to consider for binding. properties: matchExpressions: description: matchExpressions is a list of label selector requirements. The requirements are ANDed. items: description: |- A label selector requirement is a selector that contains values, a key, and an operator that relates the key and values. properties: key: description: key is the label key that the selector applies to. type: string operator: description: |- operator represents a key's relationship to a set of values. Valid operators are In, NotIn, Exists and DoesNotExist. type: string values: description: |- values is an array of string values. If the operator is In or NotIn, the values array must be non-empty. If the operator is Exists or DoesNotExist, the values array must be empty. This array is replaced during a strategic merge patch. items: type: string type: array x-kubernetes-list-type: atomic required: - key - operator type: object type: array x-kubernetes-list-type: atomic matchLabels: additionalProperties: type: string description: |- matchLabels is a map of {key,value} pairs. A single {key,value} in the matchLabels map is equivalent to an element of matchExpressions, whose key field is "key", the operator is "In", and the values array contains only "value". The requirements are ANDed. type: object type: object x-kubernetes-map-type: atomic storageClassName: description: |- storageClassName is the name of the StorageClass required by the claim. More info: https://kubernetes.io/docs/concepts/storage/persistent-volumes#class-1 type: string volumeAttributesClassName: description: |- volumeAttributesClassName may be used to set the VolumeAttributesClass used by this claim. If specified, the CSI driver will create or update the volume with the attributes defined in the corresponding VolumeAttributesClass. This has a different purpose than storageClassName, it can be changed after the claim is created. An empty string or nil value indicates that no VolumeAttributesClass will be applied to the claim. If the claim enters an Infeasible error state, this field can be reset to its previous value (including nil) to cancel the modification. If the resource referred to by volumeAttributesClass does not exist, this PersistentVolumeClaim will be set to a Pending state, as reflected by the modifyVolumeStatus field, until such as a resource exists. More info: https://kubernetes.io/docs/concepts/storage/volume-attributes-classes/ type: string volumeMode: description: |- volumeMode defines what type of volume is required by the claim. Value of Filesystem is implied when not included in claim spec. type: string volumeName: description: volumeName is the binding reference to the PersistentVolume backing this claim. type: string type: object required: - spec type: object type: object fc: description: fc represents a Fibre Channel resource that is attached to a kubelet's host machine and then exposed to the pod. properties: fsType: description: |- fsType is the filesystem type to mount. Must be a filesystem type supported by the host operating system. Ex. "ext4", "xfs", "ntfs". Implicitly inferred to be "ext4" if unspecified. type: string lun: description: 'lun is Optional: FC target lun number' format: int32 type: integer readOnly: description: |- readOnly is Optional: Defaults to false (read/write). ReadOnly here will force the ReadOnly setting in VolumeMounts. type: boolean targetWWNs: description: 'targetWWNs is Optional: FC target worldwide names (WWNs)' items: type: string type: array x-kubernetes-list-type: atomic wwids: description: |- wwids Optional: FC volume world wide identifiers (wwids) Either wwids or combination of targetWWNs and lun must be set, but not both simultaneously. items: type: string type: array x-kubernetes-list-type: atomic type: object flexVolume: description: |- flexVolume represents a generic volume resource that is provisioned/attached using an exec based plugin. Deprecated: FlexVolume is deprecated. Consider using a CSIDriver instead. properties: driver: description: driver is the name of the driver to use for this volume. type: string fsType: description: |- fsType is the filesystem type to mount. Must be a filesystem type supported by the host operating system. Ex. "ext4", "xfs", "ntfs". The default filesystem depends on FlexVolume script. type: string options: additionalProperties: type: string description: 'options is Optional: this field holds extra command options if any.' type: object readOnly: description: |- readOnly is Optional: defaults to false (read/write). ReadOnly here will force the ReadOnly setting in VolumeMounts. type: boolean secretRef: description: |- secretRef is Optional: secretRef is reference to the secret object containing sensitive information to pass to the plugin scripts. This may be empty if no secret object is specified. If the secret object contains more than one secret, all secrets are passed to the plugin scripts. properties: name: default: "" description: |- Name of the referent. This field is effectively required, but due to backwards compatibility is allowed to be empty. Instances of this type with an empty value here are almost certainly wrong. More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names type: string type: object x-kubernetes-map-type: atomic required: - driver type: object flocker: description: |- flocker represents a Flocker volume attached to a kubelet's host machine. This depends on the Flocker control service being running. Deprecated: Flocker is deprecated and the in-tree flocker type is no longer supported. properties: datasetName: description: |- datasetName is Name of the dataset stored as metadata -> name on the dataset for Flocker should be considered as deprecated type: string datasetUUID: description: datasetUUID is the UUID of the dataset. This is unique identifier of a Flocker dataset type: string type: object gcePersistentDisk: description: |- gcePersistentDisk represents a GCE Disk resource that is attached to a kubelet's host machine and then exposed to the pod. Deprecated: GCEPersistentDisk is deprecated. All operations for the in-tree gcePersistentDisk type are redirected to the pd.csi.storage.gke.io CSI driver. More info: https://kubernetes.io/docs/concepts/storage/volumes#gcepersistentdisk properties: fsType: description: |- fsType is filesystem type of the volume that you want to mount. Tip: Ensure that the filesystem type is supported by the host operating system. Examples: "ext4", "xfs", "ntfs". Implicitly inferred to be "ext4" if unspecified. More info: https://kubernetes.io/docs/concepts/storage/volumes#gcepersistentdisk type: string partition: description: |- partition is the partition in the volume that you want to mount. If omitted, the default is to mount by volume name. Examples: For volume /dev/sda1, you specify the partition as "1". Similarly, the volume partition for /dev/sda is "0" (or you can leave the property empty). More info: https://kubernetes.io/docs/concepts/storage/volumes#gcepersistentdisk format: int32 type: integer pdName: description: |- pdName is unique name of the PD resource in GCE. Used to identify the disk in GCE. More info: https://kubernetes.io/docs/concepts/storage/volumes#gcepersistentdisk type: string readOnly: description: |- readOnly here will force the ReadOnly setting in VolumeMounts. Defaults to false. More info: https://kubernetes.io/docs/concepts/storage/volumes#gcepersistentdisk type: boolean required: - pdName type: object gitRepo: description: |- gitRepo represents a git repository at a particular revision. Deprecated: GitRepo is deprecated. To provision a container with a git repo, mount an EmptyDir into an InitContainer that clones the repo using git, then mount the EmptyDir into the Pod's container. properties: directory: description: |- directory is the target directory name. Must not contain or start with '..'. If '.' is supplied, the volume directory will be the git repository. Otherwise, if specified, the volume will contain the git repository in the subdirectory with the given name. type: string repository: description: repository is the URL type: string revision: description: revision is the commit hash for the specified revision. type: string required: - repository type: object glusterfs: description: |- glusterfs represents a Glusterfs mount on the host that shares a pod's lifetime. Deprecated: Glusterfs is deprecated and the in-tree glusterfs type is no longer supported. properties: endpoints: description: endpoints is the endpoint name that details Glusterfs topology. type: string path: description: |- path is the Glusterfs volume path. More info: https://examples.k8s.io/volumes/glusterfs/README.md#create-a-pod type: string readOnly: description: |- readOnly here will force the Glusterfs volume to be mounted with read-only permissions. Defaults to false. More info: https://examples.k8s.io/volumes/glusterfs/README.md#create-a-pod type: boolean required: - endpoints - path type: object hostPath: description: |- hostPath represents a pre-existing file or directory on the host machine that is directly exposed to the container. This is generally used for system agents or other privileged things that are allowed to see the host machine. Most containers will NOT need this. More info: https://kubernetes.io/docs/concepts/storage/volumes#hostpath properties: path: description: |- path of the directory on the host. If the path is a symlink, it will follow the link to the real path. More info: https://kubernetes.io/docs/concepts/storage/volumes#hostpath type: string type: description: |- type for HostPath Volume Defaults to "" More info: https://kubernetes.io/docs/concepts/storage/volumes#hostpath type: string required: - path type: object image: description: |- image represents an OCI object (a container image or artifact) pulled and mounted on the kubelet's host machine. The volume is resolved at pod startup depending on which PullPolicy value is provided: - Always: the kubelet always attempts to pull the reference. Container creation will fail If the pull fails. - Never: the kubelet never pulls the reference and only uses a local image or artifact. Container creation will fail if the reference isn't present. - IfNotPresent: the kubelet pulls if the reference isn't already present on disk. Container creation will fail if the reference isn't present and the pull fails. The volume gets re-resolved if the pod gets deleted and recreated, which means that new remote content will become available on pod recreation. A failure to resolve or pull the image during pod startup will block containers from starting and may add significant latency. Failures will be retried using normal volume backoff and will be reported on the pod reason and message. The types of objects that may be mounted by this volume are defined by the container runtime implementation on a host machine and at minimum must include all valid types supported by the container image field. The OCI object gets mounted in a single directory (spec.containers[*].volumeMounts.mountPath) by merging the manifest layers in the same way as for container images. The volume will be mounted read-only (ro) and non-executable files (noexec). Sub path mounts for containers are not supported (spec.containers[*].volumeMounts.subpath) before 1.33. The field spec.securityContext.fsGroupChangePolicy has no effect on this volume type. properties: pullPolicy: description: |- Policy for pulling OCI objects. Possible values are: Always: the kubelet always attempts to pull the reference. Container creation will fail If the pull fails. Never: the kubelet never pulls the reference and only uses a local image or artifact. Container creation will fail if the reference isn't present. IfNotPresent: the kubelet pulls if the reference isn't already present on disk. Container creation will fail if the reference isn't present and the pull fails. Defaults to Always if :latest tag is specified, or IfNotPresent otherwise. type: string reference: description: |- Required: Image or artifact reference to be used. Behaves in the same way as pod.spec.containers[*].image. Pull secrets will be assembled in the same way as for the container image by looking up node credentials, SA image pull secrets, and pod spec image pull secrets. More info: https://kubernetes.io/docs/concepts/containers/images This field is optional to allow higher level config management to default or override container images in workload controllers like Deployments and StatefulSets. type: string type: object iscsi: description: |- iscsi represents an ISCSI Disk resource that is attached to a kubelet's host machine and then exposed to the pod. More info: https://kubernetes.io/docs/concepts/storage/volumes/#iscsi properties: chapAuthDiscovery: description: chapAuthDiscovery defines whether support iSCSI Discovery CHAP authentication type: boolean chapAuthSession: description: chapAuthSession defines whether support iSCSI Session CHAP authentication type: boolean fsType: description: |- fsType is the filesystem type of the volume that you want to mount. Tip: Ensure that the filesystem type is supported by the host operating system. Examples: "ext4", "xfs", "ntfs". Implicitly inferred to be "ext4" if unspecified. More info: https://kubernetes.io/docs/concepts/storage/volumes#iscsi type: string initiatorName: description: |- initiatorName is the custom iSCSI Initiator Name. If initiatorName is specified with iscsiInterface simultaneously, new iSCSI interface : will be created for the connection. type: string iqn: description: iqn is the target iSCSI Qualified Name. type: string iscsiInterface: default: default description: |- iscsiInterface is the interface Name that uses an iSCSI transport. Defaults to 'default' (tcp). type: string lun: description: lun represents iSCSI Target Lun number. format: int32 type: integer portals: description: |- portals is the iSCSI Target Portal List. The portal is either an IP or ip_addr:port if the port is other than default (typically TCP ports 860 and 3260). items: type: string type: array x-kubernetes-list-type: atomic readOnly: description: |- readOnly here will force the ReadOnly setting in VolumeMounts. Defaults to false. type: boolean secretRef: description: secretRef is the CHAP Secret for iSCSI target and initiator authentication properties: name: default: "" description: |- Name of the referent. This field is effectively required, but due to backwards compatibility is allowed to be empty. Instances of this type with an empty value here are almost certainly wrong. More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names type: string type: object x-kubernetes-map-type: atomic targetPortal: description: |- targetPortal is iSCSI Target Portal. The Portal is either an IP or ip_addr:port if the port is other than default (typically TCP ports 860 and 3260). type: string required: - iqn - lun - targetPortal type: object name: description: |- name of the volume. Must be a DNS_LABEL and unique within the pod. More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names type: string nfs: description: |- nfs represents an NFS mount on the host that shares a pod's lifetime More info: https://kubernetes.io/docs/concepts/storage/volumes#nfs properties: path: description: |- path that is exported by the NFS server. More info: https://kubernetes.io/docs/concepts/storage/volumes#nfs type: string readOnly: description: |- readOnly here will force the NFS export to be mounted with read-only permissions. Defaults to false. More info: https://kubernetes.io/docs/concepts/storage/volumes#nfs type: boolean server: description: |- server is the hostname or IP address of the NFS server. More info: https://kubernetes.io/docs/concepts/storage/volumes#nfs type: string required: - path - server type: object persistentVolumeClaim: description: |- persistentVolumeClaimVolumeSource represents a reference to a PersistentVolumeClaim in the same namespace. More info: https://kubernetes.io/docs/concepts/storage/persistent-volumes#persistentvolumeclaims properties: claimName: description: |- claimName is the name of a PersistentVolumeClaim in the same namespace as the pod using this volume. More info: https://kubernetes.io/docs/concepts/storage/persistent-volumes#persistentvolumeclaims type: string readOnly: description: |- readOnly Will force the ReadOnly setting in VolumeMounts. Default false. type: boolean required: - claimName type: object photonPersistentDisk: description: |- photonPersistentDisk represents a PhotonController persistent disk attached and mounted on kubelets host machine. Deprecated: PhotonPersistentDisk is deprecated and the in-tree photonPersistentDisk type is no longer supported. properties: fsType: description: |- fsType is the filesystem type to mount. Must be a filesystem type supported by the host operating system. Ex. "ext4", "xfs", "ntfs". Implicitly inferred to be "ext4" if unspecified. type: string pdID: description: pdID is the ID that identifies Photon Controller persistent disk type: string required: - pdID type: object portworxVolume: description: |- portworxVolume represents a portworx volume attached and mounted on kubelets host machine. Deprecated: PortworxVolume is deprecated. All operations for the in-tree portworxVolume type are redirected to the pxd.portworx.com CSI driver when the CSIMigrationPortworx feature-gate is on. properties: fsType: description: |- fSType represents the filesystem type to mount Must be a filesystem type supported by the host operating system. Ex. "ext4", "xfs". Implicitly inferred to be "ext4" if unspecified. type: string readOnly: description: |- readOnly defaults to false (read/write). ReadOnly here will force the ReadOnly setting in VolumeMounts. type: boolean volumeID: description: volumeID uniquely identifies a Portworx volume type: string required: - volumeID type: object projected: description: projected items for all in one resources secrets, configmaps, and downward API properties: defaultMode: description: |- defaultMode are the mode bits used to set permissions on created files by default. Must be an octal value between 0000 and 0777 or a decimal value between 0 and 511. YAML accepts both octal and decimal values, JSON requires decimal values for mode bits. Directories within the path are not affected by this setting. This might be in conflict with other options that affect the file mode, like fsGroup, and the result can be other mode bits set. format: int32 type: integer sources: description: |- sources is the list of volume projections. Each entry in this list handles one source. items: description: |- Projection that may be projected along with other supported volume types. Exactly one of these fields must be set. properties: clusterTrustBundle: description: |- ClusterTrustBundle allows a pod to access the `.spec.trustBundle` field of ClusterTrustBundle objects in an auto-updating file. Alpha, gated by the ClusterTrustBundleProjection feature gate. ClusterTrustBundle objects can either be selected by name, or by the combination of signer name and a label selector. Kubelet performs aggressive normalization of the PEM contents written into the pod filesystem. Esoteric PEM features such as inter-block comments and block headers are stripped. Certificates are deduplicated. The ordering of certificates within the file is arbitrary, and Kubelet may change the order over time. properties: labelSelector: description: |- Select all ClusterTrustBundles that match this label selector. Only has effect if signerName is set. Mutually-exclusive with name. If unset, interpreted as "match nothing". If set but empty, interpreted as "match everything". properties: matchExpressions: description: matchExpressions is a list of label selector requirements. The requirements are ANDed. items: description: |- A label selector requirement is a selector that contains values, a key, and an operator that relates the key and values. properties: key: description: key is the label key that the selector applies to. type: string operator: description: |- operator represents a key's relationship to a set of values. Valid operators are In, NotIn, Exists and DoesNotExist. type: string values: description: |- values is an array of string values. If the operator is In or NotIn, the values array must be non-empty. If the operator is Exists or DoesNotExist, the values array must be empty. This array is replaced during a strategic merge patch. items: type: string type: array x-kubernetes-list-type: atomic required: - key - operator type: object type: array x-kubernetes-list-type: atomic matchLabels: additionalProperties: type: string description: |- matchLabels is a map of {key,value} pairs. A single {key,value} in the matchLabels map is equivalent to an element of matchExpressions, whose key field is "key", the operator is "In", and the values array contains only "value". The requirements are ANDed. type: object type: object x-kubernetes-map-type: atomic name: description: |- Select a single ClusterTrustBundle by object name. Mutually-exclusive with signerName and labelSelector. type: string optional: description: |- If true, don't block pod startup if the referenced ClusterTrustBundle(s) aren't available. If using name, then the named ClusterTrustBundle is allowed not to exist. If using signerName, then the combination of signerName and labelSelector is allowed to match zero ClusterTrustBundles. type: boolean path: description: Relative path from the volume root to write the bundle. type: string signerName: description: |- Select all ClusterTrustBundles that match this signer name. Mutually-exclusive with name. The contents of all selected ClusterTrustBundles will be unified and deduplicated. type: string required: - path type: object configMap: description: configMap information about the configMap data to project properties: items: description: |- items if unspecified, each key-value pair in the Data field of the referenced ConfigMap will be projected into the volume as a file whose name is the key and content is the value. If specified, the listed keys will be projected into the specified paths, and unlisted keys will not be present. If a key is specified which is not present in the ConfigMap, the volume setup will error unless it is marked optional. Paths must be relative and may not contain the '..' path or start with '..'. items: description: Maps a string key to a path within a volume. properties: key: description: key is the key to project. type: string mode: description: |- mode is Optional: mode bits used to set permissions on this file. Must be an octal value between 0000 and 0777 or a decimal value between 0 and 511. YAML accepts both octal and decimal values, JSON requires decimal values for mode bits. If not specified, the volume defaultMode will be used. This might be in conflict with other options that affect the file mode, like fsGroup, and the result can be other mode bits set. format: int32 type: integer path: description: |- path is the relative path of the file to map the key to. May not be an absolute path. May not contain the path element '..'. May not start with the string '..'. type: string required: - key - path type: object type: array x-kubernetes-list-type: atomic name: default: "" description: |- Name of the referent. This field is effectively required, but due to backwards compatibility is allowed to be empty. Instances of this type with an empty value here are almost certainly wrong. More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names type: string optional: description: optional specify whether the ConfigMap or its keys must be defined type: boolean type: object x-kubernetes-map-type: atomic downwardAPI: description: downwardAPI information about the downwardAPI data to project properties: items: description: Items is a list of DownwardAPIVolume file items: description: DownwardAPIVolumeFile represents information to create the file containing the pod field properties: fieldRef: description: 'Required: Selects a field of the pod: only annotations, labels, name, namespace and uid are supported.' properties: apiVersion: description: Version of the schema the FieldPath is written in terms of, defaults to "v1". type: string fieldPath: description: Path of the field to select in the specified API version. type: string required: - fieldPath type: object x-kubernetes-map-type: atomic mode: description: |- Optional: mode bits used to set permissions on this file, must be an octal value between 0000 and 0777 or a decimal value between 0 and 511. YAML accepts both octal and decimal values, JSON requires decimal values for mode bits. If not specified, the volume defaultMode will be used. This might be in conflict with other options that affect the file mode, like fsGroup, and the result can be other mode bits set. format: int32 type: integer path: description: 'Required: Path is the relative path name of the file to be created. Must not be absolute or contain the ''..'' path. Must be utf-8 encoded. The first item of the relative path must not start with ''..''' type: string resourceFieldRef: description: |- Selects a resource of the container: only resources limits and requests (limits.cpu, limits.memory, requests.cpu and requests.memory) are currently supported. properties: containerName: description: 'Container name: required for volumes, optional for env vars' type: string divisor: anyOf: - type: integer - type: string description: Specifies the output format of the exposed resources, defaults to "1" pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ x-kubernetes-int-or-string: true resource: description: 'Required: resource to select' type: string required: - resource type: object x-kubernetes-map-type: atomic required: - path type: object type: array x-kubernetes-list-type: atomic type: object podCertificate: description: |- Projects an auto-rotating credential bundle (private key and certificate chain) that the pod can use either as a TLS client or server. Kubelet generates a private key and uses it to send a PodCertificateRequest to the named signer. Once the signer approves the request and issues a certificate chain, Kubelet writes the key and certificate chain to the pod filesystem. The pod does not start until certificates have been issued for each podCertificate projected volume source in its spec. Kubelet will begin trying to rotate the certificate at the time indicated by the signer using the PodCertificateRequest.Status.BeginRefreshAt timestamp. Kubelet can write a single file, indicated by the credentialBundlePath field, or separate files, indicated by the keyPath and certificateChainPath fields. The credential bundle is a single file in PEM format. The first PEM entry is the private key (in PKCS#8 format), and the remaining PEM entries are the certificate chain issued by the signer (typically, signers will return their certificate chain in leaf-to-root order). Prefer using the credential bundle format, since your application code can read it atomically. If you use keyPath and certificateChainPath, your application must make two separate file reads. If these coincide with a certificate rotation, it is possible that the private key and leaf certificate you read may not correspond to each other. Your application will need to check for this condition, and re-read until they are consistent. The named signer controls chooses the format of the certificate it issues; consult the signer implementation's documentation to learn how to use the certificates it issues. properties: certificateChainPath: description: |- Write the certificate chain at this path in the projected volume. Most applications should use credentialBundlePath. When using keyPath and certificateChainPath, your application needs to check that the key and leaf certificate are consistent, because it is possible to read the files mid-rotation. type: string credentialBundlePath: description: |- Write the credential bundle at this path in the projected volume. The credential bundle is a single file that contains multiple PEM blocks. The first PEM block is a PRIVATE KEY block, containing a PKCS#8 private key. The remaining blocks are CERTIFICATE blocks, containing the issued certificate chain from the signer (leaf and any intermediates). Using credentialBundlePath lets your Pod's application code make a single atomic read that retrieves a consistent key and certificate chain. If you project them to separate files, your application code will need to additionally check that the leaf certificate was issued to the key. type: string keyPath: description: |- Write the key at this path in the projected volume. Most applications should use credentialBundlePath. When using keyPath and certificateChainPath, your application needs to check that the key and leaf certificate are consistent, because it is possible to read the files mid-rotation. type: string keyType: description: |- The type of keypair Kubelet will generate for the pod. Valid values are "RSA3072", "RSA4096", "ECDSAP256", "ECDSAP384", "ECDSAP521", and "ED25519". type: string maxExpirationSeconds: description: |- maxExpirationSeconds is the maximum lifetime permitted for the certificate. Kubelet copies this value verbatim into the PodCertificateRequests it generates for this projection. If omitted, kube-apiserver will set it to 86400(24 hours). kube-apiserver will reject values shorter than 3600 (1 hour). The maximum allowable value is 7862400 (91 days). The signer implementation is then free to issue a certificate with any lifetime *shorter* than MaxExpirationSeconds, but no shorter than 3600 seconds (1 hour). This constraint is enforced by kube-apiserver. `kubernetes.io` signers will never issue certificates with a lifetime longer than 24 hours. format: int32 type: integer signerName: description: Kubelet's generated CSRs will be addressed to this signer. type: string userAnnotations: additionalProperties: type: string description: |- userAnnotations allow pod authors to pass additional information to the signer implementation. Kubernetes does not restrict or validate this metadata in any way. These values are copied verbatim into the `spec.unverifiedUserAnnotations` field of the PodCertificateRequest objects that Kubelet creates. Entries are subject to the same validation as object metadata annotations, with the addition that all keys must be domain-prefixed. No restrictions are placed on values, except an overall size limitation on the entire field. Signers should document the keys and values they support. Signers should deny requests that contain keys they do not recognize. type: object required: - keyType - signerName type: object secret: description: secret information about the secret data to project properties: items: description: |- items if unspecified, each key-value pair in the Data field of the referenced Secret will be projected into the volume as a file whose name is the key and content is the value. If specified, the listed keys will be projected into the specified paths, and unlisted keys will not be present. If a key is specified which is not present in the Secret, the volume setup will error unless it is marked optional. Paths must be relative and may not contain the '..' path or start with '..'. items: description: Maps a string key to a path within a volume. properties: key: description: key is the key to project. type: string mode: description: |- mode is Optional: mode bits used to set permissions on this file. Must be an octal value between 0000 and 0777 or a decimal value between 0 and 511. YAML accepts both octal and decimal values, JSON requires decimal values for mode bits. If not specified, the volume defaultMode will be used. This might be in conflict with other options that affect the file mode, like fsGroup, and the result can be other mode bits set. format: int32 type: integer path: description: |- path is the relative path of the file to map the key to. May not be an absolute path. May not contain the path element '..'. May not start with the string '..'. type: string required: - key - path type: object type: array x-kubernetes-list-type: atomic name: default: "" description: |- Name of the referent. This field is effectively required, but due to backwards compatibility is allowed to be empty. Instances of this type with an empty value here are almost certainly wrong. More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names type: string optional: description: optional field specify whether the Secret or its key must be defined type: boolean type: object x-kubernetes-map-type: atomic serviceAccountToken: description: serviceAccountToken is information about the serviceAccountToken data to project properties: audience: description: |- audience is the intended audience of the token. A recipient of a token must identify itself with an identifier specified in the audience of the token, and otherwise should reject the token. The audience defaults to the identifier of the apiserver. type: string expirationSeconds: description: |- expirationSeconds is the requested duration of validity of the service account token. As the token approaches expiration, the kubelet volume plugin will proactively rotate the service account token. The kubelet will start trying to rotate the token if the token is older than 80 percent of its time to live or if the token is older than 24 hours.Defaults to 1 hour and must be at least 10 minutes. format: int64 type: integer path: description: |- path is the path relative to the mount point of the file to project the token into. type: string required: - path type: object type: object type: array x-kubernetes-list-type: atomic type: object quobyte: description: |- quobyte represents a Quobyte mount on the host that shares a pod's lifetime. Deprecated: Quobyte is deprecated and the in-tree quobyte type is no longer supported. properties: group: description: |- group to map volume access to Default is no group type: string readOnly: description: |- readOnly here will force the Quobyte volume to be mounted with read-only permissions. Defaults to false. type: boolean registry: description: |- registry represents a single or multiple Quobyte Registry services specified as a string as host:port pair (multiple entries are separated with commas) which acts as the central registry for volumes type: string tenant: description: |- tenant owning the given Quobyte volume in the Backend Used with dynamically provisioned Quobyte volumes, value is set by the plugin type: string user: description: |- user to map volume access to Defaults to serivceaccount user type: string volume: description: volume is a string that references an already created Quobyte volume by name. type: string required: - registry - volume type: object rbd: description: |- rbd represents a Rados Block Device mount on the host that shares a pod's lifetime. Deprecated: RBD is deprecated and the in-tree rbd type is no longer supported. properties: fsType: description: |- fsType is the filesystem type of the volume that you want to mount. Tip: Ensure that the filesystem type is supported by the host operating system. Examples: "ext4", "xfs", "ntfs". Implicitly inferred to be "ext4" if unspecified. More info: https://kubernetes.io/docs/concepts/storage/volumes#rbd type: string image: description: |- image is the rados image name. More info: https://examples.k8s.io/volumes/rbd/README.md#how-to-use-it type: string keyring: default: /etc/ceph/keyring description: |- keyring is the path to key ring for RBDUser. Default is /etc/ceph/keyring. More info: https://examples.k8s.io/volumes/rbd/README.md#how-to-use-it type: string monitors: description: |- monitors is a collection of Ceph monitors. More info: https://examples.k8s.io/volumes/rbd/README.md#how-to-use-it items: type: string type: array x-kubernetes-list-type: atomic pool: default: rbd description: |- pool is the rados pool name. Default is rbd. More info: https://examples.k8s.io/volumes/rbd/README.md#how-to-use-it type: string readOnly: description: |- readOnly here will force the ReadOnly setting in VolumeMounts. Defaults to false. More info: https://examples.k8s.io/volumes/rbd/README.md#how-to-use-it type: boolean secretRef: description: |- secretRef is name of the authentication secret for RBDUser. If provided overrides keyring. Default is nil. More info: https://examples.k8s.io/volumes/rbd/README.md#how-to-use-it properties: name: default: "" description: |- Name of the referent. This field is effectively required, but due to backwards compatibility is allowed to be empty. Instances of this type with an empty value here are almost certainly wrong. More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names type: string type: object x-kubernetes-map-type: atomic user: default: admin description: |- user is the rados user name. Default is admin. More info: https://examples.k8s.io/volumes/rbd/README.md#how-to-use-it type: string required: - image - monitors type: object scaleIO: description: |- scaleIO represents a ScaleIO persistent volume attached and mounted on Kubernetes nodes. Deprecated: ScaleIO is deprecated and the in-tree scaleIO type is no longer supported. properties: fsType: default: xfs description: |- fsType is the filesystem type to mount. Must be a filesystem type supported by the host operating system. Ex. "ext4", "xfs", "ntfs". Default is "xfs". type: string gateway: description: gateway is the host address of the ScaleIO API Gateway. type: string protectionDomain: description: protectionDomain is the name of the ScaleIO Protection Domain for the configured storage. type: string readOnly: description: |- readOnly Defaults to false (read/write). ReadOnly here will force the ReadOnly setting in VolumeMounts. type: boolean secretRef: description: |- secretRef references to the secret for ScaleIO user and other sensitive information. If this is not provided, Login operation will fail. properties: name: default: "" description: |- Name of the referent. This field is effectively required, but due to backwards compatibility is allowed to be empty. Instances of this type with an empty value here are almost certainly wrong. More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names type: string type: object x-kubernetes-map-type: atomic sslEnabled: description: sslEnabled Flag enable/disable SSL communication with Gateway, default false type: boolean storageMode: default: ThinProvisioned description: |- storageMode indicates whether the storage for a volume should be ThickProvisioned or ThinProvisioned. Default is ThinProvisioned. type: string storagePool: description: storagePool is the ScaleIO Storage Pool associated with the protection domain. type: string system: description: system is the name of the storage system as configured in ScaleIO. type: string volumeName: description: |- volumeName is the name of a volume already created in the ScaleIO system that is associated with this volume source. type: string required: - gateway - secretRef - system type: object secret: description: |- secret represents a secret that should populate this volume. More info: https://kubernetes.io/docs/concepts/storage/volumes#secret properties: defaultMode: description: |- defaultMode is Optional: mode bits used to set permissions on created files by default. Must be an octal value between 0000 and 0777 or a decimal value between 0 and 511. YAML accepts both octal and decimal values, JSON requires decimal values for mode bits. Defaults to 0644. Directories within the path are not affected by this setting. This might be in conflict with other options that affect the file mode, like fsGroup, and the result can be other mode bits set. format: int32 type: integer items: description: |- items If unspecified, each key-value pair in the Data field of the referenced Secret will be projected into the volume as a file whose name is the key and content is the value. If specified, the listed keys will be projected into the specified paths, and unlisted keys will not be present. If a key is specified which is not present in the Secret, the volume setup will error unless it is marked optional. Paths must be relative and may not contain the '..' path or start with '..'. items: description: Maps a string key to a path within a volume. properties: key: description: key is the key to project. type: string mode: description: |- mode is Optional: mode bits used to set permissions on this file. Must be an octal value between 0000 and 0777 or a decimal value between 0 and 511. YAML accepts both octal and decimal values, JSON requires decimal values for mode bits. If not specified, the volume defaultMode will be used. This might be in conflict with other options that affect the file mode, like fsGroup, and the result can be other mode bits set. format: int32 type: integer path: description: |- path is the relative path of the file to map the key to. May not be an absolute path. May not contain the path element '..'. May not start with the string '..'. type: string required: - key - path type: object type: array x-kubernetes-list-type: atomic optional: description: optional field specify whether the Secret or its keys must be defined type: boolean secretName: description: |- secretName is the name of the secret in the pod's namespace to use. More info: https://kubernetes.io/docs/concepts/storage/volumes#secret type: string type: object storageos: description: |- storageOS represents a StorageOS volume attached and mounted on Kubernetes nodes. Deprecated: StorageOS is deprecated and the in-tree storageos type is no longer supported. properties: fsType: description: |- fsType is the filesystem type to mount. Must be a filesystem type supported by the host operating system. Ex. "ext4", "xfs", "ntfs". Implicitly inferred to be "ext4" if unspecified. type: string readOnly: description: |- readOnly defaults to false (read/write). ReadOnly here will force the ReadOnly setting in VolumeMounts. type: boolean secretRef: description: |- secretRef specifies the secret to use for obtaining the StorageOS API credentials. If not specified, default values will be attempted. properties: name: default: "" description: |- Name of the referent. This field is effectively required, but due to backwards compatibility is allowed to be empty. Instances of this type with an empty value here are almost certainly wrong. More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names type: string type: object x-kubernetes-map-type: atomic volumeName: description: |- volumeName is the human-readable name of the StorageOS volume. Volume names are only unique within a namespace. type: string volumeNamespace: description: |- volumeNamespace specifies the scope of the volume within StorageOS. If no namespace is specified then the Pod's namespace will be used. This allows the Kubernetes name scoping to be mirrored within StorageOS for tighter integration. Set VolumeName to any name to override the default behaviour. Set to "default" if you are not using namespaces within StorageOS. Namespaces that do not pre-exist within StorageOS will be created. type: string type: object vsphereVolume: description: |- vsphereVolume represents a vSphere volume attached and mounted on kubelets host machine. Deprecated: VsphereVolume is deprecated. All operations for the in-tree vsphereVolume type are redirected to the csi.vsphere.vmware.com CSI driver. properties: fsType: description: |- fsType is filesystem type to mount. Must be a filesystem type supported by the host operating system. Ex. "ext4", "xfs", "ntfs". Implicitly inferred to be "ext4" if unspecified. type: string storagePolicyID: description: storagePolicyID is the storage Policy Based Management (SPBM) profile ID associated with the StoragePolicyName. type: string storagePolicyName: description: storagePolicyName is the storage Policy Based Management (SPBM) profile name. type: string volumePath: description: volumePath is the path that identifies vSphere volume vmdk type: string required: - volumePath type: object required: - name type: object type: array x-kubernetes-list-map-keys: - name x-kubernetes-list-type: map workloadRef: description: |- WorkloadRef provides a reference to the Workload object that this Pod belongs to. This field is used by the scheduler to identify the PodGroup and apply the correct group scheduling policies. The Workload object referenced by this field may not exist at the time the Pod is created. This field is immutable, but a Workload object with the same name may be recreated with different policies. Doing this during pod scheduling may result in the placement not conforming to the expected policies. properties: name: description: |- Name defines the name of the Workload object this Pod belongs to. Workload must be in the same namespace as the Pod. If it doesn't match any existing Workload, the Pod will remain unschedulable until a Workload object is created and observed by the kube-scheduler. It must be a DNS subdomain. type: string podGroup: description: |- PodGroup is the name of the PodGroup within the Workload that this Pod belongs to. If it doesn't match any existing PodGroup within the Workload, the Pod will remain unschedulable until the Workload object is recreated and observed by the kube-scheduler. It must be a DNS label. type: string podGroupReplicaKey: description: |- PodGroupReplicaKey specifies the replica key of the PodGroup to which this Pod belongs. It is used to distinguish pods belonging to different replicas of the same pod group. The pod group policy is applied separately to each replica. When set, it must be a DNS label. type: string required: - name - podGroup type: object required: - containers type: object type: object type: object description: |- MPIReplicaSpecs contains maps from `MPIReplicaType` to `ReplicaSpec` that specify the MPI replicas to run. type: object runLauncherAsWorker: default: false description: |- RunLauncherAsWorker indicates whether to run worker process in launcher Defaults to false. type: boolean runPolicy: description: RunPolicy encapsulates various runtime policies of the job. properties: activeDeadlineSeconds: description: |- Specifies the duration in seconds relative to the startTime that the job may be active before the system tries to terminate it; value must be positive integer. format: int64 type: integer backoffLimit: description: Optional number of retries before marking this job failed. format: int32 type: integer cleanPodPolicy: description: |- CleanPodPolicy defines the policy to kill pods after the job completes. Default to Running. type: string managedBy: description: |- ManagedBy is used to indicate the controller or entity that manages a MPIJob. The value must be either empty, 'kubeflow.org/mpi-operator' or 'kueue.x-k8s.io/multikueue'. The mpi-operator reconciles a MPIJob which doesn't have this field at all or the field value is the reserved string 'kubeflow.org/mpi-operator', but delegates reconciling the MPIJob with 'kueue.x-k8s.io/multikueue' to the Kueue. The field is immutable. type: string schedulingPolicy: description: SchedulingPolicy defines the policy related to scheduling, e.g. gang-scheduling properties: minAvailable: description: |- MinAvailable defines the minimal number of member to run the PodGroup. If the gang-scheduling isn't empty, input is passed to `.spec.minMember` in PodGroup. Note that, when using this field, you need to make sure the application supports resizing (e.g., Elastic Horovod). If not set, it defaults to the number of workers. format: int32 type: integer minResources: additionalProperties: anyOf: - type: integer - type: string pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ x-kubernetes-int-or-string: true description: |- MinResources defines the minimal resources of members to run the PodGroup. If the gang-scheduling isn't empty, input is passed to `.spec.minResources` in PodGroup for scheduler-plugins. type: object priorityClass: description: |- PriorityClass defines the PodGroup's PriorityClass. If the gang-scheduling is set to the volcano, input is passed to `.spec.priorityClassName` in PodGroup for volcano, and if it is set to the scheduler-plugins, input isn't passed to PodGroup for scheduler-plugins. type: string queue: description: |- Queue defines the queue name to allocate resource for PodGroup. If the gang-scheduling is set to the volcano, input is passed to `.spec.queue` in PodGroup for the volcano, and if it is set to the scheduler-plugins, input isn't passed to PodGroup. type: string scheduleTimeoutSeconds: description: |- SchedulerTimeoutSeconds defines the maximal time of members to wait before run the PodGroup. If the gang-scheduling is set to the scheduler-plugins, input is passed to `.spec.scheduleTimeoutSeconds` in PodGroup for the scheduler-plugins, and if it is set to the volcano, input isn't passed to PodGroup. format: int32 type: integer type: object suspend: default: false description: |- suspend specifies whether the MPIJob controller should create Pods or not. If a MPIJob is created with suspend set to true, no Pods are created by the MPIJob controller. If a MPIJob is suspended after creation (i.e. the flag goes from false to true), the MPIJob controller will delete all active Pods and PodGroups associated with this MPIJob. Also, it will suspend the Launcher Job. Users must design their workload to gracefully handle this. Suspending a Job will reset the StartTime field of the MPIJob. Defaults to false. type: boolean ttlSecondsAfterFinished: description: |- TTLSecondsAfterFinished is the TTL to clean up jobs. It may take extra ReconcilePeriod seconds for the cleanup, since reconcile gets called periodically. Default to infinite. format: int32 type: integer type: object slotsPerWorker: default: 1 description: |- Specifies the number of slots per worker used in hostfile. Defaults to 1. format: int32 type: integer sshAuthMountPath: default: /root/.ssh description: |- SSHAuthMountPath is the directory where SSH keys are mounted. Defaults to "/root/.ssh". type: string required: - mpiReplicaSpecs type: object status: description: JobStatus represents the current observed state of the training Job. properties: completionTime: description: |- Represents time when the job was completed. It is not guaranteed to be set in happens-before order across separate operations. It is represented in RFC3339 form and is in UTC. format: date-time type: string conditions: description: conditions is a list of current observed job conditions. items: description: JobCondition describes the state of the job at a certain point. properties: lastTransitionTime: description: Last time the condition transitioned from one status to another. format: date-time type: string lastUpdateTime: description: The last time this condition was updated. format: date-time type: string message: description: A human-readable message indicating details about the transition. type: string reason: description: The reason for the condition's last transition. type: string status: description: status of the condition, one of True, False, Unknown. enum: - "True" - "False" - Unknown type: string type: description: type of job condition. type: string required: - status - type type: object type: array x-kubernetes-list-map-keys: - type x-kubernetes-list-type: map lastReconcileTime: description: |- Represents last time when the job was reconciled. It is not guaranteed to be set in happens-before order across separate operations. It is represented in RFC3339 form and is in UTC. format: date-time type: string replicaStatuses: additionalProperties: description: ReplicaStatus represents the current observed state of the replica. properties: active: description: The number of actively running pods. format: int32 type: integer failed: description: The number of pods which reached phase failed. format: int32 type: integer labelSelector: description: 'Deprecated: Use selector instead' properties: matchExpressions: description: matchExpressions is a list of label selector requirements. The requirements are ANDed. items: description: |- A label selector requirement is a selector that contains values, a key, and an operator that relates the key and values. properties: key: description: key is the label key that the selector applies to. type: string operator: description: |- operator represents a key's relationship to a set of values. Valid operators are In, NotIn, Exists and DoesNotExist. type: string values: description: |- values is an array of string values. If the operator is In or NotIn, the values array must be non-empty. If the operator is Exists or DoesNotExist, the values array must be empty. This array is replaced during a strategic merge patch. items: type: string type: array x-kubernetes-list-type: atomic required: - key - operator type: object type: array x-kubernetes-list-type: atomic matchLabels: additionalProperties: type: string description: |- matchLabels is a map of {key,value} pairs. A single {key,value} in the matchLabels map is equivalent to an element of matchExpressions, whose key field is "key", the operator is "In", and the values array contains only "value". The requirements are ANDed. type: object type: object x-kubernetes-map-type: atomic selector: description: |- A selector is a label query over a set of resources. The result of matchLabels and matchExpressions are ANDed. An empty selector matches all objects. A null selector matches no objects. type: string succeeded: description: The number of pods which reached phase succeeded. format: int32 type: integer type: object description: |- replicaStatuses is map of ReplicaType and ReplicaStatus, specifies the status of each replica. type: object startTime: description: |- Represents time when the job was acknowledged by the job controller. It is not guaranteed to be set in happens-before order across separate operations. It is represented in RFC3339 form and is in UTC. format: date-time type: string type: object type: object served: true storage: true subresources: status: {} --- apiVersion: v1 kind: ServiceAccount metadata: labels: app: mpi-operator app.kubernetes.io/component: mpijob app.kubernetes.io/name: mpi-operator kustomize.component: mpi-operator name: mpi-operator namespace: mpi-operator --- aggregationRule: clusterRoleSelectors: - matchLabels: rbac.authorization.kubeflow.org/aggregate-to-kubeflow-mpijobs-admin: "true" apiVersion: rbac.authorization.k8s.io/v1 kind: ClusterRole metadata: labels: app: mpi-operator app.kubernetes.io/component: mpijob app.kubernetes.io/name: mpi-operator kustomize.component: mpi-operator rbac.authorization.kubeflow.org/aggregate-to-kubeflow-admin: "true" name: kubeflow-mpijobs-admin --- apiVersion: rbac.authorization.k8s.io/v1 kind: ClusterRole metadata: labels: app: mpi-operator app.kubernetes.io/component: mpijob app.kubernetes.io/name: mpi-operator kustomize.component: mpi-operator rbac.authorization.kubeflow.org/aggregate-to-kubeflow-edit: "true" rbac.authorization.kubeflow.org/aggregate-to-kubeflow-mpijobs-admin: "true" name: kubeflow-mpijobs-edit rules: - apiGroups: - kubeflow.org resources: - mpijobs - mpijobs/status verbs: - get - list - watch - create - delete - deletecollection - patch - update --- apiVersion: rbac.authorization.k8s.io/v1 kind: ClusterRole metadata: labels: app: mpi-operator app.kubernetes.io/component: mpijob app.kubernetes.io/name: mpi-operator kustomize.component: mpi-operator rbac.authorization.kubeflow.org/aggregate-to-kubeflow-view: "true" name: kubeflow-mpijobs-view rules: - apiGroups: - kubeflow.org resources: - mpijobs - mpijobs/status verbs: - get - list - watch --- apiVersion: rbac.authorization.k8s.io/v1 kind: ClusterRole metadata: labels: app: mpi-operator app.kubernetes.io/component: mpijob app.kubernetes.io/name: mpi-operator kustomize.component: mpi-operator name: mpi-operator rules: - apiGroups: - "" resources: - configmaps - secrets - services verbs: - create - list - watch - update - apiGroups: - "" resources: - pods verbs: - create - get - list - watch - delete - update - patch - apiGroups: - "" resources: - pods/exec verbs: - create - apiGroups: - "" resources: - endpoints verbs: - create - get - update - apiGroups: - "" resources: - events verbs: - create - patch - apiGroups: - batch resources: - jobs verbs: - create - list - update - watch - apiGroups: - apiextensions.k8s.io resources: - customresourcedefinitions verbs: - create - get - apiGroups: - kubeflow.org resources: - mpijobs - mpijobs/finalizers - mpijobs/status verbs: - '*' - apiGroups: - coordination.k8s.io resources: - leases verbs: - '*' - apiGroups: - scheduling.incubator.k8s.io - scheduling.sigs.dev - scheduling.volcano.sh resources: - queues - podgroups verbs: - '*' - apiGroups: - scheduling.x-k8s.io resources: - podgroups verbs: - '*' - apiGroups: - scheduling.k8s.io resources: - priorityclasses verbs: - get - list - watch --- apiVersion: rbac.authorization.k8s.io/v1 kind: ClusterRoleBinding metadata: labels: app: mpi-operator app.kubernetes.io/component: mpijob app.kubernetes.io/name: mpi-operator kustomize.component: mpi-operator name: mpi-operator roleRef: apiGroup: rbac.authorization.k8s.io kind: ClusterRole name: mpi-operator subjects: - kind: ServiceAccount name: mpi-operator namespace: mpi-operator --- apiVersion: apps/v1 kind: Deployment metadata: labels: app: mpi-operator app.kubernetes.io/component: mpijob app.kubernetes.io/name: mpi-operator kustomize.component: mpi-operator name: mpi-operator namespace: mpi-operator spec: replicas: 1 selector: matchLabels: app: mpi-operator app.kubernetes.io/component: mpijob app.kubernetes.io/name: mpi-operator kustomize.component: mpi-operator template: metadata: annotations: sidecar.istio.io/inject: "false" labels: app: mpi-operator app.kubernetes.io/component: mpijob app.kubernetes.io/name: mpi-operator kustomize.component: mpi-operator spec: containers: - args: - -alsologtostderr - --lock-namespace=mpi-operator image: mpioperator/mpi-operator:0.8.0 name: mpi-operator serviceAccountName: mpi-operator ================================================ FILE: test/manifests/assets/nvidia-device-plugin.yaml ================================================ # Source: https://raw.githubusercontent.com/NVIDIA/k8s-device-plugin/main/deployments/static/nvidia-device-plugin.yml # Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. apiVersion: apps/v1 kind: DaemonSet metadata: name: nvidia-device-plugin-daemonset namespace: kube-system spec: selector: matchLabels: name: nvidia-device-plugin-ds updateStrategy: type: RollingUpdate template: metadata: labels: name: nvidia-device-plugin-ds spec: tolerations: - key: nvidia.com/gpu operator: Exists effect: NoSchedule # Mark this pod as a critical add-on; when enabled, the critical add-on # scheduler reserves resources for critical add-on pods so that they can # be rescheduled after a failure. # See https://kubernetes.io/docs/tasks/administer-cluster/guaranteed-scheduling-critical-addon-pods/ priorityClassName: "system-node-critical" containers: - image: nvcr.io/nvidia/k8s-device-plugin:v0.17.2 name: nvidia-device-plugin-ctr env: - name: FAIL_ON_INIT_ERROR value: "false" securityContext: allowPrivilegeEscalation: false capabilities: drop: ["ALL"] volumeMounts: - name: device-plugin mountPath: /var/lib/kubelet/device-plugins volumes: - name: device-plugin hostPath: path: /var/lib/kubelet/device-plugins ================================================ FILE: test/manifests/raw.go ================================================ package manifests import ( _ "embed" ) var ( //go:embed assets/nvidia-device-plugin.yaml NvidiaDevicePluginManifest []byte //go:embed assets/mpi-operator.yaml MpiOperatorManifest []byte //go:embed assets/efa-device-plugin.yaml EfaDevicePluginManifest []byte //go:embed assets/k8s-neuron-device-plugin-rbac.yml NeuronDevicePluginRbacManifest []byte //go:embed assets/k8s-neuron-device-plugin.yml NeuronDevicePluginManifest []byte //go:embed assets/dranet.yaml DranetManifest []byte //go:embed assets/dcgm-exporter.yaml DCGMExporterManifest []byte //go:embed assets/cloudwatch-agent.yaml cloudWatchAgentManifestTemplate []byte ) ================================================ FILE: test/manifests/rendered.go ================================================ package manifests import ( "html/template" "strings" fwext "github.com/aws/aws-k8s-tester/internal/e2e" ) // RenderCloudWatchAgentManifest renders the CloudWatch Agent manifest with dynamic dimensions func RenderCloudWatchAgentManifest(metricDimensions map[string]string) ([]byte, error) { var keys []string for key := range metricDimensions { keys = append(keys, `"`+key+`"`) } dimensionsStr := strings.Join(keys, ", ") return fwext.RenderManifests(cloudWatchAgentManifestTemplate, map[string]interface{}{ "MetricDimensions": metricDimensions, "DimensionKeys": template.HTML(dimensionsStr), }) }