Repository: intel/cri-resource-manager Branch: master Commit: 886388e7a4a7 Files: 555 Total size: 2.2 MB Directory structure: gitextract_8uskpwqi/ ├── .githooks/ │ ├── pre-commit.d/ │ │ ├── 00-gofmt │ │ ├── 10-shellcheck │ │ └── 20-go-version │ └── run-hooks ├── .github/ │ ├── ISSUE_TEMPLATE/ │ │ ├── bug_report.md │ │ ├── feature_request.md │ │ └── new-release.md │ └── workflows/ │ ├── common-build-docs.yaml │ ├── common-build-images.yaml │ ├── common-codeql.yaml │ ├── common-trivy.yaml │ ├── common-verify-code.yaml │ ├── publish-devel-images.yaml │ ├── publish-docs.yml │ ├── release.yaml │ ├── trivy-csv.tpl │ ├── verify-periodic.yaml │ ├── verify-pr-code.yaml │ └── verify-pr-docs.yaml ├── .gitignore ├── CODEOWNERS ├── Jenkinsfile ├── LICENSE ├── Makefile ├── README.md ├── SECURITY.md ├── cmd/ │ ├── cri-resmgr/ │ │ ├── cri-resource-manager.service.in │ │ ├── cri-resource-manager.sysconf │ │ ├── fallback.cfg.sample │ │ └── main.go │ ├── cri-resmgr-agent/ │ │ ├── Dockerfile │ │ ├── agent-deployment.yaml │ │ └── main.go │ ├── cri-resmgr-agent-probe/ │ │ └── main.go │ └── cri-resmgr-webhook/ │ ├── Dockerfile │ ├── handlers.go │ ├── main.go │ ├── mutating-webhook-config.yaml │ ├── webhook-deployment.yaml │ └── webhook.go ├── demo/ │ ├── blockio/ │ │ ├── bb-scanner.yaml │ │ ├── cri-resmgr-config.default.yaml │ │ └── run.sh │ └── lib/ │ ├── command.bash │ ├── distro.bash │ ├── host.bash │ ├── numactlH2numajson.py │ ├── topology.py │ ├── topology2qemuopts.py │ └── vm.bash ├── dockerfiles/ │ └── cross-build/ │ ├── Dockerfile.debian-11 │ ├── Dockerfile.debian-12 │ ├── Dockerfile.debian-sid │ ├── Dockerfile.fedora │ ├── Dockerfile.opensuse-leap-15.6 │ ├── Dockerfile.ubuntu-18.04 │ ├── Dockerfile.ubuntu-20.04 │ ├── Dockerfile.ubuntu-22.04 │ └── Dockerfile.ubuntu-24.04 ├── docs/ │ ├── Dockerfile │ ├── _templates/ │ │ └── layout.html │ ├── conf.py │ ├── contributing.md │ ├── demos/ │ │ ├── blockio.md │ │ └── index.rst │ ├── developers-guide/ │ │ ├── architecture.md │ │ ├── cri-test.md │ │ ├── e2e-test.md │ │ ├── index.rst │ │ ├── policy-writers-guide.md │ │ ├── testing.rst │ │ └── unit-test.md │ ├── index.html │ ├── index.rst │ ├── installation.md │ ├── introduction.md │ ├── migration-to-NRI.md │ ├── node-agent.md │ ├── policy/ │ │ ├── balloons.md │ │ ├── blockio.md │ │ ├── container-affinity.md │ │ ├── cpu-allocator.md │ │ ├── dynamic-pools.md │ │ ├── index.rst │ │ ├── podpools.md │ │ ├── rdt.md │ │ ├── static-pools.md │ │ └── topology-aware.md │ ├── quick-start.md │ ├── reference/ │ │ ├── agent-command-line-reference.md │ │ ├── configuration-reference.md │ │ ├── index.rst │ │ └── resmgr-command-line-reference.md │ ├── releases/ │ │ ├── conf.py │ │ └── index.md │ ├── requirements.txt │ ├── security.md │ ├── setup.md │ └── webhook.md ├── elf/ │ └── avx512.c ├── go.mod ├── go.sum ├── packaging/ │ ├── deb.in/ │ │ ├── changelog │ │ ├── compat │ │ ├── control │ │ └── rules │ └── rpm/ │ └── cri-resource-manager.spec.in ├── pkg/ │ ├── agent/ │ │ ├── agent.go │ │ ├── api/ │ │ │ └── v1/ │ │ │ ├── api.go │ │ │ ├── api.pb.go │ │ │ ├── api.proto │ │ │ ├── api_grpc.pb.go │ │ │ └── constants.go │ │ ├── config-updater.go │ │ ├── flags.go │ │ ├── kubernetes.go │ │ ├── server.go │ │ └── watcher.go │ ├── apis/ │ │ └── resmgr/ │ │ ├── expression.go │ │ ├── expression_test.go │ │ ├── generated/ │ │ │ ├── clientset/ │ │ │ │ └── versioned/ │ │ │ │ ├── clientset.go │ │ │ │ ├── doc.go │ │ │ │ ├── fake/ │ │ │ │ │ ├── clientset_generated.go │ │ │ │ │ ├── doc.go │ │ │ │ │ └── register.go │ │ │ │ ├── scheme/ │ │ │ │ │ ├── doc.go │ │ │ │ │ └── register.go │ │ │ │ └── typed/ │ │ │ │ └── resmgr/ │ │ │ │ └── v1alpha1/ │ │ │ │ ├── adjustment.go │ │ │ │ ├── doc.go │ │ │ │ ├── fake/ │ │ │ │ │ ├── doc.go │ │ │ │ │ ├── fake_adjustment.go │ │ │ │ │ └── fake_resmgr_client.go │ │ │ │ ├── generated_expansion.go │ │ │ │ └── resmgr_client.go │ │ │ ├── informers/ │ │ │ │ └── externalversions/ │ │ │ │ ├── factory.go │ │ │ │ ├── generic.go │ │ │ │ ├── internalinterfaces/ │ │ │ │ │ └── factory_interfaces.go │ │ │ │ └── resmgr/ │ │ │ │ ├── interface.go │ │ │ │ └── v1alpha1/ │ │ │ │ ├── adjustment.go │ │ │ │ └── interface.go │ │ │ └── listers/ │ │ │ └── resmgr/ │ │ │ └── v1alpha1/ │ │ │ ├── adjustment.go │ │ │ └── expansion_generated.go │ │ └── v1alpha1/ │ │ ├── adjustment-schema.yaml │ │ ├── adjustment.go │ │ ├── doc.go │ │ ├── register.go │ │ ├── types.go │ │ └── zz_generated.deepcopy.go │ ├── avx/ │ │ ├── collector.go │ │ ├── elfdump.go │ │ └── register.go │ ├── blockio/ │ │ ├── blockio.go │ │ ├── blockio_test.go │ │ └── config.go │ ├── cgroups/ │ │ ├── cgroupblkio.go │ │ ├── cgroupblkio_test.go │ │ ├── cgroupcontrol.go │ │ ├── cgroupid.go │ │ ├── cgrouppath.go │ │ └── cgroupstats.go │ ├── cgroupstats/ │ │ └── collector.go │ ├── config/ │ │ ├── config.go │ │ ├── data.go │ │ ├── duration.go │ │ ├── error.go │ │ ├── help.go │ │ ├── log.go │ │ └── options.go │ ├── cpuallocator/ │ │ ├── allocator.go │ │ └── cpuallocator_test.go │ ├── cri/ │ │ ├── client/ │ │ │ ├── client.go │ │ │ └── v1/ │ │ │ └── client.go │ │ ├── relay/ │ │ │ ├── image-service.go │ │ │ ├── relay.go │ │ │ └── runtime-service.go │ │ ├── resource-manager/ │ │ │ ├── agent/ │ │ │ │ └── agent.go │ │ │ ├── builtin-policies.go │ │ │ ├── cache/ │ │ │ │ ├── affinity.go │ │ │ │ ├── affinity_test.go │ │ │ │ ├── cache.go │ │ │ │ ├── cache_test.go │ │ │ │ ├── container.go │ │ │ │ ├── container_test.go │ │ │ │ ├── error.go │ │ │ │ ├── pod.go │ │ │ │ └── utils.go │ │ │ ├── config/ │ │ │ │ ├── api/ │ │ │ │ │ └── v1/ │ │ │ │ │ ├── api.pb.go │ │ │ │ │ ├── api.proto │ │ │ │ │ └── api_grpc.pb.go │ │ │ │ ├── config.go │ │ │ │ └── server.go │ │ │ ├── control/ │ │ │ │ ├── blockio/ │ │ │ │ │ └── blockio.go │ │ │ │ ├── control.go │ │ │ │ ├── cpu/ │ │ │ │ │ ├── api.go │ │ │ │ │ ├── cache.go │ │ │ │ │ └── cpu.go │ │ │ │ ├── cri/ │ │ │ │ │ └── cri.go │ │ │ │ ├── flags.go │ │ │ │ ├── memory/ │ │ │ │ │ └── memory.go │ │ │ │ ├── page-migrate/ │ │ │ │ │ ├── demoter.go │ │ │ │ │ ├── demoter_test.go │ │ │ │ │ ├── flags.go │ │ │ │ │ ├── page-migrate.go │ │ │ │ │ └── page-mover.go │ │ │ │ └── rdt/ │ │ │ │ └── rdt.go │ │ │ ├── controllers.go │ │ │ ├── error.go │ │ │ ├── events/ │ │ │ │ └── events.go │ │ │ ├── events.go │ │ │ ├── flags.go │ │ │ ├── introspect/ │ │ │ │ └── introspect.go │ │ │ ├── kubernetes/ │ │ │ │ ├── kubernetes.go │ │ │ │ └── resources.go │ │ │ ├── metrics/ │ │ │ │ ├── avx.go │ │ │ │ ├── metrics.go │ │ │ │ └── prometheus.go │ │ │ ├── no-test-api.go │ │ │ ├── policy/ │ │ │ │ ├── builtin/ │ │ │ │ │ ├── balloons/ │ │ │ │ │ │ ├── balloons-policy.go │ │ │ │ │ │ ├── balloons-policy_test.go │ │ │ │ │ │ ├── cputree.go │ │ │ │ │ │ ├── cputree_test.go │ │ │ │ │ │ ├── fillmethod.go │ │ │ │ │ │ ├── flags.go │ │ │ │ │ │ └── metrics.go │ │ │ │ │ ├── dynamic-pools/ │ │ │ │ │ │ ├── cpu.go │ │ │ │ │ │ ├── dyp.go │ │ │ │ │ │ ├── dyp_test.go │ │ │ │ │ │ ├── flags.go │ │ │ │ │ │ └── metrics.go │ │ │ │ │ ├── none/ │ │ │ │ │ │ └── none-policy.go │ │ │ │ │ ├── podpools/ │ │ │ │ │ │ ├── flags.go │ │ │ │ │ │ ├── metrics.go │ │ │ │ │ │ ├── podpools-policy.go │ │ │ │ │ │ └── podpools-policy_test.go │ │ │ │ │ ├── static/ │ │ │ │ │ │ ├── flags.go │ │ │ │ │ │ └── static-policy.go │ │ │ │ │ ├── static-plus/ │ │ │ │ │ │ └── static-plus-policy.go │ │ │ │ │ ├── static-pools/ │ │ │ │ │ │ ├── config.go │ │ │ │ │ │ ├── node.go │ │ │ │ │ │ ├── stp-policy.go │ │ │ │ │ │ └── stp-policy_test.go │ │ │ │ │ └── topology-aware/ │ │ │ │ │ ├── affinity.go │ │ │ │ │ ├── cache.go │ │ │ │ │ ├── cache_test.go │ │ │ │ │ ├── coldstart.go │ │ │ │ │ ├── coldstart_test.go │ │ │ │ │ ├── error.go │ │ │ │ │ ├── flags.go │ │ │ │ │ ├── hint.go │ │ │ │ │ ├── hint_test.go │ │ │ │ │ ├── logging.go │ │ │ │ │ ├── mocks_test.go │ │ │ │ │ ├── node.go │ │ │ │ │ ├── pod-preferences.go │ │ │ │ │ ├── pod-preferences_test.go │ │ │ │ │ ├── pools.go │ │ │ │ │ ├── pools_test.go │ │ │ │ │ ├── resources.go │ │ │ │ │ └── topology-aware-policy.go │ │ │ │ ├── error.go │ │ │ │ ├── flags.go │ │ │ │ └── policy.go │ │ │ ├── requests.go │ │ │ ├── resource-manager.go │ │ │ ├── sockets/ │ │ │ │ └── sockets.go │ │ │ ├── test-api.go │ │ │ └── visualizer/ │ │ │ ├── bubbles/ │ │ │ │ ├── assets/ │ │ │ │ │ ├── css/ │ │ │ │ │ │ └── style.css │ │ │ │ │ ├── index.html │ │ │ │ │ └── js/ │ │ │ │ │ ├── ui-json-adapter.js │ │ │ │ │ └── ui.js │ │ │ │ ├── assets.go │ │ │ │ ├── assets_generate.go │ │ │ │ └── doc.go │ │ │ ├── builtins.go │ │ │ ├── flags.go │ │ │ └── visualizer.go │ │ └── server/ │ │ ├── server.go │ │ └── services.go │ ├── dump/ │ │ ├── doc.go │ │ ├── dump.go │ │ ├── dump_test.go │ │ └── flags.go │ ├── instrumentation/ │ │ ├── flags.go │ │ ├── grpc.go │ │ ├── http/ │ │ │ ├── http.go │ │ │ └── http_test.go │ │ ├── instrumentation.go │ │ ├── instrumentation_test.go │ │ ├── jaeger.go │ │ ├── prometheus.go │ │ └── service.go │ ├── log/ │ │ ├── default.go │ │ ├── flags.go │ │ ├── grpc-logger.go │ │ ├── klogcontrol/ │ │ │ └── klogcontrol.go │ │ ├── log.go │ │ ├── ratelimit.go │ │ ├── ratelimit_test.go │ │ ├── signal.go │ │ └── stdlog-logger.go │ ├── metrics/ │ │ ├── metrics.go │ │ └── register/ │ │ ├── register_metrics.go │ │ └── register_metrics_avx.go │ ├── pidfile/ │ │ ├── pidfile.go │ │ └── pidfile_test.go │ ├── policycollector/ │ │ └── collector.go │ ├── procstats/ │ │ └── procstats.go │ ├── sysfs/ │ │ ├── error.go │ │ ├── parsers.go │ │ ├── system.go │ │ └── utils.go │ ├── testutils/ │ │ └── verify.go │ ├── topology/ │ │ ├── go.mod │ │ ├── test-cleanup.sh │ │ ├── test-setup.sh │ │ ├── topology.go │ │ └── topology_test.go │ ├── utils/ │ │ ├── cpuset/ │ │ │ ├── cpuset.go │ │ │ └── cpuset_test.go │ │ ├── json.go │ │ ├── net.go │ │ ├── parse.go │ │ ├── sort.go │ │ └── tar.go │ └── version/ │ └── version.go ├── runtime-deps.csv ├── sample-configs/ │ ├── balloons-policy.cfg │ ├── blockio.cfg │ ├── cri-full-message-dump.cfg │ ├── cri-resmgr-configmap.example.yaml │ ├── external-adjustment.yaml │ ├── podpools-policy.cfg │ ├── static-policy.cfg │ ├── static-pools-policy.conf.example │ └── topology-aware-policy.cfg ├── scripts/ │ ├── build/ │ │ ├── docker-build-image │ │ ├── get-buildid │ │ └── update-gh-pages.sh │ ├── code-generator/ │ │ ├── boilerplate.go.txt │ │ └── generate-groups.sh │ ├── hack/ │ │ ├── create-webhook-secrets.sh │ │ ├── go-mod-replace-helper.sh │ │ ├── go-mod-tree │ │ └── install-protobuf │ └── testing/ │ ├── crictl │ ├── jaeger │ ├── kube-cgroups │ ├── pairwise │ ├── prometheus │ ├── prometheus.yaml │ └── set-path └── test/ ├── critest/ │ ├── run.sh │ ├── topology-aware-policy.cfg │ └── tsl ├── e2e/ │ ├── benchmarks.test-suite/ │ │ └── memtier_benchmark/ │ │ ├── cri-resmgr.cfg │ │ ├── memtier-benchmark-02.yaml.in │ │ ├── memtier-benchmark.yaml.in │ │ ├── n4c16/ │ │ │ ├── test01-memtier-stress-ng/ │ │ │ │ ├── code.var.sh │ │ │ │ └── post-process.sh │ │ │ ├── test02-multi-memtier/ │ │ │ │ └── code.var.sh │ │ │ └── topology.var.json │ │ ├── redis-secret.yaml.in │ │ ├── redis-service.yaml.in │ │ ├── redis.yaml.in │ │ ├── stress-ng-benchmark.yaml.in │ │ └── stress-ng.yaml.in │ ├── besteffort.yaml.in │ ├── blockio.test-suite/ │ │ ├── blockio/ │ │ │ └── n4c16/ │ │ │ ├── test00-slowreader/ │ │ │ │ └── code.var.sh │ │ │ ├── topology.var.json │ │ │ └── vm-files/ │ │ │ └── etc/ │ │ │ ├── containers/ │ │ │ │ └── blockio.yaml │ │ │ └── crio/ │ │ │ └── crio.conf.d/ │ │ │ └── 55-blockio │ │ ├── containerd_src.var.in.sh │ │ ├── crio_src.var.in.sh │ │ ├── k8scri.var.in.sh │ │ └── omit_cri_resmgr.var.sh │ ├── burstable.yaml.in │ ├── cri-resmgr-topology-aware.cfg │ ├── guaranteed.yaml.in │ ├── packages.test-suite/ │ │ ├── debian-11/ │ │ │ ├── binsrc.var │ │ │ ├── cri-resmgr.cfg │ │ │ ├── distro.var │ │ │ ├── pkgtest/ │ │ │ │ ├── test01-systemd/ │ │ │ │ │ └── code.var.sh │ │ │ │ └── topology.var.json │ │ │ └── reinstall_cri_resmgr.var │ │ ├── debian-12/ │ │ │ ├── binsrc.var │ │ │ ├── cri-resmgr.cfg │ │ │ ├── distro.var │ │ │ ├── pkgtest/ │ │ │ │ ├── test01-systemd/ │ │ │ │ │ └── code.var.sh │ │ │ │ └── topology.var.json │ │ │ └── reinstall_cri_resmgr.var │ │ ├── debian-sid/ │ │ │ ├── binsrc.var │ │ │ ├── cri-resmgr.cfg │ │ │ ├── distro.var │ │ │ ├── pkgtest/ │ │ │ │ ├── test01-systemd/ │ │ │ │ │ └── code.var.sh │ │ │ │ └── topology.var.json │ │ │ └── reinstall_cri_resmgr.var │ │ ├── fedora/ │ │ │ ├── binsrc.var │ │ │ ├── cri-resmgr.cfg │ │ │ ├── distro.var │ │ │ ├── pkgtest/ │ │ │ │ ├── test01-systemd/ │ │ │ │ │ └── code.var.sh │ │ │ │ └── topology.var.json │ │ │ └── reinstall_cri_resmgr.var │ │ ├── opensuse-15.6/ │ │ │ ├── binsrc.var │ │ │ ├── cri-resmgr.cfg │ │ │ ├── distro.var │ │ │ ├── pkgtest/ │ │ │ │ ├── test01-systemd/ │ │ │ │ │ └── code.var.sh │ │ │ │ └── topology.var.json │ │ │ └── reinstall_cri_resmgr.var │ │ ├── ubuntu-18.04/ │ │ │ ├── binsrc.var │ │ │ ├── cri-resmgr.cfg │ │ │ ├── distro.var │ │ │ ├── pkgtest/ │ │ │ │ ├── test01-systemd/ │ │ │ │ │ └── code.var.sh │ │ │ │ └── topology.var.json │ │ │ └── reinstall_cri_resmgr.var │ │ ├── ubuntu-20.04/ │ │ │ ├── binsrc.var │ │ │ ├── cri-resmgr.cfg │ │ │ ├── distro.var │ │ │ ├── pkgtest/ │ │ │ │ ├── test01-systemd/ │ │ │ │ │ └── code.var.sh │ │ │ │ └── topology.var.json │ │ │ └── reinstall_cri_resmgr.var │ │ ├── ubuntu-22.04/ │ │ │ ├── binsrc.var │ │ │ ├── cri-resmgr.cfg │ │ │ ├── distro.var │ │ │ ├── pkgtest/ │ │ │ │ ├── test01-systemd/ │ │ │ │ │ └── code.var.sh │ │ │ │ └── topology.var.json │ │ │ └── reinstall_cri_resmgr.var │ │ └── ubuntu-24.04/ │ │ ├── binsrc.var │ │ ├── cri-resmgr.cfg │ │ ├── distro.var │ │ ├── pkgtest/ │ │ │ ├── test01-systemd/ │ │ │ │ └── code.var.sh │ │ │ └── topology.var.json │ │ └── reinstall_cri_resmgr.var │ ├── policies.test-suite/ │ │ ├── balloons/ │ │ │ ├── balloons-busybox.yaml.in │ │ │ ├── balloons-configmap.yaml.in │ │ │ ├── cri-resmgr.cfg │ │ │ ├── n4c16/ │ │ │ │ ├── test01-basic-placement/ │ │ │ │ │ └── code.var.sh │ │ │ │ ├── test02-prometheus-metrics/ │ │ │ │ │ ├── balloons-metrics.cfg │ │ │ │ │ └── code.var.sh │ │ │ │ ├── test03-reserved/ │ │ │ │ │ ├── balloons-reserved.cfg │ │ │ │ │ └── code.var.sh │ │ │ │ ├── test05-namespace/ │ │ │ │ │ ├── balloons-namespace.cfg │ │ │ │ │ └── code.var.sh │ │ │ │ ├── test06-update-configmap/ │ │ │ │ │ └── code.var.sh │ │ │ │ ├── test07-maxballoons/ │ │ │ │ │ ├── balloons-maxballoons-impossible.cfg │ │ │ │ │ ├── balloons-maxballoons.cfg │ │ │ │ │ └── code.var.sh │ │ │ │ ├── test08-numa/ │ │ │ │ │ ├── balloons-numa.cfg │ │ │ │ │ └── code.var.sh │ │ │ │ ├── test09-isolated/ │ │ │ │ │ ├── balloons-isolated.cfg │ │ │ │ │ └── code.var.sh │ │ │ │ ├── test10-allocator-opts/ │ │ │ │ │ ├── balloons-allocator-opts.cfg │ │ │ │ │ └── code.var.sh │ │ │ │ └── topology.var.json │ │ │ ├── n4c32/ │ │ │ │ ├── test01-dynamic-baloons/ │ │ │ │ │ ├── balloons-dynamic.cfg │ │ │ │ │ ├── code.var.sh │ │ │ │ │ └── multicontainerpod.yaml.in │ │ │ │ └── topology.var.json │ │ │ └── verify.source.sh │ │ ├── check-correct-policy.source.sh │ │ ├── dynamic-pools/ │ │ │ ├── cri-resmgr.cfg │ │ │ ├── dyp-busybox.yaml.in │ │ │ ├── dyp-configmap.yaml.in │ │ │ ├── n4c16/ │ │ │ │ ├── test01-basic-placement/ │ │ │ │ │ └── code.var.sh │ │ │ │ ├── test02-prometheus-metrics/ │ │ │ │ │ ├── code.var.sh │ │ │ │ │ └── dyp-metrics.cfg │ │ │ │ ├── test03-rebalancing/ │ │ │ │ │ └── code.var.sh │ │ │ │ ├── test04-reserved/ │ │ │ │ │ ├── code.var.sh │ │ │ │ │ └── dyp-reserved.cfg │ │ │ │ ├── test05-namespace/ │ │ │ │ │ ├── code.var.sh │ │ │ │ │ └── dyp-namespace.cfg │ │ │ │ ├── test06-update-configmap/ │ │ │ │ │ └── code.var.sh │ │ │ │ ├── test07-numa/ │ │ │ │ │ ├── code.var.sh │ │ │ │ │ └── dyp-numa.cfg │ │ │ │ └── topology.var.json │ │ │ └── verify.source.sh │ │ ├── podpools/ │ │ │ ├── cri-resmgr.cfg │ │ │ ├── n4c16/ │ │ │ │ ├── podpools-configmap.yaml.in │ │ │ │ ├── py_consts.var.py │ │ │ │ ├── test01-basic-placement/ │ │ │ │ │ └── code.var.sh │ │ │ │ ├── test02-fill-order/ │ │ │ │ │ └── code.var.sh │ │ │ │ ├── test03-qos/ │ │ │ │ │ └── code.var.sh │ │ │ │ ├── test04-overbook-cpus/ │ │ │ │ │ └── code.var.sh │ │ │ │ ├── test05-agent-updates-config/ │ │ │ │ │ └── code.var.sh │ │ │ │ ├── test06-prometheus-metrics/ │ │ │ │ │ ├── code.var.sh │ │ │ │ │ └── podpools-metrics.cfg │ │ │ │ ├── test07-custom-default-pool/ │ │ │ │ │ ├── code.var.sh │ │ │ │ │ └── podpools-custom-default.cfg │ │ │ │ └── topology.var.json │ │ │ └── podpools-busybox.yaml.in │ │ ├── static-pools/ │ │ │ ├── README.txt │ │ │ ├── cmk-exclusive.yaml.in │ │ │ ├── cmk-isolate.yaml.in │ │ │ ├── cmk-tolerating-guaranteed.yaml.in │ │ │ ├── cri-resmgr.cfg │ │ │ ├── n4c16/ │ │ │ │ ├── cri-resmgr-static-pools.cfg │ │ │ │ ├── py_consts.var.py │ │ │ │ ├── test00-node-status/ │ │ │ │ │ └── code.var.sh │ │ │ │ ├── test01-exclusive-pods/ │ │ │ │ │ └── code.var.sh │ │ │ │ ├── test02-pods-without-cmk/ │ │ │ │ │ └── code.var.sh │ │ │ │ ├── test03-cmk-isolate/ │ │ │ │ │ └── code.var.sh │ │ │ │ ├── test04-cmk-isolate-noaffinity/ │ │ │ │ │ └── code.var.sh │ │ │ │ ├── test05-negative-tests/ │ │ │ │ │ └── code.var.sh │ │ │ │ ├── test99-cleanup/ │ │ │ │ │ └── code.var.sh │ │ │ │ ├── topology.var.json │ │ │ │ └── vm-files/ │ │ │ │ └── etc/ │ │ │ │ └── cmk/ │ │ │ │ └── pools.conf │ │ │ └── static-pools-lib.source.sh │ │ └── topology-aware/ │ │ ├── c4pmem4/ │ │ │ ├── test01-pmem-node-assigning/ │ │ │ │ └── code.var.sh │ │ │ ├── test02-annotation-memory-type/ │ │ │ │ ├── code.var.sh │ │ │ │ └── memtype-guaranteed.yaml.in │ │ │ ├── test02-annotation-memory-type-deprecated-syntax/ │ │ │ │ ├── code.var.sh │ │ │ │ └── memtype-guaranteed.yaml.in │ │ │ ├── test03-coldstart/ │ │ │ │ ├── bb-coldstart.yaml.in │ │ │ │ └── code.var.sh │ │ │ ├── test03-coldstart-deprecated-syntax/ │ │ │ │ ├── bb-coldstart.yaml.in │ │ │ │ └── code.var.sh │ │ │ ├── test04-dynamic-page-demotion/ │ │ │ │ ├── bb-memload.yaml.in │ │ │ │ ├── code.var.sh │ │ │ │ └── cri-resmgr-dynamic-page-demotion.cfg │ │ │ ├── test04-dynamic-page-demotion-deprecated-syntax/ │ │ │ │ ├── bb-memload.yaml.in │ │ │ │ ├── code.var.sh │ │ │ │ └── cri-resmgr-dynamic-page-demotion.cfg │ │ │ ├── test05-guarantee-memory/ │ │ │ │ └── code.var.sh │ │ │ └── topology.var.json │ │ ├── cri-resmgr.cfg │ │ └── n4c16/ │ │ ├── test00-basic-placement/ │ │ │ ├── code.var.sh │ │ │ └── cri-resmgr.cfg.in │ │ ├── test01-always-fits/ │ │ │ └── code.var.sh │ │ ├── test02-shrink-and-grow-shared/ │ │ │ └── code.var.sh │ │ ├── test03-simple-affinity/ │ │ │ ├── code.var.sh │ │ │ └── guaranteed+affinity.yaml.in │ │ ├── test04-available-resources/ │ │ │ ├── code.var.sh │ │ │ └── cri-resmgr-available-resources.cfg.in │ │ ├── test05-reserved-resources/ │ │ │ ├── code.var.sh │ │ │ └── cri-resmgr-reserved.cfg.in │ │ ├── test06-fuzz/ │ │ │ ├── code.var.sh │ │ │ ├── codelib.sh │ │ │ ├── fuzz.aal │ │ │ ├── fuzz.fmbt.conf │ │ │ └── generate.sh │ │ ├── test07-mixed-allocations/ │ │ │ ├── code.var.sh │ │ │ └── guaranteed-annotated.yaml.in │ │ ├── test08-isolcpus/ │ │ │ ├── code.var.sh │ │ │ └── guaranteed-annotated.yaml.in │ │ ├── test09-container-exit/ │ │ │ └── code.var.sh │ │ ├── test10-additional-reserved-namespaces/ │ │ │ ├── code.var.sh │ │ │ └── cri-resmgr-reserved-namespaces.cfg.in │ │ ├── test11-reserved-cpu-annotations/ │ │ │ ├── code.var.sh │ │ │ ├── cri-resmgr-reserved-annotations.cfg.in │ │ │ └── reserved-annotated.yaml.in │ │ └── topology.var.json │ ├── run.sh │ ├── run_all_configurations.sh │ └── run_tests.sh └── functional/ ├── e2e_test.go └── fake_cri_server_test.go ================================================ FILE CONTENTS ================================================ ================================================ FILE: .githooks/pre-commit.d/00-gofmt ================================================ #!/bin/bash # Copyright 2012 The Go Authors. All rights reserved. # Use of this source code is governed by a BSD-style # license that can be found in the LICENSE file. # git gofmt pre-commit hook # # To use, store as .git/hooks/pre-commit inside your repository and make sure # it has execute permissions. # # This script does not handle file names that contain spaces. if [ -z "$(command -v gofmt)" ]; then echo >&2 "WARNING: Cannot check/enforce Go code formatting: can't find gofmt." echo >&2 "WARNING: Please consider installing gofmt." exit 0 fi gofiles=$(git diff --cached --name-only --diff-filter=ACM | grep '\.go$') [ -z "$gofiles" ] && exit 0 # shellcheck disable=SC2086 unformatted=$(gofmt -l $gofiles) [ -z "$unformatted" ] && exit 0 # Some files are not gofmt'd. Print message and fail. echo >&2 "Go files must be formatted with gofmt. Please run:" for fn in $unformatted; do echo >&2 " gofmt -w $PWD/$fn" done exit 1 ================================================ FILE: .githooks/pre-commit.d/10-shellcheck ================================================ #!/bin/bash # git shellcheck pre-commit hook # # To use, store as .git/hooks/pre-commit/shellcheck inside your repository # and make sure it has execute permissions. # # This script does not handle file names that contain spaces. # if [ -z "$(command -v shellcheck)" ]; then echo >&2 "WARNING: Cannot shellcheck scripts: can't find shellcheck." echo >&2 "WARNING: Please consider installing shellcheck." exit 0 fi shfiles=$(git diff --cached --name-only --diff-filter=ACM -- '*.sh' '*.bash') #echo >&2 "[$0: shfiles: $shfiles]" for f in $(git diff --cached --name-only --diff-filter=ACM); do if grep -EHn '^#!/bin/.*sh *' "$f" | grep -q ':1:#!'; then shfiles="$shfiles $f" fi done shfiles="$(echo "$shfiles" | tr -s '\t ' '\n' | sort | uniq)" #echo >&2 "[$0: shfiles: $shfiles]" # shellcheck disable=SC2086 if [ -z "$shfiles" ] || shellcheck $shfiles; then exit 0 fi # Some files do not pass ShellCheck. Print message and fail. echo >&2 "shell scripts must pass ShellCheck. Please fix them." exit 1 ================================================ FILE: .githooks/pre-commit.d/20-go-version ================================================ #!/bin/bash WORKFLOWS=".github/workflows/verify.yml" if git diff --cached go.mod | grep -q '^+go '; then gomod=$(go list -m -f '{{.GoVersion}}') else exit 0 fi status=0 for wf in $WORKFLOWS; do workflow=$(grep 'go-version:' $wf | sed 's/^.*: //') if [ "$gomod" != "$workflow" ]; then echo >&2 "ERROR: inconsistent golang versions, $gomod in go.mod but $workflow in $wf..." status=1 fi done if [ "$status" != 0 ]; then echo >&2 "Please consider fixing these inconsistencies before committing..." fi exit $status ================================================ FILE: .githooks/run-hooks ================================================ #!/bin/bash type=${0##*/} hdir=$0.d orig=${0%/*}/../.git/hooks/$type exec 1>&2 for hlet in "$hdir"/???*; do case $hlet in *~|*.swp) continue ;; [0-9][0-9]-*) ;; esac if [ ! -x "$hlet" ]; then continue fi echo "" $hlet r=$? if [ $r != 0 ]; then exit $r fi done if [ -x "$orig" ]; then echo "" $orig exit $? fi ================================================ FILE: .github/ISSUE_TEMPLATE/bug_report.md ================================================ --- name: Bug report about: Create a report to help us improve title: '' labels: '' assignees: '' --- **Describe the bug** **Expected behavior** **To Reproduce** **Environment** **Additional context** ================================================ FILE: .github/ISSUE_TEMPLATE/feature_request.md ================================================ --- name: Feature request about: Suggest an idea for this project title: '' labels: '' assignees: '' --- **Describe the solution you'd like** **Why this is needed** ================================================ FILE: .github/ISSUE_TEMPLATE/new-release.md ================================================ --- name: New release about: Propose a new release title: Release v0.0.0 labels: '' assignees: '' --- ## Release Process - [ ] In the issue description, add a changelog section, describing changes since the last release. - Local release preparations - [ ] Perform mandatory internal release checks and preparations. - [ ] Run `make release-tests` to run an extended set of tests prior to a release. - [ ] Sync/tidy up dependencies. - [ ] Run `go mod tidy`. - [ ] Run `git commit -m 'go.mod,go.sum: update dependencies.' go.{mod,sum}`, if necessary. - [ ] Run `git tag -a -m "CRI Resource Manager release $VERSION" $VERSION`. - Publishing - [ ] Push the tag with `git push $VERSION`. This will automatically build container images and release assets and upload the release assets to a new draft release, - [ ] Check that release assets were created for the tag - Container images are published - https://hub.docker.com/r/intel/cri-resmgr-agent/tags - https://hub.docker.com/r/intel/cri-resmgr-webhook/tags - Release assets are uploaded to the draft release - RPM packages - DEB package - Binary tarball - Source+dependencies tarball (vendored dist) - [ ] Update the automatically created draft release corresponding to the tag. - [ ] Write the change log to the release. - [ ] Mark the release as a non-production pre-release if necessary. - [ ] Save as draft. - [ ] Get the change log OK'd by other maintainers. - [ ] Publish the draft as a release. - [ ] Add a link to the tagged release in this issue. - [ ] Close this issue. ## Changelog ### Major changes ### Detailed changelog ================================================ FILE: .github/workflows/common-build-docs.yaml ================================================ name: Build documentation on: workflow_call: inputs: publish: default: false required: false type: boolean permissions: contents: read jobs: update-gh-pages: runs-on: ubuntu-22.04 permissions: contents: write steps: - uses: actions/checkout@v4 - name: Fetch gh-pages run: git fetch --no-tags --prune --depth=1 origin refs/heads/gh-pages:refs/heads/gh-pages - name: Install build dependencies run: | pip3 install --user -r docs/requirements.txt echo "`python3 -m site --user-base`/bin" >> $GITHUB_PATH - name: Add docs from this revision to gh-pages run: | git config user.name "Github" git config user.email "no-reply@github.com" ./scripts/build/update-gh-pages.sh - name: Publish gh-pages if: ${{ inputs.publish }} shell: bash env: GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} run: | git push https://${GITHUB_ACTOR}:${GITHUB_TOKEN}@github.com/${GITHUB_REPOSITORY}.git gh-pages ================================================ FILE: .github/workflows/common-build-images.yaml ================================================ name: Build container images on: workflow_call: inputs: image-tag: default: ${{ github.ref_name }} required: false type: string publish: default: false required: false type: boolean github-environment: default: null required: false type: string permissions: contents: read jobs: build-images: name: Build and publish container images runs-on: ubuntu-22.04 environment: ${{ inputs.github-environment }} env: IMAGE_REPO: intel IMAGE_VERSION: ${{ inputs.image-tag }} steps: - name: Checkout uses: actions/checkout@v4 - name: Build images run: "make images IMAGE_VERSION=${IMAGE_VERSION} Q=" - name: Login to Docker Hub if: ${{ inputs.publish }} uses: docker/login-action@v3 with: username: ${{ secrets.DOCKERHUB_USERNAME }} password: ${{ secrets.DOCKERHUB_TOKEN }} - name: Push images if: ${{ inputs.publish }} run: "make images-push IMAGE_VERSION=${IMAGE_VERSION} Q=" ================================================ FILE: .github/workflows/common-codeql.yaml ================================================ name: CodeQL scanning on: workflow_call: inputs: export-report: default: false required: false type: boolean permissions: contents: read jobs: codeql-scan: runs-on: ubuntu-22.04 permissions: security-events: write steps: - name: Checkout uses: actions/checkout@v4 - name: Set up Go uses: actions/setup-go@v5 with: go-version-file: go.mod - name: Initialize CodeQL uses: github/codeql-action/init@v3 with: languages: go - name: Perform CodeQL Analysis uses: github/codeql-action/analyze@v3 - name: Generate CodeQL Security Report if: ${{ inputs.export-report }} uses: rsdmike/github-security-report-action@v3.0.4 with: template: report token: ${{ secrets.GITHUB_TOKEN }} - name: Upload PDF report as an artifact if: ${{ inputs.export-report }} uses: actions/upload-artifact@v4 with: name: codeql-report path: report.pdf ================================================ FILE: .github/workflows/common-trivy.yaml ================================================ name: Trivy scanning on: workflow_call: inputs: upload-to-github-security-tab: default: false required: false type: boolean export-csv: default: false required: false type: boolean permissions: contents: read jobs: trivy-scan-licenses: runs-on: ubuntu-22.04 steps: - name: Checkout uses: actions/checkout@v4 - name: Run Trivy in fs mode uses: aquasecurity/trivy-action@master with: scan-type: fs scan-ref: . exit-code: 1 scanners: license severity: "UNKNOWN,MEDIUM,HIGH,CRITICAL" trivy-scan-vulns: runs-on: ubuntu-22.04 permissions: security-events: write steps: - name: Checkout uses: actions/checkout@v4 - name: Run Trivy in fs mode continue-on-error: true uses: aquasecurity/trivy-action@master with: scan-type: fs scan-ref: . exit-code: 1 list-all-pkgs: true format: json output: trivy-report.json - name: Show report in human-readable format uses: aquasecurity/trivy-action@master with: scan-type: convert vuln-type: '' severity: '' image-ref: trivy-report.json format: table - name: Convert report to sarif if: ${{ inputs.upload-to-github-security-tab }} uses: aquasecurity/trivy-action@master with: scan-type: convert vuln-type: '' severity: '' image-ref: trivy-report.json format: sarif output: trivy-report.sarif - name: Upload sarif report to GitHub Security tab if: ${{ inputs.upload-to-github-security-tab }} uses: github/codeql-action/upload-sarif@v3 with: sarif_file: trivy-report.sarif - name: Convert report to csv if: ${{ inputs.export-csv }} uses: aquasecurity/trivy-action@master with: scan-type: convert vuln-type: '' severity: '' image-ref: trivy-report.json format: template template: "@.github/workflows/trivy-csv.tpl" output: trivy-report.csv - name: Upload CSV report as an artifact if: ${{ inputs.export-csv }} uses: actions/upload-artifact@v4 with: name: trivy-report path: trivy-report.csv ================================================ FILE: .github/workflows/common-verify-code.yaml ================================================ name: Verify code on: - workflow_call permissions: contents: read jobs: build-and-test: runs-on: ubuntu-22.04 steps: - name: Check out code uses: actions/checkout@v4 - name: Set up Go uses: actions/setup-go@v5 with: go-version-file: go.mod id: go - name: Install golangci-lint run: curl -sfL https://raw.githubusercontent.com/golangci/golangci-lint/master/install.sh | sh -s -- -b $(go env GOPATH)/bin v1.64.7 - name: Gofmt run: make format - name: Build run: make - name: Test run: make test - name: Golangci-lint run: | export PATH=$PATH:$(go env GOPATH)/bin make golangci-lint - name: Codecov report run: bash <(curl -s https://codecov.io/bash) trivy-scan: uses: "./.github/workflows/common-trivy.yaml" permissions: contents: read security-events: write with: upload-to-github-security-tab: true codeql-scan: uses: "./.github/workflows/common-codeql.yaml" permissions: contents: read security-events: write ================================================ FILE: .github/workflows/publish-devel-images.yaml ================================================ name: Build and publish devel container images on: push: branches: ["master"] permissions: contents: read concurrency: group: ${{ github.workflow }}-${{ github.ref_name }} cancel-in-progress: true jobs: trivy-scan: uses: "./.github/workflows/common-trivy.yaml" permissions: contents: read security-events: write publish-images: uses: "./.github/workflows/common-build-images.yaml" needs: [trivy-scan] secrets: inherit with: publish: true image-tag: "devel" github-environment: "staging" ================================================ FILE: .github/workflows/publish-docs.yml ================================================ name: Publish documentation on: push: branches: - master - release-* # Path filters are ignored for tags paths: - "docs/**" - "Makefile" tags: - v* permissions: contents: read concurrency: group: ${{ github.workflow }} cancel-in-progress: false jobs: update-gh-pages: uses: "./.github/workflows/common-build-docs.yaml" permissions: contents: write with: publish: true ================================================ FILE: .github/workflows/release.yaml ================================================ name: Build and publish release artifacts on: push: tags: [ 'v*' ] permissions: contents: read concurrency: group: ${{ github.workflow }}-${{ github.ref_name }} cancel-in-progress: true jobs: trivy-scan: uses: "./.github/workflows/common-trivy.yaml" permissions: contents: read security-events: write with: export-csv: true codeql: uses: "./.github/workflows/common-codeql.yaml" permissions: contents: read security-events: write with: export-report: true publish-images: uses: "./.github/workflows/common-build-images.yaml" needs: [trivy-scan] secrets: inherit with: publish: true image-tag: ${{ github.ref_name }} github-environment: "release" build-packages: needs: [trivy-scan] permissions: contents: write runs-on: ubuntu-22.04 steps: - name: Checkout uses: actions/checkout@v4 - name: Build packages run: "make cross-packages Q=" - name: Build vendored dist tarball run: "make vendored-dist Q=" - name: Upload release assets uses: softprops/action-gh-release@v1 with: name: ${{ github.ref_name }} draft: true append_body: true files: | packages/release-assets/* vendored-cri-resource-manager-*.tar.gz ================================================ FILE: .github/workflows/trivy-csv.tpl ================================================ {{ range . }} Trivy Vulnerability Scan Results ({{- .Target -}}) VulnerabilityID,Severity,CVSS Score,Title,Library,Vulnerable Version,Fixed Version,Information URL,Triage Information {{ range .Vulnerabilities }} {{- .VulnerabilityID }}, {{- .Severity }}, {{- range $key, $value := .CVSS }} {{- if (eq $key "nvd") }} {{- .V3Score -}} {{- end }} {{- end }}, {{- quote .Title }}, {{- quote .PkgName }}, {{- quote .InstalledVersion }}, {{- quote .FixedVersion }}, {{- .PrimaryURL }} {{ else -}} No vulnerabilities found at this time. {{ end }} Trivy Dependency Scan Results ({{ .Target }}) ID,Name,Version,Notes {{ range .Packages -}} {{- quote .ID }}, {{- quote .Name }}, {{- quote .Version }} {{ else -}} No dependencies found at this time. {{ end }} {{ end }} ================================================ FILE: .github/workflows/verify-periodic.yaml ================================================ name: Verify branches periodic on: schedule: - cron: '30 2 * * 0' permissions: contents: read concurrency: group: ${{ github.workflow }}-${{ github.ref }} cancel-in-progress: true jobs: verify-code: uses: "./.github/workflows/common-verify-code.yaml" permissions: contents: read security-events: write ================================================ FILE: .github/workflows/verify-pr-code.yaml ================================================ name: Verify code on: pull_request permissions: contents: read concurrency: group: ${{ github.workflow }}-${{ github.event.pull_request.number }} cancel-in-progress: true jobs: verify: uses: "./.github/workflows/common-verify-code.yaml" permissions: contents: read security-events: write ================================================ FILE: .github/workflows/verify-pr-docs.yaml ================================================ name: Verify documentation on: pull_request: paths: - "docs/**" - "Makefile" permissions: contents: read concurrency: group: ${{ github.workflow }}-${{ github.event.pull_request.number }} cancel-in-progress: true jobs: verify-docs: uses: "./.github/workflows/common-build-docs.yaml" permissions: contents: write security-events: write ================================================ FILE: .gitignore ================================================ *~ *.swp *_gendata.go /bin coverage.html coverage.txt .git-hooks.redirected *.tar *.tar.* *.spec .static.* /debian /packages /_build /_work *.stamp test/e2e/**/output ================================================ FILE: CODEOWNERS ================================================ * @kad @klihub @marquiz @mythi @askervin @jukkar @fmuyassarov ================================================ FILE: Jenkinsfile ================================================ pipeline { agent { label "cri-rm" } environment { IMAGE_REPO = "cloud-native-image-registry.westus.cloudapp.azure.com" } stages { stage('Build and push images') { steps { script { withDockerRegistry([credentialsId: "${env.DOCKER_REGISTRY}", url: "https://${env.IMAGE_REPO}"]) { if (env.BRANCH_NAME == 'master') { sh "make images-push IMAGE_REPO=${env.IMAGE_REPO} IMAGE_VERSION=devel Q=" } else { sh "make images-push IMAGE_REPO=${env.IMAGE_REPO} Q=" } } } } } } } ================================================ FILE: LICENSE ================================================ Apache License Version 2.0, January 2004 http://www.apache.org/licenses/ TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 1. Definitions. "License" shall mean the terms and conditions for use, reproduction, and distribution as defined by Sections 1 through 9 of this document. "Licensor" shall mean the copyright owner or entity authorized by the copyright owner that is granting the License. "Legal Entity" shall mean the union of the acting entity and all other entities that control, are controlled by, or are under common control with that entity. For the purposes of this definition, "control" means (i) the power, direct or indirect, to cause the direction or management of such entity, whether by contract or otherwise, or (ii) ownership of fifty percent (50%) or more of the outstanding shares, or (iii) beneficial ownership of such entity. "You" (or "Your") shall mean an individual or Legal Entity exercising permissions granted by this License. "Source" form shall mean the preferred form for making modifications, including but not limited to software source code, documentation source, and configuration files. "Object" form shall mean any form resulting from mechanical transformation or translation of a Source form, including but not limited to compiled object code, generated documentation, and conversions to other media types. "Work" shall mean the work of authorship, whether in Source or Object form, made available under the License, as indicated by a copyright notice that is included in or attached to the work (an example is provided in the Appendix below). "Derivative Works" shall mean any work, whether in Source or Object form, that is based on (or derived from) the Work and for which the editorial revisions, annotations, elaborations, or other modifications represent, as a whole, an original work of authorship. For the purposes of this License, Derivative Works shall not include works that remain separable from, or merely link (or bind by name) to the interfaces of, the Work and Derivative Works thereof. "Contribution" shall mean any work of authorship, including the original version of the Work and any modifications or additions to that Work or Derivative Works thereof, that is intentionally submitted to Licensor for inclusion in the Work by the copyright owner or by an individual or Legal Entity authorized to submit on behalf of the copyright owner. For the purposes of this definition, "submitted" means any form of electronic, verbal, or written communication sent to the Licensor or its representatives, including but not limited to communication on electronic mailing lists, source code control systems, and issue tracking systems that are managed by, or on behalf of, the Licensor for the purpose of discussing and improving the Work, but excluding communication that is conspicuously marked or otherwise designated in writing by the copyright owner as "Not a Contribution." "Contributor" shall mean Licensor and any individual or Legal Entity on behalf of whom a Contribution has been received by Licensor and subsequently incorporated within the Work. 2. Grant of Copyright License. Subject to the terms and conditions of this License, each Contributor hereby grants to You a perpetual, worldwide, non-exclusive, no-charge, royalty-free, irrevocable copyright license to reproduce, prepare Derivative Works of, publicly display, publicly perform, sublicense, and distribute the Work and such Derivative Works in Source or Object form. 3. Grant of Patent License. Subject to the terms and conditions of this License, each Contributor hereby grants to You a perpetual, worldwide, non-exclusive, no-charge, royalty-free, irrevocable (except as stated in this section) patent license to make, have made, use, offer to sell, sell, import, and otherwise transfer the Work, where such license applies only to those patent claims licensable by such Contributor that are necessarily infringed by their Contribution(s) alone or by combination of their Contribution(s) with the Work to which such Contribution(s) was submitted. If You institute patent litigation against any entity (including a cross-claim or counterclaim in a lawsuit) alleging that the Work or a Contribution incorporated within the Work constitutes direct or contributory patent infringement, then any patent licenses granted to You under this License for that Work shall terminate as of the date such litigation is filed. 4. Redistribution. You may reproduce and distribute copies of the Work or Derivative Works thereof in any medium, with or without modifications, and in Source or Object form, provided that You meet the following conditions: (a) You must give any other recipients of the Work or Derivative Works a copy of this License; and (b) You must cause any modified files to carry prominent notices stating that You changed the files; and (c) You must retain, in the Source form of any Derivative Works that You distribute, all copyright, patent, trademark, and attribution notices from the Source form of the Work, excluding those notices that do not pertain to any part of the Derivative Works; and (d) If the Work includes a "NOTICE" text file as part of its distribution, then any Derivative Works that You distribute must include a readable copy of the attribution notices contained within such NOTICE file, excluding those notices that do not pertain to any part of the Derivative Works, in at least one of the following places: within a NOTICE text file distributed as part of the Derivative Works; within the Source form or documentation, if provided along with the Derivative Works; or, within a display generated by the Derivative Works, if and wherever such third-party notices normally appear. The contents of the NOTICE file are for informational purposes only and do not modify the License. You may add Your own attribution notices within Derivative Works that You distribute, alongside or as an addendum to the NOTICE text from the Work, provided that such additional attribution notices cannot be construed as modifying the License. You may add Your own copyright statement to Your modifications and may provide additional or different license terms and conditions for use, reproduction, or distribution of Your modifications, or for any such Derivative Works as a whole, provided Your use, reproduction, and distribution of the Work otherwise complies with the conditions stated in this License. 5. Submission of Contributions. Unless You explicitly state otherwise, any Contribution intentionally submitted for inclusion in the Work by You to the Licensor shall be under the terms and conditions of this License, without any additional terms or conditions. Notwithstanding the above, nothing herein shall supersede or modify the terms of any separate license agreement you may have executed with Licensor regarding such Contributions. 6. Trademarks. This License does not grant permission to use the trade names, trademarks, service marks, or product names of the Licensor, except as required for reasonable and customary use in describing the origin of the Work and reproducing the content of the NOTICE file. 7. Disclaimer of Warranty. Unless required by applicable law or agreed to in writing, Licensor provides the Work (and each Contributor provides its Contributions) on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied, including, without limitation, any warranties or conditions of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A PARTICULAR PURPOSE. You are solely responsible for determining the appropriateness of using or redistributing the Work and assume any risks associated with Your exercise of permissions under this License. 8. Limitation of Liability. In no event and under no legal theory, whether in tort (including negligence), contract, or otherwise, unless required by applicable law (such as deliberate and grossly negligent acts) or agreed to in writing, shall any Contributor be liable to You for damages, including any direct, indirect, special, incidental, or consequential damages of any character arising as a result of this License or out of the use or inability to use the Work (including but not limited to damages for loss of goodwill, work stoppage, computer failure or malfunction, or any and all other commercial damages or losses), even if such Contributor has been advised of the possibility of such damages. 9. Accepting Warranty or Additional Liability. While redistributing the Work or Derivative Works thereof, You may choose to offer, and charge a fee for, acceptance of support, warranty, indemnity, or other liability obligations and/or rights consistent with this License. However, in accepting such obligations, You may act only on Your own behalf and on Your sole responsibility, not on behalf of any other Contributor, and only if You agree to indemnify, defend, and hold each Contributor harmless for any liability incurred by, or claims asserted against, such Contributor by reason of your accepting any such warranty or additional liability. END OF TERMS AND CONDITIONS APPENDIX: How to apply the Apache License to your work. To apply the Apache License to your work, attach the following boilerplate notice, with the fields enclosed by brackets "[]" replaced with your own identifying information. (Don't include the brackets!) The text should be enclosed in the appropriate comment syntax for the file format. We also recommend that a file or class name and description of purpose be included on the same "printed page" as the copyright notice for easier identification within third-party archives. Copyright [yyyy] [name of copyright owner] Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. ================================================ FILE: Makefile ================================================ # We use bashisms in this Makefile. SHELL := /bin/bash # Go compiler/toolchain and extra related binaries we ues/need. GO_PARALLEL := GO_CMD := go GO_BUILD := $(GO_CMD) build $(GO_PARALLEL) GO_GEN := $(GO_CMD) generate -x GO_INSTALL := $(GO_CMD) install GO_FMT := gofmt GO_CYCLO := gocyclo GO_LINT := golint GO_CILINT := golangci-lint GO_VERSION ?= 1.24.1 GOLICENSES_VERSION ?= v1.5.0 # TEST_TAGS is the set of extra build tags passed for tests. # We disable AVX collector for tests by default. TEST_TAGS := noavx,test GO_TEST := $(GO_CMD) test $(GO_PARALLEL) -tags $(TEST_TAGS) GO_VET := $(GO_CMD) vet -tags $(TEST_TAGS) TEST_SETUP := test-setup.sh TEST_CLEANUP := test-cleanup.sh # Disable some golangci_lint checkers for now until we have an more acceptable baseline... GO_CILINT_CHECKERS := -D unused,staticcheck,errcheck,deadcode,structcheck,gosimple,revive -E gofmt GO_CILINT_RUNFLAGS := --build-tags $(TEST_TAGS) # Protoc compiler and protobuf definitions we might need to recompile. PROTOC := $(shell command -v protoc;) PROTOBUFS = $(shell find cmd pkg -name \*.proto) PROTOCODE := $(patsubst %.proto,%.pb.go,$(PROTOBUFS)) PROTO_INCLUDE = -I$(PWD):/usr/local/include:/usr/include PROTO_OPTIONS = --proto_path=. $(PROTO_INCLUDE) \ --go_opt=paths=source_relative --go_out=. \ --go-grpc_opt=paths=source_relative --go-grpc_out=. PROTO_COMPILE = $(PROTOC) $(PROTO_OPTIONS) # ShellCheck for checking shell scripts. SHELLCHECK := shellcheck CLANG := clang KERNEL_VERSION ?= $(shell uname -r) KERNEL_HEADERS_DIR ?= /lib/modules/$(KERNEL_VERSION)/source KERNEL_BUILD_DIR ?= /lib/modules/$(KERNEL_VERSION)/build # Directory for full kernel sources KERNEL_SRC_DIR ?= /usr/src/linux # Binaries and directories for installation. INSTALL := install PREFIX ?= /usr BINDIR ?= $(PREFIX)/bin UNITDIR ?= $(PREFIX)/lib/systemd/system DOCDIR ?= $(PREFIX)/share/doc/cri-resource-manager SYSCONFDIR ?= /etc CONFIGDIR ?= /etc/cri-resmgr DEFAULTDIR ?= $(shell \ [ -d /etc/rpm ] && { echo /etc/sysconfig; exit 0; }; \ [ -f /etc/debian_version ] && { echo /etc/default; exit 0; }; \ echo unknown; exit 1) # Directories (in cmd) with go code we'll want to build and install. BUILD_DIRS = $(shell find cmd -name \*.go | sed 's:cmd/::g;s:/.*::g' | uniq) BUILD_BINS = $(foreach dir,$(BUILD_DIRS),bin/$(dir)) # Directories (in cmd) with go code we'll want to create Docker images from. IMAGE_DIRS = $(shell find cmd -name Dockerfile | sed 's:cmd/::g;s:/.*::g' | uniq) IMAGE_VERSION := $(shell git describe --dirty 2> /dev/null || echo unknown) ifdef IMAGE_REPO override IMAGE_REPO := $(IMAGE_REPO)/ endif # List of our active go modules. GO_LIST_MODULES := $(GO_CMD) list ./... | grep -v vendor/ GO_PKG_SRC = $(shell find pkg -name \*.go) # List of visualizer collateral files to go generate. UI_ASSETS := $(shell for i in pkg/cri/resource-manager/visualizer/*; do \ if [ -d "$$i" -a -e "$$i/assets_generate.go" ]; then \ echo $$i/assets_gendata.go; \ fi; \ done) # Right now we don't depend on libexec/%.o on purpose so make sure the file # is always up-to-date when elf/avx512.c is changed. GEN_TARGETS := pkg/avx/programbytes_gendata.go $(PROTOCODE) # Determine binary version and buildid, and versions for rpm, deb, and tar packages. BUILD_VERSION := $(shell scripts/build/get-buildid --version --shell=no) BUILD_BUILDID := $(shell scripts/build/get-buildid --buildid --shell=no) RPM_VERSION := $(shell scripts/build/get-buildid --rpm --shell=no) DEB_VERSION := $(shell scripts/build/get-buildid --deb --shell=no) TAR_VERSION := $(shell scripts/build/get-buildid --tar --shell=no) # Kubernetes version we pull in as modules and our external API versions. KUBERNETES_VERSION := $(shell grep 'k8s.io/kubernetes ' go.mod | sed 's/^.* //') RESMGR_API_VERSION := $(shell ls pkg/apis/resmgr | grep '^v[0-9]*') # Git (tagged) version and revisions we'll use to linker-tag our binaries with. RANDOM_ID := "$(shell head -c20 /dev/urandom | od -An -tx1 | tr -d ' \n')" ifdef STATIC STATIC_LDFLAGS:=-extldflags=-static BUILD_TAGS:=-tags osusergo,netgo endif LDFLAGS = \ -ldflags "$(STATIC_LDFLAGS) -X=github.com/intel/cri-resource-manager/pkg/version.Version=$(BUILD_VERSION) \ -X=github.com/intel/cri-resource-manager/pkg/version.Build=$(BUILD_BUILDID) \ -B 0x$(RANDOM_ID)" # Build non-optimized version for debugging on make DEBUG=1. DEBUG ?= 0 ifeq ($(DEBUG),1) GCFLAGS=-gcflags "all=-N -l" else GCFLAGS= endif # Release/end-to-end testing. Specify E2E_TESTS to override the default test set. E2E_RUN := reinstall_cri_resmgr=1 test/e2e/run_tests.sh # tar-related commands and options. TAR := tar TAR_UPDATE := $(TAR) -uf GZIP := gzip GZIP_DC := gzip -dc GZEXT := .gz # Metadata for packages, changelog, etc. USER_NAME ?= $(shell git config user.name) USER_EMAIL ?= $(shell git config user.email) BUILD_DATE ?= $(shell date -R) # RPM spec files we might want to generate. SPEC_FILES = $(shell find packaging -name \*.spec.in | sed 's/.spec.in/.spec/g' | uniq) # Systemd collateral. SYSTEMD_DIRS = $(shell find cmd -name \*.service -o -name \*.socket | sed 's:cmd/::g;s:/.*::g'|uniq) SYSCONF_DIRS = $(shell find cmd -name \*.sysconf | sed 's:cmd/::g;s:/.*::g' | uniq) DOCKER := docker # Extra options to pass to docker (for instance --network host). DOCKER_OPTIONS = # Set this to empty to prevent 'docker build' from trying to pull all image refs. DOCKER_PULL := --pull # Docker boilerplate/commands to build debian/ubuntu packages. DOCKER_DEB_BUILD := \ cd /build && \ tar -xvf /build/input/cri-resource-manager-$(TAR_VERSION).tar.gz && \ cd cri-resource-manager-$(TAR_VERSION) && \ cp -r /build/input/debian . && \ dpkg-buildpackage -uc && \ cp ../*.{buildinfo,changes,deb,dsc} /output # Docker boilerplate/commands to build rpm packages. DOCKER_RPM_BUILD := \ mkdir -p ~/rpmbuild/{SOURCES,SPECS} && \ cp -v /build/input/*.spec ~/rpmbuild/SPECS && \ cp -v /build/input/*.tar.* ~/rpmbuild/SOURCES && \ for spec in ~/rpmbuild/SPECS/*.spec; do \ rpmbuild -bb $$spec; \ done && \ cp -v $$(rpm --eval %{_rpmdir}/%{_arch})/*.rpm /output # Docker boilerplate/commands to build binary tarballs. DOCKER_TAR_BUILD := \ cd ~ && \ $(GZIP_DC) /build/input/cri-resource-manager-$(TAR_VERSION).tar$(GZ_EXT) | \ $(TAR) -xf - && \ cd cri-resource-manager-$(TAR_VERSION) && \ $(MAKE) OUTPUT=/output/ binary-dist # Docker boilerplate/commands to build binaries. DOCKER_BIN_BUILD := \ mkdir ~/build && cd ~/build && \ tar -xvzf /build/input/cri-resource-manager-$(TAR_VERSION).tar$(GZEXT) && \ cd cri-resource-manager-$(TAR_VERSION) && \ make && \ cp -v bin/* /output # Documentation-related variables SPHINXOPTS ?= -W SPHINXBUILD = sphinx-build SITE_BUILDDIR ?= _build # Docker base command for working with html documentation. DOCKER_SITE_BUILDER_IMAGE := cri-resmgr-site-builder DOCKER_SITE_CMD := $(DOCKER) run --rm -v "`pwd`:/docs" --user=`id -u`:`id -g` \ -p 8081:8081 \ -e SITE_BUILDDIR=$(SITE_BUILDDIR) -e SPHINXOPTS=$(SPHINXOPTS) # Supported distros with debian native packaging format. SUPPORTED_DEB_DISTROS := $(shell \ grep -l 'apt-get ' dockerfiles/cross-build/Dockerfile.* | \ egrep -v '((~)|(swp))$$' | \ sed 's:^.*Dockerfile.::g') # Supported distros with rpm native packaging format. SUPPORTED_RPM_DISTROS := $(shell \ egrep -l '(dnf )|(yum )|(zypper )' dockerfiles/cross-build/Dockerfile.* | \ egrep -v '((~)|(swp))$$' | \ sed 's:^.*Dockerfile.::g') # Directory to leave built distro packages and collateral in. PACKAGES_DIR := packages # Directory to leave build distro binaries in. BINARIES_DIR := binaries # Directory to use to build distro packages. BUILD_DIR := build # dist tarball target name ifneq ($(wildcard .git/.),) DIST_TARGET = dist-git else DIST_TARGET = dist-cwd endif # Paths to exclude from tarballs generated by dist-cwd. DIST_EXCLUDE := \ --exclude="./$$tarball*" \ --exclude='./cri-resource-manager-*' \ --exclude='./$(PACKAGES_DIR)*' \ --exclude='./$(BUILD_DIR)*' # Path name transformations for tarballs generated by dist-cwd. DIST_TRANSFORM := \ --transform='s:^.:cri-resource-manager-$(TAR_VERSION):' # Determine distro ID, version and package type. DISTRO_ID := $(shell . /etc/os-release; echo "$${ID:-unknown}") DISTRO_VERSION := $(shell . /etc/os-release; echo "$${VERSION_ID:-unknown}") DISTRO_PACKAGE := $(shell echo $(DISTRO_ID) | tr -d ' \t' | \ sed -E 's/.*((fedora)|(suse)).*/rpm/;s/.*((ubuntu)|(debian)).*/deb/') # Be quiet by default but let folks override it with Q= or V=1 on the command line. ifneq ($(V),1) Q := @ endif # Default target: just build everything. all: build # # Generic targets: build, install, clean, build images. # build: $(BUILD_BINS) build-static: $(MAKE) STATIC=1 build install: $(BUILD_BINS) $(foreach dir,$(BUILD_DIRS),install-bin-$(dir)) \ $(foreach dir,$(BUILD_DIRS),install-systemd-$(dir)) \ $(foreach dir,$(BUILD_DIRS),install-sysconf-$(dir)) \ $(foreach dir,$(BUILD_DIRS),install-config-$(dir)) clean: clean-bin clean-spec clean-deb clean-ui-assets clean-html images: $(foreach dir,$(IMAGE_DIRS),image-$(dir)) images-push: $(foreach dir,$(IMAGE_DIRS),image-push-$(dir)) # # Rules for building and installing binaries, or building docker images, and cleaning up. # KERNEL_INCLUDE_DIRS = /include \ /include/uapi \ /include/generated/uapi \ /arch/x86/include \ /arch/x86/include/uapi \ /arch/x86/include/generated/uapi KERNEL_INCLUDES := $(strip $(foreach kernel_dir,$(KERNEL_HEADERS_DIR) $(KERNEL_BUILD_DIR),$(addprefix -I,$(wildcard $(addprefix $(kernel_dir),$(KERNEL_INCLUDE_DIRS)))))) libexec/%.o: elf/%.c $(Q)if [ -z "$(KERNEL_INCLUDES)" ]; then echo "Cannot build $@: invalid KERNEL_HEADERS_DIR=$(KERNEL_HEADERS_DIR)"; exit 1; fi $(Q)echo "Building $@" $(Q)mkdir -p libexec $(Q)$(CLANG) -nostdinc -D __KERNEL__ $(KERNEL_INCLUDES) -O2 -Wall -target bpf -c $< -o $@ bin/%: .static.%.$(STATIC) $(Q)bin=$(notdir $@); src=./cmd/$$bin; \ echo "Building $$([ -n "$(STATIC)" ] && echo 'static ')$@ (version $(BUILD_VERSION), build $(BUILD_BUILDID))..."; \ mkdir -p bin && \ $(GO_BUILD) $(BUILD_TAGS) $(LDFLAGS) $(GCFLAGS) -o bin/ $$src .static.%.$(STATIC): $(Q)if [ ! -f "$@" ]; then \ touch "$@"; \ fi; \ old="$@"; old="$${old%.*}"; \ if [ -n "$(STATIC)" ]; then \ rm -f "$$old."; \ else \ rm -f "$$old.1"; \ fi .PRECIOUS: $(foreach dir,$(BUILD_DIRS),.static.$(dir).1 .static.$(dir).) install-bin-%: bin/% $(Q)bin=$(patsubst install-bin-%,%,$@); dir=cmd/$$bin; \ echo "Installing $$bin in $(DESTDIR)$(BINDIR)..."; \ $(INSTALL) -d $(DESTDIR)$(BINDIR) && \ $(INSTALL) -m 0755 -t $(DESTDIR)$(BINDIR) bin/$$bin; \ install-systemd-%: $(Q)bin=$(patsubst install-systemd-%,%,$@); dir=cmd/$$bin; \ echo "Installing systemd collateral for $$bin..."; \ $(INSTALL) -d $(DESTDIR)$(UNITDIR) && \ for f in $$(find $$dir -name \*.service -o -name \*.socket); do \ echo " $$f in $(DESTDIR)$(UNITDIR)..."; \ $(INSTALL) -m 0644 -t $(DESTDIR)$(UNITDIR) $$f.in; \ done; \ for f in $$(find $$dir -name \*.service.in -o -name \*.socket.in); do \ echo " $$f in $(DESTDIR)$(UNITDIR)..."; \ df=$${f##*/}; df=$${df%.in}; \ $(INSTALL) -m 0644 -T $$f $(DESTDIR)$(UNITDIR)/$$df; \ sed -E -i -e "s:__DEFAULTDIR__:$(DEFAULTDIR):g" \ -e "s:__BINDIR__:$(BINDIR):g" $(DESTDIR)$(UNITDIR)/$$df; \ done install-sysconf-%: $(Q)bin=$(patsubst install-sysconf-%,%,$@); dir=cmd/$$bin; \ echo "Installing sysconf/default collateral for $$bin..."; \ $(INSTALL) -d $(DESTDIR)$(DEFAULTDIR) && \ for f in $$(find $$dir -name \*.sysconf); do \ echo " $$f in $(DESTDIR)$(DEFAULTDIR)..."; \ df=$${f##*/}; df=$${df%.sysconf}; \ $(INSTALL) -m 0644 -T $$f $(DESTDIR)$(DEFAULTDIR)/$$df; \ done install-config-%: $(Q)bin=$(patsubst install-config-%,%,$@); dir=cmd/$$bin; \ echo "Installing sample configuration collateral for $$bin..."; \ $(INSTALL) -d $(DESTDIR)$(CONFIGDIR) && \ for f in $$(find $$dir -name \*.cfg.sample); do \ echo " $$f in $(DESTDIR)$(CONFIGDIR)..."; \ df=$${f##*/}; \ $(INSTALL) -m 0644 -T $$f $(DESTDIR)$(CONFIGDIR)/$${df}; \ done install-minimal-docs: $(Q)echo "Installing minimal documentation to $(DOCDIR)..."; \ $(INSTALL) -d $(DESTDIR)$(DOCDIR) && \ for f in LICENSE docs/security.md; do \ echo " $$f in $(DESTDIR)$(DOCDIR)..."; \ df=$${f##*/}; \ $(INSTALL) -m 0644 -T $$f $(DESTDIR)$(DOCDIR)/$${df}; \ done install-licenses: $(Q)for cmd in $(BUILD_DIRS); do \ install -D LICENSE $(DESTDIR)/licenses/$$cmd/LICENSE && \ go-licenses save ./cmd/$$cmd \ --ignore github.com/intel/cri-resource-manager \ --save_path $(DESTDIR)/licenses/$$cmd/go-licenses; \ done clean-bin: $(foreach dir,$(BUILD_DIRS),clean-$(dir)) $(Q)rm -f .static.* clean-%: $(Q)bin=$(patsubst clean-%,%,$@); src=cmd/$$bin; \ echo "Cleaning up $$bin..."; \ rm -f bin/$$bin clean-gen: $(Q)rm -f $(GEN_TARGETS) image-%: $(Q)bin=$(patsubst image-%,%,$@); \ $(DOCKER) build . -f "cmd/$$bin/Dockerfile" \ --build-arg GO_VERSION=$(GO_VERSION) \ --build-arg GOLICENSES_VERSION=$(GOLICENSES_VERSION) \ -t $(IMAGE_REPO)$$bin:$(IMAGE_VERSION) image-push-%: $(Q)bin=$(patsubst image-push-%,%,$@); \ if [ -z "$(IMAGE_REPO)" ]; then echo "ERROR: no IMAGE_REPO specified"; exit 1; fi; \ $(DOCKER) push $(IMAGE_REPO)$$bin:$(IMAGE_VERSION) # # Rules for format checking, various code quality and complexity checks and measures. # format: $(Q)report=`$(GO_FMT) -s -d -w $$(find cmd pkg test/functional -name \*.go)`; \ if [ -n "$$report" ]; then \ echo "$$report"; \ exit 1; \ fi vet: $(Q)$(GO_VET) $(shell $(GO_LIST_MODULES)) cyclomatic-check: $(Q)report=`$(GO_CYCLO) -over 15 cmd pkg`; \ if [ -n "$$report" ]; then \ echo "Complexity is over 15 in"; \ echo "$$report"; \ exit 1; \ fi lint: $(Q)rc=0; \ for f in $$(find -name \*.go | grep -v \.\/vendor); do \ $(GO_LINT) -set_exit_status $$f || rc=1; \ done; \ exit $$rc golangci-lint: $(Q)$(GO_CILINT) run $(GO_CILINT_RUNFLAGS) $(GO_CILINT_CHECKERS) shellcheck: $(Q)for f in $$(git grep -n '^#!/bin/.*sh *' | grep ':1:#!' | sed 's/:1:.*//'); do \ echo "shellchecking $$f..."; \ $(SHELLCHECK) $$f; \ done # # Rules for running unit/module tests. # test: test-setup test-run test-cleanup race-test racetest: test-setup racetest-run test-cleanup test-setup: $(Q)for i in $$(find . -name $(TEST_SETUP)); do \ echo "+ Running test setup $$i..."; \ (cd $${i%/*}; \ if [ -x "$(TEST_SETUP)" ]; then \ ./$(TEST_SETUP); \ fi); \ done test-cleanup: $(Q)for i in $$(find . -name $(TEST_CLEANUP)); do \ echo "- Running test cleanup $$i..."; \ (cd $${i%/*}; \ if [ -x "$(TEST_CLEANUP)" ]; then \ ./$(TEST_CLEANUP); \ fi); \ done test-run: ifndef WHAT $(Q)$(GO_TEST) -race -coverprofile=coverage.txt -covermode=atomic \ $(shell $(GO_LIST_MODULES)) else $(Q)if [ -n '$(TESTS)' ]; then \ run="-run $(TESTS)"; \ fi; \ cd $(WHAT) && \ $(GO_TEST) $$run -v -cover -coverprofile cover.out || rc=1; \ $(GO_CMD) tool cover -html=cover.out -o coverage.html; \ rm cover.out; \ echo "Coverage report: file://$$(realpath coverage.html)"; \ exit $$rc endif racetest-run: ifndef WHAT $(Q)$(GO_TEST) -race -coverprofile=coverage.txt -covermode=atomic \ $(shell $(GO_LIST_MODULES)) else $(Q)cd $(WHAT) && \ $(GO_TEST) -race -coverprofile=cover.out -covermode=atomic || rc=1; \ $(GO_CMD) tool cover -html=cover.out -o coverage.html; \ rm cover.out; \ echo "Coverage report: file://$$(realpath coverage.html)"; \ exit $$rc endif release-tests: e2e-tests e2e-tests: build-static $(Q)tests="$(if $(E2E_TESTS),$(E2E_TESTS),test/e2e/policies.test-suite)"; \ $(E2E_RUN) $$tests; \ if [ "$$?" != "0" ]; then \ echo "You drop into interactive mode upon failures if you run e2e tests as"; \ echo " on_verify_fail=interactive $(E2E_RUN) $$tests"; \ exit 1; \ fi packaging-tests: cross-packages $(Q)cleanup=1 omit_agent=1 $(E2E_RUN) test/e2e/packages.test-suite # # Rules for building distro packages. # ifneq ($(DISTRO_ID),fedora) packages: cross-$(DISTRO_PACKAGE).$(DISTRO_ID)-$(DISTRO_VERSION) else packages: cross-$(DISTRO_PACKAGE).$(DISTRO_ID) endif cross-packages: cross-rpm cross-deb cross-tar cross-rpm: $(foreach d,$(SUPPORTED_RPM_DISTROS),cross-rpm.$(d)) cross-deb: $(foreach d,$(SUPPORTED_DEB_DISTROS),cross-deb.$(d)) cross-bin: $(foreach d,$(SUPPORTED_RPM_DISTROS),cross-bin.$(d)) \ $(foreach d,$(SUPPORTED_DEB_DISTROS),cross-bin.$(d)) # # Rules for building dist-tarballs, rpm, and deb packages. # dist: $(DIST_TARGET) dist-git: $(Q)echo "Using git to create dist tarball $(TAR_VERSION) from $(BUILD_BUILDID)..."; \ tardir=cri-resource-manager-$(TAR_VERSION) && \ tarball=cri-resource-manager-$(TAR_VERSION).tar && \ git archive --format=tar --prefix=$$tardir/ HEAD > $$tarball && \ mkdir -p $$tardir && \ echo $(BUILD_VERSION) > $$tardir/version && \ echo $(BUILD_BUILDID) > $$tardir/buildid && \ $(TAR) -uf $$tarball $$tardir && \ rm -f $$tarball.* && \ $(GZIP) $$tarball && \ rm -fr $$tardir dist-cwd: $(Q)echo "Using tar to create dist tarball $(TAR_VERSION) from $$(pwd)..."; \ tardir=cri-resource-manager-$(TAR_VERSION) && \ tarball=cri-resource-manager-$(TAR_VERSION).tar && \ $(TAR) $(DIST_EXCLUDE) $(DIST_TRANSFORM) -cvf - . > $$tarball && \ mkdir -p $$tardir && \ echo $(BUILD_VERSION) > $$tardir/version && \ echo $(BUILD_BUILDID) > $$tardir/buildid && \ $(TAR_UPDATE) $$tarball $$tardir && \ rm -f $$tarball.* && \ $(GZIP) $$tarball && \ rm -fr $$tardir vendored-dist: dist $(Q)echo "Creating vendored dist tarball $(TAR_VERSION)..."; \ tardir=cri-resource-manager-$(TAR_VERSION) && \ tarball=cri-resource-manager-$(TAR_VERSION).tar && \ cp $$tarball$(GZEXT) vendored-$$tarball$(GZEXT) && \ $(GZIP_DC) vendored-$$tarball$(GZEXT) | tar -xf - && \ go mod vendor -v && \ mkdir -p $$tardir && \ mv vendor $$tardir && \ rm -f vendored-$$tarball* && \ $(TAR) -cf vendored-$$tarball $$tardir && \ $(GZIP) vendored-$$tarball && \ rm -fr $$tardir binary-dist: $(Q)tarball=$(OUTPUT)cri-resource-manager-$(TAR_VERSION).$$(uname -m).tar; \ echo "Creating binary dist tarball $$tarball..."; \ tardir=binary-dist; \ rm -fr $$tarball* $$tardir && \ $(MAKE) DESTDIR=$$tardir \ BUILD_DIRS=cri-resmgr \ PREFIX=/opt/intel \ DEFAULTDIR=/etc/default \ UNITDIR=$(SYSCONFDIR)/systemd/system install install-minimal-docs && \ $(MAKE) DESTDIR=$$tardir/opt/intel/ install-licenses && \ $(TAR) -C $$tardir -cf $$tarball . && \ $(GZIP) $$tarball && \ rm -fr $$tardir spec: clean-spec $(SPEC_FILES) %.spec: $(Q)echo "Generating RPM spec file $@..."; \ cp $@.in $@ && \ sed -E -i -e "s/__VERSION__/$(RPM_VERSION)/g" \ -e "s/__TARVERSION__/$(TAR_VERSION)/g" \ -e "s/__BUILDID__/$(BUILD_BUILDID)/g" $@ clean-spec: $(Q)rm -f $(SPEC_FILES) cross-rpm.%: docker/cross-build/% clean-spec spec dist $(Q)distro=$(patsubst cross-rpm.%,%,$@); \ builddir=$(BUILD_DIR)/docker/$$distro; \ outdir=$(PACKAGES_DIR)/$$distro; \ echo "Docker cross-building $$distro packages..."; \ mkdir -p $(PACKAGES_DIR)/$$distro && \ rm -fr $$builddir && mkdir -p $$builddir/{input,build} && \ cp cri-resource-manager-$(TAR_VERSION).tar$(GZEXT) $$builddir/input && \ cp packaging/rpm/cri-resource-manager.spec $$builddir/input && \ $(DOCKER) run --rm $(DOCKER_OPTIONS) --user $$USER \ --env USER_NAME="$(USER_NAME)" --env USER_EMAIL=$(USER_EMAIL) \ -v $$(pwd)/$$builddir:/build \ -v $$(pwd)/$$outdir:/output \ -v "`go env GOMODCACHE`:/home/$$USER/go/pkg/mod" \ $$distro-build /bin/bash -c '$(DOCKER_RPM_BUILD)' && \ rm -fr $$builddir && \ install -D -m644 $$outdir/cri-resource-manager-$(RPM_VERSION)-0.x86_64.rpm $(PACKAGES_DIR)/release-assets/cri-resource-manager-$(RPM_VERSION)-0.$$distro.x86_64.rpm src.rpm source-rpm: spec dist mkdir -p ~/rpmbuild/{SOURCES,SPECS} && \ cp packaging/rpm/cri-resource-manager.spec ~/rpmbuild/SPECS && \ cp cri-resource-manager-$(TAR_VERSION).tar$(GZEXT) ~/rpmbuild/SOURCES && \ rpmbuild -bs ~/rpmbuild/SPECS/cri-resource-manager.spec rpm: source-rpm rpmbuild -bb ~/rpmbuild/SPECS/cri-resource-manager.spec debian/%: packaging/deb.in/% $(Q)echo "Generating debian packaging file $@..."; \ tardir=cri-resource-manager-$(TAR_VERSION) && \ tarball=cri-resource-manager-$(TAR_VERSION).tar && \ mkdir -p debian; \ cp $< $@ && \ sed -E -i -e "s/__PACKAGE__/cri-resource-manager/g" \ -e "s/__TARBALL__/$$tarball/g" \ -e "s/__VERSION__/$(DEB_VERSION)/g" \ -e "s/__AUTHOR__/$(USER_NAME)/g" \ -e "s/__EMAIL__/$(USER_EMAIL)/g" \ -e "s/__DATE__/$(BUILD_DATE)/g" $@ clean-deb: $(Q)rm -fr debian cross-deb.%: docker/cross-build/% \ clean-deb debian/changelog debian/control debian/rules debian/compat dist $(Q)distro=$(patsubst cross-deb.%,%,$@); \ echo "Docker cross-building $$distro packages..."; \ builddir=$(BUILD_DIR)/docker/$$distro; \ outdir=$(PACKAGES_DIR)/$$distro; \ mkdir -p $(PACKAGES_DIR)/$$distro && \ rm -fr $$builddir && mkdir -p $$builddir/{input,build} && \ cp cri-resource-manager-$(TAR_VERSION).tar$(GZEXT) $$builddir/input && \ cp -r debian $$builddir/input && \ $(DOCKER) run --rm $(DOCKER_OPTIONS) --user $$USER \ --env USER_NAME="$(USER_NAME)" --env USER_EMAIL=$(USER_EMAIL) \ -v $$(pwd)/$$builddir:/build \ -v $$(pwd)/$$outdir:/output \ -v "`go env GOMODCACHE`:/home/$$USER/go/pkg/mod" \ $$distro-build /bin/bash -c '$(DOCKER_DEB_BUILD)' && \ rm -fr $$builddir && \ install -D -m644 $$outdir/cri-resource-manager_$(DEB_VERSION)_amd64.deb $(PACKAGES_DIR)/release-assets/cri-resource-manager_$(DEB_VERSION)_$${distro}_amd64.deb deb: debian/changelog debian/control debian/rules debian/compat dist dpkg-buildpackage -uc cross-bin.%: docker/cross-build/% dist $(Q)distro=$(patsubst cross-bin.%,%,$@); \ echo "Docker cross-building $$distro binaries..."; \ builddir=$(BUILD_DIR)/docker/$$distro; \ outdir=$(BINARIES_DIR)/$$distro; \ mkdir -p $(BINARIES_DIR)/$$distro && \ rm -fr $$builddir && mkdir -p $$builddir/{input,build} && \ cp cri-resource-manager-$(TAR_VERSION).tar$(GZEXT) $$builddir/input && \ $(DOCKER) run --rm $(DOCKER_OPTIONS) --user $$USER \ --env USER_NAME="$(USER_NAME)" --env USER_EMAIL=$(USER_EMAIL) \ -v $$(pwd)/$$builddir:/build \ -v $$(pwd)/$$outdir:/output \ -v "`go env GOMODCACHE`:/home/$$USER/go/pkg/mod" \ $$distro-build /bin/bash -c '$(DOCKER_BIN_BUILD)' && \ rm -fr $$builddir cross-tar cross-tarball: dist docker/cross-build/fedora $(Q)distro=tarball; \ builddir=$(BUILD_DIR)/docker/$$distro; \ outdir=$(PACKAGES_DIR)/$$distro; \ echo "Docker cross-building $$distro packages..."; \ mkdir -p $$outdir && \ rm -fr $$builddir && mkdir -p $$builddir/{input,build} && \ cp cri-resource-manager-$(TAR_VERSION).tar$(GZEXT) $$builddir/input && \ $(DOCKER) run --rm $(DOCKER_OPTIONS) --user $$USER \ --env USER_NAME="$(USER_NAME)" --env USER_EMAIL=$(USER_EMAIL) \ -v $$(pwd)/$$builddir:/build \ -v $$(pwd)/$$outdir:/output \ -v "`go env GOMODCACHE`:/home/$$USER/go/pkg/mod" \ fedora-build /bin/bash -c '$(DOCKER_TAR_BUILD)' && \ rm -fr $$builddir && \ install -D -m644 -t $(PACKAGES_DIR)/release-assets $$outdir/cri-resource-manager-$(TAR_VERSION).x86_64.tar.gz # Build a docker image (for distro cross-building). docker/cross-build/%: dockerfiles/cross-build/Dockerfile.% $(Q)distro=$(patsubst docker/cross-build/%,%,$@) && \ echo "Building cross-build docker image for $$distro..." && \ img=$${distro}-build && $(DOCKER) rm $$distro-build || : && \ scripts/build/docker-build-image $$distro-build \ $(DOCKER_PULL) \ --build-arg GO_VERSION=$(GO_VERSION) \ --build-arg GOLICENSES_VERSION=$(GOLICENSES_VERSION) \ $(DOCKER_OPTIONS) # Rule for recompiling a changed protobuf. %.pb.go: %.proto $(Q)if [ -n "$(PROTOC)" -o ! -e "$@" ]; then \ echo "Generating go code ($@) for updated protobuf $<..."; \ $(PROTO_COMPILE) $<; \ else \ echo "WARNING: no protoc found, compiling with OUTDATED $@..."; \ fi # Rule for installing in-repo git hooks. install-git-hooks: $(Q)if [ -d .git -a ! -e .git-hooks.redirected ]; then \ echo -n "Redirecting git hooks to .githooks..."; \ git config core.hookspath .githooks && \ touch .git-hooks.redirected && \ echo "done."; \ fi # Rules for installing protoc and related utilities. install-protoc: $(Q)./scripts/hack/install-protobuf install-protoc-gen-go: $(Q)$(GO_INSTALL) google.golang.org/protobuf/cmd/protoc-gen-go@v1.28.0 install-protoc-gen-go-grpc: $(Q)$(GO_INSTALL) google.golang.org/grpc/cmd/protoc-gen-go-grpc@v1.2.0 install-protoc-tools: install-protoc install-protoc-gen-go install-protoc-gen-go-grpc # # go dependencies for our binaries (careful with that axe, Eugene...) # bin/cri-resmgr: $(wildcard cmd/cri-resmgr/*.go) $(UI_ASSETS) $(GEN_TARGETS) \ $(shell for dir in \ $(shell go list -f '{{ join .Deps "\n"}}' ./cmd/cri-resmgr/... | \ grep cri-resource-manager/pkg/ | \ sed 's#github.com/intel/cri-resource-manager/##g'); do \ find $$dir -name \*.go; \ done | sort | uniq) bin/cri-resmgr-agent: $(wildcard cmd/cri-resmgr-agent/*.go) \ $(shell for dir in \ $(shell go list -f '{{ join .Deps "\n"}}' ./cmd/cri-resmgr-agent/... | \ grep cri-resource-manager/pkg/ | \ sed 's#github.com/intel/cri-resource-manager/##g'); do \ find $$dir -name \*.go; \ done | sort | uniq) bin/webhook: $(wildcard cmd/cri-resmgr-webhook/*.go) \ $(shell for dir in \ $(shell go list -f '{{ join .Deps "\n"}}' ./cmd/cri-resmgr-webhook/... | \ grep cri-resource-manager/pkg/ | \ sed 's#github.com/intel/cri-resource-manager/##g'); do \ find $$dir -name \*.go; \ done | sort | uniq) # # rules to run go generators # clean-ui-assets: $(Q)echo "Cleaning up generated UI assets..."; \ for i in $(UI_ASSETS); do \ echo " - $$i"; \ rm -f $$i; \ done %_gendata.go:: $(Q)echo "Generating $@..."; \ cd $(dir $@) && \ $(GO_GEN) || exit 1 && \ cd - > /dev/null pkg/sysfs/sst_types%.go: pkg/sysfs/_sst_types%.go pkg/sysfs/gen_sst_types.sh $(Q)cd $(@D) && \ KERNEL_SRC_DIR=$(KERNEL_SRC_DIR) $(GO_GEN) # # API generation # # unconditionally generate all apis generate-apis: generate-resmgr-api # unconditionally generate (external) resmgr api generate-resmgr-api: $(Q)$(call generate-api,resmgr,$(RESMGR_API_VERSION)) # automatic update of generated code for resource-manager external api pkg/apis/resmgr/$(RESMGR_API_VERSION)/zz_generated.deepcopy.go: \ pkg/apis/resmgr/$(RESMGR_API_VERSION)/types.go $(Q)$(call generate-api,resmgr,$(RESMGR_API_VERSION)) # macro to generate code for api $(1), version $(2) generate-api = \ echo "Generating '$(1)' api, version $(2)..." && \ KUBERNETES_VERSION=$(KUBERNETES_VERSION) \ ./scripts/code-generator/generate-groups.sh all \ github.com/intel/cri-resource-manager/pkg/apis/$(1)/generated \ github.com/intel/cri-resource-manager/pkg/apis $(1):$(2) \ --output-base $(shell pwd)/generate && \ cp -r generate/github.com/intel/cri-resource-manager/pkg/apis/$(1) pkg/apis && \ rm -fr generate/github.com/intel/cri-resource-manager/pkg/apis/$(1) # # dependencies for UI assets baked in using vfsgendev (can't come up with a working pattern rule) # pkg/cri/resource-manager/visualizer/bubbles/assets_gendata.go:: \ $(wildcard pkg/cri/resource-manager/visualizer/bubbles/assets/*.html) \ $(wildcard pkg/cri/resource-manager/visualizer/bubbles/assets/js/*.js) \ $(wildcard pkg/cri/resource-manager/visualizer/bubbles/assets/css/*.css) # phony targets .PHONY: all build install clean test images images-push release-tests e2e-tests \ format vet cyclomatic-check lint golangci-lint \ cross-packages cross-rpm cross-deb \ # # Rules for documentation # vhtml: _work/venv/.stamp . _work/venv/bin/activate && \ make -C docs html && \ cp -r docs/_build . html: clean-html $(Q)BUILD_VERSION=$(BUILD_VERSION) \ $(SPHINXBUILD) -c docs . "$(SITE_BUILDDIR)" $(SPHINXOPTS) cp docs/index.html "$(SITE_BUILDDIR)" for d in $$(find docs -name figures -type d); do \ mkdir -p $(SITE_BUILDDIR)/$$d && cp $$d/* $(SITE_BUILDDIR)/$$d; \ done serve-html: html $(Q)cd $(SITE_BUILDDIR) && python3 -m http.server 8081 clean-html: rm -rf $(SITE_BUILDDIR) site-build: .$(DOCKER_SITE_BUILDER_IMAGE).image.stamp $(Q)$(DOCKER_SITE_CMD) $(DOCKER_SITE_BUILDER_IMAGE) make html site-serve: .$(DOCKER_SITE_BUILDER_IMAGE).image.stamp $(Q)$(DOCKER_SITE_CMD) -it $(DOCKER_SITE_BUILDER_IMAGE) make serve-html .$(DOCKER_SITE_BUILDER_IMAGE).image.stamp: docs/Dockerfile docs/requirements.txt docker build -t $(DOCKER_SITE_BUILDER_IMAGE) docs touch $@ # Set up a Python3 environment with the necessary tools for document creation. _work/venv/.stamp: docs/requirements.txt rm -rf ${@D} python3 -m venv ${@D} . ${@D}/bin/activate && pip install -r $< touch $@ ================================================ FILE: README.md ================================================ # CRI Resource Manager for Kubernetes\* ## ⚠️ The project is no longer maintained ⚠️ The CRI Resource manager project is no longer maintained. No further updates, bug fixes or releases are planned. We recommend users migrate to [NRI Plugins](https://github.com/containers/nri-plugins), which provides similar functionality and is actively maintained. Thank you for being part of this journey! ### See our [Documentation][documentation] site for detailed documentation. [documentation]: https://intel.github.io/cri-resource-manager ================================================ FILE: SECURITY.md ================================================ # Security Policy Intel is committed to rapidly addressing security vulnerabilities affecting our customers and providing clear guidance on the solution, impact, severity and mitigation. ## Reporting a Vulnerability Please report any security vulnerabilities in this project utilizing the guidelines [here](https://www.intel.com/content/www/us/en/security-center/vulnerability-handling-guidelines.html). ================================================ FILE: cmd/cri-resmgr/cri-resource-manager.service.in ================================================ [Unit] Description=A CRI proxy with (hardware) resource aware container placement policies. Documentation=https://github.com/intel/cri-resource-manager Before=kubelet.service LogRateLimitIntervalSec=5 LogRateLimitBurst=100000 [Service] Type=simple EnvironmentFile=__DEFAULTDIR__/cri-resource-manager ExecStart=__BINDIR__/cri-resmgr $CONFIG_OPTIONS $POLICY_OPTIONS Restart=always [Install] WantedBy=multi-user.target ================================================ FILE: cmd/cri-resmgr/cri-resource-manager.sysconf ================================================ # Configuration options to pass to cri-resmgr when started via systemd. # Use a fallback file for configuration if/when we can't acquire one from the agent. CONFIG_OPTIONS="--fallback-config /etc/cri-resmgr/fallback.cfg" # Enable this for preventing the active policy to be changed during startup. #POLICY_OPTIONS="--disable-policy-switch" ================================================ FILE: cmd/cri-resmgr/fallback.cfg.sample ================================================ # # If you pass this file to cri-resmgr using the --fallback-config # command line option, it will be used if configuration cannot be # acquired from any other source (agent, or last configuration # stored in the cache). # # Switching Policies: # Recent versions of cri-resmgr will allow changing the active # policy during startup. If you want to prevent this from hap- # pening you can pass the --disable-policy-switch option to # cri-resmgr on the command line. # # With the stock packaging you can control whether startup- # phase policy switching is allowed using the POLICY_OPTIONS # variable in the sysconf file. # # If switching policies is disabled, you can still reset the # active policy manually when cri-resmgr is not running. This # allows cri-resmgr to start up next with a new policy. You # do this by passing the --reset-policy command line option # to cri-resmgr. The full sequence of switching policies this # way is # - stop cri-resmgr (systemctl stop cri-resource-manager), # - reset the active policy (cri-resmgr --reset-policy), # - start cri-resmgr (systemctl start cri-resource-manager) # policy: Active: topology-aware ReservedResources: CPU: 750m logger: Debug: resource-manager,cache,resource-control dump: Config: off:.*,full:((Create)|(Remove)|(Run)|(Update)|(Start)|(Stop)).* ================================================ FILE: cmd/cri-resmgr/main.go ================================================ // Copyright 2019 Intel Corporation. All Rights Reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package main import ( "flag" "fmt" "os" "strings" "syscall" "time" "github.com/intel/goresctrl/pkg/rdt" "github.com/intel/cri-resource-manager/pkg/cri/resource-manager" "github.com/intel/cri-resource-manager/pkg/cri/resource-manager/policy" "github.com/intel/cri-resource-manager/pkg/instrumentation" "github.com/intel/cri-resource-manager/pkg/config" logger "github.com/intel/cri-resource-manager/pkg/log" version "github.com/intel/cri-resource-manager/pkg/version" ) var log = logger.Default() func main() { rate := logger.Rate{Limit: logger.Every(1 * time.Minute)} logger.SetGrpcLogger("grpc", &rate) logger.SetStdLogger("stdlog") rdt.SetLogger(logger.Get("rdt")) printConfig := flag.Bool("print-config", false, "Print configuration and exit.") listPolicies := flag.Bool("list-policies", false, "List available policies.") flag.Parse() switch { case *printConfig: config.Print(nil) os.Exit(0) case *listPolicies: fmt.Printf("Available policies:\n") for _, available := range policy.AvailablePolicies() { fmt.Printf(" * %s: %s\n", available.Name, available.Description) } os.Exit(0) default: if args := flag.Args(); len(args) > 0 { switch args[0] { case "config-help", "help": config.Describe(args[1:]...) os.Exit(0) default: log.Error("unknown command line arguments: %s", strings.Join(flag.Args(), ",")) flag.Usage() os.Exit(1) } } } logger.Flush() logger.SetupDebugToggleSignal(syscall.SIGUSR1) log.Info("cri-resmgr (version %s, build %s) starting...", version.Version, version.Build) if err := instrumentation.Start(); err != nil { log.Fatal("failed to set up instrumentation: %v", err) } defer instrumentation.Stop() m, err := resmgr.NewResourceManager() if err != nil { log.Fatal("failed to create resource manager instance: %v", err) } if err := m.Start(); err != nil { log.Fatal("failed to start resource manager: %v", err) } for { time.Sleep(15 * time.Second) } } ================================================ FILE: cmd/cri-resmgr-agent/Dockerfile ================================================ ARG GO_VERSION=1.24 FROM golang:${GO_VERSION}-bullseye as builder ARG GOLICENSES_VERSION WORKDIR /go/build # Fetch go dependencies in a separate layer for caching RUN go install github.com/google/go-licenses@${GOLICENSES_VERSION} COPY go.mod go.sum ./ COPY pkg/topology/ pkg/topology/ RUN go mod download -x # Build agent and agent-probe, fully statically linked binary COPY . . RUN CGO_ENABLED=0 make build-static BUILD_DIRS="cri-resmgr-agent cri-resmgr-agent-probe" && \ install -D /go/build/bin/* -t /install_root/bin # Save licenses RUN make install-licenses BUILD_DIRS="cri-resmgr-agent cri-resmgr-agent-probe" DESTDIR=/install_root FROM scratch as final COPY --from=builder /install_root / ENTRYPOINT ["/bin/cri-resmgr-agent"] ================================================ FILE: cmd/cri-resmgr-agent/agent-deployment.yaml ================================================ apiVersion: v1 kind: ServiceAccount metadata: name: cri-resmgr-agent namespace: kube-system --- apiVersion: rbac.authorization.k8s.io/v1 kind: ClusterRole metadata: name: cri-resmgr-agent rules: - apiGroups: - "" - criresmgr.intel.com resources: - nodes - configmaps - adjustments - labels - annotations verbs: - get - patch - update - watch --- apiVersion: rbac.authorization.k8s.io/v1 kind: ClusterRoleBinding metadata: name: cri-resmgr-agent roleRef: apiGroup: rbac.authorization.k8s.io kind: ClusterRole name: cri-resmgr-agent subjects: - kind: ServiceAccount name: cri-resmgr-agent namespace: kube-system --- apiVersion: apps/v1 kind: DaemonSet metadata: labels: app: cri-resmgr-agent name: cri-resmgr-agent namespace: kube-system spec: selector: matchLabels: app: cri-resmgr-agent template: metadata: labels: app: cri-resmgr-agent spec: serviceAccount: cri-resmgr-agent containers: - name: cri-resmgr-agent env: - name: NODE_NAME valueFrom: fieldRef: fieldPath: spec.nodeName image: IMAGE_PLACEHOLDER imagePullPolicy: Always # for testing securityContext: allowPrivilegeEscalation: false capabilities: drop: ["ALL"] readOnlyRootFilesystem: true volumeMounts: - name: resmgrsockets mountPath: /var/run/cri-resmgr resources: limits: cpu: 1 memory: 512Mi livenessProbe: exec: command: ["/bin/cri-resmgr-agent-probe"] initialDelaySeconds: 5 periodSeconds: 30 # # Notes: This is NOT a readiness probe for the agent itself. # # We (mis)use this readiness probe to propagate information # back to the control plane about any failure on the node to # activate the last updated configuration. Since success or # failure is reflected by whether the agent's pod on the node # is marked Ready, any error in configuration should now be a # watchable condition, at least indirectly. One can get more # details about the specifics of any configuration errors by # watching the readiness of the agent's and fetching its log # messages if it ever becomes not ready. # readinessProbe: exec: command: ["/bin/cri-resmgr-agent-probe", "-query", "config-status"] initialDelaySeconds: 5 periodSeconds: 30 volumes: - name: resmgrsockets hostPath: path: /var/run/cri-resmgr ================================================ FILE: cmd/cri-resmgr-agent/main.go ================================================ /* Copyright 2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ package main import ( "flag" "github.com/intel/cri-resource-manager/pkg/agent" "github.com/intel/cri-resource-manager/pkg/log" "github.com/intel/cri-resource-manager/pkg/version" ) func main() { // Disable buffering and make sure that all messages have been emitted at // program exit log.Flush() defer log.Flush() flag.Parse() a, err := agent.NewResourceManagerAgent() if err != nil { log.Fatal("failed to create resource manager agent instance: %v", err) } log.Info("cri-resmgr agent (version %s, build %s) starting...", version.Version, version.Build) if err := a.Run(); err != nil { log.Fatal("%v", err) } } ================================================ FILE: cmd/cri-resmgr-agent-probe/main.go ================================================ /* Copyright 2020 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ package main import ( "context" "flag" "fmt" "net" "time" "google.golang.org/grpc" agent_v1 "github.com/intel/cri-resource-manager/pkg/agent/api/v1" v1 "github.com/intel/cri-resource-manager/pkg/agent/api/v1" "github.com/intel/cri-resource-manager/pkg/cri/resource-manager/sockets" "github.com/intel/cri-resource-manager/pkg/log" ) func main() { socket := flag.String("agent-socket", sockets.ResourceManagerAgent, "Unix domain socket where agent is serving") query := flag.String("query", "", fmt.Sprintf("query to send, use %q to query status of last config push to resmgr", v1.ConfigStatus)) // Disable logger buffering and make sure that everything has been flushed // when program exits log.Flush() defer log.Flush() flag.Parse() // Try to connect to agent dialOpts := []grpc.DialOption{ grpc.WithInsecure(), grpc.WithDialer(func(sock string, timeout time.Duration) (net.Conn, error) { return net.Dial("unix", sock) }), } conn, err := grpc.Dial(*socket, dialOpts...) if err != nil { log.Fatal("failed to connect to agent: %v", err) } cli := agent_v1.NewAgentClient(conn) // Do health check ctx, cancel := context.WithTimeout(context.Background(), time.Second) defer cancel() rpl, err := cli.HealthCheck(ctx, &agent_v1.HealthCheckRequest{ Query: *query, }) if err != nil { log.Fatal("%v", err) } if rpl.Error != "" { log.Fatal("health check negative: %s", rpl.Error) } log.Info("Health check OK") } ================================================ FILE: cmd/cri-resmgr-webhook/Dockerfile ================================================ ARG GO_VERSION=1.24 FROM golang:${GO_VERSION}-bullseye as builder ARG GOLICENSES_VERSION WORKDIR /go/build # Fetch go dependencies in a separate layer for caching RUN go install github.com/google/go-licenses@${GOLICENSES_VERSION} COPY go.mod go.sum ./ COPY pkg/topology/ pkg/topology/ RUN go mod download -x # Build webhook, fully statically linked binary COPY . . RUN CGO_ENABLED=0 make build-static BUILD_DIRS=cri-resmgr-webhook && \ install -D /go/build/bin/* -t /install_root/bin # Save licenses RUN make install-licenses BUILD_DIRS=cri-resmgr-webhook DESTDIR=/install_root FROM scratch as final USER 65534:65534 COPY --from=builder /install_root / ENTRYPOINT ["/bin/cri-resmgr-webhook"] ================================================ FILE: cmd/cri-resmgr-webhook/handlers.go ================================================ /* Copyright 2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ package main import ( "encoding/json" "errors" "fmt" "io" "log" "net/http" admissionv1 "k8s.io/api/admission/v1" corev1 "k8s.io/api/core/v1" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apimachinery/pkg/runtime" "k8s.io/apimachinery/pkg/runtime/serializer" utilruntime "k8s.io/apimachinery/pkg/util/runtime" "sigs.k8s.io/yaml" ) type jsonPatch struct { Op string `json:"op"` Path string `json:"path"` Value interface{} `json:"value"` } type podResourceRequirements struct { InitContainers map[string]corev1.ResourceRequirements `json:"initContainers"` Containers map[string]corev1.ResourceRequirements `json:"containers"` } var scheme = runtime.NewScheme() var codecs = serializer.NewCodecFactory(scheme) // Module inatialization func init() { utilruntime.Must(corev1.AddToScheme(scheme)) utilruntime.Must(admissionv1.AddToScheme(scheme)) } // Helper for creating an AdmissionResponse with an error func errResponse(err error) *admissionv1.AdmissionResponse { return &admissionv1.AdmissionResponse{ Result: &metav1.Status{ Message: err.Error(), }, } } // Dump req/rsp in human-readable form func stringify(r interface{}) string { out, err := yaml.Marshal(r) if err != nil { return fmt.Sprintf("!!!!!\nUnable to stringify %T: %v\n!!!!!", r, err) } return string(out) } // Handle HTTP requests func handle(w http.ResponseWriter, r *http.Request) { var body []byte if r.Body != nil { if data, err := io.ReadAll(r.Body); err == nil { body = data } } // Check Content-Type contentType := r.Header.Get("Content-Type") if contentType != "application/json" { log.Printf("ERROR: incorrect Content-Type (received %s, expect application/json", contentType) return } // Deserialize AdmissionReview request and create an AdmissionReview response arReq := admissionv1.AdmissionReview{} arRsp := admissionv1.AdmissionReview{} deserializer := codecs.UniversalDeserializer() if _, _, err := deserializer.Decode(body, nil, &arReq); err != nil { log.Printf("ERROR: deserializing admission request failed: %v", err) arRsp.Response = errResponse(err) } else if arReq.Request == nil { log.Printf("REQUEST empty") arRsp.Response = errResponse(errors.New("Empty request")) } else { log.Printf("REQUEST:\n%s", stringify(&arReq)) if arReq.Request.Resource.Group != "" || arReq.Request.Resource.Version != "v1" { arRsp.Response = errResponse(fmt.Errorf("Unexpected resource group/version '%s/%s'", arReq.Request.Resource.Group, arReq.Request.Resource.Version)) } else { res := arReq.Request.Resource.Resource switch res { case "pods": arRsp.Kind = "AdmissionReview" arRsp.APIVersion = "admission.k8s.io/v1" arRsp.Response = mutatePodObject(&arReq.Request.Object) default: arRsp.Response = errResponse(fmt.Errorf("Unexpected resource %s", arReq.Request.Resource)) } } // Use the same UID in response that was used in the request arRsp.Response.UID = arReq.Request.UID } log.Printf("RESPONSE:\n%s", stringify(arRsp.Response)) respBytes, err := json.Marshal(arRsp) if err != nil { log.Printf("ERROR: json marshal failed: %v", err) } if _, err := w.Write(respBytes); err != nil { log.Printf("ERROR: failed to write HTTP response: %v", err) } } // Handle AdmissionReview requests for Pod objects func mutatePodObject(rawObj *runtime.RawExtension) *admissionv1.AdmissionResponse { pod := corev1.Pod{} deserializer := codecs.UniversalDeserializer() if _, _, err := deserializer.Decode(rawObj.Raw, nil, &pod); err != nil { log.Printf("ERROR: failed to deserialize Pod object: %v", err) return errResponse(err) } reviewResponse := admissionv1.AdmissionResponse{} reviewResponse.Allowed = true patches := []jsonPatch{} // Add a patch to add an empty annotations object if no annotations are found if pod.ObjectMeta.Annotations == nil { patches = append(patches, jsonPatch{Op: "add", Path: "/metadata/annotations", Value: map[string]string{}}) } patch, err := patchResourceAnnotation(&pod) if err != nil { return errResponse(err) } patches = append(patches, patch) reviewResponse.Patch, err = json.Marshal(patches) if err != nil { log.Printf("ERROR: failed to marshal Pod patch: %v", err) return errResponse(err) } patchType := admissionv1.PatchTypeJSONPatch reviewResponse.PatchType = &patchType return &reviewResponse } // Create a Pod (JSON) patch adding resource annotation func patchResourceAnnotation(pod *corev1.Pod) (jsonPatch, error) { patch := jsonPatch{Op: "add", Path: "/metadata/annotations/intel.com~1resources"} // Create annotation that includes all resources of all (init)containers resourceAnnotation := podResourceRequirements{InitContainers: map[string]corev1.ResourceRequirements{}, Containers: map[string]corev1.ResourceRequirements{}} for _, container := range pod.Spec.Containers { resourceAnnotation.Containers[container.Name] = container.Resources } for _, container := range pod.Spec.InitContainers { resourceAnnotation.InitContainers[container.Name] = container.Resources } resourceAnnotationBytes, err := json.Marshal(resourceAnnotation) if err != nil { log.Printf("ERROR: failed to marshal 'intel.com/resources' annotations: %v", err) return patch, err } // Patch Pod annotations to include the "resources" annotation patch.Value = string(resourceAnnotationBytes) return patch, nil } ================================================ FILE: cmd/cri-resmgr-webhook/main.go ================================================ /* Copyright 2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ package main import ( "flag" "log" ) // Parse command line func parseArgs() args { args := args{} flag.IntVar(&args.port, "port", 443, "Port on which to listen for connections") flag.StringVar(&args.certFile, "cert-file", "", "x509 certificate used for authenticating connections") flag.StringVar(&args.keyFile, "key-file", "", "Private x509 key matching --cert-file") flag.Parse() return args } func main() { args := parseArgs() if err := Run(args); err != nil { log.Fatal(err) } } ================================================ FILE: cmd/cri-resmgr-webhook/mutating-webhook-config.yaml ================================================ apiVersion: admissionregistration.k8s.io/v1 kind: MutatingWebhookConfiguration metadata: name: cri-resmgr webhooks: - name: cri-resmgr.intel.com sideEffects: None admissionReviewVersions: ["v1"] rules: - apiGroups: - "" apiVersions: - v1 operations: - CREATE - UPDATE resources: - pods clientConfig: service: namespace: cri-resmgr name: cri-resmgr-webhook caBundle: CA_BUNDLE_PLACEHOLDER ================================================ FILE: cmd/cri-resmgr-webhook/webhook-deployment.yaml ================================================ apiVersion: v1 kind: Namespace metadata: name: cri-resmgr labels: name: cri-resmgr --- apiVersion: apps/v1 kind: Deployment metadata: name: cri-resmgr-webhook namespace: cri-resmgr labels: app: cri-resmgr-webhook spec: replicas: 1 selector: matchLabels: app: cri-resmgr-webhook template: metadata: labels: app: cri-resmgr-webhook spec: containers: - name: cri-resmgr-webhook image: IMAGE_PLACEHOLDER # Convenience pull policy for development imagePullPolicy: Always # Mount the tls cert/key in the default location volumeMounts: - name: certs mountPath: /etc/cri-resmgr-webhook/certs.d/ readOnly: true args: - "-cert-file=/etc/cri-resmgr-webhook/certs.d/svc.crt" - "-key-file=/etc/cri-resmgr-webhook/certs.d/svc.key" - "-port=8443" securityContext: allowPrivilegeEscalation: false capabilities: drop: ["ALL"] readOnlyRootFilesystem: true runAsNonRoot: true resources: limits: cpu: 1 memory: 256Mi livenessProbe: httpGet: scheme: HTTPS port: 8443 httpHeaders: - name: "Content-Type" value: "application/json" initialDelaySeconds: 5 periodSeconds: 30 nodeSelector: node-role.kubernetes.io/control-plane: "" tolerations: - key: "node-role.kubernetes.io/control-plane" operator: "Equal" value: "" effect: "NoSchedule" volumes: # This example deployment uses k8s secrests to store TLS secrets # You need to manually generate the cert/key pair, and, the accompanying secret # Expected filenames are "svc.crt" and "svc.key" - name: certs secret: secretName: cri-resmgr-webhook-secret --- apiVersion: v1 kind: Service metadata: name: cri-resmgr-webhook namespace: cri-resmgr spec: selector: app: cri-resmgr-webhook ports: - port: 443 targetPort: 8443 protocol: TCP ================================================ FILE: cmd/cri-resmgr-webhook/webhook.go ================================================ /* Copyright 2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ package main import ( "crypto/tls" "fmt" "log" "net/http" ) type args struct { port int certFile string keyFile string } // Load server certificate and private key func loadTLS(certFile, keyFile string) *tls.Config { cert, err := tls.LoadX509KeyPair(certFile, keyFile) if err != nil { log.Fatalf("Failed to initialize TLS config: %v", err) } return &tls.Config{ Certificates: []tls.Certificate{cert}, } } // Run is the main entry point for the webhook server func Run(args args) error { // Attach handlers http.HandleFunc("/", handle) // Create and run HTTP server server := &http.Server{ Addr: fmt.Sprintf(":%d", args.port), TLSConfig: loadTLS(args.certFile, args.keyFile), } log.Printf("Listening on port %d", args.port) return server.ListenAndServeTLS("", "") } ================================================ FILE: demo/blockio/bb-scanner.yaml ================================================ # bb-scanner continuously calculates checksums of files found # under /scan. Output reveals added, deleted, renamed and modified # files together with timestamps. # # bb-scanner is configured as a low-priority activity: # 1. CPU usage is limited to 10 %. # 2. Disk/SSD bandwidth is limited by SlowReader configuration. # apiVersion: apps/v1 kind: DaemonSet metadata: name: bb-scanner labels: app: bb-scanner spec: selector: matchLabels: app: bb-scanner template: metadata: name: bb-scanner labels: app: bb-scanner annotations: blockioclass.cri-resource-manager.intel.com/pod: SlowReader spec: terminationGracePeriodSeconds: 1 containers: - image: busybox command: - sh - -c - while true; do find /scan -type f -print0 | xargs -0 md5sum | sort > curr.md5; date +%s >> /output/diffs.md5; diff -U1 prev.md5 curr.md5 >> /output/diffs.md5; cp curr.md5 /output/files.md5; mv curr.md5 prev.md5; done imagePullPolicy: IfNotPresent name: busybox resources: limits: cpu: 100m volumeMounts: - mountPath: /scan/usr-bin name: usr-bin readOnly: true - mountPath: /scan/usr-lib name: usr-lib readOnly: true - mountPath: /output name: output readOnly: false volumes: - name: usr-bin hostPath: path: /usr/bin type: DirectoryOrCreate - name: usr-lib hostPath: path: /usr/lib type: DirectoryOrCreate - name: output hostPath: path: /var/cache/bb-scanner type: DirectoryOrCreate restartPolicy: Always ================================================ FILE: demo/blockio/cri-resmgr-config.default.yaml ================================================ apiVersion: v1 kind: ConfigMap metadata: name: cri-resmgr-config.default namespace: kube-system data: policy: |+ Active: none logger: |+ Debug: blockio,cgroupblkio blockio: |+ Classes: SlowReader: - Devices: - /dev/vda ThrottleReadBps: 512k ================================================ FILE: demo/blockio/run.sh ================================================ #!/bin/bash DEMO_TITLE="CRI Resource Manager: Block I/O Demo" PV='pv -qL' SCRIPT_DIR="$(dirname "${BASH_SOURCE[0]}")" LIB_DIR=$SCRIPT_DIR/../lib BIN_DIR=${bindir-$(realpath "$SCRIPT_DIR/../../bin")} OUTPUT_DIR=${outdir-$SCRIPT_DIR/output} COMMAND_OUTPUT_DIR=$OUTPUT_DIR/commands # shellcheck disable=SC1091 # shellcheck source=../lib/command.bash source "$LIB_DIR/command.bash" # shellcheck disable=SC1091 # shellcheck source=../lib/host.bash source "$LIB_DIR/host.bash" # shellcheck disable=SC1091 # shellcheck source=../lib/vm.bash source "$LIB_DIR/vm.bash" usage() { echo "$DEMO_TITLE" echo "Usage: [VAR=VALUE] ./run.sh MODE" echo " MODE: \"play\" plays the demo." echo " \"record\" plays and records the demo." echo " \"test\" runs fast, reports pass or fail." echo " VARs:" echo " vm: govm virtual machine name." echo " The default is \"crirm-demo-blockio\"." echo " speed: Demo play speed." echo " The default is 10 (keypresses per second)." echo " cleanup: 0: leave VM running. (\"play\" mode default)" echo " 1: delete VM (\"test\" mode default)" echo " 2: stop VM, but do not delete it." echo " outdir: Save output under given directory." echo " The default is \"${SCRIPT_DIR}/output\"." echo " binsrc: Where to get cri-resmgr to the VM." echo " \"github\": go get and build in VM (\"play\" mode default)." echo " \"local\": copy from source tree bin/ (\"test\" mode default)" echo " (set bindir=/path/to/cri-resmgr* to override bin/)" } error() { (echo ""; echo "error: $1" ) >&2 exit 1 } out() { if [ -n "$PV" ]; then speed=${speed-10} echo "$1" | $PV "$speed" else echo "$1" fi echo "" } record() { clear out "Recording this screencast..." host-command "asciinema rec -t \"$DEMO_TITLE\" crirm-demo-blockio.cast -c \"./run.sh play\"" } screen-create-vm() { speed=60 out "### Running the demo in VM \"$vm\"." host-create-vm "$vm" vm-networking if [ -z "$VM_IP" ]; then error "creating VM failed" fi } screen-install-k8s() { speed=60 out "### Installing Kubernetes to the VM." vm-install-cri vm-install-k8s } screen-install-cri-resmgr() { speed=60 out "### Installing CRI Resource Manager to VM." vm-install-cri-resmgr } screen-launch-cri-resmgr() { policy=${policy-none} speed=60 out "### Launching cri-resmgr." vm-command "(echo \"policy:\"; echo \" Active: $policy\") > cri-resmgr.fallback.cfg" vm-command "cri-resmgr -relay-socket /var/run/cri-resmgr/cri-resmgr.sock -runtime-socket /var/run/containerd/containerd.sock -fallback-config cri-resmgr.fallback.cfg >cri-resmgr.output.txt 2>&1 &" } screen-create-singlenode-cluster() { speed=60 out "### Setting up single-node Kubernetes cluster." speed=60 out "### CRI Resource Manager + containerd will act as the container runtime." vm-create-singlenode-cluster } screen-launch-cri-resmgr-agent() { speed=60 out "### Launching cri-resmgr-agent." speed=60 out "### The agent will make cri-resmgr configurable with ConfigMaps." vm-command "NODE_NAME=\$(hostname) cri-resmgr-agent -kubeconfig \$HOME/.kube/config >cri-resmgr-agent.output.txt 2>&1 &" } screen-measure-io-speed() { process=$1 measuretime=2 vm-command "echo 3 > /proc/sys/vm/drop_caches" out "### Measuring $process read speed -- twice." cmd="pid=\$(ps -A | awk \"/$process/{print \\\$1}\"); [ -n \"\$pid\" ] && { echo \$(grep read_bytes /proc/\$pid/io; sleep $measuretime; grep read_bytes /proc/\$pid/io) | awk \"{print \\\"$process read speed: \\\"(\\\$4-\\\$2)/$measuretime/1024\\\" kBps\\\"}\"; }" speed=360 outcolor=10 vm-command "$cmd" sleep 1 speed=360 outcolor=10 vm-command "$cmd" } demo-blockio() { out "### Let the show begin!" out "### Configuring cri-resmgr: introduce a SlowReader block I/O class." host-command "scp cri-resmgr-config.default.yaml $VM_SSH_USER@$VM_IP:" vm-command "cat cri-resmgr-config.default.yaml" out "### Note: SlowReaders can read from each of the listed devices up to $(vm-command-q "awk '/ThrottleRead/{print \$2}' < cri-resmgr-config.default.yaml")Bps." vm-command "kubectl apply -f cri-resmgr-config.default.yaml" out "### Our test workload, bb-scanner, is annotated as a SlowReader." host-command "scp bb-scanner.yaml $VM_SSH_USER@$VM_IP:" vm-command "grep -A1 annotations: bb-scanner.yaml" out "### Flushing caches and deploying bb-scanner." vm-command "echo 3 > /proc/sys/vm/drop_caches" vm-command "kubectl create -f bb-scanner.yaml" out "### Now bb-scanner is running md5sum to all mounted directories, non-stop." vm-wait-process --timeout 60 md5sum screen-measure-io-speed md5sum out "### Reconfiguring cri-resmgr: set SlowReader read speed to 2 MBps." out "### This applies to all pods and containers in this block I/O class," out "### both new and already running, like our bb-scanner." vm-command "sed -i 's/ThrottleReadBps:.*/ThrottleReadBps: 2Mi/' cri-resmgr-config.default.yaml" vm-command "cat cri-resmgr-config.default.yaml" vm-command "kubectl apply -f cri-resmgr-config.default.yaml" # Give some time for new config to become effective and process # I/O to accelerate. sleep 2; screen-measure-io-speed md5sum out "### Thanks for watching!" out "### Cleaning up: deleting bb-scanner." vm-command "kubectl delete daemonset bb-scanner" } # Validate parameters mode=$1 distro=${distro:="ubuntu-20.04"} cri=${cri:="containerd"} vm=${vm:="blockio-$distro-$cri"} echo "vm is here: \"$vm\"" host-set-vm-config "$vm" "$distro" "$cri" if [ "$mode" == "play" ]; then speed=${speed-10} cleanup=${cleanup-0} binsrc=${binsrc-github} elif [ "$mode" == "test" ]; then PV= cleanup=${cleanup-1} binsrc=${binsrc-local} elif [ "$mode" == "record" ]; then record else usage error "missing valid MODE" exit 1 fi # Prepare for test/demo mkdir -p "$OUTPUT_DIR" mkdir -p "$COMMAND_OUTPUT_DIR" rm -f "$COMMAND_OUTPUT_DIR"/0* ( echo x > "$OUTPUT_DIR"/x && rm -f "$OUTPUT_DIR"/x ) || { error "output directory outdir=\"$OUTPUT_DIR\" is not writable" } if [ "$binsrc" == "local" ]; then [ -f "${BIN_DIR}/cri-resmgr" ] || error "missing \"${BIN_DIR}/cri-resmgr\"" [ -f "${BIN_DIR}/cri-resmgr-agent" ] || error "missing \"${BIN_DIR}/cri-resmgr-agent\"" fi if [ -z "$VM_IP" ] || [ -z "$VM_SSH_USER" ] || [ -z "$VM_NAME" ]; then screen-create-vm fi if ! vm-command-q "dpkg -l | grep -q kubelet"; then screen-install-k8s fi if ! vm-command-q "[ -f /usr/bin/cri-resmgr ] || [ -f /usr/local/bin/cri-resmgr ]"; then screen-install-cri-resmgr fi # start cri-resmgr if not already running if ! vm-command-q "pidof cri-resmgr" >/dev/null; then screen-launch-cri-resmgr fi # create kubernetes cluster or wait that it is online if vm-command-q "[ ! -f /var/lib/kubelet/config.yaml ]"; then screen-create-singlenode-cluster else # wait for kube-apiserver to launch (may be down if the VM was just booted) vm-wait-process kube-apiserver fi # start cri-resmgr-agent if not already running if ! vm-command-q "pidof cri-resmgr-agent >/dev/null"; then screen-launch-cri-resmgr-agent fi # Run test/demo demo-blockio # Cleanup if [ "$cleanup" == "0" ]; then echo "The VM, Kubernetes and cri-resmgr are left running. Next steps:" vm-print-usage elif [ "$cleanup" == "1" ]; then host-stop-vm $vm host-delete-vm $vm elif [ "$cleanup" == "2" ]; then host-stop-vm $vm fi # Summarize results SUMMARY_FILE="$OUTPUT_DIR/summary.txt" echo -n "" > "$SUMMARY_FILE" || error "cannot write summary to \"$SUMMARY_FILE\"" first_speed="$(grep "^md5sum read speed:" "$COMMAND_OUTPUT_DIR"/0* | head -n 1 | awk '{print $4}')" last_speed="$(grep "^md5sum read speed:" "$COMMAND_OUTPUT_DIR"/0* | tail -n 1 | awk '{print $4}')" echo "First md5sum read speed (512 kBps throttling): $first_speed kBps" >> "$SUMMARY_FILE" echo "Last md5sum read speed (2 MBps throttling): $last_speed kBps" >> "$SUMMARY_FILE" # Declare verdict in test mode exit_status=0 if [ "$mode" == "test" ]; then min_first=100 max_first=600 min_last=1500 max_last=2500 [[ "$first_speed" -gt "$min_first" ]] || exit_status=1 [[ "$first_speed" -lt "$max_first" ]] || exit_status=1 [[ "$last_speed" -gt "$min_last" ]] || exit_status=1 [[ "$last_speed" -lt "$max_last" ]] || exit_status=1 if [ "$exit_status" == "1" ]; then echo "Error: speeds outside acceptable ranges ($min_first..$max_first kBps and $min_last..$max_last kBps)." >> "$SUMMARY_FILE" echo "Test verdict: FAIL" >> "$SUMMARY_FILE" else echo "Speeds within acceptable ranges ($min_first..$max_first kBps and $min_last..$max_last kBps)." >> "$SUMMARY_FILE" echo "Test verdict: PASS" >> "$SUMMARY_FILE" fi echo "" cat "$SUMMARY_FILE" fi exit $exit_status ================================================ FILE: demo/lib/command.bash ================================================ # Hooks for displaying and logging how shell commands (local and # remote) are executed, and handling their output and exit status. # # Example in a Bash script, run-on-mytargethost function: # command-start mytargethost "ls -la" # ssh mytargethost $COMMAND 2>&1 | command-handle-output # command-end ${PIPESTATUS[0]} # [ "$COMMAND_STATUS" == "0" ] || command-error "non-zero exit status" # # command-start and command-end set environment variables: # COMMAND, COMMAND_STATUS, COMMAND_OUTPUT export LC_NUMERIC=C # These exports force ssh-* to fail instead of prompting for a passphrase. export DISPLAY=bogus-none export SSH_ASKPASS=/bin/false SSH_KEY="${HOME}/.ssh/id_rsa" SSH_OPTS="-o StrictHostKeyChecking=No" SSH="ssh $SSH_OPTS" SCP="scp $SSH_OPTS" epochrealtime() { [ -n "$EPOCHREALTIME" ] && echo "$EPOCHREALTIME" || echo "$SECONDS" } COMMAND_COUNTER=0 command_init_time=$(epochrealtime) command-start() { # example: command-start vm prompt "mkdir $MYDIR" COMMAND_TARGET="$1" COMMAND_PROMPT="$2" COMMAND="$3" COMMAND_STATUS="" COMMAND_OUTPUT="" COMMAND_COUNTER=$(( COMMAND_COUNTER + 1 )) local command_start_time=$(epochrealtime) local time_since_start=$(echo "$command_start_time - $command_init_time" | bc) COMMAND_OUT_FILE="$COMMAND_OUTPUT_DIR/$(printf %04g $COMMAND_COUNTER)-$COMMAND_TARGET" echo "# start time: $time_since_start" > "$COMMAND_OUT_FILE" || { echo "cannot write command output to file \"$COMMAND_OUT_FILE\"" exit 1 } echo "# command: $COMMAND" >> "$COMMAND_OUT_FILE" echo -e -n "${COMMAND_PROMPT}" if [ -n "$PV" ]; then echo "$COMMAND" | $PV $speed else echo "$COMMAND" fi if [ -n "$outcolor" ]; then COMMAND_OUTSTART="\e[38;5;${outcolor}m" COMMAND_OUTEND="\e[0m" else COMMAND_OUTSTART="" COMMAND_OUTEND="" fi } command-handle-output() { # example: sh -c $command | command-handle-output tee "$COMMAND_OUT_FILE.tmp" | ( echo -e -n "$COMMAND_OUTSTART"; cat; echo -e -n "$COMMAND_OUTEND" ) cat "$COMMAND_OUT_FILE.tmp" >> "$COMMAND_OUT_FILE" if [ -n "$PV" ]; then echo | $PV $speed fi } command-runs-in-bg() { echo "(runs in background)" echo "" } command-end() { # example: command-end EXIT_STATUS COMMAND_STATUS=$1 local command_end_time=$(epochrealtime) local time_since_start=$(echo "$command_end_time - $command_init_time" | bc) ( echo "# exit status: $COMMAND_STATUS"; echo "# end time: $time_since_start" ) >> "$COMMAND_OUT_FILE" COMMAND_OUTPUT=$(<"$COMMAND_OUT_FILE.tmp") rm -f "$COMMAND_OUT_FILE.tmp" } command-error() { # script API # Usage: command-error MESSAGE # # Print executed command, observed output, exit status and MESSAGE. # Stop script execution. ( echo "command: $COMMAND"; echo "output: $COMMAND_OUTPUT"; echo "exit status: $COMMAND_STATUS"; echo "error: $1" ) >&2 command-exit-if-not-interactive } command-exit-if-not-interactive() { if [ -z "$INTERACTIVE_MODE" ] || [ "$INTERACTIVE_MODE" == "0" ]; then exit ${1:-1} fi } command-debug-log() { if [ "$(type -t -- debug-log)" = "function" ]; then debug-log "$@" return 0 else if [ -n "$OUTPUT_DIR" ] && [ -d "$OUTPUT_DIR" ]; then touch "$OUTPUT_DIR"/debug-log echo "$@" >> "$OUTPUT_DIR"/debug-log return 0 fi fi echo "$@" 1>&2 } ================================================ FILE: demo/lib/distro.bash ================================================ # shellcheck disable=SC2120 GO_URLDIR=https://golang.org/dl GO_VERSION=1.24.1 GOLANG_URL=$GO_URLDIR/go$GO_VERSION.linux-amd64.tar.gz CRICTL_VERSION=${CRICTL_VERSION:-"v1.25.0"} MINIKUBE_VERSION=${MINIKUBE_VERSION:-v1.27.0} ########################################################################### # # distro-agnostic interface # # To add a new distro implement distro-specific versions of these # functions. You can omit implementing those which already resolve # to an existing function which works for the new distro. # # To add a new API function, add an new briding resolution entry below. # distro-image-url() { distro-resolve "$@"; } distro-ssh-user() { distro-resolve "$@"; } distro-pkg-type() { distro-resolve "$@"; } distro-install-repo-key() { distro-resolve "$@"; } distro-install-repo() { distro-resolve "$@"; } distro-refresh-pkg-db() { distro-resolve "$@"; } distro-install-pkg() { distro-resolve "$@"; } distro-install-pkg-local() { distro-resolve "$@"; } distro-remove-pkg() { distro-resolve "$@"; } distro-setup-proxies() { distro-resolve "$@"; } distro-setup-oneshot() { distro-resolve "$@"; } distro-install-utils() { distro-resolve "$@"; } distro-install-golang() { distro-resolve "$@"; } distro-install-runc() { distro-resolve "$@"; } distro-install-containerd() { distro-resolve "$@"; } distro-config-containerd() { distro-resolve "$@"; } distro-restart-containerd() { distro-resolve "$@"; } distro-install-crio() { distro-resolve "$@"; } distro-config-crio() { distro-resolve "$@"; } distro-restart-crio() { distro-resolve "$@"; } distro-install-crictl() { distro-resolve "$@"; } distro-install-cri-dockerd(){ distro-resolve "$@"; } distro-install-minikube() { distro-resolve "$@"; } distro-install-k8s() { distro-resolve "$@"; } distro-install-kernel-dev() { distro-resolve "$@"; } distro-k8s-cni() { distro-resolve "$@"; } distro-k8s-cni-subnet() { distro-resolve "$@"; } distro-set-kernel-cmdline() { distro-resolve "$@"; } distro-govm-env() { distro-resolve "$@"; } distro-bootstrap-commands() { distro-resolve "$@"; } distro-env-file-dir() { distro-resolve "$@"; } ########################################################################### # distro-specific function resolution distro-resolve() { local apifn="${FUNCNAME[1]}" fn prefn postfn # shellcheck disable=SC2086 { fn="$(distro-resolve-fn $apifn)" prefn="$(distro-resolve-fn $apifn-pre)" postfn="$(distro-resolve-fn $apifn-post)" command-debug-log "$VM_DISTRO/${FUNCNAME[1]}: pre: ${prefn:--}, fn: ${fn:--}, post: ${postfn:--}" } [ -n "$prefn" ] && { $prefn "$@" || return $?; } $fn "$@" || return $? [ -n "$postfn" ] && { $postfn "$@" || return $?; } return 0 } distro-resolve-fn() { # We try resolving distro-agnostic implementations by looping through # a list of candidate function names in decreasing order of precedence # and returning the first one found. The candidate list has version- # exact and unversioned distro-specific functions and a set fallbacks # based on known distro, derivative, and package type relations. # # For normal functions the last fallback is 'distro-unresolved' which # prints and returns an error. For pre- and post-functions there is no # similar setup. IOW, unresolved normal distro functions fail while # unresolved pre- and post-functions get ignored (in distro-resolve). local apifn="$1" candidates fn case $apifn in distro-*) apifn="${apifn#distro-}";; *) error "internal error: can't resolve non-API function $apifn";; esac candidates="${VM_DISTRO/./_}-$apifn ${VM_DISTRO%%-*}-$apifn" case $VM_DISTRO in ubuntu*) candidates="$candidates debian-$apifn";; fedora*) candidates="$candidates rpm-$apifn";; *suse*) candidates="$candidates rpm-$apifn";; sles*) candidates="$candidates opensuse-$apifn rpm-$apifn";; esac case $apifn in *-pre|*-post) ;; *) candidates="$candidates default-$apifn distro-unresolved";; esac for fn in $candidates; do if [ "$(type -t -- "$fn")" = "function" ]; then echo "$fn" return 0 fi done } # distro-unresolved terminates failed API function resolution with an error. distro-unresolved() { local apifn="${FUNCNAME[2]}" command-error "internal error: can't resolve \"$apifn\" for \"$VM_DISTRO\"" return 1 } ########################################################################### # # Ubuntu, Debian # ubuntu-18_04-image-url() { echo "https://cloud-images.ubuntu.com/bionic/current/bionic-server-cloudimg-amd64.img" } ubuntu-20_04-image-url() { echo "https://cloud-images.ubuntu.com/focal/current/focal-server-cloudimg-amd64.img" } ubuntu-22_04-image-url() { echo "https://cloud-images.ubuntu.com/releases/jammy/release/ubuntu-22.04-server-cloudimg-amd64.img" } ubuntu-24_04-image-url() { echo "https://cloud-images.ubuntu.com/releases/noble/release/ubuntu-24.04-server-cloudimg-amd64.img" } debian-11-image-url() { echo "https://cloud.debian.org/images/cloud/bullseye/latest/debian-11-generic-amd64.qcow2" } debian-12-image-url() { echo "https://cloud.debian.org/images/cloud/bookworm/latest/debian-12-generic-amd64.qcow2" } debian-sid-image-url() { echo "https://cloud.debian.org/images/cloud/sid/daily/latest/debian-sid-generic-amd64-daily.qcow2" } ubuntu-download-kernel() { # Usage: # ubuntu-download-kernel list # ubuntu-download-kernel VERSION # # List or download Ubuntu kernel team kernels. # # Example: # ubuntu-download-kernel list | grep 5.9 # ubuntu-download-kernel 5.9-rc8 # vm-command "dpkg -i kernels/linux*rc8*deb" # vm-reboot # vm-command "uname -a" local version=$1 [ -n "$version" ] || error "missing kernel version to install" if [ "$version" == "list" ]; then wget -q -O- https://kernel.ubuntu.com/~kernel-ppa/mainline/ | grep -E '^.*href="v[5-9]' | sed 's|^.*href="v\([0-9][^"]*\)/".*$|\1|g' return 0 fi vm-command "mkdir -p kernels; rm -f kernels/linux*$version*deb; for deb in \$(wget -q -O- https://kernel.ubuntu.com/~kernel-ppa/mainline/v$version/ | awk -F'\"' '/amd64.*deb/{print \$2}' | grep -v -E 'headers|lowlatency'); do ( cd kernels; wget -q https://kernel.ubuntu.com/~kernel-ppa/mainline/v$version/\$deb ); done; echo; echo 'Downloaded kernel packages:'; du -h kernels/*.deb" || command-error "downloading kernel $version failed" } ubuntu-ssh-user() { echo ubuntu } debian-ssh-user() { echo debian } ubuntu-apparmor-disable-runc() { vm-command "[ -f /etc/apparmor.d/runc ] && ln -s /etc/apparmor.d/runc /etc/apparmor.d/disable/ && apparmor_parser -R /etc/apparmor.d/runc" } ubuntu-config-containerd() { ubuntu-apparmor-disable-runc default-config-containerd } ubuntu-config-crio() { ubuntu-apparmor-disable-runc default-config-crio } debian-pkg-type() { echo deb } debian-install-repo-key() { local key # apt-key needs gnupg2, that might not be available by default vm-command "command -v gpg >/dev/null 2>&1" || { vm-command "apt-get update && apt-get install -y gnupg2" } for key in "$@"; do vm-command "curl -L -s $key | apt-key add -" || command-error "failed to install repo key $key" done } debian-install-repo() { if [ $# = 1 ]; then # shellcheck disable=SC2086,SC2048 set -- $* fi vm-command "echo $* > /etc/apt/sources.list.d/$3-$4.list && apt-get update" || command-error "failed to install apt repository $*" } debian-refresh-pkg-db() { vm-command "apt-get update" || command-error "failed to refresh apt package DB" } debian-install-pkg() { # dpkg configure may ask "The default action is to keep your # current version", for instance when a test has added # /etc/containerd/config.toml and then apt-get installs # containerd. 'yes ""' will continue with the default answer (N: # keep existing) in this case. Without 'yes' installation fails. # Add apt-get option "--reinstall" if any environment variable # reinstall_=1 local pkg local opts="" for pkg in "$@"; do if [ "$(eval echo \$reinstall_$pkg)" == "1" ]; then opts="$opts --reinstall" break fi done vm-command "yes \"\" | DEBIAN_FRONTEND=noninteractive apt-get install $opts -y --allow-downgrades $*" || command-error "failed to install $*" } debian-remove-pkg() { vm-command "for pkg in $*; do dpkg -l \$pkg >& /dev/null && apt remove -y --purge \$pkg || :; done" || command-error "failed to remove package(s) $*" } debian-install-pkg-local() { local force="" if [ "$1" == "--force" ]; then force="--force-all" shift fi vm-command "dpkg -i $force $*" || command-error "failed to install local package(s)" } debian-install-golang() { debian-refresh-pkg-db debian-install-pkg golang git-core } debian-install-kernel-dev() { distro-refresh-pkg-db distro-install-pkg git-core build-essential linux-source bc kmod cpio flex libncurses5-dev libelf-dev libssl-dev dwarves bison vm-command "[ -d linux ] || git clone https://github.com/torvalds/linux" vm-command '[ -f linux/.config ] || cp -v /boot/config-$(uname -r) linux/.config' echo "Kernel ready for patching and configuring." echo "build: cd linux && make bindeb-pkg" echo "install: dpkg -i linux-*.deb" } debian-11-install-containerd-pre() { debian-install-repo-key https://download.docker.com/linux/debian/gpg debian-install-repo "deb https://download.docker.com/linux/debian bullseye stable" } debian-11-install-containerd() { vm-command-q "[ -f /usr/bin/containerd ]" || { distro-install-pkg containerd.io } } debian-sid-config-containerd-post() { vm-command "sed -e 's|bin_dir = \"/usr/lib/cni\"|bin_dir = \"/opt/cni/bin\"|g' -i /etc/containerd/config.toml" } debian-install-cri-dockerd-pre() { debian-refresh-pkg-db debian-install-pkg docker.io conntrack vm-command "addgroup $(vm-ssh-user) docker" distro-install-golang } debian-install-crio-pre() { debian-refresh-pkg-db debian-install-pkg libgpgme11 conmon runc containernetworking-plugins conntrack || true } debian-install-k8s() { local _k8s=$k8s debian-refresh-pkg-db debian-install-pkg gpg apt-transport-https curl if [[ -z "$k8s" ]] || [[ "$k8s" == "latest" ]]; then vm-command "curl -s https://api.github.com/repos/kubernetes/kubernetes/releases/latest | grep tag_name | sed -e 's/.*v\([0-9]\+\.[0-9]\+\).*/\1/g'" _k8s=$COMMAND_OUTPUT fi echo "installing Kubernetes v${_k8s}" vm-command "curl -fsSL https://pkgs.k8s.io/core:/stable:/v${_k8s}/deb/Release.key -o /tmp/Release.key" || \ command-error "failed to download Kubernetes v${_k8s} key" if vm-command "command -v apt-key >/dev/null"; then vm-command "sudo apt-key add /tmp/Release.key" vm-command "echo 'deb https://pkgs.k8s.io/core:/stable:/v${_k8s}/deb/ /' > /etc/apt/sources.list.d/kubernetes.list && apt update" || \ command-error "failed to add Kubernetes v${_k8s} repo" else vm-command "sudo gpg --dearmor --batch --yes -o /etc/apt/keyrings/kubernetes-apt-keyring.gpg /tmp/Release.key" vm-command "echo 'deb [signed-by=/etc/apt/keyrings/kubernetes-apt-keyring.gpg] https://pkgs.k8s.io/core:/stable:/v${_k8s}/deb/ /' > /etc/apt/sources.list.d/kubernetes.list && apt update" || \ command-error "failed to add Kubernetes v${_k8s} repo" fi debian-install-pkg "kubeadm" "kubelet" "kubectl" } debian-set-kernel-cmdline() { local e2e_defaults="$*" vm-command "echo 'GRUB_CMDLINE_LINUX_DEFAULT=\"\${GRUB_CMDLINE_LINUX_DEFAULT} ${e2e_defaults}\"' > /etc/default/grub.d/60-e2e-defaults.cfg" || { command-error "writing new command line parameters failed" } vm-command "update-grub" || { command-error "updating grub failed" } } debian-env-file-dir() { echo "/etc/default" } debian-sid-govm-env() { echo "DISABLE_VGA=N" } ########################################################################### # # Generic Fedora # YUM_INSTALL="yum install --disableplugin=fastestmirror -y" YUM_REMOVE="yum remove --disableplugin=fastestmirror -y" fedora-image-url() { fedora-40-image-url } fedora-40-image-url() { echo "https://mirrors.xtom.de/fedora/releases/40/Cloud/x86_64/images/Fedora-Cloud-Base-Generic.x86_64-40-1.14.qcow2" } fedora-ssh-user() { echo fedora } fedora-install-utils() { distro-install-pkg /usr/bin/pidof } fedora-install-repo() { distro-install-pkg dnf-plugins-core vm-command "dnf config-manager --add-repo $*" || command-error "failed to install DNF repository $*" } fedora-install-pkg() { local pkg local do_reinstall=0 for pkg in "$@"; do if [ "$(eval echo \$reinstall_$pkg)" == "1" ]; then do_reinstall=1 break fi done vm-command "dnf install -y $*" || command-error "failed to install $*" # When requesting reinstallation, detect which packages were # already installed and reinstall those. # (Unlike apt and zypper, dnf offers no option for reinstalling # existing and installing new packages on the same run.) if [ "$do_reinstall" == "1" ]; then local reinstall_pkgs reinstall_pkgs=$(awk -F '[ -]' -v ORS=" " '/Package .* already installed/{print $2}' <<< "$COMMAND_OUTPUT") if [ -n "$reinstall_pkgs" ]; then vm-command "dnf reinstall -y $reinstall_pkgs" fi fi } fedora-remove-pkg() { vm-command "dnf remove -y $*" || command-error "failed to remove package(s) $*" } fedora-install-pkg-local() { local force="" if [ "$1" == "--force" ]; then force="--nodeps --force" shift fi vm-command "rpm -Uvh $force $*" || command-error "failed to install local package(s)" } fedora-install-kernel-dev() { fedora-install-pkg fedpkg fedora-packager rpmdevtools ncurses-devel pesign grubby git-core vm-command "(set -x -e echo root >> /etc/pesign/users echo $(vm-ssh-user) >> /etc/pesign/users /usr/libexec/pesign/pesign-authorize fedpkg clone -a kernel cd kernel git fetch git switch ${VM_DISTRO/edora-/} # example: git switch f40 in fedora-40 sed -i 's/# define buildid .local/%define buildid .e2e/g' kernel.spec )" || { echo "installing kernel development environment failed" return 1 } echo "Kernel ready for patching and configuring." echo "build: cd kernel && dnf builddep -y kernel.spec && fedpkg local" echo "install: cd kernel/x86_64 && dnf install -y --nogpgcheck kernel-{core-,modules-,}[5-9]*.e2e.fc*.x86_64.rpm" } fedora-install-golang() { fedora-install-pkg wget tar gzip git-core from-tarball-install-golang } fedora-install-crio-version() { distro-install-pkg runc conmon vm-command "ln -sf /usr/lib64/libdevmapper.so.1.02 /usr/lib64/libdevmapper.so.1.02.1" || true if [ -z "$crio_src" ]; then vm-command "dnf -y module enable cri-o:${crio_version:-$1}" fi } fedora-install-containernetworking-plugins() { distro-install-pkg containernetworking-plugins vm-command "[ -x /opt/cni/bin/loopback ] || { mkdir -p /opt/cni/bin; mount --bind /usr/libexec/cni /opt/cni/bin; }" vm-command "grep /usr/libexec/cni /etc/fstab || echo /usr/libexec/cni /opt/cni/bin none defaults,bind,nofail 0 0 >> /etc/fstab" } fedora-install-cri-dockerd-pre() { distro-install-pkg docker git-core conntrack vm-command "systemctl enable docker --now; usermod --append --groups docker $(vm-ssh-user)" distro-install-golang } fedora-install-crio-pre() { fedora-install-crio-version 1.21 fedora-install-containernetworking-plugins } fedora-install-crio() { if [ -n "$crio_src" ]; then default-install-crio else distro-install-pkg cri-o vm-command "systemctl enable --now crio" || command-error "failed to enable cri-o" fi } fedora-install-containerd-pre() { distro-install-repo https://download.docker.com/linux/fedora/docker-ce.repo fedora-install-containernetworking-plugins } fedora-install-containerd-post() { vm-command "systemctl enable containerd" } fedora-install-k8s() { _k8s=$k8s if [[ -z "$_k8s" ]] || [[ "$_k8s" == "latest" ]]; then vm-command "curl -s https://api.github.com/repos/kubernetes/kubernetes/releases/latest | grep tag_name | sed -e 's/.*v\([0-9]\+\.[0-9]\+\).*/\1/g'" _k8s=$COMMAND_OUTPUT fi local repo="/etc/yum.repos.d/kubernetes.repo" cat < /etc/sudoers.d/10-norequiretty setenforce 0 sed -E -i 's/^SELINUX=.*$/SELINUX=permissive/' /etc/selinux/config echo PATH='\$PATH:/usr/local/bin:/usr/local/sbin' > /etc/profile.d/usr-local-path.sh EOF if [[ "${cgroups:-}" != "v2" ]]; then cat <> /etc/default/grub" || { command-error "writing new command line parameters failed" } vm-command "grub2-mkconfig -o /boot/grub2/grub.cfg" || { command-error "updating grub failed" } } ########################################################################### # # OpenSUSE and SLES # ZYPPER="zypper --non-interactive --no-gpg-checks" sles-image-url() { echo "/DOWNLOAD-MANUALLY-TO-HOME/vms/images/SLES15-SP3-JeOS.x86_64-15.3-OpenStack-Cloud-GM.qcow2" } sles-ssh-user() { echo "sles" } sles-install-utils() { local sles_registered=0 local sles_version="" vm-command "SUSEConnect -s" || { command-error "cannot run SUSEConnect" } # Parse registration status and SLES version. if [ "$(jq '.[] | select(.identifier == "SLES") | .status' <<< $COMMAND_OUTPUT)" == '"Registered"' ]; then sles_registered=1 fi sles_version="$(jq -r '.[] | select(.identifier == "SLES") | .version' <<< $COMMAND_OUTPUT)" if [ -z "$sles_version" ]; then command-error "cannot read SLES version information from SUSEConnect -s output" fi # Try automatic registration if registration code is provided. if [ "$sles_registered" == 0 ] && [ -n "$VM_SLES_REGCODE" ]; then vm-command "SUSEConnect -r $VM_SLES_REGCODE" || { echo "ERROR:" echo "ERROR: Registering to SUSE Customer Center failed." echo "ERROR: - Verify VM_SLES_REGCODE and try again." echo "ERROR: - Unset VM_SLES_REGCODE to skip registration (use unsupported repos)." echo "ERROR:" exit 1 } sles_registered=1 fi # Add correct repo, depending on registration status. if [ "$sles_registered" == 0 ]; then echo "WARNING:" echo "WARNING: Unregistered SUSE Linux Enterprise Server." echo "WARNING: VM_SLES_REGCODE is not set, automatic registration skipped." echo "WARNING: Fallback to use OpenSUSE OSS repository." echo "WARNING:" sleep "${warning_delay:-0}" vm-command-q "$ZYPPER lr openSUSE-Oss >/dev/null" || { distro-install-repo "http://download.opensuse.org/distribution/leap/${sles_version}/repo/oss/" openSUSE-Oss } else vm-command-q "$ZYPPER lr | grep -q SUSE-PackageHub" || { vm-command "SUSEConnect -p PackageHub/${sles_version}/x86_64" } fi distro-install-pkg sysvinit-tools psmisc } opensuse-image-url() { opensuse-15_6-image-url } opensuse-15_6-image-url() { echo "https://download.opensuse.org/pub/opensuse/distribution/leap/15.6/appliances/openSUSE-Leap-15.6-Minimal-VM.x86_64-Cloud.qcow2" } opensuse-tumbleweed-image-url() { echo "https://ftp.uni-erlangen.de/opensuse/tumbleweed/appliances/openSUSE-MicroOS.x86_64-ContainerHost-OpenStack-Cloud.qcow2" } opensuse-install-utils() { distro-install-pkg psmisc sysvinit-tools } opensuse-ssh-user() { echo "opensuse" } opensuse-pkg-type() { echo "rpm" } opensuse-set-kernel-cmdline() { local e2e_defaults="$*" vm-command "mkdir -p /etc/default; touch /etc/default/grub; sed -i '/e2e:opensuse-set-kernel-cmdline/d' /etc/default/grub" vm-command "echo 'GRUB_CMDLINE_LINUX_DEFAULT=\"\${GRUB_CMDLINE_LINUX_DEFAULT} ${e2e_defaults}\" # by e2e:opensuse-set-kernel-cmdline' >> /etc/default/grub" || { command-error "writing new command line parameters failed" } vm-command "grub2-mkconfig -o /boot/grub2/grub.cfg" || { command-error "updating grub failed" } } opensuse-setup-oneshot() { # Remove bad version of containerd if it is already installed, # otherwise valid version of the package will not be installed. vm-command "rpm -q containerd && ( zypper info containerd | awk '/Repository/{print $3}' | grep -v Virtualization ) && echo Removing wrong containerd version && zypper --non-interactive rm containerd" } opensuse-install-repo() { opensuse-wait-for-zypper vm-command "$ZYPPER addrepo $* && $ZYPPER refresh" || command-error "failed to add zypper repository $*" } opensuse-refresh-pkg-db() { opensuse-wait-for-zypper vm-command "$ZYPPER refresh" || command-error "failed to refresh zypper package DB" } opensuse-install-pkg() { opensuse-wait-for-zypper # Add zypper option "--force" if environment variable reinstall_=1 local pkg local opts="" for pkg in "$@"; do if [ "$(eval echo \$reinstall_$pkg)" == "1" ]; then opts="$opts --force" break fi done # In OpenSUSE 15.2 zypper exits with status 106 if already installed, # in 15.3 the exit status is 0. Do not consider "already installed" # as an error. vm-command "$ZYPPER install $opts $*" || [ "$COMMAND_STATUS" == "106" ] || command-error "failed to install $*" } opensuse-install-pkg-local() { opensuse-wait-for-zypper local force="" if [ "$1" == "--force" ]; then force="--nodeps --force" shift fi vm-command "rpm -Uvh $force $*" || command-error "failed to install local package(s)" } opensuse-remove-pkg() { vm-command 'for i in $*; do rpm -q --quiet $i || continue; $ZYPPER remove $i || exit 1; done' || command-error "failed to remove package(s) $*" } opensuse-install-golang() { distro-install-pkg wget tar gzip git-core from-tarball-install-golang } opensuse-wait-for-zypper() { vm-run-until --timeout 5 '( ! pgrep zypper >/dev/null ) || ( pkill -9 zypper; sleep 1; exit 1 )' || error "Failed to stop zypper running in the background" } opensuse-install-k8s() { vm-command "( lsmod | grep -q br_netfilter ) || { echo br_netfilter > /etc/modules-load.d/50-br_netfilter.conf; modprobe br_netfilter; }" vm-command "echo 1 > /proc/sys/net/ipv4/ip_forward" vm-command "zypper ls" if ! grep -q snappy <<< "$COMMAND_OUTPUT"; then distro-install-repo "http://download.opensuse.org/repositories/system:/snappy/openSUSE_Leap_15.6 snappy" distro-refresh-pkg-db fi distro-install-pkg "snapd apparmor-profiles socat ebtables conntrackd iptables ethtool cni-plugins" distro-install-crictl vm-command "mkdir -p /opt/cni && ln -fs /usr/lib/cni/ -T /opt/cni/bin" vm-command "systemctl enable --now snapd" vm-command "snap wait system seed.loaded" for kubepart in kubelet kubectl kubeadm; do local snapcmd=install local k8sverparam if vm-command-q "snap info $kubepart | grep -q tracking"; then # $kubepart is already installed, either refresh or reinstall it. if [ "$(eval echo \$reinstall_$kubepart)" == "1" ]; then # Reinstalling $kubepart requested. # snap has no option for direct reinstalling, # so the package needs to be removed first. vm-command "snap remove $kubepart" snapcmd=install else snapcmd=refresh fi fi # Specify snap channel if user has requested a specific k8s version. if [[ "$k8s" == *.*.* ]]; then echo "WARNING: cannot snap install k8s=X.Y.Z, installing latest X.Y" k8sverparam="--channel ${k8s%.*}/edge" elif [[ "$k8s" == *.* ]]; then k8sverparam="--channel ${k8s}/edge" elif [[ -z "$k8s" ]]; then k8sverparam="" else error "invalid k8s version ${k8s}, expected k8s=X.Y" fi vm-command "snap $snapcmd $k8sverparam $kubepart --classic" done # Manage kubelet with systemd rather than snap vm-command "snap stop kubelet" cat < /etc/profile.d/linux_git.sh" } opensuse-bootstrap-commands-pre() { cat <> /etc/modules-load.d/k8s.conf || : modprobe nf-tables-bridge && echo nf-tables-bridge >> /etc/modules-load.d/k8s.conf || : modprobe br_netfilter && echo br_netfilter >> /etc/modules-load.d/k8s.conf || : touch /etc/sysctl.d/k8s.conf echo "net.bridge.bridge-nf-call-ip6tables = 1" >> /etc/sysctl.d/k8s.conf echo "net.bridge.bridge-nf-call-iptables = 1" >> /etc/sysctl.d/k8s.conf echo "net.ipv4.ip_forward = 1" >> /etc/sysctl.d/k8s.conf # rp_filter (partially) mitigates DDOS attacks with spoofed IP addresses # by dropping packages with non-routable (unanswerable) source addresses. # However, rp_filter > 0 breaks cilium networking. Make sure it's disabled. echo "net.ipv4.conf.*.rp_filter = 0" >> /etc/sysctl.d/k8s.conf /sbin/sysctl -p /etc/sysctl.d/k8s.conf || : EOF } default-setup-proxies() { # Notes: # We blindly assume that upper- vs. lower-case env vars are identical. # shellcheck disable=SC2154 if [ -z "$http_proxy$https_proxy$ftp_proxy$no_proxy" ]; then return 0 fi if vm-command-q "grep -q \"http_proxy=$http_proxy\" /etc/profile.d/proxy.sh && \ grep -q \"https_proxy=$https_proxy\" /etc/profile.d/proxy.sh && \ grep -q \"ftp_proxy=$ftp_proxy\" /etc/profile.d/proxy.sh && \ grep -q \"no_proxy=$no_proxy\" /etc/profile.d/proxy.sh" 2>/dev/null; then # No changes in proxy configuration return 0 fi local file scope="" append="--append" hn ext_no_proxy hn="$(vm-command-q hostname)" local master_node_ip_comma="" if [ -n "$k8smaster" ]; then local master_user_ip master_user_ip="$(vm-ssh-user-ip $k8smaster)" master_node_ip_comma=${master_user_ip/*@}, fi ext_no_proxy="$master_node_ip_comma$VM_IP,10.0.0.0/8,$CNI_SUBNET,$hn,.svc,.internal,192.168.0.0/16" for file in /etc/environment /etc/profile.d/proxy.sh; do cat < /etc/containerd/config.toml" fi vm-sed-file /etc/containerd/config.toml 's/^.*disabled_plugins *= *.*$/disabled_plugins = []/' if vm-command-q "containerd config dump | grep -v -q SystemdCgroup"; then vm-command "containerd config dump > /etc/containerd/config.toml" fi vm-sed-file /etc/containerd/config.toml 's/SystemdCgroup = false/SystemdCgroup = true/g' } default-restart-containerd() { vm-command "systemctl daemon-reload && systemctl restart containerd" || command-error "failed to restart containerd systemd service" } default-install-crio() { [ -n "$crio_src" ] || error "crio install error: crio_src is not set" [ -x "$crio_src/bin/crio" ] || error "crio install error: file not found $crio_src/bin/crio" for f in crio crio-status pinns; do vm-put-file "$crio_src/bin/$f" "/usr/bin/$f" done cat < /etc/systemd/system/crio.service.d/path.conf; systemctl daemon-reload" } default-config-crio() { vm-command "mkdir -p /etc/containers" echo '{"default": [{"type":"insecureAcceptAnything"}]}' | vm-pipe-to-file /etc/containers/policy.json cat </dev/null && rm go.tgz" && \ vm-command "echo 'PATH=/usr/local/go/bin:\$PATH' > /etc/profile.d/go.sh" && \ vm-command "echo \* installed \$(go version)" } } create-ext4-var-lib-containerd() { local dir="/var/lib/containerd" file="/loop-ext4.dsk" dev echo "Creating loopback-mounted ext4 $dir..." if ! dev="$(vm-command-q "losetup -f")" || [ -z "$dev" ]; then command-error "failed to find unused loopback device" fi vm-command "dd if=/dev/zero of=$file bs=$((1024*1000)) count=$((1000*5))" || command-error "failed to create file for ext4 loopback mount" vm-command "losetup $dev $file" || command-error "failed to attach $file to $dev" vm-command "mkfs.ext4 $dev" || command-error "failed to create ext4 filesystem on $dev ($file)" if vm-command "[ -d $dir ]"; then vm-command "mv $dir $dir.orig" || command-error "failed to rename original $dir to $dir.orig" fi vm-command "mkdir -p $dir" || command-error "failed to create $dir" cat <\e[0m "} HOST_LIB_DIR="$(dirname "${BASH_SOURCE[0]}")" HOST_PROJECT_DIR="$(dirname "$(dirname "$(realpath "$HOST_LIB_DIR")")")" HOST_VM_IMAGE_DIR=~/vms/images HOST_VM_DATA_DIR_TEMPLATE="~/vms/data/\${VM_NAME}" if [ -z "$HOST_GORESCTRL_DIR" ]; then HOST_GORESCTRL_DIR="$(realpath "$HOST_PROJECT_DIR/../goresctrl")" fi GOVM=${GOVM-govm} host-command() { command-start "host" "$HOST_PROMPT" "$1" bash -c "$COMMAND" 2>&1 | command-handle-output command-end ${PIPESTATUS[0]} return $COMMAND_STATUS } host-require-govm() { command -v "$GOVM" >/dev/null || error "cannot run govm \"$GOVM\". Check PATH or set GOVM=/path/to/govm." } host-require-cmd() { command -v "$1" >/dev/null || error "cannot run \"$1\". Check dependencies." } host-get-vm-config() { if [ -z "$1" ]; then error "can't get VM configuration, name not set" fi VM_NAME="$1" HOST_VM_DATA_DIR="$(eval "echo $HOST_VM_DATA_DIR_TEMPLATE")" VM_DATA_CONFIG="$HOST_VM_DATA_DIR/vm-config" if ! [ -f "$VM_DATA_CONFIG" ]; then return 1 fi source "$VM_DATA_CONFIG" if [ -z "$VM_NAME" ] || [ -z "$VM_DISTRO" ] || [ -z "$VM_CRI" ] || [ -z "$VM_SSH_USER" ]; then return 1 fi VM_COMPOSE_YAML="$HOST_VM_DATA_DIR/govm-compose.yaml" } host-set-vm-config() { if [ -z "$1" ]; then error "can't configure VM, name not set" fi if [ -z "$2" ]; then error "can't configure VM, distro not set" fi if [ -z "$3" ]; then error "can't configure VM, CRI runtime not set" fi VM_NAME="$1" VM_DISTRO="$2" VM_CRI="$3" VM_SSH_USER="$(vm-ssh-user)" HOST_VM_DATA_DIR="$(eval "echo $HOST_VM_DATA_DIR_TEMPLATE")" mkdir -p "$HOST_VM_DATA_DIR" VM_COMPOSE_YAML="$HOST_VM_DATA_DIR/govm-compose.yaml" VM_DATA_CONFIG="$HOST_VM_DATA_DIR/vm-config" cat > "$VM_DATA_CONFIG" < "$VM_COMPOSE_YAML" host-command "${GOVM} compose -f \"$VM_COMPOSE_YAML\"" echo "# VM base image : $VM_IMAGE" echo "# VM govm yaml : $VM_COMPOSE_YAML" } sleep 1 VM_CONTAINER_ID=$(${GOVM} ls | awk "/$VM_NAME/{print \$1}") # Verify Qemu version. Refuse to run if Qemu < 5.0. # Use "docker run IMAGE" instead of "docker exec CONTAINER", # because the container may have already failed. VM_CONTAINER_IMAGE=$(docker inspect $VM_CONTAINER_ID | jq '.[0].Image' -r | awk -F: '{print $2}') echo "# VM name : $VM_NAME" echo "# VM Linux distro: $VM_DISTRO" echo "# VM CRI : $VM_CRI" echo "# VM Docker image: $VM_CONTAINER_IMAGE" echo "# VM Docker cntnr: $VM_CONTAINER_ID" if [ -n "$VM_CONTAINER_IMAGE" ]; then VM_CONTAINER_QEMU_VERSION=$(docker run --rm --entrypoint=/usr/bin/qemu-system-x86_64 $VM_CONTAINER_IMAGE -version | awk '/QEMU emulator version/{print $4}') fi if [ -n "$VM_CONTAINER_QEMU_VERSION" ]; then if [[ "$VM_CONTAINER_QEMU_VERSION" > "5" ]]; then echo "# VM Qemu version: $VM_CONTAINER_QEMU_VERSION" else if [[ "$QEMU_CPUMEM" =~ ",dies=" ]]; then error "Too old Qemu version \"$VM_CONTAINER_QEMU_VERSION\". Topology with dies > 1 requires Qemu >= 5.0" else echo "# (Your Qemu does not support dies > 1, consider updating for full topology support)" fi fi else echo "Warning: cannot verify Qemu version on govm image. In case of failure, check it is >= 5.0" >&2 fi echo "# VM Qemu output : docker logs $VM_CONTAINER_ID" echo "# VM Qemu monitor: docker exec -it $VM_CONTAINER_ID nc local:/data/monitor" VM_MONITOR="docker exec -i $VM_CONTAINER_ID nc local:/data/monitor" host-wait-vm-ssh-server host-wait-cloud-init } get-ssh-timeout() { echo $((`date +%s` + $1)) } host-wait-vm-ssh-server() { timeout=`get-ssh-timeout 120` while [ "${1#-}" != "$1" ] && [ -n "$1" ]; do case "$1" in --timeout) timeout=`get-ssh-timeout $2` shift; shift ;; *) invalid="${invalid}${invalid:+,}\"$1\"" shift ;; esac done if [ -n "$invalid" ]; then error "invalid options: $invalid" return 1 fi if [ -z "$VM_IP" ]; then VM_IP=$(${GOVM} ls | awk "/$VM_NAME/{print \$4}") while [ "x$VM_IP" == "x" ]; do host-command "${GOVM} start \"$VM_NAME\"" sleep 5 VM_IP=$(${GOVM} ls | awk "/$VM_NAME/{print \$4}") done fi echo "# VM SSH server : ssh $VM_SSH_USER@$VM_IP" if [ -d "$HOME/vms/data/$VM_NAME" ]; then SSH_OPTS="$SSH_OPTS -o ControlMaster=auto -o ControlPath=$HOME/vms/data/$VM_NAME/ssh -o ControlPersist=30" SSH="${SSH%% *} $SSH_OPTS" SCP="${SCP%% *} $SSH_OPTS" export SSH SSH_OPTS SCP fi ssh-keygen -f "$HOME/.ssh/known_hosts" -R "$VM_IP" >/dev/null 2>&1 print_info=1 while ! $SSH ${VM_SSH_USER}@${VM_IP} -o ConnectTimeout=2 true 2>/dev/null; do CURR_TIME=`date +%s` if [ $CURR_TIME -gt $timeout ]; then error "timeout" fi if [ "$print_info" == 1 ]; then echo -n "Waiting for VM SSH server to respond..." print_info=0 fi sleep 2 echo -n "." done echo "" } host-wait-cloud-init() { retries=60 retries_left=$retries while true; do $SSH -o ConnectTimeout=2 ${VM_SSH_USER}@${VM_IP} sudo cloud-init status --wait 2>/dev/null [ "$?" -eq 0 -o "$?" -eq 2 ] && break if [ "$retries" == "$retries_left" ]; then echo -n "Waiting for VM cloud-init to finish..." fi sleep 2 echo -n "." retries_left=$(( $retries_left - 1 )) if [ "$retries_left" == "0" ]; then error "timeout" fi done [ "$retries" == "$retries_left" ] || echo "" } host-stop-vm() { #VM_NAME=$1 host-require-govm host-command "${GOVM} stop $VM_NAME" || { command-error "stopping govm \"$VM_NAME\" failed" } } host-delete-vm() { #VM_NAME=$1 host-require-govm host-command "${GOVM} delete $VM_NAME" || { command-error "deleting govm \"$VM_NAME\" failed" } } host-is-encrypted-ssh-key() { ssh-keygen -y -f "$1" < /dev/null >& /dev/null if [ $? != 0 ]; then return 0 else return 1 fi } host-mount-vm() { # Usage: host-mount-vm # # Mount VM / to VM data directory on host. # host-get-vm-config NAME must be run first. local mountpoint="${HOST_VM_DATA_DIR}/sshfs" local vm_sftp_server="" local vm_sftp_server_candidates=(/usr/lib/openssh/sftp-server /usr/libexec/sftp-server) local vm_sftp_server_candidate command -v sshfs >/dev/null || { error "host-mount-vm: missing sshfs" } if mount | grep "${mountpoint}"; then echo "host-mount-vm: already mounted" return 0 fi for vm_sftp_server_candidate in "${vm_sftp_server_candidates[@]}"; do if vm-command-q "command -v ${vm_sftp_server_candidate} >/dev/null"; then vm_sftp_server="${vm_sftp_server_candidate}" break fi done if [ -z "${vm_sftp_server}" ]; then error "cannot find sftp-server from vm" fi mkdir -p "${mountpoint}" sshfs "${VM_SSH_USER}@${VM_IP}:/" "${mountpoint}" -o sftp_server="/usr/bin/sudo ${vm_sftp_server}" $SSH_OPTS || { error "sshfs mount failed" } echo "host-mount-vm: mounted ${VM_NAME}:/ to ${mountpoint}" } ================================================ FILE: demo/lib/numactlH2numajson.py ================================================ #!/usr/bin/env python3 """numactlH2numajson - convert numactl -H output to numajson Example: numactl -H | numactlH2numajson """ import json import math import re import sys QEMU_DEFAULT_DIST_OTHER = 20 QEMU_DEFAULT_DIST_SELF = 10 def error(msg, exit_status=1): sys.stderr.write("numactlH2numajson: %s\n" % (msg,)) if not exit_status is None: sys.exit(1) def round_size(size, size_unit, non_zero_numbers=3): if size_unit == "kB": size_mb = size / 1024 elif size_unit == "MB": size_mb = size elif size_unit == "GB": size_mb = size * 1024 elif size_unit == "TB": size_mb = size * 1024 * 1024 else: raise Exception("unsupported size unit: %r" % (size_unit,)) if size_mb == 0: return "0G" size_mul = 10**int(math.log10(size_mb)) rounded = round(size_mb * 10**(non_zero_numbers-1) / size_mul) * size_mul / (10**(non_zero_numbers-1)) if size_mul < 1000: return "%.0fM" % (rounded,) else: return "%.0fG" % (rounded/1000) def add_dists_to_numalist(numalist, dists): """Add/replace distance information in numalist with node distances in dists. dists[i][j] = distance from node i to node j. dists can be a matrix or a dict: {sourcenode: {destnode: dist}}""" dist_matrix = [] node = -1 node_group = {} # {node: group_index_in_numalist} group_nodes = {} # {group_index_in_numalist: set_of_nodes} for groupindex, numaspec in enumerate(numalist): group_nodes[groupindex] = set() nodecount = int(numaspec.get("nodes", 1)) for _ in range(nodecount): node += 1 group_nodes[groupindex].add(node) node_group[node] = groupindex lastnode = node if isinstance(dists, list): # dists is a dist matrix. dist_matrix = dists else: # dists is a dict. create dist_matrix from it. for sourcenode in range(lastnode + 1): dist_matrix.append([]) for destnode in range(lastnode + 1): if sourcenode in dists and destnode in dists[sourcenode]: d = dists[sourcenode][destnode] elif sourcenode != destnode: d = QEMU_DEFAULT_DIST_OTHER else: d = QEMU_DEFAULT_DIST_SELF dist_matrix[-1].append(d) dist_freq = {} # {distance: number-of-appearances} try: for sourcenode in range(lastnode + 1): for destnode in range(lastnode + 1): if sourcenode != destnode: d = dist_matrix[sourcenode][destnode] dist_freq[d] = dist_freq.get(d, 0) + 1 except IndexError: raise ValueError("invalid dists matrix dimensions, %sx%s expected" % (lastnode + 1, lastnode + 1)) # Read the most common distance from the matrix, ignore distance-to-self. if len(dist_freq) > 0: default_dist = max([(v, k) for k, v in dist_freq.items()])[1] else: default_dist = QEMU_DEFAULT_DIST_SELF # don't care: there's only one node # Try filling symmetric distances with the default dist. # There may be asymmetry or node grouping that making this impossible. # In those cases sym_dist_errors > 0. sym_dist_errors = 0 group_node_dist = {} # {group_index: {othernode: dist}} for sourcenode in range(lastnode + 1): sourcegroup = node_group[sourcenode] if not sourcegroup in group_node_dist: group_node_dist[sourcegroup] = {} for destnode in range(lastnode + 1): destgroup = node_group[destnode] if sourcenode == destnode: continue elif dist_matrix[sourcenode][destnode] == default_dist: continue elif dist_matrix[sourcenode][destnode] != dist_matrix[destnode][sourcenode]: # There is asymmetry. sym_dist_errors += 1 continue for othernode in [n for n in group_nodes[sourcegroup] if n != sourcenode and n != destnode]: if (dist_matrix[othernode][destnode] != dist_matrix[sourcenode][destnode] or dist_matrix[othernode][destnode] != dist_matrix[destnode][sourcenode]): # Different nodes in the same group have different distances. sym_dist_errors += 1 group_node_dist[sourcegroup][destnode] = dist_matrix[sourcenode][destnode] # Clear existing distance definitions from numalist. for numaspec in numalist: if "dist" in numaspec: del numaspec["dist"] if "dist-all" in numaspec: del numaspec["dist-all"] if "node-dist" in numaspec: del numaspec["node-dist"] # Now we are ready to add distance information. if sym_dist_errors == 0 and len(str(group_node_dist)) < len(str(dist_matrix)): # Add info using "dist" and "node-dist", that is symmetrical distances. # This time it is more compact representation than a matrix. for groupindex, numaspec in enumerate(numalist): if group_node_dist[groupindex] != {}: # if all nodes mentioned in node-dist are in earlier groups, # there is no need to inject this definition, because it has been # covered by distance symmetry. nodes_with_dists = set(group_node_dist[groupindex].keys()) for earlier_group in range(groupindex): nodes_with_dists -= group_nodes[earlier_group] # there are new distance definitions, include all if len(nodes_with_dists) > 0: numaspec["node-dist"] = group_node_dist[groupindex] if default_dist != QEMU_DEFAULT_DIST_OTHER: numalist[0]["dist"] = default_dist elif len(numalist) > 1: # Add distances as a matrix. numalist[-1]["dist-all"] = dist_matrix else: # There is no need for distance information in the numalist, # as there is only one node. pass def numactlH2numajson(input_line_iter): numalist = [] dist_matrix = [] re_node_cpus = re.compile('^node (?P[0-9]+) cpus:( (?P([0-9]+\s?)*))?') re_node_size = re.compile('^node (?P[0-9]+) size:( (?P[0-9]+) (?P[a-zA-Z]+))?') re_node_distances = re.compile('^\s*(?P[0-9]+):(?P(\s*[0-9]+)*)') for line in input_line_iter: m = re_node_cpus.match(line) if m: m_dict = m.groupdict() node = int(m_dict["node"]) if m_dict["cpus"] is None: cpus = [] else: cpus = [int(cpu) for cpu in m.groupdict()["cpus"].strip().split()] continue m = re_node_size.match(line) if m: m_dict = m.groupdict() if int(m_dict["node"]) != node: raise Exception("expected node %s size, got %r" % (node, line)) size_unit = m_dict["size_unit"] mem = round_size(int(m_dict["size"]), size_unit) if (len(numalist) == 0 or numalist[-1]["cpu"] != len(cpus) or numalist[-1]["mem"] != mem): # found a node that is different from the previous numalist.append({"cpu": len(cpus), "mem": mem, "nodes": 1}) else: # found a node that looks the same as the previous numalist[-1]["nodes"] += 1 nodecount = node + 1 continue m = re_node_distances.match(line) if m: m_dict = m.groupdict() dist_matrix.append([int(d) for d in m_dict['dists'].strip().split()]) # filter out unnecessary "nodes": 1 from the list: for d in numalist: if d["nodes"] == 1: del d["nodes"] # parse distances add_dists_to_numalist(numalist, dist_matrix) return numalist def self_test(): input_output = { """available: 5 nodes (0-4) node 0 cpus: 0 node 0 size: 1007 MB node 0 free: 784 MB node 1 cpus: 1 node 1 size: 1007 MB node 1 free: 262 MB node 2 cpus: 2 3 node 2 size: 1951 MB node 2 free: 1081 MB node 3 cpus: 4 5 6 7 node 3 size: 4030 MB node 3 free: 693 MB node 4 cpus: node 4 size: 8039 MB node 4 free: 8029 MB node distances: node 0 1 2 3 4 0: 10 22 22 22 88 1: 22 10 22 22 88 2: 22 22 10 22 88 3: 22 22 22 10 88 4: 88 88 88 88 10 """: [{'cpu': 1, 'mem': '1G', 'nodes': 2, 'node-dist': {4: 88}, 'dist': 22}, {'cpu': 2, 'mem': '2G', 'node-dist': {4: 88}}, {'cpu': 4, 'mem': '4G', 'node-dist': {4: 88}}, {'cpu': 0, 'mem': '8G'}], """available: 2 nodes (0-1) node 0 cpus: 0 1 2 3 node 0 size: 3966 MB node 0 free: 1649 MB node 1 cpus: 4 5 6 7 node 1 size: 4006 MB node 1 free: 983 MB node distances: node 0 1 0: 10 20 1: 20 10 """: [{'cpu': 4, 'mem': '4G', 'nodes': 2}], """available: 4 nodes (0-3) node 0 cpus: 0 1 2 3 node 0 size: 3966 MB node 0 free: 1649 MB node 1 cpus: 4 5 6 7 node 1 size: 4006 MB node 1 free: 983 MB node 1 cpus: 8 9 10 11 node 1 size: 4006 MB node 1 free: 983 MB node 1 cpus: 12 13 14 15 node 1 size: 4006 MB node 1 free: 983 MB node distances: node 0 1 2 3 0: 10 55 55 55 1: 55 10 55 55 2: 55 55 10 55 3: 55 55 55 10 """: [{'cpu': 4, 'mem': '4G', 'nodes': 4, 'dist': 55}], """available: 1 nodes (0) node 0 cpus: 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 node 0 size: 128000 MB node 0 free: 80000 MB node distances: node 0 0: 10 """: [{'cpu': 20, 'mem': '128G'}], """available: 5 nodes (0-4) node 0 cpus: 0 node 0 size: 4007 MB node 0 free: 784 MB node 1 cpus: 1 node 1 size: 1007 MB node 1 free: 262 MB node 2 cpus: 2 3 node 2 size: 1951 MB node 2 free: 1081 MB node 3 cpus: 4 5 6 7 node 3 size: 4030 MB node 3 free: 693 MB node 4 cpus: node 4 size: 8039 MB node 4 free: 8029 MB node distances: node 0 1 2 3 4 0: 10 22 33 44 55 1: 22 10 22 22 22 2: 33 22 10 22 22 3: 44 22 22 10 22 4: 55 22 22 22 10 """: [{'cpu': 1, 'mem': '4G', 'node-dist': {2: 33, 3: 44, 4: 55}, 'dist': 22}, {'cpu': 1, 'mem': '1G'}, {'cpu': 2, 'mem': '2G'}, {'cpu': 4, 'mem': '4G'}, {'cpu': 0, 'mem': '8G'}] } for input_string in input_output.keys(): observed = numactlH2numajson(input_string.splitlines()) expected = input_output[input_string] if observed != expected: raise Exception("self-test: observed/expected mismatch on numanodes\n%s\n\nobserved: %r\nexpected: %r" % (input_string, observed, expected)) add_dists_to_numalist([], []) return 0 if __name__ == "__main__": if len(sys.argv) > 1 and sys.argv[1] == "test": sys.exit(self_test()) try: numalist = numactlH2numajson(sys.stdin) except Exception as e: raise error(str(e)) print(json.dumps(numalist)) ================================================ FILE: demo/lib/topology.py ================================================ #!/usr/bin/env python3 """topology.py - topology utility Usage: topology.py [options] command Options: -t TOPOLOGY_DUMP load topology_dump from TOPOLOGY_DUMP file instead of the "topology_dump" environment variable or local host. -r RES_ALLOWED load res_allowed from RES_ALLOWED file instead of the "res_allowed" environment variable or local host. -o OUTPUT_FORMAT "json" or "text". The default is "text". Commands: help print help cpus view CPU topology from topology_dump. cpus_allowed [PROCESS...] view how matching PROCESSes are allowed to use CPUs. (Uses RES_ALLOWED like res_allowed below.) res view CPU and memory topology from topology_dump. res_allowed [PROCESS...] view how matching PROCESSes are allowed to use CPUs and memory in CPU/mem topology tree. If the RES_ALLOWED file or the res_allowed environment variable are not defined, "pgrep -f PROCESS" is used to match processes. bash_topology_dump print a Bash command that creates topology_dump. bash_res_allowed PROCESS [PROCESS...] print a Bash command that creates res_allowed dump that contains Cpus_allowed and Mems_allowed masks of processes matching "pgrep -f PROCESS". Examples: Print local host CPU topology $ topology.py cpus Print how processes with pod0..2 in their names are allowed to use CPUs $ topology.py res_allowed pod0 pod1 pod2 Print remote host CPU topology $ topology_dump="$(ssh remotehost "$(topology.py bash_topology_dump)")" topology.py cpus Watch how pod0..2 are allowed to CPUS on remote host, read topology only once $ export topology_dump="$(ssh remotehost "$(topology.py bash_topology_dump)")" $ watch 'res_allowed=$(ssh remotehost "$(topology.py bash_res_allowed pod0 pod1 pod2)") topology.py res_allowed' """ import getopt import json import os import re import subprocess import sys _bash_topology_dump = """for cpu in /sys/devices/system/cpu/cpu[0-9]*; do cpu_id=${cpu#/sys/devices/system/cpu/cpu}; echo "cpu p:$(< ${cpu}/topology/physical_package_id) d:$(< ${cpu}/topology/die_id) n:$(basename ${cpu}/node* | sed 's:node::g') c:$(< ${cpu}/topology/core_id) t:$(< ${cpu}/topology/thread_siblings) cpu:${cpu_id}" ; done; for node in /sys/devices/system/node/node[0-9]*; do node_id=${node#/sys/devices/system/node/node}; echo "dist n:$node_id d:$(< $node/distance)"; echo "mem n:$node_id s:$(awk '/MemTotal/{print $4/1024}' < $node/meminfo)"; done""" _bash_res_allowed = r"""for process in '%s'; do for pid in $(pgrep -f "$process"); do proc_pid_cmdline=$(< /proc/$pid/cmdline) || continue; proc_pid_status=$(< /proc/$pid/status) || continue; name=$(echo "$proc_pid_cmdline" | tr '\0 ' '\n' | grep -E "^$process" | head -n 1); [ -n "$name" ] && [ "$pid" != "$$" ] && [ "$pid" != "$PPID" ] && echo "${name}/${pid} $(awk '/Cpus_allowed:/{c=$2}/Mems_allowed:/{m=$2}END{print "c:"c" m:"m}' <<< "$proc_pid_status")"; done 2>/dev/null; done""" def error(msg, exit_status=1): """Print error message and exit.""" if not msg is None: sys.stderr.write('topology.py: %s\n' % (msg,)) if not exit_status is None: sys.exit(exit_status) def warning(msg): """Print warning.""" sys.stderr.write('topology.py warning: %s\n' % (msg,)) def output_tree(tree): """Print tree to output in OUTPUT_FORMAT""" if opt_output_format == "json": sys.stdout.write(json.dumps(tree)) else: sys.stdout.write(str_tree(tree) + "\n") sys.stdout.flush() def add_tree(root, branch, value_dict): """Add key-value pairs in value_dict to given branch in the tree starting from root. If the branch does not exist in the tree, it will be created. Example: add_tree(tree, ("package0", "die1", "node3", "core7", "thread0", "cpu15"), {"GHz", 4.2}) """ node = root for b in branch: if b in node: node = node[b] else: node[b] = {} node = node[b] node.update(value_dict) def _str_node(root, lines, branch): """Format node names in tree to lines ([[line1col1, line1col2], ...]).""" for key in sorted(root.keys()): branch.append(key) if root[key]: _str_node(root[key], lines, branch) else: # Add those column texts to the new line which does not have the same value # as previous non-empty text in the same column. new_line = [] new_col_txt_added = False for col, txt in enumerate(branch): if new_col_txt_added: prev_col_txt = "" else: for prev_line in lines[::-1]: if len(prev_line) > col and prev_line[col] != "": prev_col_txt = prev_line[col] break else: prev_col_txt = "" if txt != prev_col_txt: new_line.append(txt) new_col_txt_added = True else: new_line.append("") lines.append(new_line) branch.pop() def str_tree(root): """Format tree to string.""" lines = [] _str_node(root, lines, []) col_max_len = {} # {column-index: max-string-length} max_col = -1 for line in lines: for col, txt in enumerate(line): if col > max_col: max_col = col if len(txt) > col_max_len.get(col, -1): col_max_len[col] = len(txt) str_lines = [] for line in lines: line_cols = len(line) new_str_fmt = "" for col, txt in enumerate(line): new_str_fmt += "%-" + str(col_max_len[col] + 1) + "s" str_lines.append(new_str_fmt % tuple(line)) return "\n".join(str_lines) def bash_output(cmd): """Return standard output of executing cmd in Bash.""" p = subprocess.Popen(["bash", "-c", cmd], stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.PIPE) out, err = p.communicate() return out.decode("utf-8") def get_local_topology_dump(): """Return topology_dump from local system.""" return bash_output(_bash_topology_dump) def get_local_res_allowed_dump(processes): """Return res_allowed from local system.""" return bash_output(_bash_res_allowed % ("' '".join(processes),)) def dump_to_topology(dump, show_mem=True): """Parse topology_dump, return topology data structures.""" # Output data structures: tree = {} # {"package0": {"die1": {"node1": ...}}} cpu_branch = {} # {cpu_id: (package_name, die_name, node_name, core_name, thread_name, cpu_name)} node_branch = {} # {node_id: (package_name, die_name, node_name)} mem_branch = {} # {node_id: (package_name, ...)} # Example input line to be parsed: # cpu line: # "cpu p:0 d:1 n:3 c:2 t:00003000 cpu:13" # mem line: # "mem n:4: s:8063.83" re_cpu_line = re.compile('cpu p:(?P[0-9]+) d:(?P[0-9]*) n:(?P[0-9]+) c:(?P[0-9]+) t:(?P[0-9a-f,]+) cpu:(?P[0-9]+)') re_mem_line = re.compile('mem n:(?P[0-9]+) s:(?P[0-9.]+)') re_dist_line = re.compile('dist n:(?P[0-9]+) d:(?P([0-9 ]+))') numeric_cpu_lines = [] numeric_mem_lines = [] numeric_dist_lines = [] for line in dump.splitlines(): m = re_cpu_line.match(line) if m: mdict = m.groupdict() package = int(mdict["package"]) try: die = int(mdict["die"]) except ValueError: die = 0 # handle kernels that do not provide topology/die_id node = int(mdict["node"]) core = int(mdict["core"]) thread_siblings = eval("0x" + mdict["thread_siblings"].replace(",", "")) cpu_id = int(mdict["cpu_id"]) # Calculate thread id. # Let the lowest CPU bit owner in thread_siblings be thread 0, next thread 1 and so on. thread = -1 bit = 1 << cpu_id while bit: if thread_siblings & bit: thread += 1 bit >>= 1 numeric_cpu_lines.append((package, die, node, core, thread, cpu_id)) continue m = re_mem_line.match(line) if m: mdict = m.groupdict() numeric_mem_lines.append((int(mdict["node"]), float(mdict["size"]))) continue m = re_dist_line.match(line) if m: mdict = m.groupdict() numeric_dist_lines.append((int(mdict["node"]), tuple([int(n) for n in mdict["dist"].strip().split()]))) numeric_mem_lines.sort() # make sure memory sizes are from node 0, 1, ... numeric_dist_lines.sort() # Build tree on CPUs max_package_len = max(len(str(nl[0])) for nl in numeric_cpu_lines) max_die_len = max(len(str(nl[1])) for nl in numeric_cpu_lines) max_node_len = max(len(str(nl[2])) for nl in numeric_cpu_lines) max_core_len = max(len(str(nl[3])) for nl in numeric_cpu_lines) max_thread_len = max(len(str(nl[4])) for nl in numeric_cpu_lines) max_cpu_id_len = max(len(str(nl[5])) for nl in numeric_cpu_lines) for (package, die, node, core, thread, cpu_id) in numeric_cpu_lines: branch = ("package" + str(package).zfill(max_package_len), "die" + str(die).zfill(max_die_len), "node" + str(node).zfill(max_node_len), "core" + str(core).zfill(max_core_len), "thread" + str(thread).zfill(max_thread_len), "cpu" + str(cpu_id).zfill(max_cpu_id_len)) add_tree(tree, branch, {}) cpu_branch[cpu_id] = branch node_branch[node] = branch[:3] if show_mem: # Add node memory information to the tree for node, distvec in numeric_dist_lines: mem_node_name = "node" + str(node).zfill(max_node_len) node_mem_size = str(int(round((numeric_mem_lines[node][1]/1024)))) + "G" dists = sorted(distvec) if node in node_branch: # This node has CPU(s) as it has been added to the tree already in CPU lines. # Add memory branch to the tree under the existing node branch. branch = node_branch[node] + ( "mem", mem_node_name, node_mem_size) elif (dists[0] == 10 # sane distance-to-self and (len(dists) < 3 or dists[1] < dists[2]) # there is a node closer than others and distvec.index(dists[1]) in node_branch): # that node is already in the tree # This means that the node has the same memory controller as this node. # Add memory branch from this node under the existing node. node_same_ctrl = distvec.index(dists[1]) branch = node_branch[node_same_ctrl] + ( "mem", mem_node_name, node_mem_size) node_branch[node] = branch[:3] else: # Suitable memory controller not found, create completely separate branch. branch = ("packagex", "mem", "node" + str(node).zfill(max_node_len), "mem", mem_node_name, node_mem_size) node_branch[node] = branch[:3] add_tree(tree, branch, {}) mem_branch[node] = branch return {"tree": tree, "cpu_branch": cpu_branch, "node_branch": node_branch, "mem_branch": mem_branch} def dump_to_res_allowed(res_allowed_dump): """Parse res_allowed data, return allowed cpu and mem bitmasks in a data structure.""" # Output data structure: owner_mask = {} # {owner_string: {"cpu": bitmask_int, "mem": bitmask_int}} # Example input line to be parsed: # "pod2 c:040c0000,00000000 m:00000000,00000300" re_owner_mask = re.compile(r'(?P[^ ]+)\s+c:(?P[0-9a-f,]+)\s+m:(?P[0-9a-f,]+)') for line in res_allowed_dump.splitlines(): if not line: continue try: mdict = re_owner_mask.match(line).groupdict() except: warning("cannot parse res_allowed line %r" % (line,)) continue owner_mask[mdict["owner"]] = { "cpu": eval("0x" + mdict["cpumask"].replace(",", "")), "mem": eval("0x" + mdict["memmask"].replace(",", "")) } return owner_mask def get_topology(show_mem=True): """Return topology data structure.""" # Priority: use file, environment variable or read from local system if opt_topology_dump: topology_dump = opt_topology_dump else: topology_dump = os.getenv("topology_dump", None) if topology_dump is None: topology_dump = get_local_topology_dump() return dump_to_topology(topology_dump, show_mem=show_mem) def get_res_allowed(processes): """Return res_allowed data structure.""" # Priority: use file, environment variable or read from local system if opt_res_allowed_dump: res_allowed_dump = opt_res_allowed_dump else: res_allowed_dump = os.getenv("res_allowed", None) if res_allowed_dump is None: res_allowed_dump = get_local_res_allowed_dump(processes) return dump_to_res_allowed(res_allowed_dump) def report_res(show_mem=True): """Print topology tree.""" topology = get_topology(show_mem=show_mem) output_tree(topology["tree"]) def report_res_allowed(processes, show_mem=True): """Print topology tree with allowed processes as leaf nodes.""" topology = get_topology(show_mem=show_mem) tree = topology["tree"] cpu_branch = topology["cpu_branch"] mem_branch = topology["mem_branch"] node_branch = topology["node_branch"] max_cpu = max(cpu_branch.keys()) max_node = max(node_branch.keys()) res_allowed = get_res_allowed(processes) # add found owners to tree as children of cpus for owner, masks in sorted(res_allowed.items()): cpumask = masks["cpu"] memmask = masks["mem"] for cpu in range(max_cpu + 1): if cpumask & (1 << cpu): add_tree(tree, cpu_branch[cpu], {owner: {}}) if show_mem: for node in range(max_node + 1): if memmask & (1 << node): add_tree(tree, mem_branch[node], {owner: {}}) output_tree(tree) if __name__ == "__main__": opt_topology_dump = None opt_res_allowed_dump = None opt_output_format = "text" try: options, commands = getopt.gnu_getopt( sys.argv[1:], 'ht:r:o:', ['help', '--topology-dump-file=', '--res-allowed-file=']) except getopt.GetoptError as e: error(str(e)) for opt, arg in options: if opt in ["-h", "--help"]: print(__doc__) error(None, exit_status=0) elif opt in ["-t", "--topology-file"]: try: opt_topology_dump = open(arg).read() except IOError as e: error("cannot read topology dump from file %r: %s" % (arg, e)) elif opt in ["-r", "--res-allowed-file"]: try: opt_res_allowed_dump = open(arg).read() except IOError as e: error("cannot read res_allowed dump from file %r: %s" % (arg, e)) elif opt in ["-o"]: if arg in ["json", "text"]: opt_output_format = arg else: error("invalid output format %r") if not commands: error("missing command, see --help") elif commands[0] == "help": print(__doc__) error(None, exit_status=0) elif commands[0] == "cpus": report_res(show_mem=False) elif commands[0] == "cpus_allowed": report_res_allowed(commands[1:], show_mem=False) elif commands[0] == "res": report_res(show_mem=True) elif commands[0] == "res_allowed": report_res_allowed(commands[1:]) elif commands[0] == "bash_topology_dump": print(_bash_topology_dump) elif commands[0] == "bash_res_allowed": print(_bash_res_allowed % ("' '".join(commands[1:]),)) else: error('invalid command %r' % (commands[0],)) ================================================ FILE: demo/lib/topology2qemuopts.py ================================================ #!/usr/bin/env python3 """topology2qemuopts - convert NUMA node list from JSON to Qemu options NUMA node group definitions: "mem" mem (RAM) size on each NUMA node in this group. The default is "0G". "nvmem" nvmem (non-volatile RAM) size on each NUMA node in this group. The default is "0G". "dimm" "": the default, memory is there without pc-dimm defined. "plugged": start with cold plugged pc-dimm. "unplugged": start with free slot for hot plug. Add the dimm in Qemu monitor at runtime: device_add pc-dimm,id=dimmX,memdev=memX,node=X or device_add nvdimm,id=nvdimmX,memdev=nvmemX,node=X "cores" number of CPU cores on each NUMA node in this group. The default is 0. "threads" number of threads on each CPU core. The default is 2. "nodes" number of NUMA nodes on each die. The default is 1. "dies" number of dies on each package. The default is 1. "packages" number of packages. The default is 1. NUMA node distances are defined with following keys: "dist-all": [[from0to0, from0to1, ...], [from1to0, from1to1, ...], ...] distances from every node to all nodes. The order is the same as in to numactl -H "node distances:" output. "node-dist": {"node": dist, ...} symmetrical distances from nodes in this group to other nodes. Distances that apply to all NUMA groups if defined in any: "dist-same-die": N the default distance between NUMA nodes on the same die. "dist-same-package": N the default distance between NUMA nodes on the same package. "dist-other-package": N the default distance between NUMA nodes in other packages. Note that the distance from a node to itself is always 10. The default distance to a node on the same die is 11, and to other nodes on the same and different packages is 21. Example: Each of the first two NUMA groups in the list contains two NUMA nodes. Each node in the first group includes two CPU cores and 2G RAM, while nodes in the second group two CPU cores and 1G RAM. The only NUMA node defined in the third group has 8G of NVRAM, and no CPU. Every NUMA group with CPU cores adds a package (a socket) to the configuration, or many identical packages if "packages" > 1. This example creates a two-socket system, four CPU cores per package. Note that CPU cores are divided symmetrically to packages, meaning that every NUMA group with CPU cores should contain the same number of cores. $ ( cat << EOF [ { "mem": "2G", "cores": 2, "nodes": 2 }, { "mem": "1G", "cores": 2, "nodes": 2 }, { "nvmem": "8G", "node-dist": {"0": 88, "1": 88, "2": 88, "3": 88, "4": 66, "5": 66, "7": 66, "8": 66} } ] EOF ) | python3 topology2qemuopts.py """ import sys import json DEFAULT_DIST = 21 DEFAULT_DIST_SAME_PACKAGE = 21 DEFAULT_DIST_SAME_DIE = 11 DEFAULT_DIST_SAME_NODE = 10 def error(msg, exitstatus=1): sys.stderr.write("topology2qemuopts: %s\n" % (msg,)) if exitstatus is not None: sys.exit(exitstatus) def siadd(s1, s2): if s1.lower().endswith("g") and s2.lower().endswith("g"): return str(int(s1[:-1]) + int(s2[:-1])) + "G" raise ValueError('supports only sizes in gigabytes, example: 2G') def sisub(s1, s2): if s1.lower().endswith("g") and s2.lower().endswith("g"): return str(int(s1[:-1]) - int(s2[:-1])) + "G" raise ValueError('supports only sizes in gigabytes, example: 2G') def validate(numalist): if not isinstance(numalist, list): raise ValueError('expected list containing dicts, got %s' % (type(numalist,).__name__)) valid_keys = set(("mem", "nvmem", "dimm", "cores", "threads", "nodes", "dies", "packages", "node-dist", "dist-all", "dist-other-package", "dist-same-package", "dist-same-die")) int_range_keys = {'cores': ('>= 0', lambda v: v >= 0), 'threads': ('> 0', lambda v: v > 0), 'nodes': ('> 0', lambda v: v > 0), 'dies': ('> 0', lambda v: v > 0), 'packages': ('> 0', lambda v: v > 0)} for numalistindex, numaspec in enumerate(numalist): for key in numaspec: if not key in valid_keys: raise ValueError('invalid name %r in node %r' % (key, numaspec)) if key in ["mem", "nvmem"]: val = numaspec.get(key) if val == "0": continue errmsg = 'invalid %s in node %r, expected string like "2G"' % (key, numaspec) if not isinstance(val, str): raise ValueError(errmsg) try: siadd(val, "0G") except ValueError: raise ValueError(errmsg) if key in int_range_keys: try: val = int(numaspec[key]) if not int_range_keys[key][1](val): raise Exception() except: raise ValueError('invalid %s in node %r, expected integer %s' % (key, numaspec, int_range_keys[key][0])) if 'threads' in numaspec and int(numaspec.get('cores', 0)) == 0: raise ValueError('threads set to %s but "cores" is 0 in node %r' % (numaspec["threads"], numaspec)) def dists(numalist): dist_dict = {} # Return value: {sourcenode: {destnode: dist}}, fully defined for all nodes sourcenode = -1 lastsocket = -1 dist_same_die = DEFAULT_DIST_SAME_DIE dist_same_package = DEFAULT_DIST_SAME_PACKAGE dist_other_package = DEFAULT_DIST # numalist "dist", if defined node_package_die = {} # topology {node: (package, die)} dist_matrix = None # numalist "dist_matrix", if defined node_node_dist = {} # numalist {sourcenode: {destnode: dist}}, if defined for sourcenode lastnode_in_group = -1 for groupindex, numaspec in enumerate(numalist): nodecount = int(numaspec.get("nodes", 1)) corecount = int(numaspec.get("cores", 0)) diecount = int(numaspec.get("dies", 1)) packagecount = int(numaspec.get("packages", 1)) first_node_in_group = sourcenode + 1 for package in range(packagecount): if nodecount > 0: lastsocket += 1 for die in range(diecount): for node in range(nodecount): sourcenode += 1 dist_dict[sourcenode] = {} node_package_die[sourcenode] = (lastsocket, die) lastnode_in_group = sourcenode + 1 if "dist" in numaspec: dist = numaspec["dist"] if "dist-same-die" in numaspec: dist_same_die = numaspec["dist-same-die"] if "dist-same-package" in numaspec: dist_same_package = numaspec["dist-same-package"] if "dist-all" in numaspec: dist_matrix = numaspec["dist-all"] if "node-dist" in numaspec: for n in range(first_node_in_group, lastnode_in_group): node_node_dist[n] = {int(nodename): value for nodename, value in numaspec["node-dist"].items()} if lastnode_in_group < 0: raise ValueError('no NUMA nodes found') lastnode = lastnode_in_group - 1 if dist_matrix is not None: # Fill the dist_dict directly from dist_matrix. # It must cover all distances. if len(dist_matrix) != lastnode + 1: raise ValueError("wrong dimensions in dist-all %s rows seen, %s expected" % (len(dist_matrix), lastnode)) for sourcenode, row in enumerate(dist_matrix): if len(row) != lastnode + 1: raise ValueError("wrong dimensions in dist-all on row %s: %s distances seen, %s expected" % (sourcenode + 1, len(row), lastnode + 1)) for destnode, source_dest_dist in enumerate(row): dist_dict[sourcenode][destnode] = source_dest_dist else: for sourcenode in range(lastnode + 1): for destnode in range(lastnode + 1): if sourcenode == destnode: dist_dict[sourcenode][destnode] = DEFAULT_DIST_SAME_NODE elif sourcenode in node_node_dist and destnode in node_node_dist[sourcenode]: # User specified explicit node-to-node distance dist_dict[sourcenode][destnode] = node_node_dist[sourcenode][destnode] dist_dict[destnode][sourcenode] = node_node_dist[sourcenode][destnode] elif not destnode in dist_dict[sourcenode]: # Set distance based on topology if node_package_die[sourcenode] == node_package_die[destnode]: dist_dict[sourcenode][destnode] = dist_same_die elif node_package_die[sourcenode][0] == node_package_die[destnode][0]: dist_dict[sourcenode][destnode] = dist_same_package else: dist_dict[sourcenode][destnode] = dist_other_package return dist_dict def qemuopts(numalist): machineparam = "-machine pc" numaparams = [] objectparams = [] deviceparams = [] lastnode = -1 lastcpu = -1 lastdie = -1 lastsocket = -1 lastmem = -1 lastnvmem = -1 totalmem = "0G" totalnvmem = "0G" unpluggedmem = "0G" pluggedmem = "0G" memslots = 0 groupnodes = {} # groupnodes[NUMALISTINDEX] = (NODEID, ...) validate(numalist) # Read cpu counts, and "mem" and "nvmem" sizes for all nodes. threadcount = -1 for numalistindex, numaspec in enumerate(numalist): nodecount = int(numaspec.get("nodes", 1)) groupnodes[numalistindex] = tuple(range(lastnode + 1, lastnode + 1 + nodecount)) corecount = int(numaspec.get("cores", 0)) if corecount > 0: if threadcount < 0: # threads per cpu, set only once based on the first cpu-ful numa node threadcount = int(numaspec.get("threads", 2)) threads_set_node = numaspec else: # threadcount already set, only check that there is no mismatch if (numaspec.get("threads", None) is not None and threadcount != int(numaspec.get("threads"))): raise ValueError('all CPUs must have the same number of threads, ' 'but %r had %s threads (the default) which contradicts %r' % (threads_set_node, threadcount, numaspec)) cpucount = int(numaspec.get("cores", 0)) * threadcount # logical cpus per numa node (cores * threads) diecount = int(numaspec.get("dies", 1)) packagecount = int(numaspec.get("packages", 1)) memsize = numaspec.get("mem", "0") memdimm = numaspec.get("dimm", "") if memsize != "0": memcount = 1 else: memcount = 0 nvmemsize = numaspec.get("nvmem", "0") if nvmemsize != "0": nvmemcount = 1 else: nvmemcount = 0 for package in range(packagecount): if nodecount > 0 and cpucount > 0: lastsocket += 1 for die in range(diecount): if nodecount > 0 and cpucount > 0: lastdie += 1 for node in range(nodecount): lastnode += 1 currentnumaparams = [] for mem in range(memcount): lastmem += 1 if memdimm == "": objectparams.append("-object memory-backend-ram,size=%s,id=membuiltin_%s_node_%s" % (memsize, lastmem, lastnode)) currentnumaparams.append("-numa node,nodeid=%s,memdev=membuiltin_%s_node_%s" % (lastnode, lastmem, lastnode)) elif memdimm == "plugged": objectparams.append("-object memory-backend-ram,size=%s,id=memdimm_%s_node_%s" % (memsize, lastmem, lastnode)) currentnumaparams.append("-numa node,nodeid=%s" % (lastnode,)) deviceparams.append("-device pc-dimm,node=%s,id=dimm%s,memdev=memdimm_%s_node_%s" % (lastnode, lastmem, lastmem, lastnode)) pluggedmem = siadd(pluggedmem, memsize) memslots += 1 elif memdimm == "unplugged": objectparams.append("-object memory-backend-ram,size=%s,id=memdimm_%s_node_%s" % (memsize, lastmem, lastnode)) currentnumaparams.append("-numa node,nodeid=%s" % (lastnode,)) unpluggedmem = siadd(unpluggedmem, memsize) memslots += 1 else: raise ValueError("unsupported dimm %r, expected 'plugged' or 'unplugged'" % (memdimm,)) totalmem = siadd(totalmem, memsize) for nvmem in range(nvmemcount): lastnvmem += 1 lastmem += 1 if lastnvmem == 0: machineparam += ",nvdimm=on" # Don't use file-backed nvdimms because the file would # need to be accessible from the govm VM # container. Everything is ram-backed on host for now. if memdimm == "": objectparams.append("-object memory-backend-ram,size=%s,id=memnvbuiltin_%s_node_%s" % (nvmemsize, lastmem, lastnode)) currentnumaparams.append("-numa node,nodeid=%s,memdev=memnvbuiltin_%s_node_%s" % (lastnode, lastmem, lastnode)) elif memdimm == "plugged": objectparams.append("-object memory-backend-ram,size=%s,id=memnvdimm_%s_node_%s" % (nvmemsize, lastmem, lastnode)) currentnumaparams.append("-numa node,nodeid=%s" % (lastnode,)) deviceparams.append("-device nvdimm,node=%s,id=nvdimm%s,memdev=memnvdimm_%s_node_%s" % (lastnode, lastmem, lastmem, lastnode)) pluggedmem = siadd(pluggedmem, nvmemsize) memslots += 1 elif memdimm == "unplugged": objectparams.append("-object memory-backend-ram,size=%s,id=memnvdimm_%s_node_%s" % (nvmemsize, lastmem, lastnode)) currentnumaparams.append("-numa node,nodeid=%s" % (lastnode,)) unpluggedmem = siadd(unpluggedmem, nvmemsize) memslots += 1 else: raise ValueError("unsupported dimm %r, expected 'plugged' or 'unplugged'" % (memdimm,)) totalnvmem = siadd(totalnvmem, nvmemsize) if cpucount > 0: if not currentnumaparams: currentnumaparams.append("-numa node,nodeid=%s" % (lastnode,)) currentnumaparams[-1] = currentnumaparams[-1] + (",cpus=%s-%s" % (lastcpu + 1, lastcpu + cpucount)) lastcpu += cpucount numaparams.extend(currentnumaparams) node_node_dist = dists(numalist) for sourcenode in sorted(node_node_dist.keys()): for destnode in sorted(node_node_dist[sourcenode].keys()): if sourcenode == destnode: continue numaparams.append("-numa dist,src=%s,dst=%s,val=%s" % ( sourcenode, destnode, node_node_dist[sourcenode][destnode])) if lastcpu == -1: raise ValueError('no CPUs found, make sure at least one NUMA node has "cores" > 0') if (lastdie + 1) // (lastsocket + 1) > 1: diesparam = ",dies=%s" % ((lastdie + 1) // (lastsocket + 1),) else: # Don't give dies parameter unless it is absolutely necessary # because it requires Qemu >= 5.0. diesparam = "" cpuparam = "-smp cpus=%s,threads=%s%s,sockets=%s" % (lastcpu + 1, threadcount, diesparam, lastsocket + 1) maxmem = siadd(totalmem, totalnvmem) startmem = sisub(sisub(maxmem, unpluggedmem), pluggedmem) memparam = "-m size=%s,slots=%s,maxmem=%s" % (startmem, memslots, maxmem) if startmem.startswith("0"): if pluggedmem.startswith("0"): raise ValueError('no memory in any NUMA node') raise ValueError("no initial memory in any NUMA node - cannot boot with hotpluggable memory") return (machineparam + " " + cpuparam + " " + memparam + " " + " ".join(numaparams) + " " + " ".join(deviceparams) + " " + " ".join(objectparams) ) def main(input_file): try: numalist = json.loads(input_file.read()) except Exception as e: error("error reading JSON: %s" % (e,)) try: print(qemuopts(numalist)) except Exception as e: error("error converting JSON to Qemu opts: %s" % (e,)) if __name__ == "__main__": if len(sys.argv) > 1: if sys.argv[1] in ["-h", "--help"]: print(__doc__) sys.exit(0) else: input_file = open(sys.argv[1]) else: input_file = sys.stdin main(input_file) ================================================ FILE: demo/lib/vm.bash ================================================ # shellcheck disable=SC1091 # shellcheck source=command.bash source "$(dirname "${BASH_SOURCE[0]}")/command.bash" # shellcheck disable=SC1091 # shellcheck source=distro.bash source "$(dirname "${BASH_SOURCE[0]}")/distro.bash" VM_PROMPT=${VM_PROMPT-"\e[38;5;11mroot@vm>\e[0m "} vm-compose-govm-template() { (echo " vms: - name: ${VM_NAME} image: ${VM_IMAGE} cloud: true ContainerEnvVars: - KVM_CPU_OPTS=${VM_QEMU_CPUMEM:=-machine pc -smp cpus=4 -m 8G} - EXTRA_QEMU_OPTS=-monitor unix:/data/monitor,server,nowait ${VM_QEMU_EXTRA} - USE_NET_BRIDGES=${USE_NET_BRIDGES:-0} $(for govm_env in $(distro-govm-env); do echo " - ${govm_env}"; done) user-data: | #!/bin/bash set -e " (if [ -n "$VM_EXTRA_BOOTSTRAP_COMMANDS" ]; then # shellcheck disable=SC2001 sed 's/^/ /g' <<< "${VM_EXTRA_BOOTSTRAP_COMMANDS}" fi # shellcheck disable=SC2001 sed 's/^/ /g' <<< "$(distro-bootstrap-commands)")) | grep -E -v '^ *$' } vm-bootstrap() { distro-bootstrap-commands | vm-pipe-to-file "./e2e-bootstrap.sh" vm-command "sh ./e2e-bootstrap.sh" host-wait-vm-ssh-server --timeout 600 } vm-image-url() { distro-image-url } vm-ssh-user() { if [ -n "$VM_SSH_USER" ]; then echo "$VM_SSH_USER" else distro-ssh-user fi } vm-is-govm() { # script API local name="${1:-$VM_NAME}" # Usage: vm-is-govm [name] # # Check if the given name (or $VM_NAME if omitted) corresponds to # a govm-managed virtual machine. Returns 0 if it does. Returns 1 # if it does not. Returns 2 if govm is not installed. if ! type -f govm >& /dev/null; then return 2 fi if [ -z "$name" ]; then return 1 fi if govm ls | cut -d ' ' -f 2 | grep -q "^$name$"; then return 0 fi return 1 } vm-check-env() { # If VM IP address is already defined, govm is not needed. if [ -n "$VM_IP" ]; then if [ "x$(vm-command-q "whoami")" != "xroot" ]; then echo "ERROR:" echo "ERROR: environment check failed:" echo "ERROR: cannot run commands (with sudo) when connecting" echo "ERROR: $SSH $VM_SSH_USER@$VM_IP" echo "ERROR:" return 1 fi return 0 fi # Check that VM created/managed with govm in this environment. type -p govm >& /dev/null || { echo "ERROR:" echo "ERROR: environment check failed:" echo "ERROR: govm binary not found." echo "ERROR:" echo "ERROR: You can install it using the following commands:" echo "ERROR:" echo "ERROR: git clone https://github.com/govm-project/govm" echo "ERROR: cd govm" echo "ERROR: go build -o govm" echo "ERROR: cp -v govm \$GOPATH/bin" echo "ERROR: docker build . -t govm/govm:latest" echo "ERROR: cd .." echo "ERROR:" return 1 } docker inspect govm/govm >& /dev/null || { echo "ERROR:" echo "ERROR: environment check failed:" echo "ERROR: govm/govm docker image not present (but govm needs it)." echo "ERROR:" echo "ERROR: You can install it using the following commands:" echo "ERROR:" echo "ERROR: git clone https://github.com/govm-project/govm" echo "ERROR: cd govm" echo "ERROR: docker build . -t govm/govm:latest" echo "ERROR: cd .." echo "ERROR:" return 1 } if [ ! -e "$SSH_KEY".pub ]; then echo "ERROR:" echo "ERROR: environment check failed:" echo "ERROR: $SSH_KEY.pub SSH public key not found (but govm needs it)." echo "ERROR:" echo "ERROR: You can generate it using the following command:" echo "ERROR:" echo "ERROR: ssh-keygen" echo "ERROR:" return 1 fi if [ -n "$SSH_AUTH_SOCK" ] && [ -e "$SSH_AUTH_SOCK" ]; then if ! ssh-add -l | grep -q "$(ssh-keygen -l -f "$SSH_KEY" < /dev/null 2>/dev/null | awk '{print $2}')"; then if ! ssh-add "$SSH_KEY" < /dev/null; then echo "ERROR:" echo "ERROR: environment setup failed:" echo "ERROR: Failed to load $SSH_KEY SSH key to agent." echo "ERROR:" echo "ERROR: Please make sure an SSH agent is running, then" echo "ERROR: try loading the key using the following command:" echo "ERROR:" echo "ERROR: ssh-add $SSH_KEY" echo "ERROR:" return 1 fi fi else if host-is-encrypted-ssh-key "$SSH_KEY"; then echo "ERROR:" echo "ERROR: environment setup failed:" echo "ERROR: $SSH_KEY SSH key is encrypted, but agent is not running." echo "ERROR:" echo "ERROR: Please make sure an SSH agent is running, then" echo "ERROR: try loading the key using the following command:" echo "ERROR:" echo "ERROR: ssh-add $SSH_KEY" echo "ERROR:" return 1 fi fi } vm-check-running-binary() { local bin_file="$1" local bin_name bin_name="$(basename "$bin_file")" pid_of_bin="$(vm-command-q "pidof $bin_name")" if [ -f "$bin_file" ] && [ -n "$pid_of_bin" ] && [ "$(vm-command-q "md5sum < /proc/$pid_of_bin/exe")" != "$(md5sum < "$bin_file")" ]; then echo "WARNING:" echo "WARNING: Running $bin_name binary is different from" echo "WARNING: $bin_file" echo "WARNING: Consider restarting with reinstall_${bin_name//-/_}=1." echo "WARNING:" sleep "${warning_delay:-0}" return 1 fi return 0 } vm-check-source-files-changed() { local bin_change local src_change local src_dir="$1" local bin_file="$2" bin_change=$(stat --format "%Z" "$bin_file") src_change=$(find "$src_dir" -name '*.go' -type f -print0 | xargs -0 stat --format "%Z" | sort -n | tail -n 1) if [[ "$src_change" > "$bin_change" ]]; then echo "WARNING:" echo "WARNING: Source files changed, outdated binaries in" echo "WARNING: $(dirname "$bin_file")/" echo "WARNING:" sleep "${warning_delay:-0}" fi } vm-command() { # script API # Usage: vm-command COMMAND # # Execute COMMAND on virtual machine as root. # Returns the exit status of the execution. # Environment variable COMMAND_OUTPUT contains what COMMAND printed # in standard output and error. # # Examples: # vm-command "kubectl get pods" # vm-command "whoami | grep myuser" || command-error "user is not myuser" command-start "vm" "$VM_PROMPT" "$1" if [ "$2" == "bg" ]; then ( $SSH "${VM_SSH_USER}@${VM_IP}" sudo bash -l <<<"$COMMAND" 2>&1 | command-handle-output ; command-end "${PIPESTATUS[0]}" ) & command-runs-in-bg else $SSH "${VM_SSH_USER}@${VM_IP}" sudo bash -l <<<"$COMMAND" 2>&1 | command-handle-output ; command-end "${PIPESTATUS[0]}" fi return "$COMMAND_STATUS" } vm-command-q() { $SSH "${VM_SSH_USER}@${VM_IP}" sudo bash -l <<<"$1" } vm-ssh-user-ip() { # Usage: vm-ssh-user-ip NODE # # Print canonical USER@HOST for NODE. NODE can be a govm vm name # or already of the form: USER@HOST. local NODE="$1" local node_ssh_user="" local node_ssh_ip="" if [[ "$NODE" == *"@"* ]]; then node_ssh_ip=${NODE/*@} node_ssh_user=${NODE%@*} else node_ssh_ip=$(${GOVM} ls | awk "/$NODE/{print \$4}") node_ssh_user=$( host-get-vm-config $NODE && echo $VM_SSH_USER ) fi if [ -z "$node_ssh_ip" ]; then error "cannot find IP address for NODE=$NODE" fi if [ -z "$node_ssh_user" ]; then error "cannot find ssh user for NODE=$NODE" fi echo "${node_ssh_user}@${node_ssh_ip}" } vm-join() { # Usage: vm-join MASTER_NODE # # Join vm to the cluster whose master node is MASTER_NODE." # MASTER_NODE is a name of a govm virtual machine, or # "USER@HOST" that can be logged into using ssh. local MASTER_NODE="$1" local master_user_ip local k8s_join_cmd k8s_join_cmd="$(vm-join-cmd "$MASTER_NODE")" vm-command "$k8s_join_cmd" || { command-error "joining to the cluster master ($MASTER_NODE) failed" } # Enable using kubectl on the worker vm by # copying k8s admin configuration on it. master_user_ip="$(vm-ssh-user-ip $MASTER_NODE)" ssh "$master_user_ip" "sudo cat /etc/kubernetes/admin.conf" | vm-pipe-to-file "/root/.kube/config" } vm-join-cmd() { # Usage: vm-join-cmd MASTER_NODE # # Print a join command to join VM to existing cluster MASTER_NODE. # MASTER_NODE is a name of a govm virtual machine (exists in "govm ls") # or USERNAME@IP. local MASTER_NODE="$1" local master_user_ip local k8s_join_cmd="" master_user_ip="$(vm-ssh-user-ip $MASTER_NODE)" local ssh_get_join_cmd="ssh $master_user_ip sudo kubeadm token create --print-join-command" k8s_join_cmd="$( $ssh_get_join_cmd )" if [[ "$k8s_join_cmd" != *" join "* ]]; then error "failed to get kubeadm join command: $k8s_join_cmd" fi echo $k8s_join_cmd } vm-mem-hotplug() { # script API # Usage: vm-mem-hotplug MEMORY # # Hotplug currently unplugged MEMORY to VM. # Find unplugged memory with "vm-mem-hw | grep unplugged". # # Examples: # vm-mem-hotplug mem2 local memmatch memline memid memdimm memnode memdriver memmatch=$1 if [ -z "$memmatch" ]; then error "missing MEMORY" return 1 fi memline="$(vm-mem-hw | grep unplugged | grep "$memmatch")" if [ -z "$memline" ]; then error "unplugged memory matching '$memmatch' not found" return 1 fi memid="$(awk '{print $1}' <<< "$memline")" memid=${memid#mem} memid=${memid%[: ]*} memdimm="$(awk '{print $2}' <<< "$memline")" memnode="$(awk '{print $4}' <<< "$memline")" memnode=${memnode#node} if [ "$memdimm" == "nvdimm" ]; then memdriver="nvdimm" else memdriver="pc-dimm" fi vm-monitor "device_add ${memdriver},id=${memdimm}${memid},memdev=mem${memdimm}_${memid}_node_${memnode},node=${memnode}" } vm-mem-hotremove() { # script API # Usage: vm-mem-hotremove MEMORY # # Hotremove currently plugged MEMORY from VM. # Find plugged memory with "vm-mem-hw | grep ' plugged'". # # Examples: # vm-mem-hotremove mem2 local memmatch memline memid memdimm memnode memdriver memmatch=$1 if [ -z "$memmatch" ]; then error "missing MEMORY" return 1 fi memline="$(vm-mem-hw | grep \ plugged | grep "$memmatch")" if [ -z "$memline" ]; then error "plugged memory matching '$memmatch' not found" return 1 fi memid="$(awk '{print $1}' <<< "$memline")" memid=${memid#mem} memid=${memid%[: ]*} memdimm="$(awk '{print $2}' <<< "$memline")" vm-monitor "device_del ${memdimm}${memid}" } vm-mem-hw() { # script API # Usage: vm-mem-hw # # List VM memory hardware with current status. # See also: vm-mem-hotplug, vm-mem-hotremove vm-monitor "$(echo info memdev; echo info memory-devices)" | awk ' /memdev: /{ split($2,a,"_"); state[a[2]]="plugged "; } /memory backend: membuiltin/{ split($3,a,"_"); backend=1; type[a[2]]="ram "; state[a[2]]="builtin "; node[a[2]]=a[4]; } /memory backend: memnvbuiltin/{ split($3,a,"_"); backend=1; type[a[2]]="nvram "; state[a[2]]="builtin "; node[a[2]]=a[4]; } /memory backend: memnvdimm/{ split($3,a,"_"); backend=1; type[a[2]]="nvdimm "; state[a[2]]="unplugged"; node[a[2]]=a[4]; } /memory backend: memdimm/{ split($3,a,"_"); backend=1; type[a[2]]="dimm "; state[a[2]]="unplugged"; node[a[2]]=a[4]; } /size: /{sz=$2/1024/1024; if (backend==1) {size[a[2]]=sz;backend=0;}} END{ for (m in node) print "mem"m": "type[m]" "state[m]" node"node[m]" size="size[m]"M"; }' } vm-monitor() { # script API # Usage: vm-monitor COMMAND # # Execute COMMAND on Qemu monitor. # # Example: VM monitor help: # vm-monitor "help" | less # # Example: print memdev objects and plugged in memory devices: # vm-monitor "info memdev" # vm-monitor "info memory-devices" # # Example: hot plug a NVDIMM to NUMA node 1 when launched with topology # topology='[{"cores":2,"mem":"2G"},{"nvmem":"4G","dimm":"unplugged"}]': # vm-monitor "device_add pc-dimm,id=nvdimm0,memdev=nvmem0,node=1" [ -n "$VM_MONITOR" ] || error "VM is not running" eval "$VM_MONITOR" <<< "$1" | sed 's/\r//g' if [ "${PIPESTATUS[0]}" != "0" ]; then error "sending command to Qemu monitor failed" fi echo "" } vm-wait-process() { # script API # Usage: vm-wait-process [--timeout TIMEOUT] [--pidfile PIDFILE] PROCESS # # Wait for a PROCESS (string) to appear in process list (pidof output). # If pidfile parameter is given, we also check that the process has that file open. # The default TIMEOUT is 30 seconds. local process timeout pidfile invalid timeout=30 while [ "${1#-}" != "$1" ] && [ -n "$1" ]; do case "$1" in --timeout) timeout="$2" shift 2 ;; --pidfile) pidfile="$2" shift 2 ;; *) invalid="${invalid}${invalid:+,}\"$1\"" shift ;; esac done if [ -n "$invalid" ]; then error "invalid options: $invalid" return 1 fi process="$1" vm-run-until --timeout "$timeout" "pidof \"$process\" > /dev/null" || error "timeout while waiting $process" # As we first wait for the process, and then wait for the pidfile (if enabled) # we might wait longer than expected. Accept that anomaly atm. if [ ! -z "$pidfile" ]; then vm-run-until --timeout $timeout "[ ! -z \"\$(fuser $pidfile 2>/dev/null)\" ]" || error "timeout while waiting $pidfile" vm-run-until --timeout $timeout "[ \$(fuser $pidfile 2>/dev/null) -eq \$(pidof $process) ]" || error "timeout while waiting $process and $pidfile" fi } vm-run-until() { # script API # Usage: vm-run-until [--timeout TIMEOUT] CMD # # Keep running CMD (string) until it exits successfully. # The default TIMEOUT is 30 seconds. local cmd timeout invalid timeout=30 while [ "${1#-}" != "$1" ] && [ -n "$1" ]; do case "$1" in --timeout) timeout="$2" shift; shift ;; *) invalid="${invalid}${invalid:+,}\"$1\"" shift ;; esac done if [ -n "$invalid" ]; then error "invalid options: $invalid" return 1 fi cmd="$1" if ! vm-command-q "retry=$timeout; until $cmd; do retry=\$(( \$retry - 1 )); [ \"\$retry\" == \"0\" ] && exit 1; sleep 1; done"; then error "waiting for command \"$cmd\" to exit successfully timed out after $timeout s" fi } vm-write-file() { local vm_path_file="$1" local file_content_b64 file_content_b64="$(base64 <<<"$2")" vm-command-q "mkdir -p $(dirname "$vm_path_file"); echo -n \"$file_content_b64\" | base64 -d > \"$vm_path_file\"" } vm-put-file() { # script API # Usage: vm-put-file [--cleanup] [--append] SRC-HOST-FILE DST-VM-FILE # # Copy SRC-HOST-FILE to DST-VM-FILE on the VM, removing # SRC-HOST-FILE if called with the --cleanup flag, and # appending instead of copying if the --append flag is # specified. # # Example: # src=$(mktemp) && \ # echo 'Ahoy, Matey...' > $src && \ # vm-put-file --cleanup $src /etc/motd local cleanup append invalid while [ "${1#-}" != "$1" ] && [ -n "$1" ]; do case "$1" in --cleanup) cleanup=1 shift ;; --append) append=1 shift ;; *) invalid="${invalid}${invalid:+,}\"$1\"" shift ;; esac done if [ -n "$cleanup" ] && [ -n "$1" ]; then # shellcheck disable=SC2064 trap "rm -f \"$1\"" RETURN EXIT fi if [ -n "$invalid" ]; then error "invalid options: $invalid" return 1 fi [ "$(dirname "$2")" == "." ] || vm-command-q "[ -d \"$(dirname "$2")\" ]" || vm-command "mkdir -p \"$(dirname "$2")\"" || command-error "cannot create vm-put-file destination directory to VM" host-command "$SCP \"$1\" ${VM_SSH_USER}@${VM_IP}:\"vm-put-file.${1##*/}\"" || command-error "failed to copy file to VM" if [ -z "$append" ]; then vm-command "mv \"vm-put-file.${1##*/}\" \"$2\"" || command-error "failed to rename file" else vm-command "touch \"$2\" && cat \"vm-put-file.${1##*/}\" >> \"$2\" && rm -f \"vm-put-file.${1##*/}\"" || command-error "failed to append file" fi } vm-put-pkg() { # script API # Usage: vm-put-pkg [--force] HOST-FILE... # # Copies HOST-FILEs from host to vm and installs them. # # Examples: # vm-put-pkg /tmp/kernel.rpm /tmp/myutil.rpm local host_pkg local vm_pkgs="" local force="" if [ "$1" == "--force" ]; then force="--force " shift fi for host_pkg in "$@"; do local vm_pkg="pkgs/$(basename "$host_pkg")" vm-command-q "mkdir -p $(dirname "$vm_pkg")" vm-put-file "$host_pkg" "$vm_pkg" vm_pkgs="$vm_pkgs $vm_pkg" done distro-install-pkg-local $force "$vm_pkgs" } vm-put-docker-image() { # script API # Usage: vm-put-docker-image IMAGE # # Exports IMAGE from docker images on the host, and # imports it in the "k8s.io" namespace (visible # for kubernetes containers) on the vm. # # Works with containerd only. # # Examples: # vm-put-docker-image busybox:latest local image_name="$1" local image_file_on_vm="images/${image_name//:/__}" vm-command-q "mkdir -p $(dirname "$image_file_on_vm")" docker save "$image_name" | vm-pipe-to-file "$image_file_on_vm" || error "failed to save and pipe image '$image_name'" vm-cri-import-image "$image_name" "$image_file_on_vm" } vm-pipe-to-file() { # script API # Usage: vm-pipe-to-file [--append] DST-VM-FILE # # Reads stdin and writes the content to DST-VM-FILE, creating any # intermediate directories necessary. # # Example: # echo 'Ahoy, Matey...' | vm-pipe-to-file /etc/motd local tmp append tmp="$(mktemp vm-pipe-to-file.XXXXXX)" if [ "$1" = "--append" ]; then append="--append" shift fi cat > "$tmp" vm-put-file --cleanup $append "$tmp" "$1" } vm-sed-file() { # script API # Usage: vm-sed-file PATH-IN-VM SED-EXTENDED-REGEXP-COMMANDS # # Edits the given file in place with the given extended regexp # sed commands. # # Example: # vm-sed-file /etc/motd 's/Matey/Guybrush Threepwood/' local file="$1" cmd shift for cmd in "$@"; do vm-command "sed -E -i \"$cmd\" $file" || command-error "failed to edit $file with sed" done } vm-set-kernel-cmdline() { # script API # Usage: vm-set-kernel-cmdline E2E-DEFAULTS # # Adds/replaces E2E-DEFAULTS to kernel command line" # # Example: # vm-set-kernel-cmdline nr_cpus=4 # vm-reboot # vm-command "cat /proc/cmdline" # launch cri-resmgr distro-set-kernel-cmdline "$@" } vm-reboot() { # script API # Usage: vm-reboot # # Reboots the virtual machine and waits that the ssh server starts # responding again. vm-command "reboot" sleep 10 if ! host-wait-vm-ssh-server; then vm-monitor system_reset host-wait-vm-ssh-server fi } vm-setup-proxies() { distro-setup-proxies } vm-networking() { vm-command-q "touch /etc/hosts; grep -q \$(hostname) /etc/hosts" || { vm-command "echo \"$VM_IP \$(hostname)\" >>/etc/hosts" } vm-setup-proxies } vm-install-cri-resmgr() { prefix=/usr/local # shellcheck disable=SC2154 if [ "$binsrc" == "github" ]; then vm-install-golang vm-install-pkg make vm-command "go get -d -v github.com/intel/cri-resource-manager" CRI_RESMGR_SOURCE_DIR=$(awk '/package.*cri-resource-manager/{print $NF}' <<< "$COMMAND_OUTPUT") vm-command "cd $CRI_RESMGR_SOURCE_DIR && make install && cd -" elif [ "${binsrc#packages/}" != "$binsrc" ]; then suf=$(vm-pkg-type) vm-command "rm -f *.$suf" local pkg_count # shellcheck disable=SC2010,SC2126 pkg_count="$(ls "$HOST_PROJECT_DIR/$binsrc"/cri-resource-manager*."$suf" | grep -v dbg | wc -l)" if [ "$pkg_count" == "0" ]; then error "installing from $binsrc failed: cannot find cri-resource-manager_*.$suf from $HOST_PROJECT_DIR/$binsrc" elif [[ "$pkg_count" -gt 1 ]]; then error "installing from $binsrc failed: expected exactly one cri-resource-manager*.$suf in $HOST_PROJECT_DIR/$binsrc, found $pkg_count alternatives." fi vm-command "mkdir -p /etc/cri-resmgr && touch /etc/cri-resmgr/fallback.cfg" host-command "$SCP $HOST_PROJECT_DIR/$binsrc/*.$suf $VM_SSH_USER@$VM_IP:/tmp" || { command-error "copying *.$suf to vm failed, run \"make cross-$suf\" first" } vm-install-pkg "/tmp/cri-resource-manager*.$suf" || { command-error "installing packages failed" } vm-command "systemctl daemon-reload" elif [ -z "$binsrc" ] || [ "$binsrc" == "local" ]; then vm-put-file "$BIN_DIR/cri-resmgr" "$prefix/bin/cri-resmgr" vm-put-file "$BIN_DIR/cri-resmgr-agent" "$prefix/bin/cri-resmgr-agent" sed -E -e "s:__DEFAULTDIR__:$(distro-env-file-dir):g" \ -E -e "s:__BINDIR__:$prefix/bin:g" < "$HOST_PROJECT_DIR/cmd/cri-resmgr/cri-resource-manager.service.in" | vm-pipe-to-file /usr/lib/systemd/system/cri-resource-manager.service cat < "$bin_change" ]]; then echo "WARNING:" echo "WARNING: Source files changed - installing possibly outdated binaries from" echo "WARNING: $BIN_DIR/" echo "WARNING:" sleep "${warning_delay:-0}" fi vm-put-file "$BIN_DIR/cri-resmgr-agent" "$prefix/bin/cri-resmgr-agent" } vm-cri-import-image() { local image_name="$1" local image_tar="$2" case "$VM_CRI" in containerd) vm-command "ctr -n k8s.io images import '$image_tar'" || command-error "failed to import \"$image_tar\" on VM" ;; *) error "vm-cri-import-image unsupported container runtime: \"$VM_CRI\"" esac } vm-install-cri-resmgr-webhook() { local service=cri-resmgr-webhook local namespace=cri-resmgr vm-command-q "\ kubectl delete secret -n ${namespace} cri-resmgr-webhook-secret 2>/dev/null; \ kubectl delete csr ${service}.${namespace} 2>/dev/null; \ kubectl delete -f webhook/mutating-webhook-config.yaml 2>/dev/null; \ kubectl delete -f webhook/webhook-deployment.yaml 2>/dev/null; \ " local webhook_image_info webhook_image_id webhook_image_repotag webhook_image_tar webhook_image_info="$(docker images --filter=reference=cri-resmgr-webhook --format '{{.ID}} {{.Repository}}:{{.Tag}} (created {{.CreatedSince}}, {{.CreatedAt}})' | head -n 1)" if [ -z "$webhook_image_info" ]; then error "cannot find cri-resmgr-webhook image on host, run \"make images\" and check \"docker images --filter=reference=cri-resmgr-webhook\"" fi echo "installing webhook to VM from image: $webhook_image_info" sleep 2 webhook_image_id="$(awk '{print $1}' <<< "$webhook_image_info")" webhook_image_repotag="$(awk '{print $2}' <<< "$webhook_image_info")" webhook_image_tar="$(realpath "$OUTPUT_DIR/webhook-image-$webhook_image_id.tar")" # It is better to export (save) the image with image_repotag rather than image_id # because otherwise manifest.json RepoTags will be null and containerd will # remove the image immediately after impoting it as part of garbage collection. docker image save "$webhook_image_repotag" > "$webhook_image_tar" vm-put-file "$webhook_image_tar" "webhook/$(basename "$webhook_image_tar")" || { command-error "copying webhook image to VM failed" } vm-cri-import-image cri-resmgr-webhook "webhook/$(basename "$webhook_image_tar")" # Create a self-signed certificate with SANs vm-command "openssl req -x509 -newkey rsa:2048 -sha256 -days 365 -nodes -keyout webhook/server-key.pem -out webhook/server-crt.pem -subj '/CN=${service}.${namespace}.svc' -addext 'subjectAltName=DNS:${service},DNS:${service}.${namespace},DNS:${service}.${namespace}.svc'" || command-error "creating self-signed certificate failed, requires openssl >= 1.1.1" # Allow webhook to run on node tainted by cmk=true sed -e "s|IMAGE_PLACEHOLDER|$webhook_image_repotag|" \ -e 's|^\(\s*\)tolerations:$|\1tolerations:\n\1 - {"key": "cmk", "operator": "Equal", "value": "true", "effect": "NoSchedule"}|g' \ -e 's/imagePullPolicy: Always/imagePullPolicy: Never/' \ < "${HOST_PROJECT_DIR}/cmd/cri-resmgr-webhook/webhook-deployment.yaml" \ | vm-pipe-to-file webhook/webhook-deployment.yaml # Create secret that contains svc.crt and svc.key for webhook deployment local server_crt_b64 server_key_b64 server_crt_b64="$(vm-command-q "cat webhook/server-crt.pem" | base64 -w 0)" server_key_b64="$(vm-command-q "cat webhook/server-key.pem" | base64 -w 0)" cat </dev/null" || { error "required command '$util' missing on VM, fix/implement $distro-install-utils()" } done } vm-install-golang() { distro-install-golang } vm-install-runc() { local host_runc="$runc_src/runc" local vm_runc="/usr/sbin/runc" if [ -n "$runc_src" ]; then # Check if runc is already installed on VM. # If it is, replace existing binary with local build." vm-command 'command -v runc' if [ -n "$COMMAND_OUTPUT" ] && [ "x$COMMAND_STATUS" == "x0" ]; then vm_runc="$COMMAND_OUTPUT" fi vm-put-file "$host_runc" "$vm_runc" else distro-install-runc fi } vm-install-cri() { local vm_cri_dir="/usr/bin" distro-install-"$VM_CRI" distro-config-"$VM_CRI" if [ "$VM_CRI" == "containerd" ]; then if [ -n "$containerd_src" ]; then vm-command "systemctl stop containerd" vm-command 'command -v containerd' if [ -n "$COMMAND_OUTPUT" ] && [ "x$COMMAND_STATUS" == "x0" ]; then vm_cri_dir="${COMMAND_OUTPUT%/*}" fi for f in ctr containerd containerd-stress containerd-shim containerd-shim-runc-v1 containerd-shim-runc-v2; do vm-put-file "$containerd_src/bin/$f" "$vm_cri_dir/$f" done vm-command "mkdir -p /etc/containerd; containerd config default | sed -e 's/SystemdCgroup = false/SystemdCgroup = true/g' > /etc/containerd/config.toml" vm-command "systemctl enable --now containerd" fi elif [ "$VM_CRI" == "crio" ]; then if [ -n "$crio_src" ]; then vm-command "systemctl stop crio" vm-command 'command -v crio' if [ -n "$COMMAND_OUTPUT" ] && [ "x$COMMAND_STATUS" == "x0" ]; then vm_cri_dir="${COMMAND_OUTPUT%/*}" fi for f in crio crio-status pinns; do vm-put-file "$crio_src/bin/$f" "$vm_cri_dir/$f" done vm-command "systemctl enable --now crio" fi fi } vm-install-containernetworking() { vm-install-golang vm-command "GO111MODULE=off go get -d github.com/containernetworking/plugins" CNI_PLUGINS_SOURCE_DIR="$(awk '/package.*plugins/{print $NF}' <<< "$COMMAND_OUTPUT")" [ -n "$CNI_PLUGINS_SOURCE_DIR" ] || { command-error "downloading containernetworking plugins failed" } vm-command "pushd \"$CNI_PLUGINS_SOURCE_DIR\" && ./build_linux.sh && mkdir -p /opt/cni && cp -rv bin /opt/cni && popd" || { command-error "building and installing cri-tools failed" } vm-command "rm -rf /etc/cni/net.d && mkdir -p /etc/cni/net.d && cat > /etc/cni/net.d/10-bridge.conf < /etc/cni/net.d/20-portmap.conf < /etc/cni/net.d/99-loopback.conf < \"\$HOME/.config/dlv/config.yml.d/00-substitute-path\"" } vm-install-glibc() { # script API # Usage: vm-install-glibc [VERSION] # # If glibc_src=/host/path/to/glibc is set, install a glibc that is # built and installed on host using configure --prefix $glibc_src. # If glibc_src is not set, download, build and install a glibc on vm. # In both cases glibc is installed to /opt/glibc/VERSION on vm. # # vm-set-glibc wraps selected binaries to use an installed glibc. # # Example: install a glibc from host and use it with two binaries. # glibc_src=/host/glibc/install/prefix vm-install-glibc host-2.34 # vm-set-glibc host-2.34 /usr/bin/containerd /usr/local/bin/cri-resmgr # # Example: download, build and install glibc 2.32 on vm: # vm-install-glibc 2.32 # vm-set-glibc 2.32 /usr/bin/containerd /usr/local/bin/cri-resmgr local glibc_ver="${1:-host}" local vm_glibc_dir="/opt/glibc/${glibc_ver}" if [ -n "$glibc_src" ] && [ -d "$glibc_src" ]; then vm-command "mkdir -p $vm_glibc_dir" ( cd "$glibc_src" && tar cz . ) | vm-pipe-to-file "$vm_glibc_dir/glibc-$glibc_ver.tar.gz" || error "failed to package glibc from '$glibc_src'" vm-command "cd $vm_glibc_dir && tar xf glibc-$glibc_ver.tar.gz && rm -f glibc-$glibc_ver.tar.gz" || command-error "failed to extract glibc-$glibc_ver.tar.gz" return 0 fi if [[ "$glibc_ver" == "host"* ]]; then error "vm-install-glibc: invalid glibc_src='$glibc_src' when installing glibc from host" fi local vm_glibc_src="$vm_glibc_dir/src/glibc-${glibc_ver}" local vm_glibc_build="$vm_glibc_dir/src/build" local vm_glibc_install="$vm_glibc_dir" vm-install-pkg make bison flex gcc vm-command "mkdir -p $vm_glibc_src; cd $vm_glibc_src; curl -L --remote-name-all https://ftp.gnu.org/gnu/glibc/glibc-${glibc_ver}.tar.gz" || command-error "failed to download glibc" vm-command "mkdir -p $vm_glibc_src; cd $vm_glibc_src/..; tar xzf $vm_glibc_src/glibc-${glibc_ver}.tar.gz" || command-error "failed to extract glibc" vm-command "mkdir -p $vm_glibc_build; cd $vm_glibc_build && $vm_glibc_src/configure --prefix=$vm_glibc_install" || command-error "failed to configure glibc" vm-command "cd $vm_glibc_build && make -j 4 >make.output.txt 2>&1 || ( tail make.output.txt; exit 1 )" || command-error "failed to build glibc, see $vm_glibc_build/make.output.txt" vm-command "cd $vm_glibc_build && make install" || command-error "failed to install glibc" } vm-set-glibc() { # script API # Usage: vm-set-glibc VERSION BIN [BIN...] # # Wrap binaries to use glibc VERSION. # # Note glibc VERSION must be installed first. # See vm-install-glibc. local glibc_ver="$1" local vm_glibc_dir="/opt/glibc/${glibc_ver}" local vm_glibc_install="$vm_glibc_dir" local vm_glibc_ld="$vm_glibc_install/lib/ld-linux-x86-64.so.2" shift if [ -z "$glibc_ver" ]; then error "vm-switch-glibc: missing glibc version to switch to" fi vm-command "[ -x $vm_glibc_ld ]" || command-error "cannot find loader $vm_glibc_ld" local vm_bin for vm_bin in "$@"; do vm-command "[ -x $vm_bin ]" || command-error "cannot find binary to be wrapped: $vm_bin" vm-command "( [ \"\$(dd bs=1 count=3 skip=1 if=$vm_bin)\" == \"ELF\" ] && mv $vm_bin ${vm_bin}.bin ) || [ -f $vm_bin.bin ]" || command-error "failed to rename binary" vm-pipe-to-file "$vm_bin" < \"\$HOME/.config/dlv/config.yml.d/01-$(basename "$host_src_dir")\"" vm-dlv-update-config } vm-dlv-update-config() { vm-command "( echo 'substitute-path:'; cat \$HOME/.config/dlv/config.yml.d/* ) > \$HOME/.config/dlv/config.yml" } vm-install-k8s() { distro-install-k8s distro-restart-$VM_CRI } vm-install-minikube() { vm-install-containernetworking distro-install-cri-dockerd distro-install-minikube } vm-create-minikube-cluster() { vm-command "sysctl fs.protected_regular=0; minikube start --driver=none --alsologtostderr=true" } vm-create-singlenode-cluster() { if ! [ "$(type -t vm-install-cni-$(distro-k8s-cni))" == "function" ]; then error "invalid CNI: $(distro-k8s-cni)" fi vm-create-cluster vm-command "kubectl taint nodes --all node-role.kubernetes.io/control-plane-" vm-command "kubectl taint nodes --all node-role.kubernetes.io/master-" vm-install-cni-"$(distro-k8s-cni)" if ! vm-command "kubectl wait --for=condition=Ready node/\$(hostname) --timeout=240s"; then command-error "kubectl waiting for node readiness timed out" fi vm-run-until --timeout 30 "kubectl get sa default > /dev/null" || error "serviceaccount 'default' not found" } vm-create-cluster() { vm-command "kubeadm init --pod-network-cidr=$CNI_SUBNET --cri-socket ${k8scri_sock}" if ! grep -q "initialized successfully" <<< "$COMMAND_OUTPUT"; then command-error "kubeadm init failed" fi user="$(vm-ssh-user)" vm-command "mkdir -p ~$user/.kube" vm-command "cp /etc/kubernetes/admin.conf ~$user/.kube/config" vm-command "chown -R $user:$user ~$user/.kube" vm-command "mkdir -p ~root/.kube" vm-command "cp /etc/kubernetes/admin.conf ~root/.kube/config" } vm-destroy-cluster() { user="$(vm-ssh-user)" vm-command "yes | kubeadm reset; rm -f ~$user/.kube/config ~root/.kube/config /etc/kubernetes" } vm-install-cni-bridge() { vm-command "rm -rf /etc/cni/net.d/* && mkdir -p /etc/cni/net.d && cat > /etc/cni/net.d/10-bridge.conf < GitHub Pages {{ versions_menu_this_version }}
{{ _('Versions') }}
all releases
{% endif %} {% endblock %} ================================================ FILE: docs/conf.py ================================================ # Configuration file for the Sphinx documentation builder. # # This file only contains a selection of the most common options. For a full # list see the documentation: # https://www.sphinx-doc.org/en/master/usage/configuration.html # -- Path setup -------------------------------------------------------------- # If extensions (or modules to document with autodoc) are in another directory, # add these directories to sys.path here. If the directory is relative to the # documentation root, use os.path.abspath to make it absolute, like shown here. # # import os # import sys # sys.path.insert(0, os.path.abspath('.')) from docutils import nodes from os.path import isdir, isfile, join, basename, dirname from os import makedirs, getenv from shutil import copyfile from subprocess import run, STDOUT # -- Project information ----------------------------------------------------- project = 'CRI Resource Manager' copyright = '2020, various' author = 'various' master_doc = 'docs/index' ############################################################################## # # This section determines the behavior of links to local items in .md files. # # if useGitHubURL == True: # # links to local files and directories will be turned into github URLs # using either the baseBranch defined here or using the commit SHA. # # if useGitHubURL == False: # # local files will be moved to the website directory structure when built # local directories will still be links to github URLs # # if built with GitHub workflows: # # the GitHub URLs will use the commit SHA (GITHUB_SHA environment variable # is defined by GitHub workflows) to link to the specific commit. # ############################################################################## baseBranch = "master" useGitHubURL = True commitSHA = getenv('GITHUB_SHA') githubServerURL = getenv('GITHUB_SERVER_URL') githubRepository = getenv('GITHUB_REPOSITORY') if githubServerURL and githubRepository: githubBaseURL = join(githubServerURL, githubRepository) else: githubBaseURL = "https://github.com/intel/cri-resource-manager/" githubFileURL = join(githubBaseURL, "blob/") githubDirURL = join(githubBaseURL, "tree/") if commitSHA: githubFileURL = join(githubFileURL, commitSHA) githubDirURL = join(githubDirURL, commitSHA) else: githubFileURL = join(githubFileURL, baseBranch) githubDirURL = join(githubDirURL, baseBranch) # Version displayed in the upper left corner of the site ref = getenv('GITHUB_REF', default="") if ref == "refs/heads/master": version = "devel" elif ref.startswith("refs/heads/release-"): # For release branches just show the latest tag name buildVersion = getenv("BUILD_VERSION", default="unknown") version = buildVersion.split('-')[0] elif ref.startswith("refs/tags/"): version = ref[len("refs/tags/"):] else: version = getenv("BUILD_VERSION", default="unknown") release = getenv("BUILD_VERSION", default="unknown") # Versions to show in the version menu if getenv('VERSIONS_MENU'): html_context = { 'versions_menu': True, 'versions_menu_this_version': getenv('VERSIONS_MENU_THIS_VERSION', version)} # -- General configuration --------------------------------------------------- # Add any Sphinx extension module names here, as strings. They can be # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom # ones. extensions = ['myst_parser', 'sphinx_markdown_tables'] myst_enable_extensions = ['substitution'] source_suffix = {'.rst': 'restructuredtext','.md': 'markdown'} # Substitution variables def module_version(module, version): version=version.split('-', 1)[0] if module == 'github.com/intel/goresctrl': version = '.'.join(version.split('.')[0:2]) + '.0' return version def gomod_versions(modules): versions = {} gocmd = run(['go', 'list', '-m', '-f', '{{.GoVersion}}'], check=True, capture_output=True, universal_newlines=True) versions['golang'] = gocmd.stdout.strip() for m in modules: gocmd = run(['go', 'list', '-m', '-f', '{{.Version}}', '%s' % m], check=True, capture_output=True, universal_newlines=True) versions[m] = module_version(m, gocmd.stdout.strip()) return versions mod_versions = gomod_versions(['github.com/intel/goresctrl']) myst_substitutions = { 'golang_version': mod_versions['golang'], 'goresctrl_version': mod_versions['github.com/intel/goresctrl'] } myst_heading_anchors = 3 # Add any paths that contain templates here, relative to this directory. templates_path = ['_templates'] # List of patterns, relative to source directory, that match files and # directories to ignore when looking for source files. # This pattern also affects html_static_path and html_extra_path. exclude_patterns = ['_build', '.github', '_work', 'generate', 'README.md', 'SECURITY.md', 'docs/releases'] # -- Options for HTML output ------------------------------------------------- # The theme to use for HTML and HTML Help pages. See the documentation for # a list of builtin themes. # html_theme = 'sphinx_rtd_theme' html_theme_options = { 'display_version': True, } # Add any paths that contain custom static files (such as style sheets) here, # relative to this directory. They are copied after the builtin static files, # so a file named "default.css" will overwrite the builtin "default.css". #html_static_path = ['_static'] def setup(app): app.connect('doctree-resolved',fixLocalMDAnchors) app.connect('missing-reference',fixRSTLinkInMD) ############################################################################### # # This section defines callbacks that make markdown specific tweaks to # either: # # 1. Fix something that recommonmark does wrong. # 2. Provide support for .md files that are written as READMEs in a GitHub # repo. # # Only use these changes if using the extension ``recommonmark``. # ############################################################################### def isHTTPLink(uri): return uri.startswith('http://') or uri.startswith('https://') def isMDFileLink(uri): return uri.endswith('.md') or '.md#' in uri def isRSTFileLink(uri): return uri.endswith('.rst') # Callback registerd with 'missing-reference'. def fixRSTLinkInMD(app, env, node, contnode): refTarget = node.get('reftarget') if isHTTPLink(refTarget): return if isRSTFileLink(refTarget) and not isHTTPLink(refTarget): # This occurs when a .rst file is referenced from a .md file # Currently unable to check if file exists as no file # context is provided and links are relative. # # Example: [Application examples](examples/readme.rst) # contnode['refuri'] = contnode['refuri'].replace('.rst','.html') contnode['internal'] = "True" return contnode elif refTarget.startswith("/"): # This occurs when a file is referenced for download from an .md file. # Construct a list of them and short-circuit the warning. The files # are moved later (need file location context). To avoid warnings, # write .md files, make the links absolute. This only marks them fixed # if it can verify that they exist. # # Example: [Makefile](/Makefile) # filePath = refTarget.lstrip("/") if isfile(filePath) or isdir(filePath): return contnode def normalizePath(docPath,uriPath): if uriPath == "": return uriPath if "#" in uriPath: # Strip out anchors uriPath = uriPath.split("#")[0] if uriPath.startswith("/"): # It's an absolute path return uriPath.lstrip("/") #path to file from project directory else: # It's a relative path docDir = dirname(docPath) return join(docDir,uriPath) #path to file from referencing file # Callback registerd with 'doctree-resolved'. def fixLocalMDAnchors(app, doctree, docname): for node in doctree.traverse(nodes.reference): uri = node.get('refuri') if uri is None: print("fixLocalMDAnchor: skipping anchor with no URI at node: ", node) continue if isHTTPLink(uri): continue filePath = normalizePath(docname,uri) if isfile(filePath): # Only do this if the file exists. # # TODO: Pop a warning if the file doesn't exist. # if isMDFileLink(uri) and not isHTTPLink(uri): # Make sure .md file links that weren't caught are converted. # These occur when creating an explicit link to an .md file # from an .rst file. By default these are not validated by Sphinx # or recommonmark. Only toctree references are validated. recommonmark # also fails to convert links to local Markdown files that include # anchors. This fixes that as well. # # Only include this code if .md files are being converted to html # # Example: `Google Cloud Engine `__ # [configuration options](autotest.md#configuration-options) # node['refuri'] = node['refuri'].replace('.md','.html') else: # Handle the case where markdown is referencing local files in the repo # # Example: [Makefile](/Makefile) # if useGitHubURL: # Replace references to local files with links to the GitHub repo # newURI = join(githubFileURL, filePath) print("new url: ", newURI) node['refuri']=newURI else: # If there are links to local files other than .md (.rst files are caught # when warnings are fired), move the files into the Sphinx project, so # they can be accessed. newFileDir = join(app.outdir,dirname(filePath)) # where to move the file in Sphinx output. newFilePath = join(app.outdir,filePath) newURI = uri # if the path is relative no need to change it. if uri.startswith("/"): # It's an absolute path. Need to make it relative. uri = uri.lstrip("/") docDirDepth = len(docname.split("/")) - 1 newURI = "../"*docDirDepth + uri if not isdir(newFileDir): makedirs(newFileDir) copyfile(filePath,newFilePath) node['refuri'] = newURI elif "#" not in uri: # ignore anchors # turn links to directories into links to the repo if isdir(filePath): newURI = join(githubDirURL, filePath) node['refuri']=newURI ================================================ FILE: docs/contributing.md ================================================ # Contributing Please use the GitHub\* infrastructure for contributing to CRI Resource Manager. Use [pull requests](https://github.com/intel/cri-resource-manager/pulls) to contribute code, bug fixes, or if you want to discuss your ideas in terms of code. Open [issues](https://github.com/intel/cri-resource-manager/issues) to report bugs, request new features, or if you want to discuss any other topics related to CRI Resource Manager or orchestration resource management in general. ================================================ FILE: docs/demos/blockio.md ================================================ # Block I/O Demo This demo creates a virtual machine for a single-node Kubernetes\* cluster where container runtime features are extended by `cri-resmgr`. In this setup, `cri-resmgr` is configured with block I/O parameters that throttle I/O bandwith of a container that constantly scans system file checksums. ## Prerequisites Install: - `docker` - `govm` ## Run the demo ``` ./run.sh play ``` The demo does not delete the virtual machine so that you can experiment with it. You can login to the virtual machine: ``` $ govm ssh crirm-demo-blockio ``` ## Clean up - and run the demo from scratch In order to run the demo from scratch again, delete the virtual machine: ``` $ govm delete crirm-demo-blockio ``` ================================================ FILE: docs/demos/index.rst ================================================ Demos ##### .. toctree:: :maxdepth: 1 blockio.md ================================================ FILE: docs/developers-guide/architecture.md ================================================ # Architecture ## Overview CRI Resource Manager (CRI-RM) is a pluggable add-on for controlling how much and which resources are assigned to containers in a Kubernetes\* cluster. It's an add-on because you install it in addition to the normal selection of your components. It's pluggable since you inject it on the signaling path between two existing components with the rest of the cluster unaware of its presence. CRI-RM plugs in between kubelet and CRI, the Kubernetes node agent and the container runtime implementation. CRI-RM intercepts CRI protocol requests from the kubelet acting as a non-transparent proxy towards the runtime. Proxying by CRI-RM is non-transparent in nature because it usually alters intercepted protocol messages before forwarding them. CRI-RM keeps track of the states of all containers running on a Kubernetes node. Whenever it intercepts a CRI request that results in changes to the resource allocation of any container (container creation, deletion, or resource assignment update request), CRI-RM runs one of its built-in policy algorithms. This policy makes a decision about how the assignment of resources should be updated and, eventually, the intercepted request is modified according to this decision. The policy can make changes to any container in the system, not just the one associated with the intercepted CRI request. Therefore it does not operate directly on CRI requests. Instead CRI-RM's internal state tracking cache provides an abstraction for modifying containers and the policy uses this abstraction for recording its decisions. In addition to policies, CRI-RM has a number of built-in resource controllers. These are used to put policy decisions—in practice pending changes made to containers by a policy—into effect. A special in-band CRI controller is used to control all resources that are controllable via the CRI runtime. This controller handles the practical details of updating the intercepted CRI request and generating any additional unsolicited update requests for other existing containers updated by the policy decision. Additional out-of-band controllers exist to exercise control over resources that the current CRI runtimes are unable to handle. To tell which containers need to be handed off to various controllers for updating, CRI-RM uses the internal state tracking cache's ability to tell which containers have pending unenforced changes and to which controllers' domain these changes belong. The CRI controller currently handles CPU and memory resources, including huge pages. The level of control covers per container CPU sets, CFS parametrization, memory limits, OOM score adjustment, and pinning to memory controllers. The two existing out-of-band controllers, Intel® Resource Director Technology (Intel® RDT) and Block I/O, handle last-level cache and memory bandwidth allocation, and the arbitration of Block I/O bandwidth respectively. Many of the details of how CRI-RM operates is configurable. These include, for instance, which policy is active within CRI-RM, configuration of the resource assignment algorithm for the active policy, and configuration for the various resource controllers. Although CRI-RM can be configured using a configuration file present on the node running CRI-RM, the preferred way to configure all CRI-RM instances in a cluster is to use Kubernetes ConfigMaps and the CRI-RM Node Agent.

## Components ### [Node Agent](/pkg/agent/) The node agent is a component external to CRI-RM itself. All interactions by CRI-RM with the Kubernetes Control Plane go through the node agent with the node agent performing any direct interactions on behalf of CRI-RM. The node agent communicates with CRI-RM using two gRPC interfaces. The [config interface](/pkg/cri/resource-manager/config/api/v1/) is used to: - push updated external configuration data to CRI-RM - push adjustments to container resource assignments to CRI-RM The [cluster interface](/pkg/agent/api/v1/) implements the necessary low-level plumbing for the agent interface CRI-RM internally exposes for its policies and other components. This interface in turn implements the following: - updating resource capacity of the node - getting, setting, or removing labels on the node - getting, setting, or removing annotations on the node - getting, setting, or removing taints on the node The config interface is defined and has its gRPC server running in CRI-RM. The agent acts as a gRPC client for this interface. The low-level cluster interface is defined and has its gRPC server running in the agent, with the [convenience layer](/pkg/cri/resource-manager/agent) defined in CRI-RM. CRI-RM acts as a gRPC client for the low-level plumbing interface. Additionally, the stock node agent that comes with CRI-RM implements schemes for: - configuration management for all CRI-RM instances - management of dynamic adjustments to container resource assignments

### [Resource Manager](/pkg/cri/resource-manager/) CRI-RM implements a request processing pipeline and an event processing pipeline. The request processing pipeline takes care of proxying CRI requests and responses between CRI clients and the CRI runtime. The event processing pipeline processes a set of other events that are not directly related to or the result of CRI requests. These events are typically internally generated within CRI-RM. They can be the result of changes in the state of some containers or the utilization of a shared system resource, which potentially could warrant an attempt to rebalance the distribution of resources among containers to bring the system closer to an optimal state. Some events can also be generated by policies. The Resource Manager component of CRI-RM implements the basic control flow of both of these processing pipelines. It passes control to all the necessary sub-components of CRI-RM at the various phases of processing a request or an event. Additionally, it serializes the processing of these, making sure there is at most one (intercepted) request or event being processed at any point in time. The high-level control flow of the request processing pipeline is as follows: A. If the request does not need policying, let it bypass the processing pipeline; hand it off for logging, then relay it to the server and the corresponding response back to the client. B. If the request needs to be intercepted for policying, do the following: 1. Lock the processing pipeline serialization lock. 2. Look up/create cache objects (pod/container) for the request. 3. If the request has no resource allocation consequences, do proxying (step 6). 4. Otherwise, invoke the policy layer for resource allocation: - Pass it on to the configured active policy, which will - Allocate resources for the container. - Update the assignments for the container in the cache. - Update any other containers affected by the allocation in the cache. 5. Invoke the controller layer for post-policy processing, which will: - Collect controllers with pending changes in their domain of control - for each invoke the post-policy processing function corresponding to the request. - Clear pending markers for the controllers. 6. Proxy the request: - Relay the request to the server. - Send update requests for any additional affected containers. - Update the cache if/as necessary based on the response. - Relay the response back to the client. 7. Release the processing pipeline serialization lock. The high-level control flow of the event processing pipeline is one of the following, based on the event type: - For policy-specific events: 1. Engage the processing pipeline lock. 2. Call policy event handler. 3. Invoke the controller layer for post-policy processing (same as step 5 for requests). 4. Release the pipeline lock. - For metrics events: 1. Perform collection/processing/correlation. 2. Engage the processing pipeline lock. 3. Update cache objects as/if necessary. 4. Request rebalancing as/if necessary. 5. Release pipeline lock. - For rebalance events: 1. Engage the processing pipeline lock. 2. Invoke policy layer for rebalancing. 3. Invoke the controller layer for post-policy processing (same as step 5 for requests). 4. Release the pipeline lock. ### [Cache](/pkg/cri/resource-manager/cache/) The cache is a shared internal storage location within CRI-RM. It tracks the runtime state of pods and containers known to CRI-RM, as well as the state of CRI-RM itself, including the active configuration and the state of the active policy. The cache is saved to permanent storage in the filesystem and is used to restore the runtime state of CRI-RM across restarts. The cache provides functions for querying and updating the state of pods and containers. This is the mechanism used by the active policy to make resource assignment decisions. The policy simply updates the state of the affected containers in the cache according to the decisions. The cache's ability to associate and track changes to containers with resource domains is used to enforce policy decisions. The generic controller layer first queries which containers have pending changes, then invokes each controller for each container. The controllers use the querying functions provided by the cache to decide if anything in their resource/control domain needs to be changed and then act accordingly. Access to the cache needs to be serialized. However, this serialization is not provided by the cache itself. Instead, it assumes callers to make sure proper protection is in place against concurrent read-write access. The request and event processing pipelines in the resource manager use a lock to serialize request and event processing and consequently access to the cache. If a policy needs to do processing unsolicited by the resource manager, IOW processing other than handling the internal policy backend API calls from the resource manager, then it should inject a policy event into the resource managers event loop. This causes a callback from the resource manager to the policy's event handler with the injected event as an argument and with the cache properly locked. ### [Generic Policy Layer](/pkg/cri/resource-manager/policy/policy.go) The generic policy layer defines the abstract interface the rest of CRI-RM uses to interact with policy implementations and takes care of the details of activating and dispatching calls through to the configured active policy. ### [Generic Resource Controller Layer](/pkg/cri/resource-manager/control/control.go) The generic resource controller layer defines the abstract interface the rest of CRI-RM uses to interact with resource controller implementations and takes care of the details of dispatching calls to the controller implementations for post-policy enforcment of decisions. ### [Metrics Collector](/pkg/metrics/) The metrics collector gathers a set of runtime metrics about the containers running on the node. CRI-RM can be configured to periodically evaluate this collected data to determine how optimal the current assignment of container resources is and to attempt a rebalancing/reallocation if it is deemed both possible and necessary. ### [Policy Implementations](/pkg/cri/resource-manager/policy/builtin/) #### [None](/pkg/cri/resource-manager/policy/builtin/none/) An empty policy that makes no policy decisions. It is included merely for the sake of completeness, analoguous to the none policy of the CPU Manager in kubelet. #### [Static Pools](/pkg/cri/resource-manager/policy/builtin/static-pools/) A backward-compatible reimplementation of [CMK](https://github.com/intel/CPU-Manager-for-Kubernetes) for CRI-RM with a few extra features. #### [Static](/pkg/cri/resource-manager/policy/builtin/static/) Part of the code from the static policy of CPU Manager in kubelet, that has been brutally hacked to work within CRI-RM. Serves merely as a proof-of-concept that the current policies of kubelet can be implemented in CRI-RM. #### [Static Plus](/pkg/cri/resource-manager/policy/builtin/static-plus/) A fairly simplistic policy similar in spirit to the static policy of CPU Manager in kubelet, with a few extra features. #### [Topology Aware](/pkg/cri/resource-manager/policy/builtin/topology-aware/) A topology-aware policy capable of handling multiple tiers/types of memory, typically a DRAM/PMEM combination configured in 2-layer memory mode. ### [Resource Controller Implementations](/pkg/cri/resource-manager/control/) #### [Intel RDT](/pkg/cri/resource-manager/control/rdt/) A resource controller implementation responsible for the practical details of associating a container with Intel RDT classes. This class effectively determines how much last level cache and memory bandwidth will be available for the container. This controller uses the resctrl pseudo filesystem of the Linux kernel for control. #### [Block I/O](/pkg/cri/resource-manager/control/blockio/) A resource controller implementation responsible for the practical details of associating a container with a Block I/O class. This class effectively determines how much Block I/O bandwidth will be available for the container. This controller uses the blkio cgroup controller and the cgroupfs pseudo- filesystem of the Linux kernel for control. #### [CRI](/pkg/cri/resource-manager/control/cri/) A resource controller responsible for modifying intercepted CRI container creation requests and creating CRI container resource update requests, according to the changes the active policy makes to containers. ================================================ FILE: docs/developers-guide/cri-test.md ================================================ # CRI Validation [This test](/test/critest) runs [`critest`](https://github.com/kubernetes-sigs/cri-tools/blob/master/docs/validation.md) from [cri-tools](https://github.com/kubernetes-sigs/cri-tools/) to make sure that various `cri-resmgr` configurations do not break CRI runtime conformance. ## Prerequisites Install: - `docker` - `govm` ## Run the test ``` cd test/critest ./run.sh test ``` ================================================ FILE: docs/developers-guide/e2e-test.md ================================================ # End-to-End tests ## Prerequisites Install: - `docker` - `govm` v0.95 In case of errors in building `govm` with `go get`, or creating a virtual machine (`Error when creating the new VM: repository name must be canonical`), these are the workarounds: ``` git clone https://github.com/govm-project/govm -b 0.95 && cd govm && go install && docker build . -t govm/govm:latest ``` ## Usage Run policy tests: ``` [VAR=VALUE...] ./run_tests.sh policies ``` Run tests only on certain policy, topology, or only selected test: ``` [VAR=VALUE...] ./run_tests.sh policies[/POLICY[/TOPOLOGY[/testNN-*]]] ``` Run custom tests: ``` [VAR=VALUE...] ./run.sh MODE ``` Get help on available `VAR=VALUE`'s with `./run.sh help`. `run_tests.sh` calls `run.sh` in order to execute selected tests. Therefore the same `VAR=VALUE` definitions apply both scripts. ## Test phases In the *setup phase* `run.sh` creates a virtual machine unless it already exists. When it is running, tests create a single-node cluster and launches `cri-resmgr` on it, unless they are already running. In the *test phase* `run.sh` runs a test script, or gives a prompt (`run.sh> `) asking a user to run test script commands in the `interactive` mode. *Test scripts* are `bash` scripts that can use helper functions for running commands and observing the status of the virtual machine and software running on it. In the *tear down phase* `run.sh` copies logs from the virtual machine and finally stops or deletes the virtual machine, if that is wanted. ## Test modes - `test` mode runs fast and reports `Test verdict: PASS` or `FAIL`. The exit status is zero if and only if a test passed. - `play` mode runs the same phases and scripts as the `test` mode, but slower. This is good for following and demonstrating what is happening. - `interactive` mode runs the setup and tear down phases, but instead of executing a test script it gives an interactive prompt. Print help to see clean up, execution speed and other options for all modes. ## Running from scratch and quick rerun in existing virtual machine The test will use `govm`-managed virtual machine named in the `vm` environment variable. The default is `crirm-test-e2e`. If a virtual machine with that name exists, the test will be run on it. Otherwise the test will create a virtual machine with that name from scratch. You can delete a virtual machine with `govm delete NAME`. If you want rerun the test many times, possibly with different test inputs or against different versions of `cri-resmgr`, either use the `play` mode or set `cleanup=0` in order to keep the virtual machine after each run. Then tests will run in the same single node cluster, and the test script will only delete running pods before launching new ones. ## Testing locally built cri-resmgr and cri-resmgr from github If you make changes to `cri-resmgr` sources and rebuild it, you can force the test script to reinstall newly built `cri-resmgr` to existing virtual machine before rerunning the test: ``` cri-resource-manager$ make cri-resource-manager$ cd test/e2e e2e$ reinstall_cri_resmgr=1 speed=1000 ./run.sh play ``` You can also let the test script build `cri-resmgr` from the github master branch. This takes place inside the virtual machine, so your local git sources will not be affected: ``` e2e$ reinstall_cri_resmgr=1 binsrc=github ./run.sh play ``` ## Custom tests You can run a custom test script in a virtual machine that runs single-node Kubernetes\* cluster. Example: ``` $ cat > myscript.sh << EOF # create two pods, each requesting two CPUs CPU=2 n=2 create guaranteed # create four pods, no resource requests n=4 create besteffort # show pods kubectl get pods # check that the first two pods are not allowed to use the same CPUs verify 'cpus["pod0c0"].isdisjoint(cpus["pod1c0"])' EOF $ ./run.sh test myscript.sh ``` ## Custom topologies If you change NUMA node topology of an existing virtual machine, you must delete the virtual machine first. Otherwise the `topology` variable is ignored and the test will run in the existing NUMA configuration. The `topology` variable is a JSON array of objects. Each object defines one or more NUMA nodes. Keys in objects: ``` "mem" mem (RAM) size on each NUMA node in this group. The default is "0G". "nvmem" nvmem (non-volatile RAM) size on each NUMA node in this group. The default is "0G". "cores" number of CPU cores on each NUMA node in this group. The default is 0. "threads" number of threads on each CPU core. The default is 2. "nodes" number of NUMA nodes on each die. The default is 1. "dies" number of dies on each package. The default is 1. "packages" number of packages. The default is 1. ``` Example: Run the test in a VM with two NUMA nodes. There are 4 CPUs (two cores, two threads per core by default) and 4G RAM in each node ``` e2e$ govm delete my2x4 ; vm=my2x4 topology='[{"mem":"4G","cores":2,"nodes":2}]' ./run.sh play ``` Run the test in a VM with 32 CPUs in total: there are two packages (sockets) in the system, each containing two dies. Each die containing two NUMA nodes, each node containing 2 CPU cores, each core containing two threads. And with a NUMA node with 16G of non-volatile memory (NVRAM) but no CPUs. ``` e2e$ vm=mynvram topology='[{"mem":"4G","cores":2,"nodes":2,"dies":2,"packages":2},{"nvmem":"16G"}]' ./run.sh play ``` ## Test output All test output is saved under the directory in the environment variable `outdir`. The default is `./output`. Executed commands with their output, exit status and timestamps are saved under the `output/commands` directory. You can find Qemu output from Docker\* logs. For instance, output of the most recent Qemu launced by `govm`: ``` $ docker logs $(docker ps | awk '/govm/{print $1; exit}') ``` ## Manual testing and debugging Interactive mode helps developing and debugging scripts: ``` $ ./run.sh interactive ... run.sh> CPU=2 n=2 create guaranteed ``` You can get help on functions available in test scripts with `./run.sh help script`, or with `help` and `help FUNCTION` when in the interactive mode. If a test has stopped to a failing `verify`, you can inspect `cri-resmgr` cache and allowed OS resources in Python\* after the test run: ``` $ PYTHONPATH= python3 >>> from pyexec_state import * >>> pp(allowed) # allowed OS resources >>> pp(pods["pod0"]) # pod entry in cache >>> pp(containers["pod0c0"])) # container entry in cache ``` If you want to get the interactive prompt in the middle of a test run wherever a `verify` or `create` fails, you can set a `on_FUNC_fail` hook to either or both of them. Example: ``` $ on_verify_fail=interactive ./run.sh myscript.sh ``` ================================================ FILE: docs/developers-guide/index.rst ================================================ Developer's Guide ################# .. toctree:: :maxdepth: 1 architecture.md policy-writers-guide.md testing.rst ================================================ FILE: docs/developers-guide/policy-writers-guide.md ================================================ # Policy Writer's Guide ***WORK IN PROGRESS*** ================================================ FILE: docs/developers-guide/testing.rst ================================================ Testing ####### .. toctree:: :maxdepth: 1 unit-test.md cri-test.md e2e-test.md ================================================ FILE: docs/developers-guide/unit-test.md ================================================ # Unit tests Run unit tests with ``` make test ``` ================================================ FILE: docs/index.html ================================================ ================================================ FILE: docs/index.rst ================================================ .. CRI Resource Manager documentation master file Welcome to CRI Resource Manager's documentation! ================================================ .. toctree:: :maxdepth: 2 :caption: Contents: introduction.md quick-start.md installation.md setup.md policy/index.rst node-agent.md webhook.md developers-guide/index.rst migration-to-NRI.md demos/index.rst reference/index.md contributing.md security.md Project GitHub repository ================================================ FILE: docs/installation.md ================================================ # Installation ## Installing from packages You can install CRI Resource Manager from `deb` or `rpm` packages for supported distros. - [download](https://github.com/intel/cri-resource-manager/releases/latest) packages - install them: - for rpm packages: `sudo rpm -Uvh ` - for deb packages: `sudo dpkg -i ` ## Installing from sources Although not recommended, you can install CRI Resource Manager from sources: - get the sources: `git clone https://github.com/intel/cri-resource-manager` - build and install: `cd cri-resource-manager; make build && sudo make install` You will need at least `git`, {{ '`golang '+ '{}'.format(golang_version) + '`' }} or newer, `GNU make`, `bash`, `find`, `sed`, `head`, `date`, and `install` to be able to build and install from sources. ## Building packages for the distro of your host You can build packages for the `$distro` of your host by executing the following command: ``` make packages ``` If the `$version` of your `$distro` is supported, this will leave the resulting packages in `packages/$distro-$version`. Building packages this way requires `docker`, but it does not require you to install the full set of build dependencies of CRI Resource Manager to your host. If you want to build packages without docker, you can use either `make rpm` or `make deb`, depending on which supported distro you are running. Building this way requires all the build dependencies to be installed on your host. You can check which `$distro`'s and `$version`'s are supported by running ``` ls dockerfiles/cross-build ``` If you see a `Dockerfile.$distro-$version` matching your host then your distro is supported. ## Building packages for another distro You can cross-build packages of the native `$type` for a particular `$version` of a `$distro` by running the following command: ``` make cross-$type.$distro-$version ``` Similarly to `make packages`, this will build packages using a `Docker\*` container. However, instead of building for your host, it will build them for the specified distro. For instance `make cross-deb.ubuntu-18.04` will build `deb` packages for `Ubuntu\* 18.04`. ## Post-install configuration The provided packages install `systemd` service files and a sample configuration. The easiest way to get up and running is to rename the sample configuration and start CRI Resource Manager using systemd. You can do this using the following commands: ``` mv /etc/cri-resmgr/fallback.cfg.sample /etc/cri-resmgr/fallback.cfg systemctl start cri-resource-manager ``` If you want, you can set CRI Resource Manager to automatically start when your system boots with this command: ``` systemctl enable cri-resource-manager ``` The provided packages also install a file for managing the default options passed to CRI Resource Manager upon startup. You can change these by editing this file and then restarting CRI Resource Manager, like this: ``` # On Debian\*-based systems edit the defaults like this: ${EDITOR:-vi} /etc/default/cri-resource-manager # On rpm-based systems edit the defaults like this: ${EDITOR:-vi} /etc/sysconfig/cri-resource-manager # Restart the service. systemctl restart cri-resource-manager ``` ================================================ FILE: docs/introduction.md ================================================ # Introduction CRI Resource Manager is a Container Runtime Interface Proxy. It sits between clients and the actual Container Runtime implementation (containerd, cri-o) relaying requests and responses back and forth. The main purpose of the proxy is to apply hardware-aware resource allocation policies to the containers running in the system. Policies are applied by either modifying a request before forwarding it or by performing extra actions related to the request during its processing and proxying. There are several policies available, each with a different set of goals in mind and implementing different hardware allocation strategies. The details of whether and how a CRI request is altered or if extra actions are performed depend on which policy is active in CRI Resource Manager and how that policy is configured. The current goal for the CRI Resource Manager is to prototype and experiment with new Kubernetes\* container placement policies. The existing policies are written with this in mind and the intended setup is for the Resource Manager to only act as a proxy for the Kubernetes Node Agent, kubelet. ================================================ FILE: docs/migration-to-NRI.md ================================================ # Migrating from CRI-RM to NRI ## Prerequisities - Up and running CRI Resource Manager - One of the two supported policies in use: balloons or topology-aware. - For other policies a little bit more work is required and the policies need to be 'ported'. This can be done by just following the example of how balloons or topology-aware were converted. ## Steps for an initial/basic migration test ### Containerd Replace the containerd version in the system with 1.7 or newer version (NRI server not supported in older versions). Replace kubelet's --container-runtime-endpoint=/var/run/cri-resmgr/cri-resmgr.sock with --container-runtime-endpoint=/var/run/containerd/containerd.sock Replacing the runtime endpoint on a node that was setup using Kubeadm: ```console # Get the Kubelet args systemctl cat kubelet <- Look for: EnvironmentFile=/.../kubeadm-flags.env vim /.../kubeadm-flags.env KUBELET_KUBEADM_ARGS="--container-runtime-endpoint=unix:///var/run/containerd/containerd.sock --pod-infra-container-image=registry.k8s.io/pause:3.9" vim /etc/sysconfig/kubelet KUBELET_EXTRA_ARGS= --container-runtime-endpoint=/var/run/containerd/containerd.sock <- Remember this aswell systemctl restart kubelet ``` Edit the containerd config file and look for the section [plugins."io.containerd.nri.v1.nri"] and replace "disable = true" with "disable = false": ```console vim /etc/containerd/config.toml ``` ```toml [plugins."io.containerd.nri.v1.nri"] disable = false disable_connections = false plugin_config_path = "/etc/nri/conf.d" plugin_path = "/opt/nri/plugins" plugin_registration_timeout = "5s" plugin_request_timeout = "2s" socket_path = "/var/run/nri/nri.sock" ``` ```console systemctl restart containerd ``` ### CRI-O Ensure that crio version 1.26.2 or newer is used. Replace kubelet's --container-runtime-endpoint=/var/run/cri-resmgr/cri-resmgr.sock with --container-runtime-endpoint=/var/run/crio/crio.sock Replacing the runtime endpoint on a node that was setup using Kubeadm: ```console # Get the Kubelet args systemctl cat kubelet <- Look for: EnvironmentFile=/.../kubeadm-flags.env vim /.../kubeadm-flags.env KUBELET_KUBEADM_ARGS="--container-runtime-endpoint=unix:///var/run/crio/crio.sock --pod-infra-container-image=registry.k8s.io/pause:3.9" vim /etc/sysconfig/kubelet KUBELET_EXTRA_ARGS= --container-runtime-endpoint=/var/run/crio/crio.sock <- Remember this aswell systemctl restart kubelet ``` Enable NRI: ```console CRIO_CONF=/etc/crio/crio.conf cp $CRIO_CONF $CRIO_CONF.orig crio --enable-nri config > $CRIO_CONF systemctl restart crio ``` ### Build the NRI policies ```console git clone https://github.com/containers/nri-plugins.git cd nri-plugins make # Build the images, specify your image repo to easily push the image later. make images IMAGE_REPO=my-repo IMAGE_VERSION=my-tag ``` ### Create required CRDs ```console kubectl apply -f deployment/base/crds/noderesourcetopology_crd.yaml ``` ### Import the image of the NRI plugin you want to run Containerd ```console ctr -n k8s.io images import build/images/nri-resmgr-topology-aware-image-*.tar ``` CRI-O See the section [below](#steps-for-a-more-real-life-migration-using-self-hosted-image-repository) for instructions on how to push the images to a registry, then pull from there. ### Deploy the plugin ```console kubectl apply -f build/images/nri-resmgr-topology-aware-deployment.yaml ``` ### Deploy a test pod ```console kubectl run mypod --image busybox -- sleep inf kubectl exec mypod -- grep allowed_list: /proc/self/status ``` ### See the resources the pod got assigned with ```console kubectl exec $pod -c $container -- grep allowed_list: /proc/self/status # Output should look similar to the output of CRI-RM ``` ## Steps for a more real-life migration using self-hosted image repository - Same steps as above for enabling NRI with Containerd/CRI-O and building the images. - Push the images built to your repository: ```console # Replace my-repo and my-tag with the IMAGE_REPO and IMAGE_VERSION you specified when building the images with make images docker push my-repo:my-tag ``` - Remember to change the image name & pull policy in the plugins .yaml file to match your registyr and image, ex: ```console vim build/images/nri-resmgr-topology-aware-deployment.yaml ``` - Then deploy the plugin simlarly to the earlier step. ## Migrating existing configuration - The ConfigMap used by the ported policies/infra has a different name/naming scheme than the original one used in CRI-RM, ex: - configMapName: ```diff - configmap-name: cri-resmgr-config + configmap-name: nri-resource-policy-config ``` - The details of grouping nodes by labeling to share configuration: ```diff - cri-resource-manager.intel.com/group: $GROUP_NAME + resource-policy.nri.io/group: $GROUP_NAME ``` ## Migrating existing workloads - The annotations one can use to customize how a policy treats a workload use slightly different keys than the original ones in CRI-RM. The collective 'key namespace' for policy- and resource-manager-specific annotation has been changed from cri-resource-manager.intel.com to resource-policy.nri.io. - For instance, an explicit type annotation for the balloons policy, which used to be: ```yaml ... metadata: annotations: balloon.balloons.cri-resource-manager.intel.com/container.$CONTAINER_NAME: $BALLOON_TYPE` ... ``` - Should now be: ```yaml ... metadata: annotations: balloon.balloons.resource-policy.nri.io/container.$CONTAINER_NAME: $BALLOON_TYPE` ... ``` - Similarly a workload opt-out annotation from exclusive CPU allocation for the topology-aware policy, which used to be: ```yaml ... metadata: annotations: prefer-shared-cpus.cri-resource-manager.intel.com/container.$CONTAINER_NAME: "true" ... ``` - Should now be: ```yaml ... metadata: annotations: prefer-shared-cpus.resource-policy.nri.io/container.$CONTAINER_NAME: "true" ... ``` - Similar changes are needed for any cri-resmgr-specific annotation that uses the same semantic scoping for key syntax. All of the annotations: | Was | Is now | | --------------------------------------------------- | ------------------------------------------- | | cri-resource-manager.intel.com/afffinity | resource-policy.nri.io/affinity | | cri-resource-manager.intel.com/anti-afffinity | resource-policy.nri.io/anti-affinity | | cri-resource-manager.intel.com/prefer-isolated-cpus | resource-policy.nri.io/prefer-isolated-cpus | | cri-resource-manager.intel.com/prefer-shared-cpus | resource-policy.nri.io/prefer-shared-cpus | | cri-resource-manager.intel.com/cold-start | resource-policy.nri.io/cold-start | | cri-resource-manager.intel.com/memory-type | resource-policy.nri.io/ memory-type | | prefer-isolated-cpus.cri-resource-manager.intel.com | prefer-isolated-cpus.resource-policy.nri.io | | prefer-shared-cpus.cri-resource-manager.intel.com | prefer-shared-cpus.resource-policy.nri.io | | memory-type.cri-resource-manager.intel.com | memory-type.resource-policy.nri.io | | cold-start.cri-resource-manager.intel.com | cold-start.resource-policy.nri.io | | prefer-reserved-cpus.cri-resource-manager.intel.com | prefer-reserved-cpus.resource-policy.nri.io | | rdtclass.cri-resource-manager.intel.com | rdtclass.resource-policy.nri.io | | blockioclass.cri-resource-manager.intel.com | blockioclass.resource-policy.nri.io | | toptierlimit.cri-resource-manager.intel.com | toptierlimit.resource-policy.nri.io | | topologyhints.cri-resource-manager.intel.com | topologyhints.resource-policy.nri.io | | balloon.balloons.cri-resource-manager.intel.com | balloon.balloons.resource-policy.nri.io | ================================================ FILE: docs/node-agent.md ================================================ # Node Agent CRI Resource Manager can be configured dynamically using the CRI Resource Manager Node Agent and Kubernetes\* ConfigMaps. ## Running as a DaemonSet The agent can be build using the [provided Dockerfile](/cmd/cri-resmgr-agent/Dockerfile). It can be deployed as a `DaemonSet` in the cluster using the [provided deployment file](/cmd/cri-resmgr-agent/agent-deployment.yaml). When using the provided or a similar deployment, the agent uses a readiness probe to propagate the status of the last configuration update back to the control plane. If the configuration could not be taken into use for any reason, the agent's probe will fail which eventually marks the agent as not being `Ready`. In this case, more details about the failure should be present among the latest messages logged by the agent or the probe itself. if the reason for failure is a configuration error, once the error is fixed, the agent should become eventually `Ready` again. ## Running as a Host Service To run the agent manually or as a `systemd` service, set the environment variable `NODE_NAME` to the name of the cluster node the agent is running on. If necessary pass it the credentials for accessing the cluster using the `-kubeconfig ` command line option. ## ConfigMap to Node Mapping Conventions The agent monitors two ConfigMaps for the node, a primary node-specific one and a secondary group-specific or default one, depending on whether the node belongs to a configuration group. The node-specific ConfigMap always takes precedence over the others. The names of these ConfigMaps are 1. `cri-resmgr-config.node.$NODE_NAME`: primary, node-specific configuration 2. `cri-resmgr-config.group.$GROUP_NAME`: secondary group-specific node configuration 3. `cri-resmgr-config.default`: secondary: secondary default node configuration You can assign a node to a configuration group by setting the `cri-resource-manager.intel.com/group` label on the node to the name of the configuration group. You can remove a node from its group by deleting the node group label. There is a [sample ConfigMap spec](/sample-configs/cri-resmgr-configmap.example.yaml) that contains a node-specific, a group-specific, and a default ConfigMap example. See [any available policy-specific documentation](policy/index.rst) for more information on the policy configurations. ================================================ FILE: docs/policy/balloons.md ================================================ # Balloons Policy ## Overview The balloons policy implements workload placement into "balloons" that are disjoint CPU pools. Balloons can be inflated and deflated, that is CPUs added and removed, based on the CPU resource requests of containers. Balloons can be static or dynamically created and destroyed. CPUs in balloons can be configured, for example, by setting min and max frequencies on CPU cores and uncore. ## How It Works 1. User configures balloon types from which the policy instantiates balloons. 2. A balloon has a set of CPUs and a set of containers that run on the CPUs. 3. Every container is assigned to exactly one balloon. A container is allowed to use all CPUs of its balloon and no other CPUs. 4. Every logical CPU belongs to at most one balloon. There can be CPUs that do not belong to any balloon. 5. The number of CPUs in a balloon can change during the lifetime of the balloon. If a balloon inflates, that is CPUs are added to it, all containers in the balloon are allowed to use more CPUs. If a balloon deflates, the opposite is true. 6. When a new container is created on a Kubernetes node, the policy first decides the type of the balloon that will run the container. The decision is based on annotations of the pod, or the namespace if annotations are not given. 7. Next the policy decides which balloon of the decided type will run the container. Options are: - an existing balloon that already has enough CPUs to run its current and new containers - an existing balloon that can be inflated to fit its current and new containers - new balloon. 9. When a CPU is added to a balloon or removed from it, the CPU is reconfigured based on balloon's CPU class attributes, or idle CPU class attributes. ## Deployment ### Install cri-resmgr Deploy cri-resmgr on each node as you would for any other policy. See [installation](../installation.md) for more details. ## Configuration The balloons policy is configured using the yaml-based configuration system of CRI-RM. See [setup and usage](../setup.md#setting-up-cri-resource-manager) for more details on managing the configuration. ### Parameters Balloons policy parameters: - `PinCPU` controls pinning a container to CPUs of its balloon. The default is `true`: the container cannot use other CPUs. - `PinMemory` controls pinning a container to the memories that are closest to the CPUs of its balloon. The default is `true`: allow using memory only from the closest NUMA nodes. Warning: this may cause kernel to kill workloads due to out-of-memory error when closest NUMA nodes do not have enough memory. In this situation consider switching this option `false`. - `IdleCPUClass` specifies the CPU class of those CPUs that do not belong to any balloon. - `ReservedPoolNamespaces` is a list of namespaces (wildcards allowed) that are assigned to the special reserved balloon, that is, will run on reserved CPUs. This always includes the `kube-system` namespace. - `AllocatorTopologyBalancing` affects selecting CPUs for new balloons. If `true`, new balloons are created using CPUs on NUMA/die/package with most free CPUs, that is, balloons are spread across the hardware topology. This helps inflating balloons within the same NUMA/die/package and reduces interference between workloads in balloons when system is not fully loaded. The default is `false`: pack new balloons tightly into the same NUMAs/dies/packages. This helps keeping large portions of hardware idle and entering into deep power saving states. - `PreferSpreadOnPhysicalCores` prefers allocating logical CPUs (possibly hyperthreads) for a balloon from separate physical CPU cores. This prevents workloads in the balloon from interfering with themselves as they do not compete on the resources of the same CPU cores. On the other hand, it allows more interference between workloads in different balloons. The default is `false`: balloons are packed tightly to a minimum number of physical CPU cores. The value set here is the default for all balloon types, but it can be overridden with the balloon type specific setting with the same name. - `BalloonTypes` is a list of balloon type definitions. Each type can be configured with the following parameters: - `Name` of the balloon type. This is used in pod annotations to assign containers to balloons of this type. - `Namespaces` is a list of namespaces (wildcards allowed) whose pods should be assigned to this balloon type, unless overridden by pod annotations. - `MinBalloons` is the minimum number of balloons of this type that is always present, even if the balloons would not have any containers. The default is 0: if a balloon has no containers, it can be destroyed. - `MaxBalloons` is the maximum number of balloons of this type that is allowed to co-exist. The default is 0: creating new balloons is not limited by the number of existing balloons. - `MaxCPUs` specifies the maximum number of CPUs in any balloon of this type. Balloons will not be inflated larger than this. 0 means unlimited. - `MinCPUs` specifies the minimum number of CPUs in any balloon of this type. When a balloon is created or deflated, it will always have at least this many CPUs, even if containers in the balloon request less. - `CpuClass` specifies the name of the CPU class according to which CPUs of balloons are configured. - `PreferSpreadingPods`: if `true`, containers of the same pod should be spread to different balloons of this type. The default is `false`: prefer placing containers of the same pod to the same balloon(s). - `PreferPerNamespaceBalloon`: if `true`, containers in the same namespace will be placed in the same balloon(s). On the other hand, containers in different namespaces are preferrably placed in different balloons. The default is `false`: namespace has no effect on choosing the balloon of this type. - `PreferNewBalloons`: if `true`, prefer creating new balloons over placing containers to existing balloons. This results in preferring exclusive CPUs, as long as there are enough free CPUs. The default is `false`: prefer filling and inflating existing balloons over creating new ones. - `ShareIdleCPUsInSame`: Whenever the number of or sizes of balloons change, idle CPUs (that do not belong to any balloon) are reshared as extra CPUs to workloads in balloons with this option. The value sets locality of allowed extra CPUs that will be common to these workloads. - `system`: workloads are allowed to use idle CPUs available anywhere in the system. - `package`: ...allowed to use idle CPUs in the same package(s) (sockets) as the balloon. - `die`: ...in the same die(s) as the balloon. - `numa`: ...in the same numa node(s) as the balloon. - `core`: ...allowed to use idle CPU threads in the same cores with the balloon. - `PreferSpreadOnPhysicalCores` overrides the policy level option with the same name in the scope of this balloon type. - `AllocatorPriority` (0: High, 1: Normal, 2: Low, 3: None). CPU allocator parameter, used when creating new or resizing existing balloons. If there are balloon types with pre-created balloons (`MinBalloons` > 0), balloons of the type with the highest `AllocatorPriority` are created first. Related configuration parameters: - `policy.ReservedResources.CPU` specifies the (number of) CPUs in the special `reserved` balloon. By default all containers in the `kube-system` namespace are assigned to the reserved balloon. - `cpu.classes` defines CPU classes and their parameters (such as `minFreq`, `maxFreq`, `uncoreMinFreq` and `uncoreMaxFreq`). ### Example Example configuration that runs all pods in balloons of 1-4 CPUs. ```yaml policy: Active: balloons ReservedResources: CPU: 1 balloons: PinCPU: true PinMemory: true IdleCPUClass: lowpower BalloonTypes: - Name: "quad" MinCpus: 1 MaxCPUs: 4 CPUClass: dynamic Namespaces: - "*" cpu: classes: lowpower: minFreq: 800 maxFreq: 800 dynamic: minFreq: 800 maxFreq: 3600 turbo: minFreq: 3000 maxFreq: 3600 uncoreMinFreq: 2000 uncoreMaxFreq: 2400 ``` See the [sample configmap](/sample-configs/balloons-policy.cfg) for a complete example. ## Assigning a Container to a Balloon The balloon type of a container can be defined in pod annotations. In the example below, the first annotation sets the balloon type (`BT`) of a single container (`CONTAINER_NAME`). The last two annotations set the default balloon type for all containers in the pod. ```yaml balloon.balloons.cri-resource-manager.intel.com/container.CONTAINER_NAME: BT balloon.balloons.cri-resource-manager.intel.com/pod: BT balloon.balloons.cri-resource-manager.intel.com: BT ``` If a pod has no annotations, its namespace is matched to the `Namespaces` of balloon types. The first matching balloon type is used. If the namespace does not match, the container is assigned to the special `default` balloon, that means reserved CPUs unless `MinCPUs` or `MaxCPUs` of the `default` balloon type are explicitely defined in the `BalloonTypes` configuration. ## Metrics and Debugging In order to enable more verbose logging and metrics exporting from the balloons policy, enable instrumentation and policy debugging from the CRI-RM global config: ```yaml instrumentation: # The balloons policy exports containers running in each balloon, # and cpusets of balloons. Accessible in command line: # curl --silent http://localhost:8891/metrics HTTPEndpoint: :8891 PrometheusExport: true logger: Debug: policy ``` ================================================ FILE: docs/policy/blockio.md ================================================ # Block IO ## Overview Block IO controller provides means to control - block device IO scheduling priority (weight) - throttling IO bandwith - throttling number of IO operations. CRI Resource Manager applies block IO contoller parameters to pods via [cgroups block io contoller](https://www.kernel.org/doc/html/latest/admin-guide/cgroup-v1/blkio-controller.html). ## Configuration See [sample blockio configuration](/sample-configs/blockio.cfg). ## Demo See [Block IO demo](../demos/blockio.md) ================================================ FILE: docs/policy/container-affinity.md ================================================ # Container Affinity and Anti-Affinity ## Introduction Some policies allow the user to give hints about how particular containers should be *co-located* within a node. In particular these hints express whether containers should be located *'close'* to each other or *'far away'* from each other, in a hardware topology sense. Since these hints are interpreted always by a particular *policy implementation*, the exact definitions of 'close' and 'far' are also somewhat *policy-specific*. However as a general rule of thumb containers running - on CPUs within the *same NUMA nodes* are considered *'close'* to each other, - on CPUs within *different NUMA nodes* in the *same socket* are *'farther'*, and - on CPUs within *different sockets* are *'far'* from each other These hints are expressed by `container affinity annotations` on the Pod. There are two types of affinities: - `affinity` (or `positive affinty`): cause affected containers to *pull* each other closer - `anti-affinity` (or `negative affinity`): cause affected containers to *push* each other further away Policies try to place a container - close to those the container has affinity towards - far from those the container has anti-affinity towards. ## Affinity Annotation Syntax *Affinities* are defined as the `cri-resource-manager.intel.com/affinity` annotation. *Anti-affinities* are defined as the `cri-resource-manager.intel.com/anti-affinity` annotation. They are specified in the `metadata` section of the `Pod YAML`, under `annotations` as a dictionary, with each dictionary key being the name of the *container* within the Pod to which the annotation belongs to. ```yaml metadata: anotations: cri-resource-manager.intel.com/affinity: | container1: - scope: key: key-ref operator: op values: - value1 ... - valueN match: key: key-ref operator: op values: - value1 ... - valueN weight: w ``` An anti-affinity is defined similarly but using `cri-resource-manager.intel.com/anti-affinity` as the annotation key. ```yaml metadata: anotations: cri-resource-manager.intel.com/anti-affinity: | container1: - scope: key: key-ref operator: op values: - value1 ... - valueN match: key: key-ref operator: op values: - value1 ... - valueN weight: w ``` ## Affinity Semantics An affinity consists of three parts: - `scope expression`: defines which containers this affinity is evaluated against - `match expression`: defines for which containers (within the scope) the affinity applies to - `weight`: defines how *strong* a pull or a push the affinity causes *Affinities* are also sometimes referred to as *positive affinities* while *anti-affinities* are referred to as *negative affinities*. The reason for this is that the only difference between these are that affinities have a *positive weight* while anti-affinities have a *negative weight*. The *scope* of an affinity defines the *bounding set of containers* the affinity can apply to. The affinity *expression* is evaluated against the containers *in scope* and it *selects the containers* the affinity really has an effect on. The *weight* specifies whether the effect is a *pull* or a *push*. *Positive* weights cause a *pull* while *negative* weights cause a *push*. Additionally, the *weight* specifies *how strong* the push or the pull is. This is useful in situations where the policy needs to make some compromises because an optimal placement is not possible. The weight then also acts as a way to specify preferences of priorities between the various compromises: the heavier the weight the stronger the pull or push and the larger the propbability that it will be honored, if this is possible at all. The scope can be omitted from an affinity in which case it implies *Pod scope*, in other words the scope of all containers that belong to the same Pod as the container for which which the affinity is defined. The weight can also be omitted in which case it defaults to -1 for anti-affinities and +1 for affinities. Weights are currently limited to the range [-1000,1000]. Both the affinity scope and the expression select containers, therefore they are identical. Both of them are *expressions*. An expression consists of three parts: - key: specifies what *metadata* to pick from a container for evaluation - operation (op): specifies what *logical operation* the expression evaluates - values: a set of *strings* to evaluate the the value of the key against The supported keys are: - for pods: - `name` - `namespace` - `qosclass` - `labels/` - `id` - `uid` - for containers: - `pod/` - `name` - `namespace` - `qosclass` - `labels/` - `tags/` - `id` Essentially an expression defines a logical operation of the form (key op values). Evaluating this logical expression will take the value of the key in which either evaluates to true or false. a boolean true/false result. Currently the following operations are supported: - `Equals`: equality, true if the *value of key* equals the single item in *values* - `NotEqual`: inequality, true if the *value of key* is not equal to the single item in *values* - `In`: membership, true if *value of key* equals to any among *values* - `NotIn`: negated membership, true if the *value of key* is not equal to any among *values* - `Exists`: true if the given *key* exists with any value - `NotExists`: true if the given *key* does not exist - `AlwaysTrue`: always evaluates to true, can be used to denote node-global scope (all containers) - `Matches`: true if the *value of key* matches the globbing pattern in values - `MatchesNot`: true if the *value of key* does not match the globbing pattern in values - `MatchesAny`: true if the *value of key* matches any of the globbing patterns in values - `MatchesNone`: true if the *value of key* does not match any of the globbing patterns in values The effective affinity between containers C_1 and C_2, A(C_1, C_2) is the sum of the weights of all pairwise in-scope matching affinities W(C_1, C_2). To put it another way, evaluating an affinity for a container C_1 is done by first using the scope (expression) to determine which containers are in the scope of the affinity. Then, for each in-scope container C_2 for which the match expression evaluates to true, taking the weight of the affinity and adding it to the effective affinity A(C_1, C_2). Note that currently (for the topology-aware policy) this evaluation is asymmetric: A(C_1, C_2) and A(C_2, C_1) can and will be different unless the affinity annotations are crafted to prevent this (by making them fully symmetric). Moreover, A(C_1, C_2) is calculated and taken into consideration during resource allocation for C_1, while A(C_2, C_1) is calculated and taken into account during resource allocation for C_2. This might be changed in a future version. Currently affinity expressions lack support for boolean operators (and, or, not). Sometimes this limitation can be overcome by using joint keys, especially with matching operators. The joint key syntax allows joining the value of several keys with a separator into a single value. A joint key can be specified in a simple or full format: - simple: ``, this is equivalent to `:::` - full: `` A joint key evaluates to the values of all the ``-separated subkeys joined by ``. A non-existent subkey evaluates to the empty string. For instance the joint key `:pod/qosclass:pod/name:name` evaluates to `::` For existence operators, a joint key is considered to exist if any of its subkeys exists. ## Examples Put the container `peter` close to the container `sheep` but far away from the container `wolf`. ```yaml metadata: annotations: cri-resource-manager.intel.com/affinity: | peter: - match: key: name operator: Equals values: - sheep weight: 5 cri-resource-manager.intel.com/anti-affinity: | peter: - match: key: name operator: Equals values: - wolf weight: 5 ``` ## Shorthand Notation There is an alternative shorthand syntax for what is considered to be the most common case: defining affinities between containers within the same pod. With this notation one needs to give just the names of the containers, like in the example below. ```yaml annotations: cri-resource-manager.intel.com/affinity: | container3: [ container1 ] cri-resource-manager.intel.com/anti-affinity: | container3: [ container2 ] container4: [ container2, container3 ] ``` This shorthand notation defines: - `container3` having - affinity (weight 1) to `container1` - `anti-affinity` (weight -1) to `container2` - `container4` having - `anti-affinity` (weight -1) to `container2`, and `container3` The equivalent annotation in full syntax would be ```yaml metadata: annotations: cri-resource-manager.intel.com/affinity: |+ container3: - match: key: labels/io.kubernetes.container.name operator: In values: - container1 cri-resource-manager.intel.com/anti-affinity: |+ container3: - match: key: labels/io.kubernetes.container.name operator: In values: - container2 container4: - match: key: labels/io.kubernetes.container.name operator: In values: - container2 - container3 ``` ================================================ FILE: docs/policy/cpu-allocator.md ================================================ # CPU Allocator CRI Resource Manager has a separate CPU allocator component that helps policies make educated allocation of CPU cores for workloads. Currently all policies except for [static-pools](static-pools.md) utilize the built-in CPU allocator. See policy specific documentation for more details. ## Topology Based Allocation The CPU allocator tries to optimize the allocation of CPUs in terms of the hardware topology. More specifically, it aims at packing all CPUs of one request "near" each other in order to minimize memory latencies between CPUs. ## CPU Prioritization The CPU allocator also does automatic CPU prioritization by detecting CPU features and their configuration parameters. Currently, CRI Resource Manager supports CPU priority detection based on the `intel_pstate` scaling driver in the Linux CPUFreq subsystem, and, Intel Speed Select Technology (SST). CPUs are divided into three priority classes, i.e. *high*, *normal* and *low*. Policies utilizing the CPU allocator may choose to prefer certain priority class for certain types of workloads. For example, prefer (and preserve) high priority CPUs for high priority workloads. ### Intel Speed Select Technology (SST) CRI Resource Manager supports detection of all Intel Speed Select Technology (SST) features, i.e. Speed Select Technology Performance Profile (SST-PP), Base Frequency (SST-BF), Turbo Frequency (SST-TF) and Core Power (SST-CP). CPU prioritization is based on detection of the currently active SST features and their parameterization: 1. If SST-TF has been enabled, all CPUs prioritized by SST-TF are flagged as high priority. 1. If SST-CP is enabled but SST-TF disabled, the CPU allocator examines the active Classes of Service (CLOSes) and their parameters. CPUs associated with the highest priority CLOS will be flagged as high priority, lowest priority CLOS will be flagged as low priority and possible "middle priority" CLOS as normal priority. 1. If SST-BF has been enabled and SST-TF and SST-CP are inactive, all BF high priority cores (having higher guaranteed base frequency) will be flagged as high priority. ### Linux CPUFreq CPUFreq based prioritization only takes effect if Intel Speed Select Technology (SST) is disabled (or not supported). CRI-RM divides CPU cores into priority classes based on two parameters: - base frequency - EPP (Energy-Performance Preference) CPU cores with high base frequency (relative to the other cores in the system) will be flagged as high priority. Low base frequency will map to low priority, correspondingly. CPU cores with high EPP priority (relative to the other cores in the system) will be marked as high priority cores. ================================================ FILE: docs/policy/dynamic-pools.md ================================================ # Dynamic-Pools Policy ## Overview The dynamic-pools policy can put the workload into different dynamic-pools. Each dynamic-pool contains several CPUs and can be resized dynamically in terms of the specific algorithms. The main idea of the algorithm is: on the premise that the CPUs in each dynamic-pool can meet the requests of the pods in the dynamic-pool, the CPUs are allocated based on the CPU utilization of the workload. Dynamic-pools policy try to keep CPU utilization balanced. CPUs in dynamic-pools can be configured, for example, by setting min and max frequencies on CPU cores and uncore. ## How It Works 1. The user configures the dynamic-pool types from which the policy instantiates dynamic-pools. In addition to the dynamic-pools configured by the user, there is also a built-in dynamic-pool named shared pool. 2. A dynamic-pool has a set of CPUs and a set of containers running on the CPUs. 3. Every container is assigned to a dynamic-pool. Dynamic-pools policy allows a container to use all CPUs of its pool and no other CPUs. 4. Each logical CPU belongs to exactly one dynamic-pool. There cannot be CPUs that do not belong to any dynamic-pool. 5. The number of CPUs in a dynamic-pool can change. If CPUs are added to a dynamic-pool, then all containers in the dynamic-pool can use more CPUs. The opposite is true if the CPUs are removed. 6. As CPUs are added to or removed from the dynamic-pool, the CPUs are reconfigured according to the dynamic-pool's CPU class attributes or the idle CPU class attributes. 7. Updating the number of CPUs in dynamic-pools: - The dynamic-pools policy needs to update the number of CPUs in dynamic-pools when starting policy, creating pods, deleting pods, updating configurations, and at regular intervals. - The number of CPUs in the dynamic-pools is determined by the requests of containers and CPU utilization in the dynamic-pools. - The number of CPUs allocated in each dynamic-pool is the sum of the requests of the containers in the dynamic pool and the CPUs allocated based on the CPU utilization of the workload. 8. When a new container is created on a Kubernetes node, the policy first decides the type of the dynamic-pool that will run the container. The decision is based on the annotation of the pod, or the namespace if annotations are not given. ## Deployment ### Install cri-resmgr Deploy cri-resmgr on each node as you would for any other policy. See [installation](https://intel.github.io/cri-resource-manager/stable/docs/installation.html) for more details. ## Configuration The dynamic-pools policy is configured using the yaml-based configuration system of CRI-RM. See [setup and usage](https://intel.github.io/cri-resource-manager/stable/docs/setup.html#setting-up-cri-resource-manager) for more details on managing the configuration. ### Parameters Dynamic-pools policy parameters: * `PinCPU` controls pinning a container to CPUs of its dynamic-pool. The default is `true`: the container cannot use other CPUs. * `PinMemory` controls pinning a container to the memories that are closest to the CPUs of its dynamic-pool. The default is `true`: allow using memory only from the closest NUMA nodes. Warning: this may cause kernel to kill workloads due to out-of-memory error when closest NUMA nodes do not have enough memory. In this situation consider switching this option `false`. * `ReservedPoolNamespaces` is a list of namespaces (wildcards allowed) that are assigned to the special reserved dynamic-pool, that is, will run on reserved CPUs. This always includes the `kube-system` namespace. * `DynamicPoolTypes` is a list of dynamic-pool type definitions. Each type can be configured with the following parameters: - `Name` of the dynamic-pool type. This is used in pod annotations to assign containers to dynamic-pool of this type. - `Namespaces` is a list of namespaces (wildcards allowed) whose pods should be assigned to this dynamic-pool type, unless overridden by pod annotations. - `CpuClass` specifies the name of the CPU class according to which CPUs of dynamic-pools are configured. - `AllocatorPriority` (0: High, 1: Normal, 2: Low, 3: None). CPU allocator parameter, used when creating new or resizing existing dynamic-pools. Related configuration parameters: * `policy.ReservedResources.CPU` specifies the (number of) CPUs in the special `reserved` dynamic-pool. By default all containers in the `kube-system` namespace are assigned to the reserved dynamic-pool. * `policy.AvailableResources.CPU` specifies the CPUs that can be used by the policy, including `policy.ReservedResources.CPU`. * `cpu.classes` defines CPU classes and their parameters (such as `minFreq`, `maxFreq`, `uncoreMinFreq` and `uncoreMaxFreq`). ### Example ```yaml cpu: classes: pool1-cpuclass: maxFreq: 1500000 minFreq: 2000000 pool2-cpuclass: maxFreq: 2000000 minFreq: 2500000 policy: Active: dynamic-pools ReservedResources: CPU: cpuset:0 dynamic-pools: PinCPU: true PinMemory: true DynamicPoolTypes: - Name: "pool1" Namespaces: - "pool1" CPUClass: "pool1-cpuclass" - Name: "pool2" Namespaces: - "pool2" CPUClass: "pool2-cpuclass" ``` ### Update Dynamic-Pools at Regular Intervals The dynamic-pools policy can be set at regular intervals, based on the cpu utilization of the workload in each pool, to update the cpu allocation, and use the `--rebalance-interval` option to set the interval. ### Assigning a Container to a Dynamic-pool The dynamic-pool type of a container can be defined in pod annotations. In the example below, the first annotation sets the dynamic-pool type (`DPT`) of a single container (`CONTAINER_NAME`). The last two annotations set the default dynamic-pools type for all containers in the pod. ```yaml dynamic-pool.dynamic-pools.cri-resource-manager.intel.com/container.CONTAINER_NAME: DPT dynamic-pool.dynamic-pools.cri-resource-manager.intel.com/pod: DPT dynamic-pool.dynamic-pools.cri-resource-manager.intel.com: DPT ``` If a pod has no annotations, its namespace is matched to the `namespace` of dynamic-pool types. The first matching dynamic-pool type is used. If the namespace does not match, the container is assigned to the `shared` dynamic-pool. ## Metrics and Debugging In order to enable more verbose logging and metrics exporting from the dynamic-pools policy, enable instrumentation and policy debugging from the CRI-RM global config: ```yaml instrumentation: # The dynamic-pools policy exports containers running in each dynamic-pool, # and cpusets of dynamic-pools. Accessible in command line: # curl --silent http://localhost:8891/metrics HTTPEndpoint: :8891 PrometheusExport: true logger: Debug: policy ``` Use the `--metrics-interval` option to set the interval for updating metrics data. ================================================ FILE: docs/policy/index.rst ================================================ Policies ######## .. toctree:: :maxdepth: 1 topology-aware.md static-pools.md balloons.md podpools.md container-affinity.md blockio.md rdt.md cpu-allocator.md dynamic-pools.md ================================================ FILE: docs/policy/podpools.md ================================================ # Podpools Policy ## Overview The podpools policy implements pod-level workload placement. It assigns all containers of a pod to the same CPU/memory pool. The number of CPUs in a pool is configurable by user. ## Deployment ### Install cri-resmgr Deploy cri-resmgr on each node as you would for any other policy. See [installation](../installation.md) for more details. ## Configuration The policy is configured using the yaml-based configuration system of CRI-RM. See [setup and usage](../setup.md#setting-up-cri-resource-manager) for more details on managing the configuration. At minimum, you need to specify the active policy in the configuration, and define at least one pod pool. For example, the following configuration dedicates 95 % of non-reserved CPUs on the node to be used by `dualcpu` pools. Every pool instance (`dualcpu[0]`, `dualcpu[1]`, ...) contains two exclusive CPUs and has a capacity (`MaxPods`) of one pod. The CPUs are used only by containers of pods assigned to the pool. Remaining CPUs will be used for running pods that are not `dualcpu` or `kube-system` pods. ```yaml policy: Active: podpools ReservedResources: CPU: 1 podpools: Pools: - Name: dualcpu CPU: 2 MaxPods: 1 Instances: 95 % ``` Note that the configuration above allocates two exclusive CPUs for each pod assigned to the pool. To align with kube-scheduler resource accounting, requested CPUs of all containers in this kind of pods must sum up to CPU/MaxPods, that is 2000m CPU in this case. See the [sample configmap](/sample-configs/podpools-policy.cfg) for a complete example. ### Debugging In order to enable more verbose logging for the podpools policy enable policy debug from the CRI-RM global config: ```yaml logger: Debug: policy ``` ## Running Pods in Podpools The podpools policy assigns a pod to a pod pool instance if the pod has annotation ```yaml pool.podpools.cri-resource-manager.intel.com: POOLNAME ``` Following Pod runs in a `dualcpu` pool. This example assumes that `dualcpu` pools include two CPUs per pod, as in the above configuration example. Therefore containers in the yaml request 2000m CPUs in total. ```yaml apiVersion: v1 kind: Pod metadata: name: podpools-test annotations: pool.podpools.cri-resource-manager.intel.com: dualcpu spec: containers: - name: testcont0 image: busybox command: - "sh" - "-c" - "while :; do grep _allowed_list /proc/self/status; sleep 5; done" resources: requests: cpu: 1200m - name: testcont1 image: busybox command: - "sh" - "-c" - "while :; do grep _allowed_list /proc/self/status; sleep 5; done" resources: requests: cpu: 800m ``` If a pod is not annotated to run on any specific pod pool and it is not a `kube-system` pod, it will be run on shared CPUs. Shared CPUs include left-over CPUs after creating user-defined pools. If all CPUs were allocated to other pools, reserved CPUs will be used as shared, too. ================================================ FILE: docs/policy/rdt.md ================================================ # RDT (Intel® Resource Director Technology) ## Background Intel® RDT provides capabilities for cache and memory allocation and monitoring. In Linux system the functionality is exposed to the user space via the [resctrl](https://docs.kernel.org/x86/resctrl.html) filesystem. Cache and memory allocation in RDT is handled by using resource control groups. Resource allocation is specified on the group level and each task (process/thread) is assigned to one group. In the context of CRI Resource we use the term 'RDT class' instead of 'resource control group'. CRI Resource Manager supports all available RDT technologies, i.e. L2 and L3 Cache Allocation (CAT) with Code and Data Prioritization (CDP) and Memory Bandwidth Allocation (MBA) plus Cache Monitoring (CMT) and Memory Bandwidth Monitoring (MBM). ## Overview RDT configuration in CRI-RM is class-based. Each container gets assigned to an RDT class. In turn, all processes of the container will be assigned to the RDT Classes of Service (CLOS) (under `/sys/fs/resctrl`) corresponding the RDT class. CRI-RM will configure the CLOSes according to its configuration at startup or whenever the configuration changes. CRI-RM maintains a direct mapping between Pod QoS classes and RDT classes. If RDT is enabled CRI-RM tries to assign containers into an RDT class with a name matching their Pod QoS class. This default behavior can be overridden with pod annotations. ## Class Assignment By default, containers get an RDT class with the same name as its Pod QoS class (Guaranteed, Burstable or Besteffort). If the RDT class is missing the container will be assigned to the system root class. The default behavior can be overridden with pod annotations: - `rdtclass.cri-resource-manager.intel.com/pod: ` specifies a pod-level default that will be used for all containers of a pod - `rdtclass.cri-resource-manager.intel.com/container.: ` specifies container-specific assignment, taking preference over possible pod-level annotation (above) With pod annotations it is possible to specify RDT classes other than Guaranteed, Burstable or Besteffort. The default assignment could also be overridden by a policy but currently none of the builtin policies do that. ## Configuration ### Operating Modes The RDT controller supports three operating modes, controlled by `rdt.options.mode` configuration option. - Disabled: RDT controller is effectively disabled and containers will not be assigned and no monitoring groups will be created. Upon activation of this mode all CRI-RM specific control and monitoring groups from the resctrl filesystem are removed. - Discovery: RDT controller detects existing non-CRI-RM specific classes from the resctrl filesystem and uses these. The configuration of the discovered classes is considered read-only and it will not be altered. Upon activation of this mode all CRI-RM specific control groups from the resctrl filesystem are removed. - Full: Full operating mode. The controller manages the configuration of the resctrl filesystem according to the rdt class definitions in the CRI-RM configuration. This is the default operating mode. ### RDT Classes The RDT class configuration in CRI-RM is a two-level hierarchy consisting of partitions and classes. It specifies a set of partitions each having a set of classes. #### Partitions Partitions represent a logical grouping of the underlying classes, each partition specifying a portion of the available resources (L2/L3/MB) which will be shared by the classes under it. Partitions guarantee non-overlapping exclusive cache allocation - i.e. no overlap on the cache ways between partitions is allowed. However, by technology, MB allocations are not exclusive. Thus, it is possible to assign all partitions 100% of memory bandwidth, for example. #### Classes Classes represent the actual RDT classes containers are assigned to. In contrast to partitions, cache allocation between classes under a specific partition may overlap (and they usually do). The set of RDT classes can be freely specified, but, it should be ensured that classes corresponding to the Pod QoS classes are specified. Also, the maximum number of classes (CLOSes) supported by the underlying hardware must not be exceeded. ### Example Below is a config snippet that would allocate 60% of the L3 cache lines exclusively to the Guarenteed class. The remaining 40% L3 is for Burstable and Besteffort, Besteffort getting only 50% of this. Guaranteed class gets full memory bandwidth whereas the other classes are throttled to 50%. ```yaml rdt: # Common options options: # One of Full, Discovery or Disabled mode: Full # Set to true to disable creation of monitoring groups monitoringDisabled: false l3: # Make this false if L3 CAT must be available optional: true mb: # Make this false if MBA must be available optional: true # Configuration of classes partitions: exclusive: # Allocate 60% of all L3 cache to the "exclusive" partition l3Allocation: "60%" mbAllocation: ["100%"] classes: Guaranteed: # Allocate all of the partitions cache lines to "Guaranteed" l3Allocation: "100%" shared: # Allocate 40% L3 cache IDs to the "shared" partition # These will NOT overlap with the cache lines allocated for "exclusive" partition l3Allocation: "40%" mbAllocation: ["50%"] classes: Burstable: # Allow "Burstable" to use all cache lines of the "shared" partition l3Allocation: "100%" BestEffort: # Allow "Besteffort" to use only half of the L3 cache # lines of the "shared" partition. # These will overlap with those used by "Burstable" l3Allocation: "50%" ``` The configuration also supports far more fine-grained control, e.g. per cache-ID configuration (i.e. different sockets having different allocation) and Code and Data Prioritization (CDP) allowing different cache allocation for code and data paths. If the hardware details are known, raw bitmasks or bit numbers ("0x1f" or 0-4) can be used instead of percentages in order to be able to configure cache allocations exactly as required. For detailed description of the RDT configuration format with examples see the {{ '[goresctrl library documentation](https://github.com/intel/goresctrl/blob/{}/doc/rdt.md)'.format(goresctrl_version) }} See `rdt` in the [example ConfigMap spec](/sample-configs/cri-resmgr-configmap.example.yaml) for another example configuration. ### Dynamic Configuration RDT supports dynamic configuration i.e. the resctrl filesystem is reconfigured whenever a configuration update e.g. via the [Node Agent](../node-agent.md) is received. However, the configuration update is rejected if it is incompatible with the set of currently running containers - e.g. the new config is missing a class that a running container has been assigned to. ================================================ FILE: docs/policy/static-pools.md ================================================ # Static-Pools (STP) Policy ## Overview The `static-pools` (STP) builtin policy was inspired by [CMK (CPU Manager for Kubernetes)][cmk]. It is an example policy demonstrating capabilities of `cri-resource-manager` and not considered as production ready. Basically, the STP policy aims to replicate the functionality of the `cmk isolate` command of CMK. It also has compatibility features to function as a drop-in replacement in order to allow easier testing and prototyping. Features: - arbitrary number of configurable CPU list pools - dynamic configuration updates via the [node agent](../node-agent.md) Please see the documentation of [CMK][cmk] for a more detailed description of the terminology and functionality. CMK compatibility features: - supports the same environment variables as the original CMK, except for: - `CMK_LOCK_TIMEOUT` and `CMK_PROC_FS`: configuration variables that are not applicable in cri-resmgr context - `CMK_LOG_LEVEL`: not implemented, yet - `CMK_NUM_CORES`: not needed in cri-resmgr as we take this value directly from the container resource request - supports the existing configuration directory format of CMK for retrieving the pool configuration - parses the container command/args in an attempt to retrieve command line options of `cmk isolate` - supports generating CMK-specific node label and taint (off by default) ## Deployment ### Install cri-resmgr Deploy cri-resmgr on each node as you would for any other policy. See [installation](../installation.md) for more details. ### Deploy Node Agent The CRI-RM node agent is required in order to communicate with the Kubernetes control plane. In particular, the STP policy needs this capability for updating the extended resource (that represents exclusive cores) as well as managing legacy CMK node annotation and taint. In addition, the node agent enables dynamic configuration updates. See [node agent](../node-agent.md) for detailed instructions for set-up and usage. ### Deploy Admission Webhook You need to run and enable the cri-resmgr mutating admission webhook which creates pod annotations consumed by CRI-RM. This is required so that the STP policy is able to inspect the extended resources (in this case, exclusive CPU cores) requested by containers. See the [webhook](../webhook.md) for instructions how to set it up. ## Configuration The policy is configured using the yaml-based configuration system of CRI-RM. See [setup and usage](../setup.md#setting-up-cri-resource-manager) for more details on managing the configuration. At minimum, you need to specify the active policy in the configuration. Policy-specific options control the pool configuration and legacy node label and taint. ```yaml policy: Active: static-pools static-pools: # Set to true to create CMK node label #LabelNode: false # Set to true to create CMK node taint #TaintNode: false ... ``` See the [sample configmap](/sample-configs/cri-resmgr-configmap.example.yaml) for a complete example containing all available configuration options. If dynamic configuration via the [node agent](../node-agent.md) is in use the policy options, including pools configuration, may be altered at runtime. **NOTE**: the active policy (`policy.Active`) cannot be changed at runtime. In order to change the active policy cri-resmgr needs to be restarted. ### Pools Configuration There are three possible sources of the pools configuration, in decreasing priority order: 1. CRI-RM global config 1. stand-alone static-pools config file 1. CMK directory tree The configuration is fully evaluated whenever a re-configuration event is received (e.g. from the node agent). Thus, a valid pools config appearing in the CRI-RM global config will take precedence over a directory tree based config that was previously active. Similarly, removing pools config from the CRI-RM global config will make a local config (file or directory tree) effective. **NOTE:** cri-resmgr does not have any utility for generating a pool configuration. Thus, you need to either manually write one by yourself, or, run the `cmk init` command (of the original CMK) in order to create a legacy configuration directory structure. #### Global Config Configuration from the global CRI-RM config takes the highest preference, if specified (under `policy.static-pools.pools`). A referential example: ```yaml policy: static-pools: pools: exclusive: exclusive: true cpuLists: ... shared: cpuLists: ... infra: cpuLists: ... ``` #### Stand-alone YAML File Path to a stand-alone configuration file can be specified by the `policy.static-pools.ConfFilePath` option (empty by default) in the CRI-RM global config: ```yaml policy: static-pools: ConfFilePath: "/path/to/conf.yaml" ``` Format of the configuration file is similar to the pools config used in the global CRI-RM config. You can also see the [example config file](/sample-configs/static-pools-policy.conf.example) for a starting point. #### CMK Directory Tree The STP policy also supports configuration directory format of the original CMK. It reads the configuration from a location specified by the `policy.static-pools.ConfFileDir` field (`/etc/cmk` by default) in the CRI-RM global config: ```yaml policy: static-pools: ConfFileDir: "/etc/cmk" ``` ### Debugging In order to enable more verbose logging for the STP policy set the `LOGGER_DEBUG=static-pools` environment variable or enable debug from the CRI-RM global config: ```yaml logger: Debug: static-pools ``` ## Running Workloads The preferred way to specify the pod configuration is through environment variables. However, exclusive cores must be reserved by making a request of the `cmk.intel.com/exclusive-cores` extended resource. Naming of the extended resource has `cmk` prefix in order to provide backwards compatibility with the original CMK. ### Pod Configuration Using Env Variables The following environment variables are recognized: | Name | Description | | --------------- | ---------------------------------------------------------- | | STP_NO_AFFINITY | Do not set cpu affinity. The workload is responsible for reading the `CMK_CPUS_ASSIGNED` environment variable and setting the affinity itself. | STP_POOL | Name of the pool to run in | STP_SOCKET_ID | Socket where cores should be allocated. Set to -1 to accept any socket. An example Pod spec for running a workload in the `exclusive` pool with one core reserved from socket id 0: ```yaml apiVersion: v1 kind: Pod metadata: name: stp-test spec: containers: - name: stp-test image: busybox env: - name: STP_POOL value: "exclusive" - name: STP_SOCKET_ID value: "0" command: - "sh" - "-c" - "while :; do echo ASSIGNED: $CMK_CPUS_ASSIGNED; sleep 1; done" resources: requests: cmk.intel.com/exclusive-cores: "1" limits: cmk.intel.com/exclusive-cores: "1" ``` ### Backwards Compatibility for `cmk isolate` The STP policy parses the container command/args in an attempt to retrieve the Pod configuration (from `cmk isolate` options). This is to provide backwards compatibility with existing CMK workload specs. It manipulates the container command and args so that `cmk isolate` and all it's arguments are removed. In the example below STP policy will run `sh -c "sleep 10000"` in the `infra` pool. ```yaml apiVersion: v1 kind: Pod metadata: name: cmk-test spec: containers: - name: cmk-test image: busybox command: - "sh" - "-c" args: - "/opt/bin/cmk isolate --conf-dir=/etc/cmk --pool=infra sleep 10000" ``` [cmk]: https://github.com/intel/CPU-Manager-for-Kubernetes ================================================ FILE: docs/policy/topology-aware.md ================================================ # Topology-Aware Policy ## Background On server-grade hardware the CPU cores, I/O devices and other peripherals form a rather complex network together with the memory controllers, the I/O bus hierarchy and the CPU interconnect. When a combination of these resources are allocated to a single workload, the performance of that workload can vary greatly, depending on how efficiently data is transferred between them or, in other words, on how well the resources are aligned. There are a number of inherent architectural hardware properties that, unless properly taken into account, can cause resource misalignment and workload performance degradation. There are a multitude of CPU cores available to run workloads. There are a multitude of memory controllers these workloads can use to store and retrieve data from main memory. There are a multitude of I/O devices attached to a number of I/O buses the same workloads can access. The CPU cores can be divided into a number of groups, with each group having different access latency and bandwidth to each memory controller and I/O device. If a workload is not assigned to run with a properly aligned set of CPU, memory and devices, it will not be able to achieve optimal performance. Given the idiosyncrasies of hardware, allocating a properly aligned set of resources for optimal workload performance requires identifying and understanding the multiple dimensions of access latency locality present in hardware or, in other words, hardware topology awareness. ## Overview The `topology-aware` policy automatically builds a tree of pools based on the detected hardware topology. Each pool has a set of CPUs and memory zones assigned as their resources. Resource allocation for workloads happens by first picking the pool which is considered to fit the best the resource requirements of the workload and then assigning CPU and memory from this pool. The pool nodes at various depths from bottom to top represent the NUMA nodes, dies, sockets, and finally the whole of the system at the root node. Leaf NUMA nodes are assigned the memory behind their controllers / zones and CPU cores with the smallest distance / access penalty to this memory. If the machine has multiple types of memory separately visible to both the kernel and user space, for instance both DRAM and [PMEM](https://www.intel.com/content/www/us/en/products/memory-storage/optane-dc-persistent-memory.html), each zone of special type of memory is assigned to the closest NUMA node pool. Each non-leaf pool node in the tree is assigned the union of the resources of its children. So in practice, dies nodes end up containing all the CPU cores and the memory zones in the corresponding die, sockets nodes end up containing the CPU cores and memory zones in the corresponding socket's dies, and the root ends up containing all CPU cores and memory zones in all sockets. With this setup, each pool in the tree has a topologically aligned set of CPU and memory resources. The amount of available resources gradually increases in the tree from bottom to top, while the strictness of alignment is gradually relaxed. In other words, as one moves from bottom to top in the tree, it is getting gradually easier to fit in a workload, but the price paid for this is a gradually increasing maximum potential cost or penalty for memory access and data transfer between CPU cores. Another property of this setup is that the resource sets of sibling pools at the same depth in the tree are disjoint while the resource sets of descendant pools along the same path in the tree partially overlap, with the intersection decreasing as the the distance between pools increases. This makes it easy to isolate workloads from each other. As long as workloads are assigned to pools which has no other common ancestor than the root, the resources of these workloads should be as well isolated from each other as possible on the given hardware. With such an arrangement, this policy should handle topology-aware alignment of resources without any special or extra configuration. When allocating resources, the policy - filters out all pools with insufficient free capacity - runs a scoring algorithm for the remaining ones - picks the one with the best score - assigns resources to the workload from there Although the details of the scoring algorithm are subject to change as the implementation evolves, its basic principles are roughly - prefer pools lower in the tree, IOW stricter alignment and lower latency - prefer idle pools over busy ones, IOW more remaining free capacity and fewer workloads - prefer pools with better overall device alignment ## Features The `topology-aware` policy has the following features: - topologically aligned allocation of CPU and memory * assign CPU and memory to workloads with tightest available alignment - aligned allocation of devices * pick pool for workload based on locality of devices already assigned - shared allocation of CPU cores * assign workload to shared subset of pool CPUs - exclusive allocation of CPU cores * dynamically slice off CPU cores from shared subset and assign to workload - mixed allocation of CPU cores * assign both exclusive and shared CPU cores to workload - discovering and using kernel-isolated CPU cores (['isolcpus'](https://www.kernel.org/doc/html/latest/admin-guide/kernel-parameters.html#cpu-lists)) * use kernel-isolated CPU cores for exclusively assigned CPU cores - exposing assigned resources to workloads - notifying workloads about changes in resource assignment - dynamic relaxation of memory alignment to prevent OOM * dynamically widen workload memory set to avoid pool/workload OOM - multi-tier memory allocation * assign workloads to memory zones of their preferred type * the policy knows about three kinds of memory: - DRAM is regular system main memory - PMEM is large-capacity memory, such as [Intel® Optane™ memory](https://www.intel.com/content/www/us/en/products/memory-storage/optane-dc-persistent-memory.html) - [HBM](https://en.wikipedia.org/wiki/High_Bandwidth_Memory) is high speed memory, typically found on some special-purpose computing systems - cold start * pin workload exclusively to PMEM for an initial warm-up period - dynamic page demotion * forcibly migrate read-only and idle container memory pages to PMEM ## Activating the Policy You can activate the `topology-aware` policy by using the following configuration fragment in the configuration for `cri-resmgr`: ```yaml policy: Active: topology-aware ReservedResources: CPU: 750m ``` ## Configuring the Policy The policy has a number of configuration options which affect its default behavior. These options can be supplied as part of the [dynamic configuration](../setup.md#using-cri-resource-manager-agent-and-a-configmap) received via the [`node agent`](../node-agent.md), or in a fallback or forced [configuration file](../setup.md#using-a-local-configuration-from-a-file). These configuration options are - `PinCPU` * whether to pin workloads to assigned pool CPU sets - `PinMemory` * whether to pin workloads to assigned pool memory zones - `PreferIsolatedCPUs` * whether isolated CPUs are preferred by default for workloads that are eligible for exclusive CPU allocation - `PreferSharedCPUs` * whether shared allocation is preferred by default for workloads that would be otherwise eligible for exclusive CPU allocation - `ReservedPoolNamespaces` * list of extra namespaces (or glob patters) that will be allocated to reserved CPUs - `ColocatePods` * whether try to allocate containers in a pod to the same or close by topology pools - `ColocateNamespaces` * whether try to allocate containers in a namespace to the same or close by topology pools ## Policy CPU Allocation Preferences There are a number of workload properties this policy actively checks to decide if the workload could potentially benefit from extra resource allocation optimizations. Unless configured differently, containers fulfilling certain corresponding criteria are considered eligible for these optimizations. This will be reflected in the assigned resources whenever that is possible at the time the container's creation / resource allocation request hits the policy. The set of these extra optimizations consist of - assignment of `kube-reserved` CPUs - assignment of exclusively allocated CPU cores - usage of kernel-isolated CPU cores (for exclusive allocation) The policy uses a combination of the QoS class and the resource requirements of the container to decide if any of these extra allocation preferences should be applied. Containers are divided into five groups, with each group having a slightly different set of criteria for eligibility. - `kube-system` group * all containers in the `kube-system` namespace - `low-priority` group * containers in the `BestEffort` or `Burstable` QoS class - `sub-core` group * Guaranteed QoS class containers with `CPU request < 1 CPU` - `mixed` group * Guaranteed QoS class containers with `1 <= CPU request < 2` - `multi-core` group * Guaranteed QoS class containers with `CPU request >= 2` The eligibility rules for extra optimization are slightly different among these groups. - `kube-system` * not eligible for extra optimizations * eligible to run on `kube-reserved` CPU cores * always run on shared CPU cores - `low-priority` * not eligible for extra optimizations * always run on shared CPU cores - `sub-core` * not eligible for extra optimizations * always run on shared CPU cores - `mixed` * by default eligible for exclusive and isolated allocation * not eligible for either if `PreferSharedCPUs` is set to true * not eligible for either if annotated to opt out from exclusive allocation * not eligible for isolated allocation if annotated to opt out - `multi-core` * CPU request fractional (`(CPU request % 1000 milli-CPU) != 0`): - by default not eligible for extra optimizations - eligible for exclusive and isolated allocation if annotated to opt in * CPU request not fractional: - by default eligible for exclusive allocation - by default not eligible for isolated allocation - not eligible for exclusive allocation if annotated to opt out - eligible for isolated allocation if annotated to opt in Eligibility for kube-reserved CPU core allocation should always be possible to honor. If this is not the case, it is probably due to an incorrect configuration which underdeclares `ReservedResources`. In that case, ordinary shared CPU cores will be used instead of kube-reserved ones. Eligibility for exclusive CPU allocation should always be possible to honor. Eligibility for isolated core allocation is only honored if there are enough isolated cores available to fulfill the exclusive part of the container's CPU request with isolated cores alone. Otherwise ordinary CPUs will be allocated, by slicing them off for exclusive usage from the shared subset of CPU cores in the container's assigned pool. Containers in the kube-system group are pinned to share all kube-reserved CPU cores. Containers in the low-priority or sub-core groups, and containers which are only eligible for shared CPU core allocation in the mixed and multi-core groups, are all pinned to run on the shared subset of CPU cores in the container's assigned pool. This shared subset can and usually does change dynamically as exclusive CPU cores are allocated and released in the pool. ## Container CPU Allocation Preference Annotations Containers can be annotated to diverge from the default CPU allocation preferences the policy would otherwise apply to them. These Pod annotations can be given both with per pod and per container resolution. If for any container both of these exist, the container-specific one takes precedence. ### Shared, Exclusive, and Isolated CPU Preference A container can opt in to or opt out from shared CPU allocation using the following Pod annotation. ```yaml metadata: annotations: # opt in container C1 to shared CPU core allocation prefer-shared-cpus.cri-resource-manager.intel.com/container.C1: "true" # opt in the whole pod to shared CPU core allocation prefer-shared-cpus.cri-resource-manager.intel.com/pod: "true" # selectively opt out container C2 from shared CPU core allocation prefer-shared-cpus.cri-resource-manager.intel.com/container.C2: "false" ``` Opting in to exclusive allocation happens by opting out from shared allocation, and opting out from exclusive allocation happens by opting in to shared allocation. A container can opt in to or opt out from isolated exclusive CPU core allocation using the following Pod annotation. ```yaml metadata: annotations: # opt in container C1 to isolated exclusive CPU core allocation prefer-isolated-cpus.cri-resource-manager.intel.com/container.C1: "true" # opt in the whole pod to isolated exclusive CPU core allocation prefer-isolated-cpus.cri-resource-manager.intel.com/pod: "true" # selectively opt out container C2 from isolated exclusive CPU core allocation prefer-isolated-cpus.cri-resource-manager.intel.com/container.C2: "false" ``` These Pod annotations have no effect on containers which are not eligible for exclusive allocation. ### Implicit Hardware Topology Hints `CRI Resource Manager` automatically generates HW `Topology Hints` for devices assigned to a container, prior to handing the container off to the active policy for resource allocation. The `topology-aware` policy is hint-aware and normally takes topology hints into account when picking the best pool to allocate resources. Hints indicate optimal `HW locality` for device access and they can alter significantly which pool gets picked for a container. Since device topology hints are implicitly generated, there are cases where one would like the policy to disregard them altogether. For instance, when a local volume is used by a container but not in any performance critical manner. Containers can be annotated to opt out from and selectively opt in to hint-aware pool selection using the following Pod annotations. ```yaml metadata: annotations: # only disregard hints for container C1 topologyhints.cri-resource-manager.intel.com/container.C1: "false" # disregard hints for all containers by default topologyhints.cri-resource-manager.intel.com/pod: "false" # but take hints into account for container C2 topologyhints.cri-resource-manager.intel.com/container.C2: "true" ``` Topology hint generation is globally enabled by default. Therefore, using the Pod annotation as opt in only has an effect when the whole pod is annotated to opt out from hint-aware pool selection. ### Implicit Topological Co-location for Pods and Namespaces The `ColocatePods` or `ColocateNamespaces` configuration options control whether the policy will try to co-locate, that is allocate topologically close, containers within the same Pod or K8s namespace. Both of these options are false by default. Setting them to true is a shorthand for adding to each container an affinity of weight 10 for all other containers in the same pod or namespace. Containers with user-defined affinities are never extended with either of these co-location affinities. However, such containers can still have affinity effects on other containers that do get extended with co-location. Therefore mixing user- defined affinities with implicit co-location requires both careful consideration and a thorough understanding of affinity evaluation, or it should be avoided altogether. ## Cold Start The `topology-aware` policy supports "cold start" functionality. When cold start is enabled and the workload is allocated to a topology node with both DRAM and PMEM memory, the initial memory controller is only the PMEM controller. DRAM controller is added to the workload only after the cold start timeout is done. The effect of this is that allocated large unused memory areas of memory don't need to be migrated to PMEM, because it was allocated there to begin with. Cold start is configured like this in the pod metadata: ```yaml metadata: annotations: memory-type.cri-resource-manager.intel.com/container.container1: dram,pmem cold-start.cri-resource-manager.intel.com/container.container1: | duration: 60s ``` Again, alternatively you can use the following deprecated Pod annotation syntax to achieve the same, but support for this syntax is subject to be dropped in a future release: ```yaml metadata: annotations: cri-resource-manager.intel.com/memory-type: | container1: dram,pmem cri-resource-manager.intel.com/cold-start: | container1: duration: 60s ``` In the above example, `container1` would be initially granted only PMEM memory controller, but after 60 seconds the DRAM controller would be added to the container memset. ## Dynamic Page Demotion The `topology-aware` policy also supports dynamic page demotion. With dynamic demotion enabled, rarely-used pages are periodically moved from DRAM to PMEM for those workloads which are assigned to use both DRAM and PMEM memory types. The configuration for this feature is done using three configuration keys: `DirtyBitScanPeriod`, `PageMovePeriod`, and `PageMoveCount`. All of these parameters need to be set to non-zero values in order for dynamic page demotion to get enabled. See this configuration file fragment as an example: ```yaml policy: Active: topology-aware topology-aware: DirtyBitScanPeriod: 10s PageMovePeriod: 2s PageMoveCount: 1000 ``` In this setup, every pid in every container in every non-system pod fulfilling the memory container requirements would have their page ranges scanned for non-accessed pages every ten seconds. The result of the scan would be fed to a page-moving loop, which would attempt to move 1000 pages every two seconds from DRAM to PMEM. ## Container memory requests and limits Due to inaccuracies in how `cri-resmgr` calculates memory requests for pods in QoS class `Burstable`, you should either use `Limit` for setting the amount of memory for containers in `Burstable` pods or run the [resource-annotating webhook](../webhook.md) to provide `cri-resmgr` with an exact copy of the resource requirements from the Pod Spec as an extra Pod annotation. ## Reserved pool namespaces User is able to mark certain namespaces to have a reserved CPU allocation. Containers belonging to such namespaces will only run on CPUs set aside according to the global CPU reservation, as configured by the ReservedResources configuration option in the policy section. The `ReservedPoolNamespaces` option is a list of namespace globs that will be allocated to reserved CPU class. For example: ```yaml policy: Active: topology-aware topology-aware: ReservedPoolNamespaces: ["my-pool","reserved-*"] ``` In this setup, all the workloads in `my-pool` namespace and those namespaces starting with `reserved-` string are allocated to reserved CPU class. The workloads in `kube-system` are automatically assigned to reserved CPU class so no need to mention `kube-system` in this list. ## Reserved CPU annotations User is able to mark certain pods and containers to have a reserved CPU allocation by using annotations. Containers having a such annotation will only run on CPUs set aside according to the global CPU reservation, as configured by the ReservedResources configuration option in the policy section. For example: ```yaml metadata: annotations: prefer-reserved-cpus.cri-resource-manager.intel.com/pod: "true" prefer-reserved-cpus.cri-resource-manager.intel.com/container.special: "false" ``` ================================================ FILE: docs/quick-start.md ================================================ # Quick-start The following describes the minimum number of steps to get started with CRI Resource Manager. ## Pre-requisites - containerd container runtime installed and running - kubelet installed on your nodes ## Setup CRI-Resmgr First, install and setup cri-resource-manager. ### Install package #### Fedora\*, and SUSE\* ``` CRIRM_VERSION=`curl -s "https://api.github.com/repos/intel/cri-resource-manager/releases/latest" | \ jq .tag_name | tr -d '"v'` source /etc/os-release [ "$ID" = "sles" ] && export ID=opensuse-leap sudo rpm -Uvh https://github.com/intel/cri-resource-manager/releases/download/v${CRIRM_VERSION}/cri-resource-manager-${CRIRM_VERSION}-0.${ID}-${VERSION_ID}.x86_64.rpm ``` #### Ubuntu\* and Debian\* ``` CRIRM_VERSION=`curl -s "https://api.github.com/repos/intel/cri-resource-manager/releases/latest" | \ jq .tag_name | tr -d '"v'` source /etc/os-release pkg=cri-resource-manager_${CRIRM_VERSION}_${ID}-${VERSION_ID}_amd64.deb; curl -LO https://github.com/intel/cri-resource-manager/releases/download/v${CRIRM_VERSION}/${pkg}; sudo dpkg -i ${pkg}; rm ${pkg} ``` ### Setup and verify Create configuration and start cri-resource-manager ``` sudo cp /etc/cri-resmgr/fallback.cfg.sample /etc/cri-resmgr/fallback.cfg sudo systemctl enable cri-resource-manager && sudo systemctl start cri-resource-manager ``` See that cri-resource-manager is running ``` systemctl status cri-resource-manager ``` ## Kubelet setup Next, you need to configure kubelet to use cri-resource-manager as it's container runtime endpoint. ### Existing cluster When integrating into an existing cluster you need to change kubelet to use cri-resmgr instead of the existing container runtime (expecting containerd here). #### Fedora, and SUSE ``` sudo sed '/KUBELET_EXTRA_ARGS/ s!$! --container-runtime-endpoint=/var/run/cri-resmgr/cri-resmgr.sock!' -i /etc/sysconfig/kubelet sudo systemctl restart kubelet ``` #### Ubuntu and Debian ``` sudo sed '/KUBELET_EXTRA_ARGS/ s!$! --container-runtime-endpoint=/var/run/cri-resmgr/cri-resmgr.sock!' -i /etc/default/kubelet sudo systemctl restart kubelet ``` ### New Cluster When in the process of setting up a new cluster you simply point the kubelet to use the cri-resmgr cri sockets on cluster node setup time. Here's an example with kubeadm: ``` kubeadm join --cri-socket /var/run/cri-resmgr/cri-resmgr.sock \ ... ``` ## What Next Congratulations, you now have cri-resource-manager running on your system and policying container resource allocations. Next, you could see: - [Installation](installation.md) for more installation options and detailed installation instructions - [Setup](setup.md) for details on setup and usage - [Node Agent](node-agent.md) for setting up cri-resmgr-agent for dynamic configuration and more - [Webhook](webhook.md) for setting up our resource-annotating webhook - [Support for Kata Containers\*](setup.md#kata-containers) for setting up CRI-RM with Kata Containers ================================================ FILE: docs/reference/agent-command-line-reference.md ================================================ # CRI-Resmgr-Agent Command-line Reference ***WORK IN PROGRESS*** ================================================ FILE: docs/reference/configuration-reference.md ================================================ # Configuration Reference ## Configuration file ***WORK IN PROGRESS*** ### `policy` **Active** specifies the active policy. ```yaml policy: Active: static ``` **AvailableResources** specifies the available hardware resources. **ReservedResources** specifies the hardware resources reserved for system and kube tasks. Currently, only CPU resources are supported. CPUs may be specified as a cpuset or as a numerical value, similar to Kubernetes resource quantities. Not all policies use these configuration settings. See the policy-specific documentation for details. ```yaml policy: AvailableResources: cpu: cpuset:0-63 ReservedResources: cpu: cpuset:0-3 # Alternative ways to specify CPUs: #cpu: 4 #cpu: 4000m ``` ### `policy.static` **RelaxedIsolation** controls whether isolated CPUs are preferred for Guarenteed Pods. ```yaml policy: static: RelaxedIsolation: true ``` ### `policy.static-plus` ### `policytopology-aware` ### `policy.static-pools` ### `policy.eda` ### `control` ### `control.blockio` ### `control.rdt` ### `blockio` ### `rdt` ### `instrumentation` ### `rdt` ### `blockio` ### `log` ### `dump` ================================================ FILE: docs/reference/index.rst ================================================ Reference ######### .. toctree:: :maxdepth: 1 resmgr-command-line-reference.md agent-command-line-reference.md configuration-reference.md ================================================ FILE: docs/reference/resmgr-command-line-reference.md ================================================ # CRI-Resmgr Command-line Reference ***WORK IN PROGRESS*** ================================================ FILE: docs/releases/conf.py ================================================ # Configuration file for the Sphinx documentation builder. # # This file only contains a selection of the most common options. For a full # list see the documentation: # https://www.sphinx-doc.org/en/master/usage/configuration.html # -- Path setup -------------------------------------------------------------- # If extensions (or modules to document with autodoc) are in another directory, # add these directories to sys.path here. If the directory is relative to the # documentation root, use os.path.abspath to make it absolute, like shown here. # import os # sys.path.insert(0, os.path.abspath('.')) # -- Project information ----------------------------------------------------- project = 'CRI Resource Manager' copyright = '2020, various' author = 'various' # Versions to show in the version menu version = "all releases" if os.getenv('VERSIONS_MENU'): html_context = { 'versions_menu': True, 'versions_menu_this_version': version} # -- General configuration --------------------------------------------------- # Add any Sphinx extension module names here, as strings. They can be # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom # ones. extensions = [ 'myst_parser', 'sphinx_markdown_tables' ] source_suffix = { '.rst': 'restructuredtext', '.md': 'markdown' } # Add any paths that contain templates here, relative to this directory. templates_path = ['../_templates'] # List of patterns, relative to source directory, that match files and # directories to ignore when looking for source files. # This pattern also affects html_static_path and html_extra_path. exclude_patterns = [] # -- Options for HTML output ------------------------------------------------- # The theme to use for HTML and HTML Help pages. See the documentation for # a list of builtin themes. # html_theme = 'sphinx_rtd_theme' html_theme_options = { 'display_version': True, } # Add any paths that contain custom static files (such as style sheets) here, # relative to this directory. They are copied after the builtin static files, # so a file named "default.css" will overwrite the builtin "default.css". #html_static_path = ['_static'] # Callbacks for recommonmark def setup(app): app.connect('missing-reference',ignoreMissingRefs) def ignoreMissingRefs(app, env, node, contnode): return contnode ================================================ FILE: docs/releases/index.md ================================================ # Releases For up-to-date user documentation see the [documentation site](/cri-resource-manager) ## Documentation for Released Versions
================================================ FILE: docs/requirements.txt ================================================ sphinx==5.3.0 sphinx_rtd_theme myst-parser==0.18.1 sphinx-markdown-tables Pygments==2.15.1 ================================================ FILE: docs/security.md ================================================ # Reporting a Potential Security Vulnerability Please visit [intel.com/security](https://intel.com/security) to report security issues. ================================================ FILE: docs/setup.md ================================================ # Setup and Usage If you want to give CRI Resource Manager a try, here is the list of things you need to do, assuming you already have a Kubernetes\* cluster up and running, using either `containerd` or `cri-o` as the runtime. 0. [Install](installation.md) CRI Resource Manager. 1. Set up kubelet to use CRI Resource Manager as the runtime. 2. Set up CRI Resource Manager to use the runtime with a policy. For kubelet you do this by altering its command line options like this: ``` kubelet --container-runtime=remote \ --container-runtime-endpoint=unix:///var/run/cri-resmgr/cri-resmgr.sock ``` For CRI Resource Manager, you need to provide a configuration file, and also a socket path if you don't use `containerd` or you run it with a different socket path. ``` # for containerd with default socket path cri-resmgr --force-config --runtime-socket unix:///var/run/containerd/containerd.sock # for cri-o cri-resmgr --force-config --runtime-socket unix:///var/run/crio/crio.sock ``` The choice of policy to use along with any potential parameters specific to that policy are taken from the configuration file. You can take a look at the [sample configurations](/sample-configs) for some minimal/trivial examples. For instance, you can use [sample-configs/topology-aware-policy.cfg](/sample-configs/topology-aware-policy.cfg) as `` to activate the topology aware policy with memory tiering support. **NOTE**: Currently, the available policies are a work in progress. ## Setting up kubelet to use CRI Resource Manager as the runtime To let CRI Resource Manager act as a proxy between kubelet and the CRI runtime, you need to configure kubelet to connect to CRI Resource Manager instead of the runtime. You do this by passing extra command line options to kubelet as shown below: ``` kubelet --container-runtime=remote \ --container-runtime-endpoint=unix:///var/run/cri-resmgr/cri-resmgr.sock ``` ## Setting up CRI Resource Manager Setting up CRI Resource Manager involves pointing it to your runtime and providing it with a configuration. Pointing to the runtime is done using the `--runtime-socket ` and, optionally, the `--image-socket `. For providing a configuration there are two options: 1. use a local configuration YAML file 2. use the [CRI Resource Manager Node Agent][agent] and a `ConfigMap` The former is easier to set up and it is also the preferred way to run CRI Resource Manager for development, and in some cases testing. Setting up the latter is a bit more involved but it allows you to - manage policy configuration for your cluster as a single source, and - dynamically update that configuration ### Using a local configuration from a file This is the easiest way to run CRI Resource Manager for development or testing. You can do it with the following command: ``` cri-resmgr --force-config --runtime-socket ``` When started this way, CRI Resource Manager reads its configuration from the given file. It does not fetch external configuration from the node agent and also disables the config interface for receiving configuration updates. ### Using CRI Resource Manager Agent and a ConfigMap This setup requires an extra component, the [CRI Resource Manager Node Agent][agent], to monitor and fetch configuration from the ConfigMap and pass it on to CRI Resource Manager. By default, CRI Resource Manager automatically tries to use the agent to acquire configuration, unless you override this by forcing a static local configuration using the `--force-config ` option. When using the agent, it is also possible to provide an initial fallback for configuration using the `--fallback-config `. This file is used before the very first configuration is successfully acquired from the agent. Whenever a new configuration is acquired from the agent and successfully taken into use, this configuration is stored in the cache and becomes the default configuration to take into use the next time CRI Resource Manager is restarted (unless that time the --force-config option is used). While CRI Resource Manager is shut down, any cached configuration can be cleared from the cache using the --reset-config command line option. See the [Node Agent][agent] about how to set up and configure the agent. ### Changing the active policy Currently, CRI Resource Manager disables changing the active policy using the [agent][agent]. That is, once the active policy is recorded in the cache, any configuration received through the agent that requests a different policy is rejected. This limitation will be removed in a future version of CRI Resource Manager. However, by default CRI Resource Manager allows you to change policies during its startup phase. If you want to disable this, you can pass the command line option `--disable-policy-switch` to CRI Resource Manager. If you run CRI Resource Manager with disabled policy switching, you can still switch policies by clearing any policy-specific data stored in the cache while CRI Resource Manager is shut down. You can do this by using the command line option `--reset-policy`. The whole sequence of switching policies this way is - stop cri-resmgr (`systemctl stop cri-resource-manager`) - reset policy data (`cri-resmgr --reset-policy`) - change policy (`$EDITOR /etc/cri-resource-manager/fallback.cfg`) - start cri-resmgr (`systemctl start cri-resource-manager`) ### Container adjustments When the [agent][agent] is in use, it is also possible to `adjust` container `resource assignments` externally, using dedicated `Adjustment` `Custom Resources` in the `adjustments.criresmgr.intel.com` group. You can use the [provided schema](/pkg/apis/resmgr/v1alpha1/adjustment-schema.yaml) to define the `Adjustment` resource. Then you can copy and modify the [sample adjustment CR](/sample-configs/external-adjustment.yaml) as a starting point to test some overrides. An `Adjustment` consists of the following: - `scope`: - the nodes and containers to which the adjustment applies - adjustment data: - updated native/compute resources (`cpu`/`memory` `requests` and `limits`) - updated `RDT` and/or `Block I/O` class - updated top tier (practically now DRAM) memory limit All adjustment data is optional. An adjustment can choose to set any or all of them as necessary. The current handling of adjustment update updates the resource assignments of containers, marks all existing containers as having pending changes in all controller domains, and then triggers a rebalancing in the active policy. This causes all containers to be updated. The scope defines to which containers on what nodes the adjustment applies. Nodes are currently matched/picked by name, but a trailing wildcard (`*`) is allowed and matches all nodes with the given prefix in their names. Containers are matched by expressions. These are exactly the same as the expressions for defining [affinity scopes](policy/container-affinity.md). A single adjustment can specify multiple node/container match pairs. An adjustment applies to all containers in its scope. If an adjustment/update results in conflicts for some container, that is at least one container is in the scope of multiple adjustments, the adjustment is rejected and the whole update is ignored. #### Commands for declaring, creating, deleting, and examining adjustments You can declare the custom resource for adjustments with this command: ``` kubectl apply -f pkg/apis/resmgr/v1alpha1/adjustment-schema.yaml ``` You can then add adjustments with a command like this: ``` kubectl apply -f sample-configs/external-adjustment.yaml ``` You can list existing adjustments with the following command. Use the correct `-n namespace` option according to the namespace you use for the agent, for the configuration, and in your adjustment specifications. ``` kubectl get adjustments.criresmgr.intel.com -n kube-system ``` You can examine the contents of a single adjustment with these commands: ``` kubectl describe adjustments external-adjustment -n kube-system kubectl get adjustments.criresmgr.intel.com/ -n kube-system -oyaml ``` Or you can examine the contents of all adjustments using this command: ``` kubectl get adjustments.criresmgr.intel.com -n kube-system -oyaml ``` Finally, you can delete an adjustment with commands like these: ``` kubectl delete -f sample-configs/external-adjustment.yaml kubectl delete adjustments.criresmgr.intel.com/ -n kube-system ``` The status of adjustment updates is propagated back to the `Adjustment` `Custom Resources`, more specifically into their `Status` fields. With the help of `jq`, you can easily examine the status of external adjustments using a command like this: ``` kli@r640-1:~> kubectl get -n kube-system adjustments.criresmgr.intel.com -ojson | jq '.items[].status' { "nodes": { "r640-1": { "errors": {} } } } { "nodes": { "r640-1": { "errors": {} } } } ``` The above response is what you get for adjustments applied without conflicts or errors. You can see here that only node *r640-1* is in the scope of both of your existing adjustments and those applied without errors. If your adjustments resulted in errors, the output will look something like this: ``` klitkey1@r640-1:~> kubectl get -n kube-system adjustments.criresmgr.intel.com -ojson | jq '.items[].status' { "nodes": { "r640-1": { "errors": { "b71a93523e58cb4ba0310aa225b2e2a329cef895ca4b96fcd9d12b375337ea35": "cache: conflicting adjustments for my-pod-r640-1:my-container: adjustment-1,adjustment-2" } } } } { "nodes": { "r640-1": { "errors": { "b71a93523e58cb4ba0310aa225b2e2a329cef895ca4b96fcd9d12b375337ea35": "cache: conflicting adjustments for my-pod-r640-1:my-container: adjustment-1,adjustment-2" } } } } ``` In the sample above, you can see that on node *r640-1* the container with `ID`*b71a93523e58cb4ba0310aa225b2e2a329cef895ca4b96fcd9d12b375337ea35*, or *my-container* of *my-pod-r640-1*, had a conflict. Moreover you can see that the reason of the conflict is that the container is in the scope of both *adjustment-1* and *adjustment-2*. You can now fix those adjustments to resolve/remove the conflict, then reapply the adjustments, and finally verify that the conflicts are gone. ``` kli@r640-1:~> $EDITOR adjustment-1.yaml adjustment-2.yaml kli@r640-1:~> kubectl apply -f adjustment-1.yaml && kubectl apply -f adjustment-1.yaml && sleep 2 kli@r640-1:~> kubectl get -n kube-system adjustments.criresmgr.intel.com -ojson | jq '.items[].status' { "nodes": { "r640-1": { "errors": {} } } } { "nodes": { "r640-1": { "errors": {} } } } ``` ## Using CRI Resource Manager as a message dumper You can use CRI Resource Manager to simply inspect all proxied CRI requests and responses without applying any policy. Run CRI Resource Manager with the provided [sample configuration](/sample-configs/cri-full-message-dump.cfg) for doing this. ## Kata Containers [Kata Containers](https://katacontainers.io/) is an open source container runtime, building lightweight virtual machines that seamlessly plug into the containers ecosystem. In order to enable Kata Containers in a Kubernetes-CRI-RM stack, both Kubernetes and the Container Runtime need to be aware of the new runtime environment: * The Container Runtime can only be CRI-O or containerd, and needs to have the runtimes enabled in their configuration files. * Kubernetes must be made aware of the CRI-O/containerd runtimes via a "RuntimeClass" [resource](https://kubernetes.io/docs/concepts/containers/runtime-class/) After these prerequisites are satisfied, the configuration file for the target Kata Container, must have the flag "SandboxCgroupOnly" set to true. As of Kata 2.0 this is the only way Kata Containers can work with the Kubernetes cgroup naming conventions. ```toml ... # If enabled, the runtime will add all the kata processes inside one dedicated cgroup. # The container cgroups in the host are not created, just one single cgroup per sandbox. # The runtime caller is free to restrict or collect cgroup stats of the overall Kata sandbox. # The sandbox cgroup path is the parent cgroup of a container with the PodSandbox annotation. # The sandbox cgroup is constrained if there is no container type annotation. # See: https://godoc.org/github.com/kata-containers/runtime/virtcontainers#ContainerType sandbox_cgroup_only=true ... ``` ### Reference If you have a pre-existing Kubernetes cluster, for an easy deployement follow this [document](https://github.com/kata-containers/packaging/blob/master/kata-deploy/README.md#kubernetes-quick-start). Starting from scratch: * [Kata installation guide](https://github.com/kata-containers/kata-containers/tree/2.0-dev/docs/install#manual-installation) * [Kata Containers + CRI-O](https://github.com/kata-containers/documentation/blob/master/how-to/run-kata-with-k8s.md) * [Kata Containers + containerd](https://github.com/kata-containers/documentation/blob/master/how-to/containerd-kata.md) * [Kubernetes Runtime Class](https://kubernetes.io/docs/concepts/containers/runtime-class/) * [Cgroup and Kata containers](https://github.com/kata-containers/kata-containers/blob/stable-2.0.0/docs/design/host-cgroups.md) ## Running with Untested Runtimes CRI Resource Manager is tested with `containerd` and `CRI-O`. If any other runtime is detected during startup, `cri-resmgr` will refuse to start. This default behavior can be changed using the `--allow-untested-runtimes` command line option. ## Logging and debugging You can control logging with the klog command line options or by setting the corresponding environment variables. You can get the name of the environment variable for a command line option by prepending the `LOGGER_` prefix to the capitalized option name without any leading dashes. For instance, setting the environment variable `LOGGER_SKIP_HEADERS=true` has the same effect as using the `-skip_headers` command line option. Additionally, the `LOGGER_DEBUG` environment variable controls debug logs. These are globally disabled by default. You can turn on full debugging by setting `LOGGER_DEBUG='*'`. When using environment variables, be careful which configuration you pass to CRI Resource Manager using a file or ConfigMap. The environment is treated as default configuration but a file or a ConfigMap has higher precedence. If something is configured in both, the environment will only be in effect until the configuration is applied. However, in such a case if you later push an updated configuration to CRI Resource Manager with the overlapping settings removed, the original ones from the environment will be in effect again. For debug logs, the settings from the configuration are applied in addition to any settings in the environment. That said, if you turn something on in the environment but off in the configuration, it will be turned off eventually. [agent]: node-agent.md ================================================ FILE: docs/webhook.md ================================================ # Webhook By default, CRI Resource Manager does not see the original container *resource requirements* specified in the *Pod Spec*. It tries to calculate these for `cpu` and `memory` *compute resource*s using the related parameters present in the CRI container creation request. The resulting estimates are normally accurate for `cpu`, and also for `memory` `limits`. However, it is not possible to use these parameters to estimate `memory` `request`s or any *extended resource*s. If you want to make sure that CRI Resource Manager uses the origin *Pod Spec* *resource requirement*s, you need to duplicate these as *annotations* on the Pod. This is necessary if you plan using or writing a policy which needs *extended resource*s. This process can be fully automated using the [CRI Resource Manager Annotating Webhook](/cmd/cri-resmgr-webhook). Once you built the Docker\* image for it using the [provided Dockerfile](/cmd/cri-resmgr-webhook/Dockerfile) and published it, you can set up the webhook as follows: - Fill in the `IMAGE_PLACEHOLDER` in [webhook-deployment.yaml](/cmd/cri-resmgr-webhook/webhook-deployment.yaml) to match the image. - Create a `cri-resmgr-webhook-secret` that carries a key and a certificate to `cri-resmgr-webhook`. You can create a key, a self-signed certificate and the secret that holds them with the following commands: ```bash SVC=cri-resmgr-webhook NS=cri-resmgr openssl req -x509 -newkey rsa:2048 -sha256 -days 365 -nodes \ -keyout cmd/cri-resmgr-webhook/server-key.pem \ -out cmd/cri-resmgr-webhook/server-crt.pem \ -subj "/CN=$SVC.$NS.svc" \ -addext "subjectAltName=DNS:$SVC,DNS:$SVC.$NS,DNS:$SVC.$NS.svc" cat >cmd/cri-resmgr-webhook/webhook-secret.yaml < #include #include #define SEC(NAME) __attribute__((section(NAME), used)) #ifndef KERNEL_VERSION #define KERNEL_VERSION(a,b,c) (((a) << 16) + ((b) << 8) + (c)) #endif #define BUF_SIZE_MAP_NS 256 typedef struct bpf_map_def { unsigned int type; unsigned int key_size; unsigned int value_size; unsigned int max_entries; unsigned int map_flags; unsigned int pinning; char namespace[BUF_SIZE_MAP_NS]; } bpf_map_def; static int (*bpf_probe_read)(void *dst, u64 size, const void *unsafe_ptr) = (void *)BPF_FUNC_probe_read; static u64 (*bpf_get_current_cgroup_id)(void) = (void *) BPF_FUNC_get_current_cgroup_id; static u64 (*bpf_ktime_get_ns)(void) = (void *) BPF_FUNC_ktime_get_ns; static int (*bpf_map_update_elem)(void *map, void *key, void *value, u64 flags) = (void *)BPF_FUNC_map_update_elem; static void *(*bpf_map_lookup_elem)(void *map, void *key) = (void *) BPF_FUNC_map_lookup_elem; struct bpf_map_def SEC("maps/all_context_switch_count") all_context_switch_count_hash = { .type = BPF_MAP_TYPE_HASH, .key_size = sizeof(u64), .value_size = sizeof(u32), .max_entries = 1024, }; struct bpf_map_def SEC("maps/avx_context_switch_count") avx_context_switch_count_hash = { .type = BPF_MAP_TYPE_PERCPU_HASH, .key_size = sizeof(u64), .value_size = sizeof(u32), .max_entries = 1024, }; struct bpf_map_def SEC("maps/avx_timestamp") avx_timestamp_hash = { .type = BPF_MAP_TYPE_HASH, .key_size = sizeof(u64), .value_size = sizeof(u32), .max_entries = 1024, }; struct bpf_map_def SEC("maps/last_update_ns") last_update_ns_hash = { .type = BPF_MAP_TYPE_HASH, .key_size = sizeof(u64), .value_size = sizeof(u64), .max_entries = 1024, }; SEC("tracepoint/sched/sched_switch") int tracepoint__sched_switch(void *args) { u64 cgroup_id = bpf_get_current_cgroup_id(); u32 *count, *found; u32 new_count = 1; found = bpf_map_lookup_elem(&avx_context_switch_count_hash, &cgroup_id); /* store sched_switch counts only for cgroups that have AVX activity */ if (!found) { return 0; } count = bpf_map_lookup_elem(&all_context_switch_count_hash, &cgroup_id); if (count) { __sync_fetch_and_add(count, 1); } else { bpf_map_update_elem(&all_context_switch_count_hash, &cgroup_id, &new_count, BPF_ANY); } return 0; } struct x86_fpu_args { u64 pad; struct fpu *fpu; bool load_fpu; u64 xfeatures; u64 xcomp_bv; }; SEC("tracepoint/x86_fpu/x86_fpu_regs_deactivated") int tracepoint__x86_fpu_regs_deactivated(struct x86_fpu_args *args) { u32 *counter; u32 ts; bpf_probe_read(&ts, sizeof(u32), (void *)&args->fpu->avx512_timestamp); if (ts == 0) { return 0; } u64 cgroup_id = bpf_get_current_cgroup_id(); u32 ts_prev; u32 *tsp; tsp = bpf_map_lookup_elem(&avx_timestamp_hash, &cgroup_id); ts_prev = tsp ? *tsp : 0; if (ts == ts_prev) { return 0; } bpf_map_update_elem(&avx_timestamp_hash, &cgroup_id, &ts, BPF_ANY); u32 count = 1; counter = bpf_map_lookup_elem(&avx_context_switch_count_hash, &cgroup_id); if (counter) { __sync_fetch_and_add(counter, 1); } else { bpf_map_update_elem(&avx_context_switch_count_hash, &cgroup_id, &count, BPF_ANY); } u64 last = bpf_ktime_get_ns(); bpf_map_update_elem(&last_update_ns_hash, &cgroup_id, &last, BPF_ANY); return 0; } char _license[] SEC("license") = "GPL"; /* Notes about Linux version: * We don't check LINUX_VERSION_CODE build time. It's user's responsibility to provide new enough headers. * Build failures may happen due to too old kernel headers (currently, Linux >= 5.1 headers are needed). * Our dependency to Kernel ABI is x86_fpu tracepoint parameters and struct fpu. * The host kernel needs to run Linux >= 5.2 and the version is checked upon eBPF loading. * We build the minimum supported version in SEC("version") section. * Max supported version is not checked but the check may be added later. */ unsigned int _version SEC("version") = KERNEL_VERSION(5, 2, 0); ================================================ FILE: go.mod ================================================ module github.com/intel/cri-resource-manager go 1.24 require ( contrib.go.opencensus.io/exporter/jaeger v0.2.1 contrib.go.opencensus.io/exporter/prometheus v0.4.2 github.com/cilium/ebpf v0.12.3 github.com/evanphx/json-patch v5.7.0+incompatible github.com/google/go-cmp v0.6.0 github.com/intel/cri-resource-manager/pkg/topology v0.0.0 github.com/intel/goresctrl v0.5.0 github.com/pkg/errors v0.9.1 github.com/prometheus/client_golang v1.18.0 github.com/prometheus/client_model v0.5.0 github.com/prometheus/common v0.45.0 github.com/shurcooL/vfsgen v0.0.0-20230704071429-0000e147ea92 github.com/stretchr/testify v1.8.4 go.opencensus.io v0.24.0 golang.org/x/sys v0.31.0 golang.org/x/time v0.5.0 google.golang.org/grpc v1.60.1 google.golang.org/protobuf v1.33.0 k8s.io/api v0.29.0 k8s.io/apimachinery v0.29.0 k8s.io/client-go v0.29.0 k8s.io/cri-api v0.29.0 k8s.io/klog/v2 v2.110.1 k8s.io/utils v0.0.0-20240102154912-e7106e64919e sigs.k8s.io/yaml v1.4.0 ) require ( github.com/beorn7/perks v1.0.1 // indirect github.com/cespare/xxhash/v2 v2.2.0 // indirect github.com/davecgh/go-spew v1.1.1 // indirect github.com/emicklei/go-restful/v3 v3.11.0 // indirect github.com/go-kit/log v0.2.1 // indirect github.com/go-logfmt/logfmt v0.6.0 // indirect github.com/go-logr/logr v1.3.0 // indirect github.com/go-openapi/jsonpointer v0.19.6 // indirect github.com/go-openapi/jsonreference v0.20.2 // indirect github.com/go-openapi/swag v0.22.3 // indirect github.com/gogo/protobuf v1.3.2 // indirect github.com/golang/groupcache v0.0.0-20210331224755-41bb18bfe9da // indirect github.com/golang/protobuf v1.5.3 // indirect github.com/google/gnostic-models v0.6.8 // indirect github.com/google/gofuzz v1.2.0 // indirect github.com/google/uuid v1.5.0 // indirect github.com/imdario/mergo v0.3.12 // indirect github.com/josharian/intern v1.0.0 // indirect github.com/json-iterator/go v1.1.12 // indirect github.com/mailru/easyjson v0.7.7 // indirect github.com/matttproud/golang_protobuf_extensions/v2 v2.0.0 // indirect github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd // indirect github.com/modern-go/reflect2 v1.0.2 // indirect github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822 // indirect github.com/pmezard/go-difflib v1.0.0 // indirect github.com/prometheus/procfs v0.12.0 // indirect github.com/prometheus/statsd_exporter v0.26.0 // indirect github.com/shurcooL/httpfs v0.0.0-20230704072500-f1e31cf0ba5c // indirect github.com/spf13/pflag v1.0.5 // indirect github.com/uber/jaeger-client-go v2.25.0+incompatible // indirect golang.org/x/exp v0.0.0-20240103183307-be819d1f06fc // indirect golang.org/x/net v0.38.0 // indirect golang.org/x/oauth2 v0.27.0 // indirect golang.org/x/sync v0.12.0 // indirect golang.org/x/term v0.30.0 // indirect golang.org/x/text v0.23.0 // indirect google.golang.org/api v0.155.0 // indirect google.golang.org/genproto/googleapis/rpc v0.0.0-20240102182953-50ed04b92917 // indirect gopkg.in/inf.v0 v0.9.1 // indirect gopkg.in/yaml.v2 v2.4.0 // indirect gopkg.in/yaml.v3 v3.0.1 // indirect k8s.io/kube-openapi v0.0.0-20231010175941-2dd684a91f00 // indirect sigs.k8s.io/json v0.0.0-20221116044647-bc3834ca7abd // indirect sigs.k8s.io/structured-merge-diff/v4 v4.4.1 // indirect ) replace ( github.com/intel/cri-resource-manager/pkg/topology v0.0.0 => ./pkg/topology go.opentelemetry.io/contrib => go.opentelemetry.io/contrib v0.20.0 go.opentelemetry.io/contrib/instrumentation/google.golang.org/grpc/otelgrpc => go.opentelemetry.io/contrib/instrumentation/google.golang.org/grpc/otelgrpc v0.20.0 go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp => go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp v0.20.0 go.opentelemetry.io/otel => go.opentelemetry.io/otel v0.20.0 go.opentelemetry.io/otel/exporters/otlp => go.opentelemetry.io/otel/exporters/otlp v0.20.0 go.opentelemetry.io/otel/metric => go.opentelemetry.io/otel/metric v0.20.0 go.opentelemetry.io/otel/oteltest => go.opentelemetry.io/otel/oteltest v0.20.0 go.opentelemetry.io/otel/sdk => go.opentelemetry.io/otel/sdk v0.20.0 go.opentelemetry.io/otel/sdk/export/metric => go.opentelemetry.io/otel/sdk/export/metric v0.20.0 go.opentelemetry.io/otel/sdk/metric => go.opentelemetry.io/otel/sdk/metric v0.20.0 go.opentelemetry.io/otel/trace => go.opentelemetry.io/otel/trace v0.20.0 k8s.io/api => k8s.io/api v0.29.0 k8s.io/apimachinery => k8s.io/apimachinery v0.29.0 k8s.io/apiserver => k8s.io/apiserver v0.29.0 k8s.io/client-go => k8s.io/client-go v0.29.0 k8s.io/component-base => k8s.io/component-base v0.29.0 k8s.io/cri-api => k8s.io/cri-api v0.29.0 ) ================================================ FILE: go.sum ================================================ cloud.google.com/go v0.26.0/go.mod h1:aQUYkXzVsufM+DwF1aE+0xfcU+56JwCaLick0ClmMTw= cloud.google.com/go v0.34.0/go.mod h1:aQUYkXzVsufM+DwF1aE+0xfcU+56JwCaLick0ClmMTw= cloud.google.com/go v0.38.0/go.mod h1:990N+gfupTy94rShfmMCWGDn0LpTmnzTp2qbd1dvSRU= cloud.google.com/go v0.44.1/go.mod h1:iSa0KzasP4Uvy3f1mN/7PiObzGgflwredwwASm/v6AU= cloud.google.com/go v0.44.2/go.mod h1:60680Gw3Yr4ikxnPRS/oxxkBccT6SA1yMk63TGekxKY= cloud.google.com/go v0.45.1/go.mod h1:RpBamKRgapWJb87xiFSdk4g1CME7QZg3uwTez+TSTjc= cloud.google.com/go v0.46.3/go.mod h1:a6bKKbmY7er1mI7TEI4lsAkts/mkhTSZK8w33B4RAg0= cloud.google.com/go v0.50.0/go.mod h1:r9sluTvynVuxRIOHXQEHMFffphuXHOMZMycpNR5e6To= cloud.google.com/go v0.52.0/go.mod h1:pXajvRH/6o3+F9jDHZWQ5PbGhn+o8w9qiu/CffaVdO4= cloud.google.com/go v0.53.0/go.mod h1:fp/UouUEsRkN6ryDKNW/Upv/JBKnv6WDthjR6+vze6M= cloud.google.com/go v0.54.0/go.mod h1:1rq2OEkV3YMf6n/9ZvGWI3GWw0VoqH/1x2nd8Is/bPc= cloud.google.com/go v0.56.0/go.mod h1:jr7tqZxxKOVYizybht9+26Z/gUq7tiRzu+ACVAMbKVk= cloud.google.com/go v0.57.0/go.mod h1:oXiQ6Rzq3RAkkY7N6t3TcE6jE+CIBBbA36lwQ1JyzZs= cloud.google.com/go v0.62.0/go.mod h1:jmCYTdRCQuc1PHIIJ/maLInMho30T/Y0M4hTdTShOYc= cloud.google.com/go v0.65.0/go.mod h1:O5N8zS7uWy9vkA9vayVHs65eM1ubvY4h553ofrNHObY= cloud.google.com/go/bigquery v1.0.1/go.mod h1:i/xbL2UlR5RvWAURpBYZTtm/cXjCha9lbfbpx4poX+o= cloud.google.com/go/bigquery v1.3.0/go.mod h1:PjpwJnslEMmckchkHFfq+HTD2DmtT67aNFKH1/VBDHE= cloud.google.com/go/bigquery v1.4.0/go.mod h1:S8dzgnTigyfTmLBfrtrhyYhwRxG72rYxvftPBK2Dvzc= cloud.google.com/go/bigquery v1.5.0/go.mod h1:snEHRnqQbz117VIFhE8bmtwIDY80NLUZUMb4Nv6dBIg= cloud.google.com/go/bigquery v1.7.0/go.mod h1://okPTzCYNXSlb24MZs83e2Do+h+VXtc4gLoIoXIAPc= cloud.google.com/go/bigquery v1.8.0/go.mod h1:J5hqkt3O0uAFnINi6JXValWIb1v0goeZM77hZzJN/fQ= cloud.google.com/go/datastore v1.0.0/go.mod h1:LXYbyblFSglQ5pkeyhO+Qmw7ukd3C+pD7TKLgZqpHYE= cloud.google.com/go/datastore v1.1.0/go.mod h1:umbIZjpQpHh4hmRpGhH4tLFup+FVzqBi1b3c64qFpCk= cloud.google.com/go/pubsub v1.0.1/go.mod h1:R0Gpsv3s54REJCy4fxDixWD93lHJMoZTyQ2kNxGRt3I= cloud.google.com/go/pubsub v1.1.0/go.mod h1:EwwdRX2sKPjnvnqCa270oGRyludottCI76h+R3AArQw= cloud.google.com/go/pubsub v1.2.0/go.mod h1:jhfEVHT8odbXTkndysNHCcx0awwzvfOlguIAii9o8iA= cloud.google.com/go/pubsub v1.3.1/go.mod h1:i+ucay31+CNRpDW4Lu78I4xXG+O1r/MAHgjpRVR+TSU= cloud.google.com/go/storage v1.0.0/go.mod h1:IhtSnM/ZTZV8YYJWCY8RULGVqBDmpoyjwiyrjsg+URw= cloud.google.com/go/storage v1.5.0/go.mod h1:tpKbwo567HUNpVclU5sGELwQWBDZ8gh0ZeosJ0Rtdos= cloud.google.com/go/storage v1.6.0/go.mod h1:N7U0C8pVQ/+NIKOBQyamJIeKQKkZ+mxpohlUTyfDhBk= cloud.google.com/go/storage v1.8.0/go.mod h1:Wv1Oy7z6Yz3DshWRJFhqM/UCfaWIRTdp0RXyy7KQOVs= cloud.google.com/go/storage v1.10.0/go.mod h1:FLPqc6j+Ki4BU591ie1oL6qBQGu2Bl/tZ9ullr3+Kg0= contrib.go.opencensus.io/exporter/jaeger v0.2.1 h1:yGBYzYMewVL0yO9qqJv3Z5+IRhPdU7e9o/2oKpX4YvI= contrib.go.opencensus.io/exporter/jaeger v0.2.1/go.mod h1:Y8IsLgdxqh1QxYxPC5IgXVmBaeLUeQFfBeBi9PbeZd0= contrib.go.opencensus.io/exporter/prometheus v0.4.2 h1:sqfsYl5GIY/L570iT+l93ehxaWJs2/OwXtiWwew3oAg= contrib.go.opencensus.io/exporter/prometheus v0.4.2/go.mod h1:dvEHbiKmgvbr5pjaF9fpw1KeYcjrnC1J8B+JKjsZyRQ= dmitri.shuralyov.com/gpu/mtl v0.0.0-20190408044501-666a987793e9/go.mod h1:H6x//7gZCb22OMCxBHrMx7a5I7Hp++hsVxbQ4BYO7hU= github.com/BurntSushi/toml v0.3.1/go.mod h1:xHWCNGjB5oqiDr8zfno3MHue2Ht5sIBksp03qcyfWMU= github.com/BurntSushi/xgb v0.0.0-20160522181843-27f122750802/go.mod h1:IVnqGOEym/WlBOVXweHU+Q+/VP0lqqI8lqeDx9IjBqo= github.com/alecthomas/template v0.0.0-20160405071501-a0175ee3bccc/go.mod h1:LOuyumcjzFXgccqObfd/Ljyb9UuFJ6TxHnclSeseNhc= github.com/alecthomas/template v0.0.0-20190718012654-fb15b899a751/go.mod h1:LOuyumcjzFXgccqObfd/Ljyb9UuFJ6TxHnclSeseNhc= github.com/alecthomas/units v0.0.0-20151022065526-2efee857e7cf/go.mod h1:ybxpYRFXyAe+OPACYpWeL0wqObRcbAqCMya13uyzqw0= github.com/alecthomas/units v0.0.0-20190717042225-c3de453c63f4/go.mod h1:ybxpYRFXyAe+OPACYpWeL0wqObRcbAqCMya13uyzqw0= github.com/alecthomas/units v0.0.0-20190924025748-f65c72e2690d/go.mod h1:rBZYJk541a8SKzHPHnH3zbiI+7dagKZ0cgpgrD7Fyho= github.com/alecthomas/units v0.0.0-20211218093645-b94a6e3cc137/go.mod h1:OMCwj8VM1Kc9e19TLln2VL61YJF0x1XFtfdL4JdbSyE= github.com/beorn7/perks v0.0.0-20180321164747-3a771d992973/go.mod h1:Dwedo/Wpr24TaqPxmxbtue+5NUziq4I4S80YR8gNf3Q= github.com/beorn7/perks v1.0.0/go.mod h1:KWe93zE9D1o94FZ5RNwFwVgaQK1VOXiVxmqh+CedLV8= github.com/beorn7/perks v1.0.1 h1:VlbKKnNfV8bJzeqoa4cOKqO6bYr3WgKZxO8Z16+hsOM= github.com/beorn7/perks v1.0.1/go.mod h1:G2ZrVWU2WbWT9wwq4/hrbKbnv/1ERSJQ0ibhJ6rlkpw= github.com/census-instrumentation/opencensus-proto v0.2.1/go.mod h1:f6KPmirojxKA12rnyqOA5BBL4O983OfeGPqjHWSTneU= github.com/cespare/xxhash/v2 v2.1.1/go.mod h1:VGX0DQ3Q6kWi7AoAeZDth3/j3BFtOZR5XLFGgcrjCOs= github.com/cespare/xxhash/v2 v2.1.2/go.mod h1:VGX0DQ3Q6kWi7AoAeZDth3/j3BFtOZR5XLFGgcrjCOs= github.com/cespare/xxhash/v2 v2.2.0 h1:DC2CZ1Ep5Y4k3ZQ899DldepgrayRUGE6BBZ/cd9Cj44= github.com/cespare/xxhash/v2 v2.2.0/go.mod h1:VGX0DQ3Q6kWi7AoAeZDth3/j3BFtOZR5XLFGgcrjCOs= github.com/chzyer/logex v1.1.10/go.mod h1:+Ywpsq7O8HXn0nuIou7OrIPyXbp3wmkHB+jjWRnGsAI= github.com/chzyer/readline v0.0.0-20180603132655-2972be24d48e/go.mod h1:nSuG5e5PlCu98SY8svDHJxuZscDgtXS6KTTbou5AhLI= github.com/chzyer/test v0.0.0-20180213035817-a1ea475d72b1/go.mod h1:Q3SI9o4m/ZMnBNeIyt5eFwwo7qiLfzFZmjNmxjkiQlU= github.com/cilium/ebpf v0.12.3 h1:8ht6F9MquybnY97at+VDZb3eQQr8ev79RueWeVaEcG4= github.com/cilium/ebpf v0.12.3/go.mod h1:TctK1ivibvI3znr66ljgi4hqOT8EYQjz1KWBfb1UVgM= github.com/client9/misspell v0.3.4/go.mod h1:qj6jICC3Q7zFZvVWo7KLAzC3yx5G7kyvSDkc90ppPyw= github.com/cncf/udpa/go v0.0.0-20191209042840-269d4d468f6f/go.mod h1:M8M6+tZqaGXZJjfX53e64911xZQV5JYwmTeXPW+k8Sc= github.com/creack/pty v1.1.9/go.mod h1:oKZEueFk5CKHvIhNR5MUki03XCEU+Q6VDXinZuGJ33E= github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c= github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= github.com/emicklei/go-restful/v3 v3.11.0 h1:rAQeMHw1c7zTmncogyy8VvRZwtkmkZ4FxERmMY4rD+g= github.com/emicklei/go-restful/v3 v3.11.0/go.mod h1:6n3XBCmQQb25CM2LCACGz8ukIrRry+4bhvbpWn3mrbc= github.com/envoyproxy/go-control-plane v0.9.0/go.mod h1:YTl/9mNaCwkRvm6d1a2C3ymFceY/DCBVvsKhRF0iEA4= github.com/envoyproxy/go-control-plane v0.9.1-0.20191026205805-5f8ba28d4473/go.mod h1:YTl/9mNaCwkRvm6d1a2C3ymFceY/DCBVvsKhRF0iEA4= github.com/envoyproxy/go-control-plane v0.9.4/go.mod h1:6rpuAdCZL397s3pYoYcLgu1mIlRU8Am5FuJP05cCM98= github.com/envoyproxy/protoc-gen-validate v0.1.0/go.mod h1:iSmxcyjqTsJpI2R4NaDN7+kN2VEUnK/pcBlmesArF7c= github.com/evanphx/json-patch v5.7.0+incompatible h1:vgGkfT/9f8zE6tvSCe74nfpAVDQ2tG6yudJd8LBksgI= github.com/evanphx/json-patch v5.7.0+incompatible/go.mod h1:50XU6AFN0ol/bzJsmQLiYLvXMP4fmwYFNcr97nuDLSk= github.com/frankban/quicktest v1.14.5 h1:dfYrrRyLtiqT9GyKXgdh+k4inNeTvmGbuSgZ3lx3GhA= github.com/frankban/quicktest v1.14.5/go.mod h1:4ptaffx2x8+WTWXmUCuVU6aPUX1/Mz7zb5vbUoiM6w0= github.com/go-gl/glfw v0.0.0-20190409004039-e6da0acd62b1/go.mod h1:vR7hzQXu2zJy9AVAgeJqvqgH9Q5CA+iKCZ2gyEVpxRU= github.com/go-gl/glfw/v3.3/glfw v0.0.0-20191125211704-12ad95a8df72/go.mod h1:tQ2UAYgL5IevRw8kRxooKSPJfGvJ9fJQFa0TUsXzTg8= github.com/go-gl/glfw/v3.3/glfw v0.0.0-20200222043503-6f7a984d4dc4/go.mod h1:tQ2UAYgL5IevRw8kRxooKSPJfGvJ9fJQFa0TUsXzTg8= github.com/go-kit/kit v0.8.0/go.mod h1:xBxKIO96dXMWWy0MnWVtmwkA9/13aqxPnvrjFYMA2as= github.com/go-kit/kit v0.9.0/go.mod h1:xBxKIO96dXMWWy0MnWVtmwkA9/13aqxPnvrjFYMA2as= github.com/go-kit/log v0.1.0/go.mod h1:zbhenjAZHb184qTLMA9ZjW7ThYL0H2mk7Q6pNt4vbaY= github.com/go-kit/log v0.2.0/go.mod h1:NwTd00d/i8cPZ3xOwwiv2PO5MOcx78fFErGNcVmBjv0= github.com/go-kit/log v0.2.1 h1:MRVx0/zhvdseW+Gza6N9rVzU/IVzaeE1SFI4raAhmBU= github.com/go-kit/log v0.2.1/go.mod h1:NwTd00d/i8cPZ3xOwwiv2PO5MOcx78fFErGNcVmBjv0= github.com/go-logfmt/logfmt v0.3.0/go.mod h1:Qt1PoO58o5twSAckw1HlFXLmHsOX5/0LbT9GBnD5lWE= github.com/go-logfmt/logfmt v0.4.0/go.mod h1:3RMwSq7FuexP4Kalkev3ejPJsZTpXXBr9+V4qmtdjCk= github.com/go-logfmt/logfmt v0.5.0/go.mod h1:wCYkCAKZfumFQihp8CzCvQ3paCTfi41vtzG1KdI/P7A= github.com/go-logfmt/logfmt v0.5.1/go.mod h1:WYhtIu8zTZfxdn5+rREduYbwxfcBr/Vr6KEVveWlfTs= github.com/go-logfmt/logfmt v0.6.0 h1:wGYYu3uicYdqXVgoYbvnkrPVXkuLM1p1ifugDMEdRi4= github.com/go-logfmt/logfmt v0.6.0/go.mod h1:WYhtIu8zTZfxdn5+rREduYbwxfcBr/Vr6KEVveWlfTs= github.com/go-logr/logr v1.3.0 h1:2y3SDp0ZXuc6/cjLSZ+Q3ir+QB9T/iG5yYRXqsagWSY= github.com/go-logr/logr v1.3.0/go.mod h1:9T104GzyrTigFIr8wt5mBrctHMim0Nb2HLGrmQ40KvY= github.com/go-openapi/jsonpointer v0.19.6 h1:eCs3fxoIi3Wh6vtgmLTOjdhSpiqphQ+DaPn38N2ZdrE= github.com/go-openapi/jsonpointer v0.19.6/go.mod h1:osyAmYz/mB/C3I+WsTTSgw1ONzaLJoLCyoi6/zppojs= github.com/go-openapi/jsonreference v0.20.2 h1:3sVjiK66+uXK/6oQ8xgcRKcFgQ5KXa2KvnJRumpMGbE= github.com/go-openapi/jsonreference v0.20.2/go.mod h1:Bl1zwGIM8/wsvqjsOQLJ/SH+En5Ap4rVB5KVcIDZG2k= github.com/go-openapi/swag v0.22.3 h1:yMBqmnQ0gyZvEb/+KzuWZOXgllrXT4SADYbvDaXHv/g= github.com/go-openapi/swag v0.22.3/go.mod h1:UzaqsxGiab7freDnrUUra0MwWfN/q7tE4j+VcZ0yl14= github.com/go-stack/stack v1.8.0/go.mod h1:v0f6uXyyMGvRgIKkXu+yp6POWl0qKG85gN/melR3HDY= github.com/go-task/slim-sprig v0.0.0-20230315185526-52ccab3ef572 h1:tfuBGBXKqDEevZMzYi5KSi8KkcZtzBcTgAUUtapy0OI= github.com/go-task/slim-sprig v0.0.0-20230315185526-52ccab3ef572/go.mod h1:9Pwr4B2jHnOSGXyyzV8ROjYa2ojvAY6HCGYYfMoC3Ls= github.com/gogo/protobuf v1.1.1/go.mod h1:r8qH/GZQm5c6nD/R0oafs1akxWv10x8SbQlK7atdtwQ= github.com/gogo/protobuf v1.3.2 h1:Ov1cvc58UF3b5XjBnZv7+opcTcQFZebYjWzi34vdm4Q= github.com/gogo/protobuf v1.3.2/go.mod h1:P1XiOD3dCwIKUDQYPy72D8LYyHL2YPYrpS2s69NZV8Q= github.com/golang/glog v0.0.0-20160126235308-23def4e6c14b/go.mod h1:SBH7ygxi8pfUlaOkMMuAQtPIUF8ecWP5IEl/CR7VP2Q= github.com/golang/groupcache v0.0.0-20190702054246-869f871628b6/go.mod h1:cIg4eruTrX1D+g88fzRXU5OdNfaM+9IcxsU14FzY7Hc= github.com/golang/groupcache v0.0.0-20191227052852-215e87163ea7/go.mod h1:cIg4eruTrX1D+g88fzRXU5OdNfaM+9IcxsU14FzY7Hc= github.com/golang/groupcache v0.0.0-20200121045136-8c9f03a8e57e/go.mod h1:cIg4eruTrX1D+g88fzRXU5OdNfaM+9IcxsU14FzY7Hc= github.com/golang/groupcache v0.0.0-20210331224755-41bb18bfe9da h1:oI5xCqsCo564l8iNU+DwB5epxmsaqB+rhGL0m5jtYqE= github.com/golang/groupcache v0.0.0-20210331224755-41bb18bfe9da/go.mod h1:cIg4eruTrX1D+g88fzRXU5OdNfaM+9IcxsU14FzY7Hc= github.com/golang/mock v1.1.1/go.mod h1:oTYuIxOrZwtPieC+H1uAHpcLFnEyAGVDL/k47Jfbm0A= github.com/golang/mock v1.2.0/go.mod h1:oTYuIxOrZwtPieC+H1uAHpcLFnEyAGVDL/k47Jfbm0A= github.com/golang/mock v1.3.1/go.mod h1:sBzyDLLjw3U8JLTeZvSv8jJB+tU5PVekmnlKIyFUx0Y= github.com/golang/mock v1.4.0/go.mod h1:UOMv5ysSaYNkG+OFQykRIcU/QvvxJf3p21QfJ2Bt3cw= github.com/golang/mock v1.4.1/go.mod h1:UOMv5ysSaYNkG+OFQykRIcU/QvvxJf3p21QfJ2Bt3cw= github.com/golang/mock v1.4.3/go.mod h1:UOMv5ysSaYNkG+OFQykRIcU/QvvxJf3p21QfJ2Bt3cw= github.com/golang/mock v1.4.4/go.mod h1:l3mdAwkq5BuhzHwde/uurv3sEJeZMXNpwsxVWU71h+4= github.com/golang/protobuf v1.2.0/go.mod h1:6lQm79b+lXiMfvg/cZm0SGofjICqVBUtrP5yJMmIC1U= github.com/golang/protobuf v1.3.1/go.mod h1:6lQm79b+lXiMfvg/cZm0SGofjICqVBUtrP5yJMmIC1U= github.com/golang/protobuf v1.3.2/go.mod h1:6lQm79b+lXiMfvg/cZm0SGofjICqVBUtrP5yJMmIC1U= github.com/golang/protobuf v1.3.3/go.mod h1:vzj43D7+SQXF/4pzW/hwtAqwc6iTitCiVSaWz5lYuqw= github.com/golang/protobuf v1.3.4/go.mod h1:vzj43D7+SQXF/4pzW/hwtAqwc6iTitCiVSaWz5lYuqw= github.com/golang/protobuf v1.3.5/go.mod h1:6O5/vntMXwX2lRkT1hjjk0nAC1IDOTvTlVgjlRvqsdk= github.com/golang/protobuf v1.4.0-rc.1/go.mod h1:ceaxUfeHdC40wWswd/P6IGgMaK3YpKi5j83Wpe3EHw8= github.com/golang/protobuf v1.4.0-rc.1.0.20200221234624-67d41d38c208/go.mod h1:xKAWHe0F5eneWXFV3EuXVDTCmh+JuBKY0li0aMyXATA= github.com/golang/protobuf v1.4.0-rc.2/go.mod h1:LlEzMj4AhA7rCAGe4KMBDvJI+AwstrUpVNzEA03Pprs= github.com/golang/protobuf v1.4.0-rc.4.0.20200313231945-b860323f09d0/go.mod h1:WU3c8KckQ9AFe+yFwt9sWVRKCVIyN9cPHBJSNnbL67w= github.com/golang/protobuf v1.4.0/go.mod h1:jodUvKwWbYaEsadDk5Fwe5c77LiNKVO9IDvqG2KuDX0= github.com/golang/protobuf v1.4.1/go.mod h1:U8fpvMrcmy5pZrNK1lt4xCsGvpyWQ/VVv6QDs8UjoX8= github.com/golang/protobuf v1.4.2/go.mod h1:oDoupMAO8OvCJWAcko0GGGIgR6R6ocIYbsSw735rRwI= github.com/golang/protobuf v1.4.3/go.mod h1:oDoupMAO8OvCJWAcko0GGGIgR6R6ocIYbsSw735rRwI= github.com/golang/protobuf v1.5.0/go.mod h1:FsONVRAS9T7sI+LIUmWTfcYkHO4aIWwzhcaSAoJOfIk= github.com/golang/protobuf v1.5.2/go.mod h1:XVQd3VNwM+JqD3oG2Ue2ip4fOMUkwXdXDdiuN0vRsmY= github.com/golang/protobuf v1.5.3 h1:KhyjKVUg7Usr/dYsdSqoFveMYd5ko72D+zANwlG1mmg= github.com/golang/protobuf v1.5.3/go.mod h1:XVQd3VNwM+JqD3oG2Ue2ip4fOMUkwXdXDdiuN0vRsmY= github.com/google/btree v0.0.0-20180813153112-4030bb1f1f0c/go.mod h1:lNA+9X1NB3Zf8V7Ke586lFgjr2dZNuvo3lPJSGZ5JPQ= github.com/google/btree v1.0.0/go.mod h1:lNA+9X1NB3Zf8V7Ke586lFgjr2dZNuvo3lPJSGZ5JPQ= github.com/google/gnostic-models v0.6.8 h1:yo/ABAfM5IMRsS1VnXjTBvUb61tFIHozhlYvRgGre9I= github.com/google/gnostic-models v0.6.8/go.mod h1:5n7qKqH0f5wFt+aWF8CW6pZLLNOfYuF5OpfBSENuI8U= github.com/google/go-cmp v0.2.0/go.mod h1:oXzfMopK8JAjlY9xF4vHSVASa0yLyX7SntLO5aqRK0M= github.com/google/go-cmp v0.3.0/go.mod h1:8QqcDgzrUqlUb/G2PQTWiueGozuR1884gddMywk6iLU= github.com/google/go-cmp v0.3.1/go.mod h1:8QqcDgzrUqlUb/G2PQTWiueGozuR1884gddMywk6iLU= github.com/google/go-cmp v0.4.0/go.mod h1:v8dTdLbMG2kIc/vJvl+f65V22dbkXbowE6jgT/gNBxE= github.com/google/go-cmp v0.4.1/go.mod h1:v8dTdLbMG2kIc/vJvl+f65V22dbkXbowE6jgT/gNBxE= github.com/google/go-cmp v0.5.0/go.mod h1:v8dTdLbMG2kIc/vJvl+f65V22dbkXbowE6jgT/gNBxE= github.com/google/go-cmp v0.5.1/go.mod h1:v8dTdLbMG2kIc/vJvl+f65V22dbkXbowE6jgT/gNBxE= github.com/google/go-cmp v0.5.3/go.mod h1:v8dTdLbMG2kIc/vJvl+f65V22dbkXbowE6jgT/gNBxE= github.com/google/go-cmp v0.5.4/go.mod h1:v8dTdLbMG2kIc/vJvl+f65V22dbkXbowE6jgT/gNBxE= github.com/google/go-cmp v0.5.5/go.mod h1:v8dTdLbMG2kIc/vJvl+f65V22dbkXbowE6jgT/gNBxE= github.com/google/go-cmp v0.5.8/go.mod h1:17dUlkBOakJ0+DkrSSNjCkIjxS6bF9zb3elmeNGIjoY= github.com/google/go-cmp v0.5.9/go.mod h1:17dUlkBOakJ0+DkrSSNjCkIjxS6bF9zb3elmeNGIjoY= github.com/google/go-cmp v0.6.0 h1:ofyhxvXcZhMsU5ulbFiLKl/XBFqE1GSq7atu8tAmTRI= github.com/google/go-cmp v0.6.0/go.mod h1:17dUlkBOakJ0+DkrSSNjCkIjxS6bF9zb3elmeNGIjoY= github.com/google/gofuzz v1.0.0/go.mod h1:dBl0BpW6vV/+mYPU4Po3pmUjxk6FQPldtuIdl/M65Eg= github.com/google/gofuzz v1.2.0 h1:xRy4A+RhZaiKjJ1bPfwQ8sedCA+YS2YcCHW6ec7JMi0= github.com/google/gofuzz v1.2.0/go.mod h1:dBl0BpW6vV/+mYPU4Po3pmUjxk6FQPldtuIdl/M65Eg= github.com/google/martian v2.1.0+incompatible/go.mod h1:9I4somxYTbIHy5NJKHRl3wXiIaQGbYVAs8BPL6v8lEs= github.com/google/martian/v3 v3.0.0/go.mod h1:y5Zk1BBys9G+gd6Jrk0W3cC1+ELVxBWuIGO+w/tUAp0= github.com/google/pprof v0.0.0-20181206194817-3ea8567a2e57/go.mod h1:zfwlbNMJ+OItoe0UupaVj+oy1omPYYDuagoSzA8v9mc= github.com/google/pprof v0.0.0-20190515194954-54271f7e092f/go.mod h1:zfwlbNMJ+OItoe0UupaVj+oy1omPYYDuagoSzA8v9mc= github.com/google/pprof v0.0.0-20191218002539-d4f498aebedc/go.mod h1:ZgVRPoUq/hfqzAqh7sHMqb3I9Rq5C59dIz2SbBwJ4eM= github.com/google/pprof v0.0.0-20200212024743-f11f1df84d12/go.mod h1:ZgVRPoUq/hfqzAqh7sHMqb3I9Rq5C59dIz2SbBwJ4eM= github.com/google/pprof v0.0.0-20200229191704-1ebb73c60ed3/go.mod h1:ZgVRPoUq/hfqzAqh7sHMqb3I9Rq5C59dIz2SbBwJ4eM= github.com/google/pprof v0.0.0-20200430221834-fc25d7d30c6d/go.mod h1:ZgVRPoUq/hfqzAqh7sHMqb3I9Rq5C59dIz2SbBwJ4eM= github.com/google/pprof v0.0.0-20200708004538-1a94d8640e99/go.mod h1:ZgVRPoUq/hfqzAqh7sHMqb3I9Rq5C59dIz2SbBwJ4eM= github.com/google/pprof v0.0.0-20210720184732-4bb14d4b1be1 h1:K6RDEckDVWvDI9JAJYCmNdQXq6neHJOYx3V6jnqNEec= github.com/google/pprof v0.0.0-20210720184732-4bb14d4b1be1/go.mod h1:kpwsk12EmLew5upagYY7GY0pfYCcupk39gWOCRROcvE= github.com/google/renameio v0.1.0/go.mod h1:KWCgfxg9yswjAJkECMjeO8J8rahYeXnNhOm40UhjYkI= github.com/google/uuid v1.1.2/go.mod h1:TIyPZe4MgqvfeYDBFedMoGGpEw/LqOeaOT+nhxU+yHo= github.com/google/uuid v1.5.0 h1:1p67kYwdtXjb0gL0BPiP1Av9wiZPo5A8z2cWkTZ+eyU= github.com/google/uuid v1.5.0/go.mod h1:TIyPZe4MgqvfeYDBFedMoGGpEw/LqOeaOT+nhxU+yHo= github.com/googleapis/gax-go/v2 v2.0.4/go.mod h1:0Wqv26UfaUD9n4G6kQubkQ+KchISgw+vpHVxEJEs9eg= github.com/googleapis/gax-go/v2 v2.0.5/go.mod h1:DWXyrwAJ9X0FpwwEdw+IPEYBICEFu5mhpdKc/us6bOk= github.com/hashicorp/golang-lru v0.5.0/go.mod h1:/m3WP610KZHVQ1SGc6re/UDhFvYD7pJ4Ao+sR/qLZy8= github.com/hashicorp/golang-lru v0.5.1/go.mod h1:/m3WP610KZHVQ1SGc6re/UDhFvYD7pJ4Ao+sR/qLZy8= github.com/ianlancetaylor/demangle v0.0.0-20181102032728-5e5cf60278f6/go.mod h1:aSSvb/t6k1mPoxDqO4vJh6VOCGPwU4O0C2/Eqndh1Sc= github.com/imdario/mergo v0.3.12 h1:b6R2BslTbIEToALKP7LxUvijTsNI9TAe80pLWN2g/HU= github.com/imdario/mergo v0.3.12/go.mod h1:jmQim1M+e3UYxmgPu/WyfjB3N3VflVyUjjjwH0dnCYA= github.com/intel/goresctrl v0.5.0 h1:kcDhjE3ZF/mNrJuRzLS3LY2Hp6atFaF1XVFBT7SVL2g= github.com/intel/goresctrl v0.5.0/go.mod h1:mIe63ggylWYr0cU/l8n11FAkesqfvuP3oktIsxvu0T0= github.com/josharian/intern v1.0.0 h1:vlS4z54oSdjm0bgjRigI+G1HpF+tI+9rE5LLzOg8HmY= github.com/josharian/intern v1.0.0/go.mod h1:5DoeVV0s6jJacbCEi61lwdGj/aVlrQvzHFFd8Hwg//Y= github.com/jpillora/backoff v1.0.0/go.mod h1:J/6gKK9jxlEcS3zixgDgUAsiuZ7yrSoa/FX5e0EB2j4= github.com/json-iterator/go v1.1.6/go.mod h1:+SdeFBvtyEkXs7REEP0seUULqWtbJapLOCVDaaPEHmU= github.com/json-iterator/go v1.1.10/go.mod h1:KdQUCv79m/52Kvf8AW2vK1V8akMuk1QjK/uOdHXbAo4= github.com/json-iterator/go v1.1.11/go.mod h1:KdQUCv79m/52Kvf8AW2vK1V8akMuk1QjK/uOdHXbAo4= github.com/json-iterator/go v1.1.12 h1:PV8peI4a0ysnczrg+LtxykD8LfKY9ML6u2jnxaEnrnM= github.com/json-iterator/go v1.1.12/go.mod h1:e30LSqwooZae/UwlEbR2852Gd8hjQvJoHmT4TnhNGBo= github.com/jstemmer/go-junit-report v0.0.0-20190106144839-af01ea7f8024/go.mod h1:6v2b51hI/fHJwM22ozAgKL4VKDeJcHhJFhtBdhmNjmU= github.com/jstemmer/go-junit-report v0.9.1/go.mod h1:Brl9GWCQeLvo8nXZwPNNblvFj/XSXhF0NWZEnDohbsk= github.com/julienschmidt/httprouter v1.2.0/go.mod h1:SYymIcj16QtmaHHD7aYtjjsJG7VTCxuUUipMqKk8s4w= github.com/julienschmidt/httprouter v1.3.0/go.mod h1:JR6WtHb+2LUe8TCKY3cZOxFyyO8IZAc4RVcycCCAKdM= github.com/kisielk/errcheck v1.5.0/go.mod h1:pFxgyoBC7bSaBwPgfKdkLd5X25qrDl4LWUI2bnpBCr8= github.com/kisielk/gotool v1.0.0/go.mod h1:XhKaO+MFFWcvkIS/tQcRk01m1F5IRFswLeQ+oQHNcck= github.com/konsorten/go-windows-terminal-sequences v1.0.1/go.mod h1:T0+1ngSBFLxvqU3pZ+m/2kptfBszLMUkC4ZK/EgS/cQ= github.com/konsorten/go-windows-terminal-sequences v1.0.3/go.mod h1:T0+1ngSBFLxvqU3pZ+m/2kptfBszLMUkC4ZK/EgS/cQ= github.com/kr/logfmt v0.0.0-20140226030751-b84e30acd515/go.mod h1:+0opPa2QZZtGFBFZlji/RkVcI2GknAs/DXo4wKdlNEc= github.com/kr/pretty v0.1.0/go.mod h1:dAy3ld7l9f0ibDNOQOHHMYYIIbhfbHSm3C4ZsoJORNo= github.com/kr/pretty v0.2.1/go.mod h1:ipq/a2n7PKx3OHsz4KJII5eveXtPO4qwEXGdVfWzfnI= github.com/kr/pretty v0.3.1 h1:flRD4NNwYAUpkphVc1HcthR4KEIFJ65n8Mw5qdRn3LE= github.com/kr/pretty v0.3.1/go.mod h1:hoEshYVHaxMs3cyo3Yncou5ZscifuDolrwPKZanG3xk= github.com/kr/pty v1.1.1/go.mod h1:pFQYn66WHrOpPYNljwOMqo10TkYh1fy3cYio2l3bCsQ= github.com/kr/text v0.1.0/go.mod h1:4Jbv+DJW3UT/LiOwJeYQe1efqtUx/iVham/4vfdArNI= github.com/kr/text v0.2.0 h1:5Nx0Ya0ZqY2ygV366QzturHI13Jq95ApcVaJBhpS+AY= github.com/kr/text v0.2.0/go.mod h1:eLer722TekiGuMkidMxC/pM04lWEeraHUUmBw8l2grE= github.com/mailru/easyjson v0.7.7 h1:UGYAvKxe3sBsEDzO8ZeWOSlIQfWFlxbzLZe7hwFURr0= github.com/mailru/easyjson v0.7.7/go.mod h1:xzfreul335JAWq5oZzymOObrkdz5UnU4kGfJJLY9Nlc= github.com/matttproud/golang_protobuf_extensions v1.0.1/go.mod h1:D8He9yQNgCq6Z5Ld7szi9bcBfOoFv/3dc6xSMkL2PC0= github.com/matttproud/golang_protobuf_extensions/v2 v2.0.0 h1:jWpvCLoY8Z/e3VKvlsiIGKtc+UG6U5vzxaoagmhXfyg= github.com/matttproud/golang_protobuf_extensions/v2 v2.0.0/go.mod h1:QUyp042oQthUoa9bqDv0ER0wrtXnBruoNd7aNjkbP+k= github.com/modern-go/concurrent v0.0.0-20180228061459-e0a39a4cb421/go.mod h1:6dJC0mAP4ikYIbvyc7fijjWJddQyLn8Ig3JB5CqoB9Q= github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd h1:TRLaZ9cD/w8PVh93nsPXa1VrQ6jlwL5oN8l14QlcNfg= github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd/go.mod h1:6dJC0mAP4ikYIbvyc7fijjWJddQyLn8Ig3JB5CqoB9Q= github.com/modern-go/reflect2 v0.0.0-20180701023420-4b7aa43c6742/go.mod h1:bx2lNnkwVCuqBIxFjflWJWanXIb3RllmbCylyMrvgv0= github.com/modern-go/reflect2 v1.0.1/go.mod h1:bx2lNnkwVCuqBIxFjflWJWanXIb3RllmbCylyMrvgv0= github.com/modern-go/reflect2 v1.0.2 h1:xBagoLtFs94CBntxluKeaWgTMpvLxC4ur3nMaC9Gz0M= github.com/modern-go/reflect2 v1.0.2/go.mod h1:yWuevngMOJpCy52FWWMvUC8ws7m/LJsjYzDa0/r8luk= github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822 h1:C3w9PqII01/Oq1c1nUAm88MOHcQC9l5mIlSMApZMrHA= github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822/go.mod h1:+n7T8mK8HuQTcFwEeznm/DIxMOiR9yIdICNftLE1DvQ= github.com/mwitkow/go-conntrack v0.0.0-20161129095857-cc309e4a2223/go.mod h1:qRWi+5nqEBWmkhHvq77mSJWrCKwh8bxhgT7d/eI7P4U= github.com/mwitkow/go-conntrack v0.0.0-20190716064945-2f068394615f/go.mod h1:qRWi+5nqEBWmkhHvq77mSJWrCKwh8bxhgT7d/eI7P4U= github.com/onsi/ginkgo/v2 v2.13.0 h1:0jY9lJquiL8fcf3M4LAXN5aMlS/b2BV86HFFPCPMgE4= github.com/onsi/ginkgo/v2 v2.13.0/go.mod h1:TE309ZR8s5FsKKpuB1YAQYBzCaAfUgatB/xlT/ETL/o= github.com/onsi/gomega v1.29.0 h1:KIA/t2t5UBzoirT4H9tsML45GEbo3ouUnBHsCfD2tVg= github.com/onsi/gomega v1.29.0/go.mod h1:9sxs+SwGrKI0+PWe4Fxa9tFQQBG5xSsSbMXOI8PPpoQ= github.com/pkg/errors v0.8.0/go.mod h1:bwawxfHBFNV+L2hUp1rHADufV3IMtnDRdf1r5NINEl0= github.com/pkg/errors v0.8.1/go.mod h1:bwawxfHBFNV+L2hUp1rHADufV3IMtnDRdf1r5NINEl0= github.com/pkg/errors v0.9.1 h1:FEBLx1zS214owpjy7qsBeixbURkuhQAwrK5UwLGTwt4= github.com/pkg/errors v0.9.1/go.mod h1:bwawxfHBFNV+L2hUp1rHADufV3IMtnDRdf1r5NINEl0= github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM= github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4= github.com/prometheus/client_golang v0.9.1/go.mod h1:7SWBe2y4D6OKWSNQJUaRYU/AaXPKyh/dDVn+NZz0KFw= github.com/prometheus/client_golang v1.0.0/go.mod h1:db9x61etRT2tGnBNRi70OPL5FsnadC4Ky3P0J6CfImo= github.com/prometheus/client_golang v1.7.1/go.mod h1:PY5Wy2awLA44sXw4AOSfFBetzPP4j5+D6mVACh+pe2M= github.com/prometheus/client_golang v1.11.0/go.mod h1:Z6t4BnS23TR94PD6BsDNk8yVqroYurpAkEiz0P2BEV0= github.com/prometheus/client_golang v1.12.1/go.mod h1:3Z9XVyYiZYEO+YQWt3RD2R3jrbd179Rt297l4aS6nDY= github.com/prometheus/client_golang v1.12.2/go.mod h1:3Z9XVyYiZYEO+YQWt3RD2R3jrbd179Rt297l4aS6nDY= github.com/prometheus/client_golang v1.13.0/go.mod h1:vTeo+zgvILHsnnj/39Ou/1fPN5nJFOEMgftOUOmlvYQ= github.com/prometheus/client_golang v1.18.0 h1:HzFfmkOzH5Q8L8G+kSJKUx5dtG87sewO+FoDDqP5Tbk= github.com/prometheus/client_golang v1.18.0/go.mod h1:T+GXkCk5wSJyOqMIzVgvvjFDlkOQntgjkJWKrN5txjA= github.com/prometheus/client_model v0.0.0-20180712105110-5c3871d89910/go.mod h1:MbSGuTsp3dbXC40dX6PRTWyKYBIrTGTE9sqQNg2J8bo= github.com/prometheus/client_model v0.0.0-20190129233127-fd36f4220a90/go.mod h1:xMI15A0UPsDsEKsMN9yxemIoYk6Tm2C1GtYGdfGttqA= github.com/prometheus/client_model v0.0.0-20190812154241-14fe0d1b01d4/go.mod h1:xMI15A0UPsDsEKsMN9yxemIoYk6Tm2C1GtYGdfGttqA= github.com/prometheus/client_model v0.2.0/go.mod h1:xMI15A0UPsDsEKsMN9yxemIoYk6Tm2C1GtYGdfGttqA= github.com/prometheus/client_model v0.5.0 h1:VQw1hfvPvk3Uv6Qf29VrPF32JB6rtbgI6cYPYQjL0Qw= github.com/prometheus/client_model v0.5.0/go.mod h1:dTiFglRmd66nLR9Pv9f0mZi7B7fk5Pm3gvsjB5tr+kI= github.com/prometheus/common v0.4.1/go.mod h1:TNfzLD0ON7rHzMJeJkieUDPYmFC7Snx/y86RQel1bk4= github.com/prometheus/common v0.10.0/go.mod h1:Tlit/dnDKsSWFlCLTWaA1cyBgKHSMdTB80sz/V91rCo= github.com/prometheus/common v0.26.0/go.mod h1:M7rCNAaPfAosfx8veZJCuw84e35h3Cfd9VFqTh1DIvc= github.com/prometheus/common v0.32.1/go.mod h1:vu+V0TpY+O6vW9J44gczi3Ap/oXXR10b+M/gUGO4Hls= github.com/prometheus/common v0.35.0/go.mod h1:phzohg0JFMnBEFGxTDbfu3QyL5GI8gTQJFhYO5B3mfA= github.com/prometheus/common v0.37.0/go.mod h1:phzohg0JFMnBEFGxTDbfu3QyL5GI8gTQJFhYO5B3mfA= github.com/prometheus/common v0.45.0 h1:2BGz0eBc2hdMDLnO/8n0jeB3oPrt2D08CekT0lneoxM= github.com/prometheus/common v0.45.0/go.mod h1:YJmSTw9BoKxJplESWWxlbyttQR4uaEcGyv9MZjVOJsY= github.com/prometheus/procfs v0.0.0-20181005140218-185b4288413d/go.mod h1:c3At6R/oaqEKCNdg8wHV1ftS6bRYblBhIjjI8uT2IGk= github.com/prometheus/procfs v0.0.2/go.mod h1:TjEm7ze935MbeOT/UhFTIMYKhuLP4wbCsTZCD3I8kEA= github.com/prometheus/procfs v0.1.3/go.mod h1:lV6e/gmhEcM9IjHGsFOCxxuZ+z1YqCvr4OA4YeYWdaU= github.com/prometheus/procfs v0.6.0/go.mod h1:cz+aTbrPOrUb4q7XlbU9ygM+/jj0fzG6c1xBZuNvfVA= github.com/prometheus/procfs v0.7.3/go.mod h1:cz+aTbrPOrUb4q7XlbU9ygM+/jj0fzG6c1xBZuNvfVA= github.com/prometheus/procfs v0.8.0/go.mod h1:z7EfXMXOkbkqb9IINtpCn86r/to3BnA0uaxHdg830/4= github.com/prometheus/procfs v0.12.0 h1:jluTpSng7V9hY0O2R9DzzJHYb2xULk9VTR1V1R/k6Bo= github.com/prometheus/procfs v0.12.0/go.mod h1:pcuDEFsWDnvcgNzo4EEweacyhjeA9Zk3cnaOZAZEfOo= github.com/prometheus/statsd_exporter v0.22.7/go.mod h1:N/TevpjkIh9ccs6nuzY3jQn9dFqnUakOjnEuMPJJJnI= github.com/prometheus/statsd_exporter v0.26.0 h1:SQl3M6suC6NWQYEzOvIv+EF6dAMYEqIuZy+o4H9F5Ig= github.com/prometheus/statsd_exporter v0.26.0/go.mod h1:GXFLADOmBTVDrHc7b04nX8ooq3azG61pnECNqT7O5DM= github.com/rogpeppe/go-internal v1.3.0/go.mod h1:M8bDsm7K2OlrFYOpmOWEs/qY81heoFRclV5y23lUDJ4= github.com/rogpeppe/go-internal v1.10.0 h1:TMyTOH3F/DB16zRVcYyreMH6GnZZrwQVAoYjRBZyWFQ= github.com/rogpeppe/go-internal v1.10.0/go.mod h1:UQnix2H7Ngw/k4C5ijL5+65zddjncjaFoBhdsK/akog= github.com/shurcooL/httpfs v0.0.0-20230704072500-f1e31cf0ba5c h1:aqg5Vm5dwtvL+YgDpBcK1ITf3o96N/K7/wsRXQnUTEs= github.com/shurcooL/httpfs v0.0.0-20230704072500-f1e31cf0ba5c/go.mod h1:owqhoLW1qZoYLZzLnBw+QkPP9WZnjlSWihhxAJC1+/M= github.com/shurcooL/vfsgen v0.0.0-20230704071429-0000e147ea92 h1:OfRzdxCzDhp+rsKWXuOO2I/quKMJ/+TQwVbIP/gltZg= github.com/shurcooL/vfsgen v0.0.0-20230704071429-0000e147ea92/go.mod h1:7/OT02F6S6I7v6WXb+IjhMuZEYfH/RJ5RwEWnEo5BMg= github.com/sirupsen/logrus v1.2.0/go.mod h1:LxeOpSwHxABJmUn/MG1IvRgCAasNZTLOkJPxbbu5VWo= github.com/sirupsen/logrus v1.4.2/go.mod h1:tLMulIdttU9McNUspp0xgXVQah82FyeX6MwdIuYE2rE= github.com/sirupsen/logrus v1.6.0/go.mod h1:7uNnSEd1DgxDLC74fIahvMZmmYsHGZGEOFrfsX/uA88= github.com/spf13/pflag v1.0.5 h1:iy+VFUOCP1a+8yFto/drg2CJ5u0yRoB7fZw3DKv/JXA= github.com/spf13/pflag v1.0.5/go.mod h1:McXfInJRrz4CZXVZOBLb0bTZqETkiAhM9Iw0y3An2Bg= github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME= github.com/stretchr/objx v0.1.1/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME= github.com/stretchr/objx v0.4.0/go.mod h1:YvHI0jy2hoMjB+UWwv71VJQ9isScKT/TqJzVSSt89Yw= github.com/stretchr/objx v0.5.0/go.mod h1:Yh+to48EsGEfYuaHDzXPcE3xhTkx73EhmCGUpEOglKo= github.com/stretchr/testify v1.2.2/go.mod h1:a8OnRcib4nhh0OaRAV+Yts87kKdq0PP7pXfy6kDkUVs= github.com/stretchr/testify v1.3.0/go.mod h1:M5WIy9Dh21IEIfnGCwXGc5bZfKNJtfHm1UVUgZn+9EI= github.com/stretchr/testify v1.4.0/go.mod h1:j7eGeouHqKxXV5pUuKE4zz7dFj8WfuZ+81PSLYec5m4= github.com/stretchr/testify v1.6.1/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/h/Wwjteg= github.com/stretchr/testify v1.7.1/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/h/Wwjteg= github.com/stretchr/testify v1.8.0/go.mod h1:yNjHg4UonilssWZ8iaSj1OCr/vHnekPRkoO+kdMU+MU= github.com/stretchr/testify v1.8.1/go.mod h1:w2LPCIKwWwSfY2zedu0+kehJoqGctiVI29o6fzry7u4= github.com/stretchr/testify v1.8.4 h1:CcVxjf3Q8PM0mHUKJCdn+eZZtm5yQwehR5yeSVQQcUk= github.com/stretchr/testify v1.8.4/go.mod h1:sz/lmYIOXD/1dqDmKjjqLyZ2RngseejIcXlSw2iwfAo= github.com/stvp/go-udp-testing v0.0.0-20201019212854-469649b16807/go.mod h1:7jxmlfBCDBXRzr0eAQJ48XC1hBu1np4CS5+cHEYfwpc= github.com/uber/jaeger-client-go v2.25.0+incompatible h1:IxcNZ7WRY1Y3G4poYlx24szfsn/3LvK9QHCq9oQw8+U= github.com/uber/jaeger-client-go v2.25.0+incompatible/go.mod h1:WVhlPFC8FDjOFMMWRy2pZqQJSXxYSwNYOkTr/Z6d3Kk= github.com/yuin/goldmark v1.1.25/go.mod h1:3hX8gzYuyVAZsxl0MRgGTJEmQBFcNTphYh9decYSb74= github.com/yuin/goldmark v1.1.27/go.mod h1:3hX8gzYuyVAZsxl0MRgGTJEmQBFcNTphYh9decYSb74= github.com/yuin/goldmark v1.1.32/go.mod h1:3hX8gzYuyVAZsxl0MRgGTJEmQBFcNTphYh9decYSb74= github.com/yuin/goldmark v1.2.1/go.mod h1:3hX8gzYuyVAZsxl0MRgGTJEmQBFcNTphYh9decYSb74= go.opencensus.io v0.21.0/go.mod h1:mSImk1erAIZhrmZN+AvHh14ztQfjbGwt4TtuofqLduU= go.opencensus.io v0.22.0/go.mod h1:+kGneAE2xo2IficOXnaByMWTGM9T73dGwxeWcUqIpI8= go.opencensus.io v0.22.2/go.mod h1:yxeiOL68Rb0Xd1ddK5vPZ/oVn4vY4Ynel7k9FzqtOIw= go.opencensus.io v0.22.3/go.mod h1:yxeiOL68Rb0Xd1ddK5vPZ/oVn4vY4Ynel7k9FzqtOIw= go.opencensus.io v0.22.4/go.mod h1:yxeiOL68Rb0Xd1ddK5vPZ/oVn4vY4Ynel7k9FzqtOIw= go.opencensus.io v0.23.0/go.mod h1:XItmlyltB5F7CS4xOC1DcqMoFqwtC6OG2xF7mCv7P7E= go.opencensus.io v0.24.0 h1:y73uSU6J157QMP2kn2r30vwW1A2W2WFwSCGnAVxeaD0= go.opencensus.io v0.24.0/go.mod h1:vNK8G9p7aAivkbmorf4v+7Hgx+Zs0yY+0fOtgBfjQKo= golang.org/x/crypto v0.0.0-20180904163835-0709b304e793/go.mod h1:6SG95UA2DQfeDnfUPMdvaQW0Q7yPrPDi9nlGo2tz2b4= golang.org/x/crypto v0.0.0-20190308221718-c2843e01d9a2/go.mod h1:djNgcEr1/C05ACkg1iLfiJU5Ep61QUkGW8qpdssI0+w= golang.org/x/crypto v0.0.0-20190510104115-cbcb75029529/go.mod h1:yigFU9vqHzYiE8UmvKecakEJjdnWj3jj499lnFckfCI= golang.org/x/crypto v0.0.0-20190605123033-f99c8df09eb5/go.mod h1:yigFU9vqHzYiE8UmvKecakEJjdnWj3jj499lnFckfCI= golang.org/x/crypto v0.0.0-20191011191535-87dc89f01550/go.mod h1:yigFU9vqHzYiE8UmvKecakEJjdnWj3jj499lnFckfCI= golang.org/x/crypto v0.0.0-20200622213623-75b288015ac9/go.mod h1:LzIPMQfyMNhhGPhUkYOs5KpL4U8rLKemX1yGLhDgUto= golang.org/x/exp v0.0.0-20190121172915-509febef88a4/go.mod h1:CJ0aWSM057203Lf6IL+f9T1iT9GByDxfZKAQTCR3kQA= golang.org/x/exp v0.0.0-20190306152737-a1d7652674e8/go.mod h1:CJ0aWSM057203Lf6IL+f9T1iT9GByDxfZKAQTCR3kQA= golang.org/x/exp v0.0.0-20190510132918-efd6b22b2522/go.mod h1:ZjyILWgesfNpC6sMxTJOJm9Kp84zZh5NQWvqDGG3Qr8= golang.org/x/exp v0.0.0-20190829153037-c13cbed26979/go.mod h1:86+5VVa7VpoJ4kLfm080zCjGlMRFzhUhsZKEZO7MGek= golang.org/x/exp v0.0.0-20191030013958-a1ab85dbe136/go.mod h1:JXzH8nQsPlswgeRAPE3MuO9GYsAcnJvJ4vnMwN/5qkY= golang.org/x/exp v0.0.0-20191129062945-2f5052295587/go.mod h1:2RIsYlXP63K8oxa1u096TMicItID8zy7Y6sNkU49FU4= golang.org/x/exp v0.0.0-20191227195350-da58074b4299/go.mod h1:2RIsYlXP63K8oxa1u096TMicItID8zy7Y6sNkU49FU4= golang.org/x/exp v0.0.0-20200119233911-0405dc783f0a/go.mod h1:2RIsYlXP63K8oxa1u096TMicItID8zy7Y6sNkU49FU4= golang.org/x/exp v0.0.0-20200207192155-f17229e696bd/go.mod h1:J/WKrq2StrnmMY6+EHIKF9dgMWnmCNThgcyBT1FY9mM= golang.org/x/exp v0.0.0-20200224162631-6cc2880d07d6/go.mod h1:3jZMyOhIsHpP37uCMkUooju7aAi5cS1Q23tOzKc+0MU= golang.org/x/exp v0.0.0-20240103183307-be819d1f06fc h1:ao2WRsKSzW6KuUY9IWPwWahcHCgR0s52IfwutMfEbdM= golang.org/x/exp v0.0.0-20240103183307-be819d1f06fc/go.mod h1:iRJReGqOEeBhDZGkGbynYwcHlctCvnjTYIamk7uXpHI= golang.org/x/image v0.0.0-20190227222117-0694c2d4d067/go.mod h1:kZ7UVZpmo3dzQBMxlp+ypCbDeSB+sBbTgSJuh5dn5js= golang.org/x/image v0.0.0-20190802002840-cff245a6509b/go.mod h1:FeLwcggjj3mMvU+oOTbSwawSJRM1uh48EjtB4UJZlP0= golang.org/x/lint v0.0.0-20181026193005-c67002cb31c3/go.mod h1:UVdnD1Gm6xHRNCYTkRU2/jEulfH38KcIWyp/GAMgvoE= golang.org/x/lint v0.0.0-20190227174305-5b3e6a55c961/go.mod h1:wehouNa3lNwaWXcvxsM5YxQ5yQlVC4a0KAMCusXpPoU= golang.org/x/lint v0.0.0-20190301231843-5614ed5bae6f/go.mod h1:UVdnD1Gm6xHRNCYTkRU2/jEulfH38KcIWyp/GAMgvoE= golang.org/x/lint v0.0.0-20190313153728-d0100b6bd8b3/go.mod h1:6SW0HCj/g11FgYtHlgUYUwCkIfeOF89ocIRzGO/8vkc= golang.org/x/lint v0.0.0-20190409202823-959b441ac422/go.mod h1:6SW0HCj/g11FgYtHlgUYUwCkIfeOF89ocIRzGO/8vkc= golang.org/x/lint v0.0.0-20190909230951-414d861bb4ac/go.mod h1:6SW0HCj/g11FgYtHlgUYUwCkIfeOF89ocIRzGO/8vkc= golang.org/x/lint v0.0.0-20190930215403-16217165b5de/go.mod h1:6SW0HCj/g11FgYtHlgUYUwCkIfeOF89ocIRzGO/8vkc= golang.org/x/lint v0.0.0-20191125180803-fdd1cda4f05f/go.mod h1:5qLYkcX4OjUUV8bRuDixDT3tpyyb+LUpUlRWLxfhWrs= golang.org/x/lint v0.0.0-20200130185559-910be7a94367/go.mod h1:3xt1FjdF8hUf6vQPIChWIBhFzV8gjjsPE/fR3IyQdNY= golang.org/x/lint v0.0.0-20200302205851-738671d3881b/go.mod h1:3xt1FjdF8hUf6vQPIChWIBhFzV8gjjsPE/fR3IyQdNY= golang.org/x/mobile v0.0.0-20190312151609-d3739f865fa6/go.mod h1:z+o9i4GpDbdi3rU15maQ/Ox0txvL9dWGYEHz965HBQE= golang.org/x/mobile v0.0.0-20190719004257-d2bd2a29d028/go.mod h1:E/iHnbuqvinMTCcRqshq8CkpyQDoeVncDDYHnLhea+o= golang.org/x/mod v0.0.0-20190513183733-4bf6d317e70e/go.mod h1:mXi4GBBbnImb6dmsKGUJ2LatrhH/nqhxcFungHvyanc= golang.org/x/mod v0.1.0/go.mod h1:0QHyrYULN0/3qlju5TqG8bIK38QM8yzMo5ekMj3DlcY= golang.org/x/mod v0.1.1-0.20191105210325-c90efee705ee/go.mod h1:QqPTAvyqsEbceGzBzNggFXnrqF1CaUcvgkdR5Ot7KZg= golang.org/x/mod v0.1.1-0.20191107180719-034126e5016b/go.mod h1:QqPTAvyqsEbceGzBzNggFXnrqF1CaUcvgkdR5Ot7KZg= golang.org/x/mod v0.2.0/go.mod h1:s0Qsj1ACt9ePp/hMypM3fl4fZqREWJwdYDEqhRiZZUA= golang.org/x/mod v0.3.0/go.mod h1:s0Qsj1ACt9ePp/hMypM3fl4fZqREWJwdYDEqhRiZZUA= golang.org/x/net v0.0.0-20180724234803-3673e40ba225/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4= golang.org/x/net v0.0.0-20180826012351-8a410e7b638d/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4= golang.org/x/net v0.0.0-20181114220301-adae6a3d119a/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4= golang.org/x/net v0.0.0-20190108225652-1e06a53dbb7e/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4= golang.org/x/net v0.0.0-20190213061140-3a22650c66bd/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4= golang.org/x/net v0.0.0-20190311183353-d8887717615a/go.mod h1:t9HGtf8HONx5eT2rtn7q6eTqICYqUVnKs3thJo3Qplg= golang.org/x/net v0.0.0-20190404232315-eb5bcb51f2a3/go.mod h1:t9HGtf8HONx5eT2rtn7q6eTqICYqUVnKs3thJo3Qplg= golang.org/x/net v0.0.0-20190501004415-9ce7a6920f09/go.mod h1:t9HGtf8HONx5eT2rtn7q6eTqICYqUVnKs3thJo3Qplg= golang.org/x/net v0.0.0-20190503192946-f4e77d36d62c/go.mod h1:t9HGtf8HONx5eT2rtn7q6eTqICYqUVnKs3thJo3Qplg= golang.org/x/net v0.0.0-20190603091049-60506f45cf65/go.mod h1:HSz+uSET+XFnRR8LxR5pz3Of3rY3CfYBVs4xY44aLks= golang.org/x/net v0.0.0-20190613194153-d28f0bde5980/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s= golang.org/x/net v0.0.0-20190620200207-3b0461eec859/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s= golang.org/x/net v0.0.0-20190628185345-da137c7871d7/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s= golang.org/x/net v0.0.0-20190724013045-ca1201d0de80/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s= golang.org/x/net v0.0.0-20191209160850-c0dbc17a3553/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s= golang.org/x/net v0.0.0-20200114155413-6afb5195e5aa/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s= golang.org/x/net v0.0.0-20200202094626-16171245cfb2/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s= golang.org/x/net v0.0.0-20200222125558-5a598a2470a0/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s= golang.org/x/net v0.0.0-20200226121028-0de0cce0169b/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s= golang.org/x/net v0.0.0-20200301022130-244492dfa37a/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s= golang.org/x/net v0.0.0-20200324143707-d3edc9973b7e/go.mod h1:qpuaurCH72eLCgpAm/N6yyVIVM9cpaDIP3A8BGJEC5A= golang.org/x/net v0.0.0-20200501053045-e0ff5e5a1de5/go.mod h1:qpuaurCH72eLCgpAm/N6yyVIVM9cpaDIP3A8BGJEC5A= golang.org/x/net v0.0.0-20200506145744-7e3656a0809f/go.mod h1:qpuaurCH72eLCgpAm/N6yyVIVM9cpaDIP3A8BGJEC5A= golang.org/x/net v0.0.0-20200513185701-a91f0712d120/go.mod h1:qpuaurCH72eLCgpAm/N6yyVIVM9cpaDIP3A8BGJEC5A= golang.org/x/net v0.0.0-20200520182314-0ba52f642ac2/go.mod h1:qpuaurCH72eLCgpAm/N6yyVIVM9cpaDIP3A8BGJEC5A= golang.org/x/net v0.0.0-20200625001655-4c5254603344/go.mod h1:/O7V0waA8r7cgGh81Ro3o1hOxt32SMVPicZroKQ2sZA= golang.org/x/net v0.0.0-20200707034311-ab3426394381/go.mod h1:/O7V0waA8r7cgGh81Ro3o1hOxt32SMVPicZroKQ2sZA= golang.org/x/net v0.0.0-20200822124328-c89045814202/go.mod h1:/O7V0waA8r7cgGh81Ro3o1hOxt32SMVPicZroKQ2sZA= golang.org/x/net v0.0.0-20201021035429-f5854403a974/go.mod h1:sp8m0HH+o8qH0wwXwYZr8TS3Oi6o0r6Gce1SSxlDquU= golang.org/x/net v0.0.0-20201110031124-69a78807bb2b/go.mod h1:sp8m0HH+o8qH0wwXwYZr8TS3Oi6o0r6Gce1SSxlDquU= golang.org/x/net v0.0.0-20210525063256-abc453219eb5/go.mod h1:9nx3DQGgdP8bBQD5qxJ1jj9UTztislL4KSBs9R2vV5Y= golang.org/x/net v0.0.0-20220127200216-cd36cc0744dd/go.mod h1:CfG3xpIq0wQ8r1q4Su4UZFWDARRcnwPjda9FqA0JpMk= golang.org/x/net v0.0.0-20220225172249-27dd8689420f/go.mod h1:CfG3xpIq0wQ8r1q4Su4UZFWDARRcnwPjda9FqA0JpMk= golang.org/x/net v0.38.0 h1:vRMAPTMaeGqVhG5QyLJHqNDwecKTomGeqbnfZyKlBI8= golang.org/x/net v0.38.0/go.mod h1:ivrbrMbzFq5J41QOQh0siUuly180yBYtLp+CKbEaFx8= golang.org/x/oauth2 v0.0.0-20180821212333-d2e6202438be/go.mod h1:N/0e6XlmueqKjAGxoOufVs8QHGRruUQn6yWY3a++T0U= golang.org/x/oauth2 v0.0.0-20190226205417-e64efc72b421/go.mod h1:gOpvHmFTYa4IltrdGE7lF6nIHvwfUNPOp7c8zoXwtLw= golang.org/x/oauth2 v0.0.0-20190604053449-0f29369cfe45/go.mod h1:gOpvHmFTYa4IltrdGE7lF6nIHvwfUNPOp7c8zoXwtLw= golang.org/x/oauth2 v0.0.0-20191202225959-858c2ad4c8b6/go.mod h1:gOpvHmFTYa4IltrdGE7lF6nIHvwfUNPOp7c8zoXwtLw= golang.org/x/oauth2 v0.0.0-20200107190931-bf48bf16ab8d/go.mod h1:gOpvHmFTYa4IltrdGE7lF6nIHvwfUNPOp7c8zoXwtLw= golang.org/x/oauth2 v0.0.0-20210514164344-f6687ab2804c/go.mod h1:KelEdhl1UZF7XfJ4dDtk6s++YSgaE7mD/BuKKDLBl4A= golang.org/x/oauth2 v0.0.0-20220223155221-ee480838109b/go.mod h1:DAh4E804XQdzx2j+YRIaUnCqCV2RuMz24cGBJ5QYIrc= golang.org/x/oauth2 v0.27.0 h1:da9Vo7/tDv5RH/7nZDz1eMGS/q1Vv1N/7FCrBhI9I3M= golang.org/x/oauth2 v0.27.0/go.mod h1:onh5ek6nERTohokkhCD/y2cV4Do3fxFHFuAejCkRWT8= golang.org/x/sync v0.0.0-20180314180146-1d60e4601c6f/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= golang.org/x/sync v0.0.0-20181108010431-42b317875d0f/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= golang.org/x/sync v0.0.0-20181221193216-37e7f081c4d4/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= golang.org/x/sync v0.0.0-20190227155943-e225da77a7e6/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= golang.org/x/sync v0.0.0-20190423024810-112230192c58/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= golang.org/x/sync v0.0.0-20190911185100-cd5d95a43a6e/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= golang.org/x/sync v0.0.0-20200317015054-43a5402ce75a/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= golang.org/x/sync v0.0.0-20200625203802-6e8e738ad208/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= golang.org/x/sync v0.0.0-20201020160332-67f06af15bc9/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= golang.org/x/sync v0.0.0-20201207232520-09787c993a3a/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= golang.org/x/sync v0.0.0-20220601150217-0de741cfad7f/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= golang.org/x/sync v0.12.0 h1:MHc5BpPuC30uJk597Ri8TV3CNZcTLu6B6z4lJy+g6Jw= golang.org/x/sync v0.12.0/go.mod h1:1dzgHSNfp02xaA81J2MS99Qcpr2w7fw1gpm99rleRqA= golang.org/x/sys v0.0.0-20180830151530-49385e6e1522/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY= golang.org/x/sys v0.0.0-20180905080454-ebe1bf3edb33/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY= golang.org/x/sys v0.0.0-20181116152217-5ac8a444bdc5/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY= golang.org/x/sys v0.0.0-20190215142949-d0b11bdaac8a/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY= golang.org/x/sys v0.0.0-20190312061237-fead79001313/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= golang.org/x/sys v0.0.0-20190412213103-97732733099d/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= golang.org/x/sys v0.0.0-20190422165155-953cdadca894/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= golang.org/x/sys v0.0.0-20190502145724-3ef323f4f1fd/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= golang.org/x/sys v0.0.0-20190507160741-ecd444e8653b/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= golang.org/x/sys v0.0.0-20190606165138-5da285871e9c/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= golang.org/x/sys v0.0.0-20190624142023-c5567b49c5d0/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= golang.org/x/sys v0.0.0-20190726091711-fc99dfbffb4e/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= golang.org/x/sys v0.0.0-20191001151750-bb3f8db39f24/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= golang.org/x/sys v0.0.0-20191204072324-ce4227a45e2e/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= golang.org/x/sys v0.0.0-20191228213918-04cbcbbfeed8/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= golang.org/x/sys v0.0.0-20200106162015-b016eb3dc98e/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= golang.org/x/sys v0.0.0-20200113162924-86b910548bc1/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= golang.org/x/sys v0.0.0-20200122134326-e047566fdf82/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= golang.org/x/sys v0.0.0-20200202164722-d101bd2416d5/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= golang.org/x/sys v0.0.0-20200212091648-12a6c2dcc1e4/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= golang.org/x/sys v0.0.0-20200223170610-d5e6a3e2c0ae/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= golang.org/x/sys v0.0.0-20200302150141-5c8b2ff67527/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= golang.org/x/sys v0.0.0-20200323222414-85ca7c5b95cd/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= golang.org/x/sys v0.0.0-20200331124033-c3d80250170d/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= golang.org/x/sys v0.0.0-20200501052902-10377860bb8e/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= golang.org/x/sys v0.0.0-20200511232937-7e40ca221e25/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= golang.org/x/sys v0.0.0-20200515095857-1151b9dac4a9/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= golang.org/x/sys v0.0.0-20200523222454-059865788121/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= golang.org/x/sys v0.0.0-20200615200032-f1bc736245b1/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= golang.org/x/sys v0.0.0-20200625212154-ddb9806d33ae/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= golang.org/x/sys v0.0.0-20200803210538-64077c9b5642/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= golang.org/x/sys v0.0.0-20200930185726-fdedc70b468f/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= golang.org/x/sys v0.0.0-20201119102817-f84b799fce68/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= golang.org/x/sys v0.0.0-20210124154548-22da62e12c0c/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= golang.org/x/sys v0.0.0-20210423082822-04245dca01da/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= golang.org/x/sys v0.0.0-20210603081109-ebe580a85c40/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/sys v0.0.0-20210615035016-665e8c7367d1/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/sys v0.0.0-20211216021012-1d35b9e2eb4e/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/sys v0.0.0-20220114195835-da31bd327af9/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/sys v0.0.0-20220520151302-bc2c85ada10a/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/sys v0.0.0-20220708085239-5a0f0661e09d/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/sys v0.31.0 h1:ioabZlmFYtWhL+TRYpcnNlLwhyxaM9kWTDEmfnprqik= golang.org/x/sys v0.31.0/go.mod h1:BJP2sWEmIv4KK5OTEluFJCKSidICx8ciO85XgH3Ak8k= golang.org/x/term v0.0.0-20201126162022-7de9c90e9dd1/go.mod h1:bj7SfCRtBDWHUb9snDiAeCFNEtKQo2Wmx5Cou7ajbmo= golang.org/x/term v0.0.0-20210927222741-03fcf44c2211/go.mod h1:jbD1KX2456YbFQfuXm/mYQcufACuNUgVhRMnK/tPxf8= golang.org/x/term v0.30.0 h1:PQ39fJZ+mfadBm0y5WlL4vlM7Sx1Hgf13sMIY2+QS9Y= golang.org/x/term v0.30.0/go.mod h1:NYYFdzHoI5wRh/h5tDMdMqCqPJZEuNqVR5xJLd/n67g= golang.org/x/text v0.0.0-20170915032832-14c0d48ead0c/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ= golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ= golang.org/x/text v0.3.1-0.20180807135948-17ff2d5776d2/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ= golang.org/x/text v0.3.2/go.mod h1:bEr9sfX3Q8Zfm5fL9x+3itogRgK3+ptLWKqgva+5dAk= golang.org/x/text v0.3.3/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ= golang.org/x/text v0.3.6/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ= golang.org/x/text v0.3.7/go.mod h1:u+2+/6zg+i71rQMx5EYifcz6MCKuco9NR6JIITiCfzQ= golang.org/x/text v0.23.0 h1:D71I7dUrlY+VX0gQShAThNGHFxZ13dGLBHQLVl1mJlY= golang.org/x/text v0.23.0/go.mod h1:/BLNzu4aZCJ1+kcD0DNRotWKage4q2rGVAg4o22unh4= golang.org/x/time v0.0.0-20181108054448-85acf8d2951c/go.mod h1:tRJNPiyCQ0inRvYxbN9jk5I+vvW/OXSQhTDSoE431IQ= golang.org/x/time v0.0.0-20190308202827-9d24e82272b4/go.mod h1:tRJNPiyCQ0inRvYxbN9jk5I+vvW/OXSQhTDSoE431IQ= golang.org/x/time v0.0.0-20191024005414-555d28b269f0/go.mod h1:tRJNPiyCQ0inRvYxbN9jk5I+vvW/OXSQhTDSoE431IQ= golang.org/x/time v0.5.0 h1:o7cqy6amK/52YcAKIPlM3a+Fpj35zvRj2TP+e1xFSfk= golang.org/x/time v0.5.0/go.mod h1:3BpzKBy/shNhVucY/MWOyx10tF3SFh9QdLuxbVysPQM= golang.org/x/tools v0.0.0-20180917221912-90fa682c2a6e/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ= golang.org/x/tools v0.0.0-20190114222345-bf090417da8b/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ= golang.org/x/tools v0.0.0-20190226205152-f727befe758c/go.mod h1:9Yl7xja0Znq3iFh3HoIrodX9oNMXvdceNzlUR8zjMvY= golang.org/x/tools v0.0.0-20190311212946-11955173bddd/go.mod h1:LCzVGOaR6xXOjkQ3onu1FJEFr0SW1gC7cKk1uF8kGRs= golang.org/x/tools v0.0.0-20190312151545-0bb0c0a6e846/go.mod h1:LCzVGOaR6xXOjkQ3onu1FJEFr0SW1gC7cKk1uF8kGRs= golang.org/x/tools v0.0.0-20190312170243-e65039ee4138/go.mod h1:LCzVGOaR6xXOjkQ3onu1FJEFr0SW1gC7cKk1uF8kGRs= golang.org/x/tools v0.0.0-20190425150028-36563e24a262/go.mod h1:RgjU9mgBXZiqYHBnxXauZ1Gv1EHHAz9KjViQ78xBX0Q= golang.org/x/tools v0.0.0-20190506145303-2d16b83fe98c/go.mod h1:RgjU9mgBXZiqYHBnxXauZ1Gv1EHHAz9KjViQ78xBX0Q= golang.org/x/tools v0.0.0-20190524140312-2c0ae7006135/go.mod h1:RgjU9mgBXZiqYHBnxXauZ1Gv1EHHAz9KjViQ78xBX0Q= golang.org/x/tools v0.0.0-20190606124116-d0a3d012864b/go.mod h1:/rFqwRUd4F7ZHNgwSSTFct+R/Kf4OFW1sUzUTQQTgfc= golang.org/x/tools v0.0.0-20190621195816-6e04913cbbac/go.mod h1:/rFqwRUd4F7ZHNgwSSTFct+R/Kf4OFW1sUzUTQQTgfc= golang.org/x/tools v0.0.0-20190628153133-6cdbf07be9d0/go.mod h1:/rFqwRUd4F7ZHNgwSSTFct+R/Kf4OFW1sUzUTQQTgfc= golang.org/x/tools v0.0.0-20190816200558-6889da9d5479/go.mod h1:b+2E5dAYhXwXZwtnZ6UAqBI28+e2cm9otk0dWdXHAEo= golang.org/x/tools v0.0.0-20190911174233-4f2ddba30aff/go.mod h1:b+2E5dAYhXwXZwtnZ6UAqBI28+e2cm9otk0dWdXHAEo= golang.org/x/tools v0.0.0-20191012152004-8de300cfc20a/go.mod h1:b+2E5dAYhXwXZwtnZ6UAqBI28+e2cm9otk0dWdXHAEo= golang.org/x/tools v0.0.0-20191113191852-77e3bb0ad9e7/go.mod h1:b+2E5dAYhXwXZwtnZ6UAqBI28+e2cm9otk0dWdXHAEo= golang.org/x/tools v0.0.0-20191115202509-3a792d9c32b2/go.mod h1:b+2E5dAYhXwXZwtnZ6UAqBI28+e2cm9otk0dWdXHAEo= golang.org/x/tools v0.0.0-20191119224855-298f0cb1881e/go.mod h1:b+2E5dAYhXwXZwtnZ6UAqBI28+e2cm9otk0dWdXHAEo= golang.org/x/tools v0.0.0-20191125144606-a911d9008d1f/go.mod h1:b+2E5dAYhXwXZwtnZ6UAqBI28+e2cm9otk0dWdXHAEo= golang.org/x/tools v0.0.0-20191130070609-6e064ea0cf2d/go.mod h1:b+2E5dAYhXwXZwtnZ6UAqBI28+e2cm9otk0dWdXHAEo= golang.org/x/tools v0.0.0-20191216173652-a0e659d51361/go.mod h1:TB2adYChydJhpapKDTa4BR/hXlZSLoq2Wpct/0txZ28= golang.org/x/tools v0.0.0-20191227053925-7b8e75db28f4/go.mod h1:TB2adYChydJhpapKDTa4BR/hXlZSLoq2Wpct/0txZ28= golang.org/x/tools v0.0.0-20200117161641-43d50277825c/go.mod h1:TB2adYChydJhpapKDTa4BR/hXlZSLoq2Wpct/0txZ28= golang.org/x/tools v0.0.0-20200122220014-bf1340f18c4a/go.mod h1:TB2adYChydJhpapKDTa4BR/hXlZSLoq2Wpct/0txZ28= golang.org/x/tools v0.0.0-20200130002326-2f3ba24bd6e7/go.mod h1:TB2adYChydJhpapKDTa4BR/hXlZSLoq2Wpct/0txZ28= golang.org/x/tools v0.0.0-20200204074204-1cc6d1ef6c74/go.mod h1:TB2adYChydJhpapKDTa4BR/hXlZSLoq2Wpct/0txZ28= golang.org/x/tools v0.0.0-20200207183749-b753a1ba74fa/go.mod h1:TB2adYChydJhpapKDTa4BR/hXlZSLoq2Wpct/0txZ28= golang.org/x/tools v0.0.0-20200212150539-ea181f53ac56/go.mod h1:TB2adYChydJhpapKDTa4BR/hXlZSLoq2Wpct/0txZ28= golang.org/x/tools v0.0.0-20200224181240-023911ca70b2/go.mod h1:TB2adYChydJhpapKDTa4BR/hXlZSLoq2Wpct/0txZ28= golang.org/x/tools v0.0.0-20200227222343-706bc42d1f0d/go.mod h1:TB2adYChydJhpapKDTa4BR/hXlZSLoq2Wpct/0txZ28= golang.org/x/tools v0.0.0-20200304193943-95d2e580d8eb/go.mod h1:o4KQGtdN14AW+yjsvvwRTJJuXz8XRtIHtEnmAXLyFUw= golang.org/x/tools v0.0.0-20200312045724-11d5b4c81c7d/go.mod h1:o4KQGtdN14AW+yjsvvwRTJJuXz8XRtIHtEnmAXLyFUw= golang.org/x/tools v0.0.0-20200331025713-a30bf2db82d4/go.mod h1:Sl4aGygMT6LrqrWclx+PTx3U+LnKx/seiNR+3G19Ar8= golang.org/x/tools v0.0.0-20200501065659-ab2804fb9c9d/go.mod h1:EkVYQZoAsY45+roYkvgYkIh4xh/qjgUK9TdY2XT94GE= golang.org/x/tools v0.0.0-20200512131952-2bc93b1c0c88/go.mod h1:EkVYQZoAsY45+roYkvgYkIh4xh/qjgUK9TdY2XT94GE= golang.org/x/tools v0.0.0-20200515010526-7d3b6ebf133d/go.mod h1:EkVYQZoAsY45+roYkvgYkIh4xh/qjgUK9TdY2XT94GE= golang.org/x/tools v0.0.0-20200618134242-20370b0cb4b2/go.mod h1:EkVYQZoAsY45+roYkvgYkIh4xh/qjgUK9TdY2XT94GE= golang.org/x/tools v0.0.0-20200619180055-7c47624df98f/go.mod h1:EkVYQZoAsY45+roYkvgYkIh4xh/qjgUK9TdY2XT94GE= golang.org/x/tools v0.0.0-20200729194436-6467de6f59a7/go.mod h1:njjCfa9FT2d7l9Bc6FUM5FLjQPp3cFF28FI3qnDFljA= golang.org/x/tools v0.0.0-20200804011535-6c149bb5ef0d/go.mod h1:njjCfa9FT2d7l9Bc6FUM5FLjQPp3cFF28FI3qnDFljA= golang.org/x/tools v0.0.0-20200825202427-b303f430e36d/go.mod h1:njjCfa9FT2d7l9Bc6FUM5FLjQPp3cFF28FI3qnDFljA= golang.org/x/tools v0.0.0-20210106214847-113979e3529a/go.mod h1:emZCQorbCU4vsT4fOWvOPXz4eW1wZW4PmDk9uLelYpA= golang.org/x/tools v0.21.1-0.20240508182429-e35e4ccd0d2d h1:vU5i/LfpvrRCpgM/VPfJLg5KjxD3E+hfT1SH+d9zLwg= golang.org/x/tools v0.21.1-0.20240508182429-e35e4ccd0d2d/go.mod h1:aiJjzUbINMkxbQROHiO6hDPo2LHcIPhhQsa9DLh0yGk= golang.org/x/xerrors v0.0.0-20190717185122-a985d3407aa7/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= golang.org/x/xerrors v0.0.0-20191011141410-1b5146add898/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= golang.org/x/xerrors v0.0.0-20191204190536-9bdfabe68543/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= golang.org/x/xerrors v0.0.0-20200804184101-5ec99f83aff1/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= google.golang.org/api v0.4.0/go.mod h1:8k5glujaEP+g9n7WNsDg8QP6cUVNI86fCNMcbazEtwE= google.golang.org/api v0.7.0/go.mod h1:WtwebWUNSVBH/HAw79HIFXZNqEvBhG+Ra+ax0hx3E3M= google.golang.org/api v0.8.0/go.mod h1:o4eAsZoiT+ibD93RtjEohWalFOjRDx6CVaqeizhEnKg= google.golang.org/api v0.9.0/go.mod h1:o4eAsZoiT+ibD93RtjEohWalFOjRDx6CVaqeizhEnKg= google.golang.org/api v0.13.0/go.mod h1:iLdEw5Ide6rF15KTC1Kkl0iskquN2gFfn9o9XIsbkAI= google.golang.org/api v0.14.0/go.mod h1:iLdEw5Ide6rF15KTC1Kkl0iskquN2gFfn9o9XIsbkAI= google.golang.org/api v0.15.0/go.mod h1:iLdEw5Ide6rF15KTC1Kkl0iskquN2gFfn9o9XIsbkAI= google.golang.org/api v0.17.0/go.mod h1:BwFmGc8tA3vsd7r/7kR8DY7iEEGSU04BFxCo5jP/sfE= google.golang.org/api v0.18.0/go.mod h1:BwFmGc8tA3vsd7r/7kR8DY7iEEGSU04BFxCo5jP/sfE= google.golang.org/api v0.19.0/go.mod h1:BwFmGc8tA3vsd7r/7kR8DY7iEEGSU04BFxCo5jP/sfE= google.golang.org/api v0.20.0/go.mod h1:BwFmGc8tA3vsd7r/7kR8DY7iEEGSU04BFxCo5jP/sfE= google.golang.org/api v0.22.0/go.mod h1:BwFmGc8tA3vsd7r/7kR8DY7iEEGSU04BFxCo5jP/sfE= google.golang.org/api v0.24.0/go.mod h1:lIXQywCXRcnZPGlsd8NbLnOjtAoL6em04bJ9+z0MncE= google.golang.org/api v0.28.0/go.mod h1:lIXQywCXRcnZPGlsd8NbLnOjtAoL6em04bJ9+z0MncE= google.golang.org/api v0.29.0/go.mod h1:Lcubydp8VUV7KeIHD9z2Bys/sm/vGKnG1UHuDBSrHWM= google.golang.org/api v0.30.0/go.mod h1:QGmEvQ87FHZNiUVJkT14jQNYJ4ZJjdRF23ZXz5138Fc= google.golang.org/api v0.155.0 h1:vBmGhCYs0djJttDNynWo44zosHlPvHmA0XiN2zP2DtA= google.golang.org/api v0.155.0/go.mod h1:GI5qK5f40kCpHfPn6+YzGAByIKWv8ujFnmoWm7Igduk= google.golang.org/appengine v1.1.0/go.mod h1:EbEs0AVv82hx2wNQdGPgUI5lhzA/G0D9YwlJXL52JkM= google.golang.org/appengine v1.4.0/go.mod h1:xpcJRLb0r/rnEns0DIKYYv+WjYCduHsrkT7/EB5XEv4= google.golang.org/appengine v1.5.0/go.mod h1:xpcJRLb0r/rnEns0DIKYYv+WjYCduHsrkT7/EB5XEv4= google.golang.org/appengine v1.6.1/go.mod h1:i06prIuMbXzDqacNJfV5OdTW448YApPu5ww/cMBSeb0= google.golang.org/appengine v1.6.5/go.mod h1:8WjMMxjGQR8xUklV/ARdw2HLXBOI7O7uCIDZVag1xfc= google.golang.org/appengine v1.6.6/go.mod h1:8WjMMxjGQR8xUklV/ARdw2HLXBOI7O7uCIDZVag1xfc= google.golang.org/genproto v0.0.0-20180817151627-c66870c02cf8/go.mod h1:JiN7NxoALGmiZfu7CAH4rXhgtRTLTxftemlI0sWmxmc= google.golang.org/genproto v0.0.0-20190307195333-5fe7a883aa19/go.mod h1:VzzqZJRnGkLBvHegQrXjBqPurQTc5/KpmUdxsrq26oE= google.golang.org/genproto v0.0.0-20190418145605-e7d98fc518a7/go.mod h1:VzzqZJRnGkLBvHegQrXjBqPurQTc5/KpmUdxsrq26oE= google.golang.org/genproto v0.0.0-20190425155659-357c62f0e4bb/go.mod h1:VzzqZJRnGkLBvHegQrXjBqPurQTc5/KpmUdxsrq26oE= google.golang.org/genproto v0.0.0-20190502173448-54afdca5d873/go.mod h1:VzzqZJRnGkLBvHegQrXjBqPurQTc5/KpmUdxsrq26oE= google.golang.org/genproto v0.0.0-20190801165951-fa694d86fc64/go.mod h1:DMBHOl98Agz4BDEuKkezgsaosCRResVns1a3J2ZsMNc= google.golang.org/genproto v0.0.0-20190819201941-24fa4b261c55/go.mod h1:DMBHOl98Agz4BDEuKkezgsaosCRResVns1a3J2ZsMNc= google.golang.org/genproto v0.0.0-20190911173649-1774047e7e51/go.mod h1:IbNlFCBrqXvoKpeg0TB2l7cyZUmoaFKYIwrEpbDKLA8= google.golang.org/genproto v0.0.0-20191108220845-16a3f7862a1a/go.mod h1:n3cpQtvxv34hfy77yVDNjmbRyujviMdxYliBSkLhpCc= google.golang.org/genproto v0.0.0-20191115194625-c23dd37a84c9/go.mod h1:n3cpQtvxv34hfy77yVDNjmbRyujviMdxYliBSkLhpCc= google.golang.org/genproto v0.0.0-20191216164720-4f79533eabd1/go.mod h1:n3cpQtvxv34hfy77yVDNjmbRyujviMdxYliBSkLhpCc= google.golang.org/genproto v0.0.0-20191230161307-f3c370f40bfb/go.mod h1:n3cpQtvxv34hfy77yVDNjmbRyujviMdxYliBSkLhpCc= google.golang.org/genproto v0.0.0-20200115191322-ca5a22157cba/go.mod h1:n3cpQtvxv34hfy77yVDNjmbRyujviMdxYliBSkLhpCc= google.golang.org/genproto v0.0.0-20200122232147-0452cf42e150/go.mod h1:n3cpQtvxv34hfy77yVDNjmbRyujviMdxYliBSkLhpCc= google.golang.org/genproto v0.0.0-20200204135345-fa8e72b47b90/go.mod h1:GmwEX6Z4W5gMy59cAlVYjN9JhxgbQH6Gn+gFDQe2lzA= google.golang.org/genproto v0.0.0-20200212174721-66ed5ce911ce/go.mod h1:55QSHmfGQM9UVYDPBsyGGes0y52j32PQ3BqQfXhyH3c= google.golang.org/genproto v0.0.0-20200224152610-e50cd9704f63/go.mod h1:55QSHmfGQM9UVYDPBsyGGes0y52j32PQ3BqQfXhyH3c= google.golang.org/genproto v0.0.0-20200228133532-8c2c7df3a383/go.mod h1:55QSHmfGQM9UVYDPBsyGGes0y52j32PQ3BqQfXhyH3c= google.golang.org/genproto v0.0.0-20200305110556-506484158171/go.mod h1:55QSHmfGQM9UVYDPBsyGGes0y52j32PQ3BqQfXhyH3c= google.golang.org/genproto v0.0.0-20200312145019-da6875a35672/go.mod h1:55QSHmfGQM9UVYDPBsyGGes0y52j32PQ3BqQfXhyH3c= google.golang.org/genproto v0.0.0-20200331122359-1ee6d9798940/go.mod h1:55QSHmfGQM9UVYDPBsyGGes0y52j32PQ3BqQfXhyH3c= google.golang.org/genproto v0.0.0-20200430143042-b979b6f78d84/go.mod h1:55QSHmfGQM9UVYDPBsyGGes0y52j32PQ3BqQfXhyH3c= google.golang.org/genproto v0.0.0-20200511104702-f5ebc3bea380/go.mod h1:55QSHmfGQM9UVYDPBsyGGes0y52j32PQ3BqQfXhyH3c= google.golang.org/genproto v0.0.0-20200515170657-fc4c6c6a6587/go.mod h1:YsZOwe1myG/8QRHRsmBRE1LrgQY60beZKjly0O1fX9U= google.golang.org/genproto v0.0.0-20200526211855-cb27e3aa2013/go.mod h1:NbSheEEYHJ7i3ixzK3sjbqSGDJWnxyFXZblF3eUsNvo= google.golang.org/genproto v0.0.0-20200618031413-b414f8b61790/go.mod h1:jDfRM7FcilCzHH/e9qn6dsT145K34l5v+OpcnNgKAAA= google.golang.org/genproto v0.0.0-20200729003335-053ba62fc06f/go.mod h1:FWY/as6DDZQgahTzZj3fqbO1CbirC29ZNUFHwi0/+no= google.golang.org/genproto v0.0.0-20200804131852-c06518451d9c/go.mod h1:FWY/as6DDZQgahTzZj3fqbO1CbirC29ZNUFHwi0/+no= google.golang.org/genproto v0.0.0-20200825200019-8632dd797987/go.mod h1:FWY/as6DDZQgahTzZj3fqbO1CbirC29ZNUFHwi0/+no= google.golang.org/genproto/googleapis/rpc v0.0.0-20240102182953-50ed04b92917 h1:6G8oQ016D88m1xAKljMlBOOGWDZkes4kMhgGFlf8WcQ= google.golang.org/genproto/googleapis/rpc v0.0.0-20240102182953-50ed04b92917/go.mod h1:xtjpI3tXFPP051KaWnhvxkiubL/6dJ18vLVf7q2pTOU= google.golang.org/grpc v1.19.0/go.mod h1:mqu4LbDTu4XGKhr4mRzUsmM4RtVoemTSY81AxZiDr8c= google.golang.org/grpc v1.20.1/go.mod h1:10oTOabMzJvdu6/UiuZezV6QK5dSlG84ov/aaiqXj38= google.golang.org/grpc v1.21.1/go.mod h1:oYelfM1adQP15Ek0mdvEgi9Df8B9CZIaU1084ijfRaM= google.golang.org/grpc v1.23.0/go.mod h1:Y5yQAOtifL1yxbo5wqy6BxZv8vAUGQwXBOALyacEbxg= google.golang.org/grpc v1.25.1/go.mod h1:c3i+UQWmh7LiEpx4sFZnkU36qjEYZ0imhYfXVyQciAY= google.golang.org/grpc v1.26.0/go.mod h1:qbnxyOmOxrQa7FizSgH+ReBfzJrCY1pSN7KXBS8abTk= google.golang.org/grpc v1.27.0/go.mod h1:qbnxyOmOxrQa7FizSgH+ReBfzJrCY1pSN7KXBS8abTk= google.golang.org/grpc v1.27.1/go.mod h1:qbnxyOmOxrQa7FizSgH+ReBfzJrCY1pSN7KXBS8abTk= google.golang.org/grpc v1.28.0/go.mod h1:rpkK4SK4GF4Ach/+MFLZUBavHOvF2JJB5uozKKal+60= google.golang.org/grpc v1.29.1/go.mod h1:itym6AZVZYACWQqET3MqgPpjcuV5QH3BxFS3IjizoKk= google.golang.org/grpc v1.30.0/go.mod h1:N36X2cJ7JwdamYAgDz+s+rVMFjt3numwzf/HckM8pak= google.golang.org/grpc v1.31.0/go.mod h1:N36X2cJ7JwdamYAgDz+s+rVMFjt3numwzf/HckM8pak= google.golang.org/grpc v1.33.2/go.mod h1:JMHMWHQWaTccqQQlmk3MJZS+GWXOdAesneDmEnv2fbc= google.golang.org/grpc v1.60.1 h1:26+wFr+cNqSGFcOXcabYC0lUVJVRa2Sb2ortSK7VrEU= google.golang.org/grpc v1.60.1/go.mod h1:OlCHIeLYqSSsLi6i49B5QGdzaMZK9+M7LXN2FKz4eGM= google.golang.org/protobuf v0.0.0-20200109180630-ec00e32a8dfd/go.mod h1:DFci5gLYBciE7Vtevhsrf46CRTquxDuWsQurQQe4oz8= google.golang.org/protobuf v0.0.0-20200221191635-4d8936d0db64/go.mod h1:kwYJMbMJ01Woi6D6+Kah6886xMZcty6N08ah7+eCXa0= google.golang.org/protobuf v0.0.0-20200228230310-ab0ca4ff8a60/go.mod h1:cfTl7dwQJ+fmap5saPgwCLgHXTUD7jkjRqWcaiX5VyM= google.golang.org/protobuf v1.20.1-0.20200309200217-e05f789c0967/go.mod h1:A+miEFZTKqfCUM6K7xSMQL9OKL/b6hQv+e19PK+JZNE= google.golang.org/protobuf v1.21.0/go.mod h1:47Nbq4nVaFHyn7ilMalzfO3qCViNmqZ2kzikPIcrTAo= google.golang.org/protobuf v1.22.0/go.mod h1:EGpADcykh3NcUnDUJcl1+ZksZNG86OlYog2l/sGQquU= google.golang.org/protobuf v1.23.0/go.mod h1:EGpADcykh3NcUnDUJcl1+ZksZNG86OlYog2l/sGQquU= google.golang.org/protobuf v1.23.1-0.20200526195155-81db48ad09cc/go.mod h1:EGpADcykh3NcUnDUJcl1+ZksZNG86OlYog2l/sGQquU= google.golang.org/protobuf v1.24.0/go.mod h1:r/3tXBNzIEhYS9I1OUVjXDlt8tc493IdKGjtUeSXeh4= google.golang.org/protobuf v1.25.0/go.mod h1:9JNX74DMeImyA3h4bdi1ymwjUzf21/xIlbajtzgsN7c= google.golang.org/protobuf v1.26.0-rc.1/go.mod h1:jlhhOSvTdKEhbULTjvd4ARK9grFBp09yW+WbY/TyQbw= google.golang.org/protobuf v1.26.0/go.mod h1:9q0QmTI4eRPtz6boOQmLYwt+qCgq0jsYwAQnmE0givc= google.golang.org/protobuf v1.28.0/go.mod h1:HV8QOd/L58Z+nl8r43ehVNZIU/HEI6OcFqwMG9pJV4I= google.golang.org/protobuf v1.28.1/go.mod h1:HV8QOd/L58Z+nl8r43ehVNZIU/HEI6OcFqwMG9pJV4I= google.golang.org/protobuf v1.33.0 h1:uNO2rsAINq/JlFpSdYEKIZ0uKD/R9cpdv0T+yoGwGmI= google.golang.org/protobuf v1.33.0/go.mod h1:c6P6GXX6sHbq/GpV6MGZEdwhWPcYBgnhAHhKbcUYpos= gopkg.in/alecthomas/kingpin.v2 v2.2.6/go.mod h1:FMv+mEhP44yOT+4EoQTLFTRgOQ1FBLkstjWtayDeSgw= gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0= gopkg.in/check.v1 v1.0.0-20180628173108-788fd7840127/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0= gopkg.in/check.v1 v1.0.0-20190902080502-41f04d3bba15/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0= gopkg.in/check.v1 v1.0.0-20201130134442-10cb98267c6c h1:Hei/4ADfdWqJk1ZMxUNpqntNwaWcugrBjAiHlqqRiVk= gopkg.in/check.v1 v1.0.0-20201130134442-10cb98267c6c/go.mod h1:JHkPIbrfpd72SG/EVd6muEfDQjcINNoR0C8j2r3qZ4Q= gopkg.in/errgo.v2 v2.1.0/go.mod h1:hNsd1EY+bozCKY1Ytp96fpM3vjJbqLJn88ws8XvfDNI= gopkg.in/inf.v0 v0.9.1 h1:73M5CoZyi3ZLMOyDlQh031Cx6N9NDJ2Vvfl76EDAgDc= gopkg.in/inf.v0 v0.9.1/go.mod h1:cWUDdTG/fYaXco+Dcufb5Vnc6Gp2YChqWtbxRZE0mXw= gopkg.in/yaml.v2 v2.2.1/go.mod h1:hI93XBmqTisBFMUTm0b8Fm+jr3Dg1NNxqwp+5A1VGuI= gopkg.in/yaml.v2 v2.2.2/go.mod h1:hI93XBmqTisBFMUTm0b8Fm+jr3Dg1NNxqwp+5A1VGuI= gopkg.in/yaml.v2 v2.2.4/go.mod h1:hI93XBmqTisBFMUTm0b8Fm+jr3Dg1NNxqwp+5A1VGuI= gopkg.in/yaml.v2 v2.2.5/go.mod h1:hI93XBmqTisBFMUTm0b8Fm+jr3Dg1NNxqwp+5A1VGuI= gopkg.in/yaml.v2 v2.2.8/go.mod h1:hI93XBmqTisBFMUTm0b8Fm+jr3Dg1NNxqwp+5A1VGuI= gopkg.in/yaml.v2 v2.3.0/go.mod h1:hI93XBmqTisBFMUTm0b8Fm+jr3Dg1NNxqwp+5A1VGuI= gopkg.in/yaml.v2 v2.4.0 h1:D8xgwECY7CYvx+Y2n4sBz93Jn9JRvxdiyyo8CTfuKaY= gopkg.in/yaml.v2 v2.4.0/go.mod h1:RDklbk79AGWmwhnvt/jBztapEOGDOx6ZbXqjP6csGnQ= gopkg.in/yaml.v3 v3.0.0-20200313102051-9f266ea9e77c/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM= gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA= gopkg.in/yaml.v3 v3.0.1/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM= honnef.co/go/tools v0.0.0-20190102054323-c2f93a96b099/go.mod h1:rf3lG4BRIbNafJWhAfAdb/ePZxsR/4RtNHQocxwk9r4= honnef.co/go/tools v0.0.0-20190106161140-3f1c8253044a/go.mod h1:rf3lG4BRIbNafJWhAfAdb/ePZxsR/4RtNHQocxwk9r4= honnef.co/go/tools v0.0.0-20190418001031-e561f6794a2a/go.mod h1:rf3lG4BRIbNafJWhAfAdb/ePZxsR/4RtNHQocxwk9r4= honnef.co/go/tools v0.0.0-20190523083050-ea95bdfd59fc/go.mod h1:rf3lG4BRIbNafJWhAfAdb/ePZxsR/4RtNHQocxwk9r4= honnef.co/go/tools v0.0.1-2019.2.3/go.mod h1:a3bituU0lyd329TUQxRnasdCoJDkEUEAqEt0JzvZhAg= honnef.co/go/tools v0.0.1-2020.1.3/go.mod h1:X/FiERA/W4tHapMX5mGpAtMSVEeEUOyHaw9vFzvIQ3k= honnef.co/go/tools v0.0.1-2020.1.4/go.mod h1:X/FiERA/W4tHapMX5mGpAtMSVEeEUOyHaw9vFzvIQ3k= k8s.io/api v0.29.0 h1:NiCdQMY1QOp1H8lfRyeEf8eOwV6+0xA6XEE44ohDX2A= k8s.io/api v0.29.0/go.mod h1:sdVmXoz2Bo/cb77Pxi71IPTSErEW32xa4aXwKH7gfBA= k8s.io/apimachinery v0.29.0 h1:+ACVktwyicPz0oc6MTMLwa2Pw3ouLAfAon1wPLtG48o= k8s.io/apimachinery v0.29.0/go.mod h1:eVBxQ/cwiJxH58eK/jd/vAk4mrxmVlnpBH5J2GbMeis= k8s.io/client-go v0.29.0 h1:KmlDtFcrdUzOYrBhXHgKw5ycWzc3ryPX5mQe0SkG3y8= k8s.io/client-go v0.29.0/go.mod h1:yLkXH4HKMAywcrD82KMSmfYg2DlE8mepPR4JGSo5n38= k8s.io/cri-api v0.29.0 h1:atenAqOltRsFqcCQlFFpDnl/R4aGfOELoNLTDJfd7t8= k8s.io/cri-api v0.29.0/go.mod h1:Rls2JoVwfC7kW3tndm7267kriuRukQ02qfht0PCRuIc= k8s.io/klog/v2 v2.110.1 h1:U/Af64HJf7FcwMcXyKm2RPM22WZzyR7OSpYj5tg3cL0= k8s.io/klog/v2 v2.110.1/go.mod h1:YGtd1984u+GgbuZ7e08/yBuAfKLSO0+uR1Fhi6ExXjo= k8s.io/kube-openapi v0.0.0-20231010175941-2dd684a91f00 h1:aVUu9fTY98ivBPKR9Y5w/AuzbMm96cd3YHRTU83I780= k8s.io/kube-openapi v0.0.0-20231010175941-2dd684a91f00/go.mod h1:AsvuZPBlUDVuCdzJ87iajxtXuR9oktsTctW/R9wwouA= k8s.io/utils v0.0.0-20240102154912-e7106e64919e h1:eQ/4ljkx21sObifjzXwlPKpdGLrCfRziVtos3ofG/sQ= k8s.io/utils v0.0.0-20240102154912-e7106e64919e/go.mod h1:OLgZIPagt7ERELqWJFomSt595RzquPNLL48iOWgYOg0= rsc.io/binaryregexp v0.2.0/go.mod h1:qTv7/COck+e2FymRvadv62gMdZztPaShugOCi3I+8D8= rsc.io/quote/v3 v3.1.0/go.mod h1:yEA65RcK8LyAZtP9Kv3t0HmxON59tX3rD+tICJqUlj0= rsc.io/sampler v1.3.0/go.mod h1:T1hPZKmBbMNahiBKFy5HrXp6adAjACjK9JXDnKaTXpA= sigs.k8s.io/json v0.0.0-20221116044647-bc3834ca7abd h1:EDPBXCAspyGV4jQlpZSudPeMmr1bNJefnuqLsRAsHZo= sigs.k8s.io/json v0.0.0-20221116044647-bc3834ca7abd/go.mod h1:B8JuhiUyNFVKdsE8h686QcCxMaH6HrOAZj4vswFpcB0= sigs.k8s.io/structured-merge-diff/v4 v4.4.1 h1:150L+0vs/8DA78h1u02ooW1/fFq/Lwr+sGiqlzvrtq4= sigs.k8s.io/structured-merge-diff/v4 v4.4.1/go.mod h1:N8hJocpFajUSSeSJ9bOZ77VzejKZaXsTtZo4/u7Io08= sigs.k8s.io/yaml v1.4.0 h1:Mk1wCc2gy/F0THH0TAp1QYyJNzRm2KCLy3o5ASXVI5E= sigs.k8s.io/yaml v1.4.0/go.mod h1:Ejl7/uTz7PSA4eKMyQCUTnhZYNmLIl+5c2lQPGR2BPY= ================================================ FILE: packaging/deb.in/changelog ================================================ __PACKAGE__ (__VERSION__) unstable; urgency=low * Release build of __PACKAGE__ __VERSION__ for debian/ubuntu. -- __AUTHOR__ <__EMAIL__> __DATE__ ================================================ FILE: packaging/deb.in/compat ================================================ 11 ================================================ FILE: packaging/deb.in/control ================================================ Source: __PACKAGE__ Maintainer: __AUTHOR__ <__EMAIL__> Package: __PACKAGE__ Architecture: any Description: A CRI Proxy for Hardware Resource Management ================================================ FILE: packaging/deb.in/rules ================================================ #!/usr/bin/make -f #-*- make -*- DISTRIBUTION = $(shell sed -n "s/^VERSION_CODENAME=//p" /etc/os-release) VERSION = __VERSION__ PACKAGEVERSION = $(VERSION) TARBALL = __TARBALL__ URL = http://github.com/intel/cri-resource-manager %: dh $@ override_dh_auto_clean: override_dh_auto_test: override_dh_auto_build: override_dh_auto_install: export PATH="$$PATH:$$(go env GOPATH)/bin"; \ make BUILD_DIRS=cri-resmgr install DESTDIR=debian/__PACKAGE__ make BUILD_DIRS=cri-resmgr install-licenses DESTDIR=debian/__PACKAGE__/usr/share/doc/__PACKAGE__ cp README.md docs/*.md cmd/*/*.sample \ debian/__PACKAGE__/usr/share/doc/__PACKAGE__ override_dh_gencontrol: dh_gencontrol -- -v$(PACKAGEVERSION) ================================================ FILE: packaging/rpm/cri-resource-manager.spec.in ================================================ Name: cri-resource-manager Version: __VERSION__ Release: 0 Summary: CRI Resource Manager, a CRI proxy with various in-node workload placement policies License: ASL 2.0 URL: https://github.com/intel/cri-resource-manager Source0: https://github.com/intel/cri-resource-manager/archive/cri-resource-manager-__TARVERSION__.tar.gz BuildRequires: coreutils, make, kernel-devel # Disable the building of debug package(s). %define debug_package %{nil} %description Kubernetes Container Runtime Interface proxy service with hardware resource aware workload placement policies. %prep %setup -q -n cri-resource-manager-__TARVERSION__ %build make build BUILD_DIRS=cri-resmgr make install-licenses BUILD_DIRS=cri-resmgr DESTDIR=. %install %make_install UNITDIR=%{_unitdir} SYSCONFDIR=%{_sysconfdir} BUILD_DIRS=cri-resmgr install -m 0700 -d %{?buildroot}%{_sharedstatedir}/cri-resmgr %files %defattr(-,root,root,-) %{_bindir}/* %{_sysconfdir}/sysconfig/* %{_unitdir}/* %dir %attr(0700,root,root) %{_sharedstatedir}/cri-resmgr %dir %attr(0700,root,root) %{_sysconfdir}/cri-resmgr %config(noreplace) %{_sysconfdir}/cri-resmgr/* %license licenses/cri-resmgr/* %doc README.md docs/*.md %doc cmd/*/*.sample ================================================ FILE: pkg/agent/agent.go ================================================ /* Copyright 2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ package agent import ( "fmt" "github.com/intel/cri-resource-manager/pkg/log" k8sclient "k8s.io/client-go/kubernetes" resmgrcs "github.com/intel/cri-resource-manager/pkg/apis/resmgr/generated/clientset/versioned/typed/resmgr/v1alpha1" resmgr "github.com/intel/cri-resource-manager/pkg/apis/resmgr/v1alpha1" ) // Get cri-resmgr config type configInterface interface { getConfig() resmgrConfig getError() error } // resmgrConfig represents cri-resmgr configuration type resmgrConfig map[string]string // resmgrAdjustment represents external adjustments for the resource-manager type resmgrAdjustment map[string]*resmgr.Adjustment // resmgrStatus represents the status of an external adjustment update type resmgrStatus struct { request error errors map[string]string } // ResourceManagerAgent is the interface exposed for the CRI Resource Manager Congig Agent type ResourceManagerAgent interface { Run() error } // agent implements ResourceManagerAgent type agent struct { log.Logger // Our logging interface cli *k8sclient.Clientset // K8s client extCli *resmgrcs.CriresmgrV1alpha1Client server agentServer // gRPC server listening for requests from cri-resource-manager watcher k8sWatcher // Watcher monitoring events in K8s cluster updater configUpdater // Client sending config updates to cri-resource-manager } // NewResourceManagerAgent creates a new instance of ResourceManagerAgent func NewResourceManagerAgent() (ResourceManagerAgent, error) { var err error a := &agent{ Logger: log.NewLogger("resource-manager-agent"), } if a.cli, a.extCli, err = a.getK8sClient(opts.kubeconfig); err != nil { return nil, agentError("failed to get k8s client: %v", err) } if a.watcher, err = newK8sWatcher(a.cli, a.extCli); err != nil { return nil, agentError("failed to initialize watcher instance: %v", err) } if a.server, err = newAgentServer(a.cli, a); err != nil { return nil, agentError("failed to initialize gRPC server") } if a.updater, err = newConfigUpdater(); err != nil { return nil, agentError("failed to initialize config updater instance: %v", err) } return a, nil } // Start starts the resource manager. func (a *agent) Run() error { a.Info("starting CRI Resource Manager Agent") if err := a.server.Start(opts.agentSocket); err != nil { return agentError("failed to start gRPC server: %v", err) } if err := a.watcher.Start(); err != nil { return agentError("failed to start watcher: %v", err) } if err := a.updater.Start(); err != nil { return agentError("failed to start config updater: %v", err) } for { select { case config, ok := <-a.watcher.ConfigChan(): if ok { a.updater.UpdateConfig(&config) } case adjust, ok := <-a.watcher.AdjustmentChan(): if ok { a.updater.UpdateAdjustment(&adjust) } case status, ok := <-a.updater.StatusChan(): if ok { a.Info("got status %v", status) if err := a.watcher.UpdateStatus(status); err != nil { a.Error("failed to update adjustment node status: %v", err) } } } } } func (a *agent) getConfig() resmgrConfig { if a.watcher == nil { return nil } return a.watcher.GetConfig() } func (a *agent) getError() error { if a.updater == nil { return nil } return a.updater.GetError() } func agentError(format string, args ...interface{}) error { return fmt.Errorf(format, args...) } ================================================ FILE: pkg/agent/api/v1/api.go ================================================ /* Copyright 2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ package v1 import ( "encoding/json" ) var _ json.Marshaler = &JsonPatch{} // MarshalJSON marshals JsonPatch to valid Json func (j *JsonPatch) MarshalJSON() ([]byte, error) { // Don't really encode anything. Op and Path are ascii strings and value // is assumed to be in marshaled format if len(j.Value) == 0 { return []byte(`{"op":"` + j.Op + `","path":"` + j.Path + `"}`), nil } return []byte(`{"op":"` + j.Op + `","path":"` + j.Path + `","value":` + j.Value + `}`), nil } ================================================ FILE: pkg/agent/api/v1/api.pb.go ================================================ // //Copyright 2019 Intel Corporation // //Licensed under the Apache License, Version 2.0 (the "License"); //you may not use this file except in compliance with the License. //You may obtain a copy of the License at // //http://www.apache.org/licenses/LICENSE-2.0 // //Unless required by applicable law or agreed to in writing, software //distributed under the License is distributed on an "AS IS" BASIS, //WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. //See the License for the specific language governing permissions and //limitations under the License. // Code generated by protoc-gen-go. DO NOT EDIT. // versions: // protoc-gen-go v1.28.0 // protoc v3.20.1 // source: pkg/agent/api/v1/api.proto package v1 import ( protoreflect "google.golang.org/protobuf/reflect/protoreflect" protoimpl "google.golang.org/protobuf/runtime/protoimpl" reflect "reflect" sync "sync" ) const ( // Verify that this generated code is sufficiently up-to-date. _ = protoimpl.EnforceVersion(20 - protoimpl.MinVersion) // Verify that runtime/protoimpl is sufficiently up-to-date. _ = protoimpl.EnforceVersion(protoimpl.MaxVersion - 20) ) type GetNodeRequest struct { state protoimpl.MessageState sizeCache protoimpl.SizeCache unknownFields protoimpl.UnknownFields } func (x *GetNodeRequest) Reset() { *x = GetNodeRequest{} if protoimpl.UnsafeEnabled { mi := &file_pkg_agent_api_v1_api_proto_msgTypes[0] ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) ms.StoreMessageInfo(mi) } } func (x *GetNodeRequest) String() string { return protoimpl.X.MessageStringOf(x) } func (*GetNodeRequest) ProtoMessage() {} func (x *GetNodeRequest) ProtoReflect() protoreflect.Message { mi := &file_pkg_agent_api_v1_api_proto_msgTypes[0] if protoimpl.UnsafeEnabled && x != nil { ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) if ms.LoadMessageInfo() == nil { ms.StoreMessageInfo(mi) } return ms } return mi.MessageOf(x) } // Deprecated: Use GetNodeRequest.ProtoReflect.Descriptor instead. func (*GetNodeRequest) Descriptor() ([]byte, []int) { return file_pkg_agent_api_v1_api_proto_rawDescGZIP(), []int{0} } type GetNodeReply struct { state protoimpl.MessageState sizeCache protoimpl.SizeCache unknownFields protoimpl.UnknownFields Node string `protobuf:"bytes,1,opt,name=node,proto3" json:"node,omitempty"` } func (x *GetNodeReply) Reset() { *x = GetNodeReply{} if protoimpl.UnsafeEnabled { mi := &file_pkg_agent_api_v1_api_proto_msgTypes[1] ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) ms.StoreMessageInfo(mi) } } func (x *GetNodeReply) String() string { return protoimpl.X.MessageStringOf(x) } func (*GetNodeReply) ProtoMessage() {} func (x *GetNodeReply) ProtoReflect() protoreflect.Message { mi := &file_pkg_agent_api_v1_api_proto_msgTypes[1] if protoimpl.UnsafeEnabled && x != nil { ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) if ms.LoadMessageInfo() == nil { ms.StoreMessageInfo(mi) } return ms } return mi.MessageOf(x) } // Deprecated: Use GetNodeReply.ProtoReflect.Descriptor instead. func (*GetNodeReply) Descriptor() ([]byte, []int) { return file_pkg_agent_api_v1_api_proto_rawDescGZIP(), []int{1} } func (x *GetNodeReply) GetNode() string { if x != nil { return x.Node } return "" } // JsonPatch holds on JSON patch type JsonPatch struct { state protoimpl.MessageState sizeCache protoimpl.SizeCache unknownFields protoimpl.UnknownFields Op string `protobuf:"bytes,1,opt,name=op,proto3" json:"op,omitempty"` Path string `protobuf:"bytes,2,opt,name=path,proto3" json:"path,omitempty"` Value string `protobuf:"bytes,3,opt,name=value,proto3" json:"value,omitempty"` } func (x *JsonPatch) Reset() { *x = JsonPatch{} if protoimpl.UnsafeEnabled { mi := &file_pkg_agent_api_v1_api_proto_msgTypes[2] ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) ms.StoreMessageInfo(mi) } } func (x *JsonPatch) String() string { return protoimpl.X.MessageStringOf(x) } func (*JsonPatch) ProtoMessage() {} func (x *JsonPatch) ProtoReflect() protoreflect.Message { mi := &file_pkg_agent_api_v1_api_proto_msgTypes[2] if protoimpl.UnsafeEnabled && x != nil { ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) if ms.LoadMessageInfo() == nil { ms.StoreMessageInfo(mi) } return ms } return mi.MessageOf(x) } // Deprecated: Use JsonPatch.ProtoReflect.Descriptor instead. func (*JsonPatch) Descriptor() ([]byte, []int) { return file_pkg_agent_api_v1_api_proto_rawDescGZIP(), []int{2} } func (x *JsonPatch) GetOp() string { if x != nil { return x.Op } return "" } func (x *JsonPatch) GetPath() string { if x != nil { return x.Path } return "" } func (x *JsonPatch) GetValue() string { if x != nil { return x.Value } return "" } type PatchNodeRequest struct { state protoimpl.MessageState sizeCache protoimpl.SizeCache unknownFields protoimpl.UnknownFields // List of JSON patches to apply on the node Patches []*JsonPatch `protobuf:"bytes,1,rep,name=patches,proto3" json:"patches,omitempty"` } func (x *PatchNodeRequest) Reset() { *x = PatchNodeRequest{} if protoimpl.UnsafeEnabled { mi := &file_pkg_agent_api_v1_api_proto_msgTypes[3] ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) ms.StoreMessageInfo(mi) } } func (x *PatchNodeRequest) String() string { return protoimpl.X.MessageStringOf(x) } func (*PatchNodeRequest) ProtoMessage() {} func (x *PatchNodeRequest) ProtoReflect() protoreflect.Message { mi := &file_pkg_agent_api_v1_api_proto_msgTypes[3] if protoimpl.UnsafeEnabled && x != nil { ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) if ms.LoadMessageInfo() == nil { ms.StoreMessageInfo(mi) } return ms } return mi.MessageOf(x) } // Deprecated: Use PatchNodeRequest.ProtoReflect.Descriptor instead. func (*PatchNodeRequest) Descriptor() ([]byte, []int) { return file_pkg_agent_api_v1_api_proto_rawDescGZIP(), []int{3} } func (x *PatchNodeRequest) GetPatches() []*JsonPatch { if x != nil { return x.Patches } return nil } type PatchNodeReply struct { state protoimpl.MessageState sizeCache protoimpl.SizeCache unknownFields protoimpl.UnknownFields } func (x *PatchNodeReply) Reset() { *x = PatchNodeReply{} if protoimpl.UnsafeEnabled { mi := &file_pkg_agent_api_v1_api_proto_msgTypes[4] ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) ms.StoreMessageInfo(mi) } } func (x *PatchNodeReply) String() string { return protoimpl.X.MessageStringOf(x) } func (*PatchNodeReply) ProtoMessage() {} func (x *PatchNodeReply) ProtoReflect() protoreflect.Message { mi := &file_pkg_agent_api_v1_api_proto_msgTypes[4] if protoimpl.UnsafeEnabled && x != nil { ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) if ms.LoadMessageInfo() == nil { ms.StoreMessageInfo(mi) } return ms } return mi.MessageOf(x) } // Deprecated: Use PatchNodeReply.ProtoReflect.Descriptor instead. func (*PatchNodeReply) Descriptor() ([]byte, []int) { return file_pkg_agent_api_v1_api_proto_rawDescGZIP(), []int{4} } type UpdateNodeCapacityRequest struct { state protoimpl.MessageState sizeCache protoimpl.SizeCache unknownFields protoimpl.UnknownFields // Name-value map of status.capacity to update Capacities map[string]string `protobuf:"bytes,1,rep,name=capacities,proto3" json:"capacities,omitempty" protobuf_key:"bytes,1,opt,name=key,proto3" protobuf_val:"bytes,2,opt,name=value,proto3"` } func (x *UpdateNodeCapacityRequest) Reset() { *x = UpdateNodeCapacityRequest{} if protoimpl.UnsafeEnabled { mi := &file_pkg_agent_api_v1_api_proto_msgTypes[5] ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) ms.StoreMessageInfo(mi) } } func (x *UpdateNodeCapacityRequest) String() string { return protoimpl.X.MessageStringOf(x) } func (*UpdateNodeCapacityRequest) ProtoMessage() {} func (x *UpdateNodeCapacityRequest) ProtoReflect() protoreflect.Message { mi := &file_pkg_agent_api_v1_api_proto_msgTypes[5] if protoimpl.UnsafeEnabled && x != nil { ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) if ms.LoadMessageInfo() == nil { ms.StoreMessageInfo(mi) } return ms } return mi.MessageOf(x) } // Deprecated: Use UpdateNodeCapacityRequest.ProtoReflect.Descriptor instead. func (*UpdateNodeCapacityRequest) Descriptor() ([]byte, []int) { return file_pkg_agent_api_v1_api_proto_rawDescGZIP(), []int{5} } func (x *UpdateNodeCapacityRequest) GetCapacities() map[string]string { if x != nil { return x.Capacities } return nil } type UpdateNodeCapacityReply struct { state protoimpl.MessageState sizeCache protoimpl.SizeCache unknownFields protoimpl.UnknownFields } func (x *UpdateNodeCapacityReply) Reset() { *x = UpdateNodeCapacityReply{} if protoimpl.UnsafeEnabled { mi := &file_pkg_agent_api_v1_api_proto_msgTypes[6] ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) ms.StoreMessageInfo(mi) } } func (x *UpdateNodeCapacityReply) String() string { return protoimpl.X.MessageStringOf(x) } func (*UpdateNodeCapacityReply) ProtoMessage() {} func (x *UpdateNodeCapacityReply) ProtoReflect() protoreflect.Message { mi := &file_pkg_agent_api_v1_api_proto_msgTypes[6] if protoimpl.UnsafeEnabled && x != nil { ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) if ms.LoadMessageInfo() == nil { ms.StoreMessageInfo(mi) } return ms } return mi.MessageOf(x) } // Deprecated: Use UpdateNodeCapacityReply.ProtoReflect.Descriptor instead. func (*UpdateNodeCapacityReply) Descriptor() ([]byte, []int) { return file_pkg_agent_api_v1_api_proto_rawDescGZIP(), []int{6} } type HealthCheckRequest struct { state protoimpl.MessageState sizeCache protoimpl.SizeCache unknownFields protoimpl.UnknownFields Query string `protobuf:"bytes,1,opt,name=query,proto3" json:"query,omitempty"` } func (x *HealthCheckRequest) Reset() { *x = HealthCheckRequest{} if protoimpl.UnsafeEnabled { mi := &file_pkg_agent_api_v1_api_proto_msgTypes[7] ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) ms.StoreMessageInfo(mi) } } func (x *HealthCheckRequest) String() string { return protoimpl.X.MessageStringOf(x) } func (*HealthCheckRequest) ProtoMessage() {} func (x *HealthCheckRequest) ProtoReflect() protoreflect.Message { mi := &file_pkg_agent_api_v1_api_proto_msgTypes[7] if protoimpl.UnsafeEnabled && x != nil { ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) if ms.LoadMessageInfo() == nil { ms.StoreMessageInfo(mi) } return ms } return mi.MessageOf(x) } // Deprecated: Use HealthCheckRequest.ProtoReflect.Descriptor instead. func (*HealthCheckRequest) Descriptor() ([]byte, []int) { return file_pkg_agent_api_v1_api_proto_rawDescGZIP(), []int{7} } func (x *HealthCheckRequest) GetQuery() string { if x != nil { return x.Query } return "" } type HealthCheckReply struct { state protoimpl.MessageState sizeCache protoimpl.SizeCache unknownFields protoimpl.UnknownFields Error string `protobuf:"bytes,1,opt,name=error,proto3" json:"error,omitempty"` } func (x *HealthCheckReply) Reset() { *x = HealthCheckReply{} if protoimpl.UnsafeEnabled { mi := &file_pkg_agent_api_v1_api_proto_msgTypes[8] ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) ms.StoreMessageInfo(mi) } } func (x *HealthCheckReply) String() string { return protoimpl.X.MessageStringOf(x) } func (*HealthCheckReply) ProtoMessage() {} func (x *HealthCheckReply) ProtoReflect() protoreflect.Message { mi := &file_pkg_agent_api_v1_api_proto_msgTypes[8] if protoimpl.UnsafeEnabled && x != nil { ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) if ms.LoadMessageInfo() == nil { ms.StoreMessageInfo(mi) } return ms } return mi.MessageOf(x) } // Deprecated: Use HealthCheckReply.ProtoReflect.Descriptor instead. func (*HealthCheckReply) Descriptor() ([]byte, []int) { return file_pkg_agent_api_v1_api_proto_rawDescGZIP(), []int{8} } func (x *HealthCheckReply) GetError() string { if x != nil { return x.Error } return "" } var File_pkg_agent_api_v1_api_proto protoreflect.FileDescriptor var file_pkg_agent_api_v1_api_proto_rawDesc = []byte{ 0x0a, 0x1a, 0x70, 0x6b, 0x67, 0x2f, 0x61, 0x67, 0x65, 0x6e, 0x74, 0x2f, 0x61, 0x70, 0x69, 0x2f, 0x76, 0x31, 0x2f, 0x61, 0x70, 0x69, 0x2e, 0x70, 0x72, 0x6f, 0x74, 0x6f, 0x12, 0x02, 0x76, 0x31, 0x22, 0x10, 0x0a, 0x0e, 0x47, 0x65, 0x74, 0x4e, 0x6f, 0x64, 0x65, 0x52, 0x65, 0x71, 0x75, 0x65, 0x73, 0x74, 0x22, 0x22, 0x0a, 0x0c, 0x47, 0x65, 0x74, 0x4e, 0x6f, 0x64, 0x65, 0x52, 0x65, 0x70, 0x6c, 0x79, 0x12, 0x12, 0x0a, 0x04, 0x6e, 0x6f, 0x64, 0x65, 0x18, 0x01, 0x20, 0x01, 0x28, 0x09, 0x52, 0x04, 0x6e, 0x6f, 0x64, 0x65, 0x22, 0x45, 0x0a, 0x09, 0x4a, 0x73, 0x6f, 0x6e, 0x50, 0x61, 0x74, 0x63, 0x68, 0x12, 0x0e, 0x0a, 0x02, 0x6f, 0x70, 0x18, 0x01, 0x20, 0x01, 0x28, 0x09, 0x52, 0x02, 0x6f, 0x70, 0x12, 0x12, 0x0a, 0x04, 0x70, 0x61, 0x74, 0x68, 0x18, 0x02, 0x20, 0x01, 0x28, 0x09, 0x52, 0x04, 0x70, 0x61, 0x74, 0x68, 0x12, 0x14, 0x0a, 0x05, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x18, 0x03, 0x20, 0x01, 0x28, 0x09, 0x52, 0x05, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x22, 0x3b, 0x0a, 0x10, 0x50, 0x61, 0x74, 0x63, 0x68, 0x4e, 0x6f, 0x64, 0x65, 0x52, 0x65, 0x71, 0x75, 0x65, 0x73, 0x74, 0x12, 0x27, 0x0a, 0x07, 0x70, 0x61, 0x74, 0x63, 0x68, 0x65, 0x73, 0x18, 0x01, 0x20, 0x03, 0x28, 0x0b, 0x32, 0x0d, 0x2e, 0x76, 0x31, 0x2e, 0x4a, 0x73, 0x6f, 0x6e, 0x50, 0x61, 0x74, 0x63, 0x68, 0x52, 0x07, 0x70, 0x61, 0x74, 0x63, 0x68, 0x65, 0x73, 0x22, 0x10, 0x0a, 0x0e, 0x50, 0x61, 0x74, 0x63, 0x68, 0x4e, 0x6f, 0x64, 0x65, 0x52, 0x65, 0x70, 0x6c, 0x79, 0x22, 0xa9, 0x01, 0x0a, 0x19, 0x55, 0x70, 0x64, 0x61, 0x74, 0x65, 0x4e, 0x6f, 0x64, 0x65, 0x43, 0x61, 0x70, 0x61, 0x63, 0x69, 0x74, 0x79, 0x52, 0x65, 0x71, 0x75, 0x65, 0x73, 0x74, 0x12, 0x4d, 0x0a, 0x0a, 0x63, 0x61, 0x70, 0x61, 0x63, 0x69, 0x74, 0x69, 0x65, 0x73, 0x18, 0x01, 0x20, 0x03, 0x28, 0x0b, 0x32, 0x2d, 0x2e, 0x76, 0x31, 0x2e, 0x55, 0x70, 0x64, 0x61, 0x74, 0x65, 0x4e, 0x6f, 0x64, 0x65, 0x43, 0x61, 0x70, 0x61, 0x63, 0x69, 0x74, 0x79, 0x52, 0x65, 0x71, 0x75, 0x65, 0x73, 0x74, 0x2e, 0x43, 0x61, 0x70, 0x61, 0x63, 0x69, 0x74, 0x69, 0x65, 0x73, 0x45, 0x6e, 0x74, 0x72, 0x79, 0x52, 0x0a, 0x63, 0x61, 0x70, 0x61, 0x63, 0x69, 0x74, 0x69, 0x65, 0x73, 0x1a, 0x3d, 0x0a, 0x0f, 0x43, 0x61, 0x70, 0x61, 0x63, 0x69, 0x74, 0x69, 0x65, 0x73, 0x45, 0x6e, 0x74, 0x72, 0x79, 0x12, 0x10, 0x0a, 0x03, 0x6b, 0x65, 0x79, 0x18, 0x01, 0x20, 0x01, 0x28, 0x09, 0x52, 0x03, 0x6b, 0x65, 0x79, 0x12, 0x14, 0x0a, 0x05, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x18, 0x02, 0x20, 0x01, 0x28, 0x09, 0x52, 0x05, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x3a, 0x02, 0x38, 0x01, 0x22, 0x19, 0x0a, 0x17, 0x55, 0x70, 0x64, 0x61, 0x74, 0x65, 0x4e, 0x6f, 0x64, 0x65, 0x43, 0x61, 0x70, 0x61, 0x63, 0x69, 0x74, 0x79, 0x52, 0x65, 0x70, 0x6c, 0x79, 0x22, 0x2a, 0x0a, 0x12, 0x48, 0x65, 0x61, 0x6c, 0x74, 0x68, 0x43, 0x68, 0x65, 0x63, 0x6b, 0x52, 0x65, 0x71, 0x75, 0x65, 0x73, 0x74, 0x12, 0x14, 0x0a, 0x05, 0x71, 0x75, 0x65, 0x72, 0x79, 0x18, 0x01, 0x20, 0x01, 0x28, 0x09, 0x52, 0x05, 0x71, 0x75, 0x65, 0x72, 0x79, 0x22, 0x28, 0x0a, 0x10, 0x48, 0x65, 0x61, 0x6c, 0x74, 0x68, 0x43, 0x68, 0x65, 0x63, 0x6b, 0x52, 0x65, 0x70, 0x6c, 0x79, 0x12, 0x14, 0x0a, 0x05, 0x65, 0x72, 0x72, 0x6f, 0x72, 0x18, 0x01, 0x20, 0x01, 0x28, 0x09, 0x52, 0x05, 0x65, 0x72, 0x72, 0x6f, 0x72, 0x32, 0x86, 0x02, 0x0a, 0x05, 0x41, 0x67, 0x65, 0x6e, 0x74, 0x12, 0x31, 0x0a, 0x07, 0x47, 0x65, 0x74, 0x4e, 0x6f, 0x64, 0x65, 0x12, 0x12, 0x2e, 0x76, 0x31, 0x2e, 0x47, 0x65, 0x74, 0x4e, 0x6f, 0x64, 0x65, 0x52, 0x65, 0x71, 0x75, 0x65, 0x73, 0x74, 0x1a, 0x10, 0x2e, 0x76, 0x31, 0x2e, 0x47, 0x65, 0x74, 0x4e, 0x6f, 0x64, 0x65, 0x52, 0x65, 0x70, 0x6c, 0x79, 0x22, 0x00, 0x12, 0x37, 0x0a, 0x09, 0x50, 0x61, 0x74, 0x63, 0x68, 0x4e, 0x6f, 0x64, 0x65, 0x12, 0x14, 0x2e, 0x76, 0x31, 0x2e, 0x50, 0x61, 0x74, 0x63, 0x68, 0x4e, 0x6f, 0x64, 0x65, 0x52, 0x65, 0x71, 0x75, 0x65, 0x73, 0x74, 0x1a, 0x12, 0x2e, 0x76, 0x31, 0x2e, 0x50, 0x61, 0x74, 0x63, 0x68, 0x4e, 0x6f, 0x64, 0x65, 0x52, 0x65, 0x70, 0x6c, 0x79, 0x22, 0x00, 0x12, 0x52, 0x0a, 0x12, 0x55, 0x70, 0x64, 0x61, 0x74, 0x65, 0x4e, 0x6f, 0x64, 0x65, 0x43, 0x61, 0x70, 0x61, 0x63, 0x69, 0x74, 0x79, 0x12, 0x1d, 0x2e, 0x76, 0x31, 0x2e, 0x55, 0x70, 0x64, 0x61, 0x74, 0x65, 0x4e, 0x6f, 0x64, 0x65, 0x43, 0x61, 0x70, 0x61, 0x63, 0x69, 0x74, 0x79, 0x52, 0x65, 0x71, 0x75, 0x65, 0x73, 0x74, 0x1a, 0x1b, 0x2e, 0x76, 0x31, 0x2e, 0x55, 0x70, 0x64, 0x61, 0x74, 0x65, 0x4e, 0x6f, 0x64, 0x65, 0x43, 0x61, 0x70, 0x61, 0x63, 0x69, 0x74, 0x79, 0x52, 0x65, 0x70, 0x6c, 0x79, 0x22, 0x00, 0x12, 0x3d, 0x0a, 0x0b, 0x48, 0x65, 0x61, 0x6c, 0x74, 0x68, 0x43, 0x68, 0x65, 0x63, 0x6b, 0x12, 0x16, 0x2e, 0x76, 0x31, 0x2e, 0x48, 0x65, 0x61, 0x6c, 0x74, 0x68, 0x43, 0x68, 0x65, 0x63, 0x6b, 0x52, 0x65, 0x71, 0x75, 0x65, 0x73, 0x74, 0x1a, 0x14, 0x2e, 0x76, 0x31, 0x2e, 0x48, 0x65, 0x61, 0x6c, 0x74, 0x68, 0x43, 0x68, 0x65, 0x63, 0x6b, 0x52, 0x65, 0x70, 0x6c, 0x79, 0x22, 0x00, 0x42, 0x07, 0x5a, 0x05, 0x2e, 0x2e, 0x2f, 0x76, 0x31, 0x62, 0x06, 0x70, 0x72, 0x6f, 0x74, 0x6f, 0x33, } var ( file_pkg_agent_api_v1_api_proto_rawDescOnce sync.Once file_pkg_agent_api_v1_api_proto_rawDescData = file_pkg_agent_api_v1_api_proto_rawDesc ) func file_pkg_agent_api_v1_api_proto_rawDescGZIP() []byte { file_pkg_agent_api_v1_api_proto_rawDescOnce.Do(func() { file_pkg_agent_api_v1_api_proto_rawDescData = protoimpl.X.CompressGZIP(file_pkg_agent_api_v1_api_proto_rawDescData) }) return file_pkg_agent_api_v1_api_proto_rawDescData } var file_pkg_agent_api_v1_api_proto_msgTypes = make([]protoimpl.MessageInfo, 10) var file_pkg_agent_api_v1_api_proto_goTypes = []interface{}{ (*GetNodeRequest)(nil), // 0: v1.GetNodeRequest (*GetNodeReply)(nil), // 1: v1.GetNodeReply (*JsonPatch)(nil), // 2: v1.JsonPatch (*PatchNodeRequest)(nil), // 3: v1.PatchNodeRequest (*PatchNodeReply)(nil), // 4: v1.PatchNodeReply (*UpdateNodeCapacityRequest)(nil), // 5: v1.UpdateNodeCapacityRequest (*UpdateNodeCapacityReply)(nil), // 6: v1.UpdateNodeCapacityReply (*HealthCheckRequest)(nil), // 7: v1.HealthCheckRequest (*HealthCheckReply)(nil), // 8: v1.HealthCheckReply nil, // 9: v1.UpdateNodeCapacityRequest.CapacitiesEntry } var file_pkg_agent_api_v1_api_proto_depIdxs = []int32{ 2, // 0: v1.PatchNodeRequest.patches:type_name -> v1.JsonPatch 9, // 1: v1.UpdateNodeCapacityRequest.capacities:type_name -> v1.UpdateNodeCapacityRequest.CapacitiesEntry 0, // 2: v1.Agent.GetNode:input_type -> v1.GetNodeRequest 3, // 3: v1.Agent.PatchNode:input_type -> v1.PatchNodeRequest 5, // 4: v1.Agent.UpdateNodeCapacity:input_type -> v1.UpdateNodeCapacityRequest 7, // 5: v1.Agent.HealthCheck:input_type -> v1.HealthCheckRequest 1, // 6: v1.Agent.GetNode:output_type -> v1.GetNodeReply 4, // 7: v1.Agent.PatchNode:output_type -> v1.PatchNodeReply 6, // 8: v1.Agent.UpdateNodeCapacity:output_type -> v1.UpdateNodeCapacityReply 8, // 9: v1.Agent.HealthCheck:output_type -> v1.HealthCheckReply 6, // [6:10] is the sub-list for method output_type 2, // [2:6] is the sub-list for method input_type 2, // [2:2] is the sub-list for extension type_name 2, // [2:2] is the sub-list for extension extendee 0, // [0:2] is the sub-list for field type_name } func init() { file_pkg_agent_api_v1_api_proto_init() } func file_pkg_agent_api_v1_api_proto_init() { if File_pkg_agent_api_v1_api_proto != nil { return } if !protoimpl.UnsafeEnabled { file_pkg_agent_api_v1_api_proto_msgTypes[0].Exporter = func(v interface{}, i int) interface{} { switch v := v.(*GetNodeRequest); i { case 0: return &v.state case 1: return &v.sizeCache case 2: return &v.unknownFields default: return nil } } file_pkg_agent_api_v1_api_proto_msgTypes[1].Exporter = func(v interface{}, i int) interface{} { switch v := v.(*GetNodeReply); i { case 0: return &v.state case 1: return &v.sizeCache case 2: return &v.unknownFields default: return nil } } file_pkg_agent_api_v1_api_proto_msgTypes[2].Exporter = func(v interface{}, i int) interface{} { switch v := v.(*JsonPatch); i { case 0: return &v.state case 1: return &v.sizeCache case 2: return &v.unknownFields default: return nil } } file_pkg_agent_api_v1_api_proto_msgTypes[3].Exporter = func(v interface{}, i int) interface{} { switch v := v.(*PatchNodeRequest); i { case 0: return &v.state case 1: return &v.sizeCache case 2: return &v.unknownFields default: return nil } } file_pkg_agent_api_v1_api_proto_msgTypes[4].Exporter = func(v interface{}, i int) interface{} { switch v := v.(*PatchNodeReply); i { case 0: return &v.state case 1: return &v.sizeCache case 2: return &v.unknownFields default: return nil } } file_pkg_agent_api_v1_api_proto_msgTypes[5].Exporter = func(v interface{}, i int) interface{} { switch v := v.(*UpdateNodeCapacityRequest); i { case 0: return &v.state case 1: return &v.sizeCache case 2: return &v.unknownFields default: return nil } } file_pkg_agent_api_v1_api_proto_msgTypes[6].Exporter = func(v interface{}, i int) interface{} { switch v := v.(*UpdateNodeCapacityReply); i { case 0: return &v.state case 1: return &v.sizeCache case 2: return &v.unknownFields default: return nil } } file_pkg_agent_api_v1_api_proto_msgTypes[7].Exporter = func(v interface{}, i int) interface{} { switch v := v.(*HealthCheckRequest); i { case 0: return &v.state case 1: return &v.sizeCache case 2: return &v.unknownFields default: return nil } } file_pkg_agent_api_v1_api_proto_msgTypes[8].Exporter = func(v interface{}, i int) interface{} { switch v := v.(*HealthCheckReply); i { case 0: return &v.state case 1: return &v.sizeCache case 2: return &v.unknownFields default: return nil } } } type x struct{} out := protoimpl.TypeBuilder{ File: protoimpl.DescBuilder{ GoPackagePath: reflect.TypeOf(x{}).PkgPath(), RawDescriptor: file_pkg_agent_api_v1_api_proto_rawDesc, NumEnums: 0, NumMessages: 10, NumExtensions: 0, NumServices: 1, }, GoTypes: file_pkg_agent_api_v1_api_proto_goTypes, DependencyIndexes: file_pkg_agent_api_v1_api_proto_depIdxs, MessageInfos: file_pkg_agent_api_v1_api_proto_msgTypes, }.Build() File_pkg_agent_api_v1_api_proto = out.File file_pkg_agent_api_v1_api_proto_rawDesc = nil file_pkg_agent_api_v1_api_proto_goTypes = nil file_pkg_agent_api_v1_api_proto_depIdxs = nil } ================================================ FILE: pkg/agent/api/v1/api.proto ================================================ /* Copyright 2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ syntax = "proto3"; package v1; option go_package = "../v1"; service Agent{ rpc GetNode(GetNodeRequest) returns (GetNodeReply) {} rpc PatchNode(PatchNodeRequest) returns (PatchNodeReply) {} rpc UpdateNodeCapacity(UpdateNodeCapacityRequest) returns (UpdateNodeCapacityReply) {} rpc HealthCheck(HealthCheckRequest) returns (HealthCheckReply) {} } message GetNodeRequest { } message GetNodeReply { string node = 1; } // JsonPatch holds on JSON patch message JsonPatch { string op = 1; string path = 2; string value = 3; } message PatchNodeRequest { // List of JSON patches to apply on the node repeated JsonPatch patches = 1; } message PatchNodeReply { } message UpdateNodeCapacityRequest { // Name-value map of status.capacity to update map capacities = 1; } message UpdateNodeCapacityReply { } message HealthCheckRequest { string query = 1; } message HealthCheckReply { string error = 1; } ================================================ FILE: pkg/agent/api/v1/api_grpc.pb.go ================================================ // Code generated by protoc-gen-go-grpc. DO NOT EDIT. // versions: // - protoc-gen-go-grpc v1.2.0 // - protoc v3.20.1 // source: pkg/agent/api/v1/api.proto package v1 import ( context "context" grpc "google.golang.org/grpc" codes "google.golang.org/grpc/codes" status "google.golang.org/grpc/status" ) // This is a compile-time assertion to ensure that this generated file // is compatible with the grpc package it is being compiled against. // Requires gRPC-Go v1.32.0 or later. const _ = grpc.SupportPackageIsVersion7 // AgentClient is the client API for Agent service. // // For semantics around ctx use and closing/ending streaming RPCs, please refer to https://pkg.go.dev/google.golang.org/grpc/?tab=doc#ClientConn.NewStream. type AgentClient interface { GetNode(ctx context.Context, in *GetNodeRequest, opts ...grpc.CallOption) (*GetNodeReply, error) PatchNode(ctx context.Context, in *PatchNodeRequest, opts ...grpc.CallOption) (*PatchNodeReply, error) UpdateNodeCapacity(ctx context.Context, in *UpdateNodeCapacityRequest, opts ...grpc.CallOption) (*UpdateNodeCapacityReply, error) HealthCheck(ctx context.Context, in *HealthCheckRequest, opts ...grpc.CallOption) (*HealthCheckReply, error) } type agentClient struct { cc grpc.ClientConnInterface } func NewAgentClient(cc grpc.ClientConnInterface) AgentClient { return &agentClient{cc} } func (c *agentClient) GetNode(ctx context.Context, in *GetNodeRequest, opts ...grpc.CallOption) (*GetNodeReply, error) { out := new(GetNodeReply) err := c.cc.Invoke(ctx, "/v1.Agent/GetNode", in, out, opts...) if err != nil { return nil, err } return out, nil } func (c *agentClient) PatchNode(ctx context.Context, in *PatchNodeRequest, opts ...grpc.CallOption) (*PatchNodeReply, error) { out := new(PatchNodeReply) err := c.cc.Invoke(ctx, "/v1.Agent/PatchNode", in, out, opts...) if err != nil { return nil, err } return out, nil } func (c *agentClient) UpdateNodeCapacity(ctx context.Context, in *UpdateNodeCapacityRequest, opts ...grpc.CallOption) (*UpdateNodeCapacityReply, error) { out := new(UpdateNodeCapacityReply) err := c.cc.Invoke(ctx, "/v1.Agent/UpdateNodeCapacity", in, out, opts...) if err != nil { return nil, err } return out, nil } func (c *agentClient) HealthCheck(ctx context.Context, in *HealthCheckRequest, opts ...grpc.CallOption) (*HealthCheckReply, error) { out := new(HealthCheckReply) err := c.cc.Invoke(ctx, "/v1.Agent/HealthCheck", in, out, opts...) if err != nil { return nil, err } return out, nil } // AgentServer is the server API for Agent service. // All implementations must embed UnimplementedAgentServer // for forward compatibility type AgentServer interface { GetNode(context.Context, *GetNodeRequest) (*GetNodeReply, error) PatchNode(context.Context, *PatchNodeRequest) (*PatchNodeReply, error) UpdateNodeCapacity(context.Context, *UpdateNodeCapacityRequest) (*UpdateNodeCapacityReply, error) HealthCheck(context.Context, *HealthCheckRequest) (*HealthCheckReply, error) mustEmbedUnimplementedAgentServer() } // UnimplementedAgentServer must be embedded to have forward compatible implementations. type UnimplementedAgentServer struct { } func (UnimplementedAgentServer) GetNode(context.Context, *GetNodeRequest) (*GetNodeReply, error) { return nil, status.Errorf(codes.Unimplemented, "method GetNode not implemented") } func (UnimplementedAgentServer) PatchNode(context.Context, *PatchNodeRequest) (*PatchNodeReply, error) { return nil, status.Errorf(codes.Unimplemented, "method PatchNode not implemented") } func (UnimplementedAgentServer) UpdateNodeCapacity(context.Context, *UpdateNodeCapacityRequest) (*UpdateNodeCapacityReply, error) { return nil, status.Errorf(codes.Unimplemented, "method UpdateNodeCapacity not implemented") } func (UnimplementedAgentServer) HealthCheck(context.Context, *HealthCheckRequest) (*HealthCheckReply, error) { return nil, status.Errorf(codes.Unimplemented, "method HealthCheck not implemented") } func (UnimplementedAgentServer) mustEmbedUnimplementedAgentServer() {} // UnsafeAgentServer may be embedded to opt out of forward compatibility for this service. // Use of this interface is not recommended, as added methods to AgentServer will // result in compilation errors. type UnsafeAgentServer interface { mustEmbedUnimplementedAgentServer() } func RegisterAgentServer(s grpc.ServiceRegistrar, srv AgentServer) { s.RegisterService(&Agent_ServiceDesc, srv) } func _Agent_GetNode_Handler(srv interface{}, ctx context.Context, dec func(interface{}) error, interceptor grpc.UnaryServerInterceptor) (interface{}, error) { in := new(GetNodeRequest) if err := dec(in); err != nil { return nil, err } if interceptor == nil { return srv.(AgentServer).GetNode(ctx, in) } info := &grpc.UnaryServerInfo{ Server: srv, FullMethod: "/v1.Agent/GetNode", } handler := func(ctx context.Context, req interface{}) (interface{}, error) { return srv.(AgentServer).GetNode(ctx, req.(*GetNodeRequest)) } return interceptor(ctx, in, info, handler) } func _Agent_PatchNode_Handler(srv interface{}, ctx context.Context, dec func(interface{}) error, interceptor grpc.UnaryServerInterceptor) (interface{}, error) { in := new(PatchNodeRequest) if err := dec(in); err != nil { return nil, err } if interceptor == nil { return srv.(AgentServer).PatchNode(ctx, in) } info := &grpc.UnaryServerInfo{ Server: srv, FullMethod: "/v1.Agent/PatchNode", } handler := func(ctx context.Context, req interface{}) (interface{}, error) { return srv.(AgentServer).PatchNode(ctx, req.(*PatchNodeRequest)) } return interceptor(ctx, in, info, handler) } func _Agent_UpdateNodeCapacity_Handler(srv interface{}, ctx context.Context, dec func(interface{}) error, interceptor grpc.UnaryServerInterceptor) (interface{}, error) { in := new(UpdateNodeCapacityRequest) if err := dec(in); err != nil { return nil, err } if interceptor == nil { return srv.(AgentServer).UpdateNodeCapacity(ctx, in) } info := &grpc.UnaryServerInfo{ Server: srv, FullMethod: "/v1.Agent/UpdateNodeCapacity", } handler := func(ctx context.Context, req interface{}) (interface{}, error) { return srv.(AgentServer).UpdateNodeCapacity(ctx, req.(*UpdateNodeCapacityRequest)) } return interceptor(ctx, in, info, handler) } func _Agent_HealthCheck_Handler(srv interface{}, ctx context.Context, dec func(interface{}) error, interceptor grpc.UnaryServerInterceptor) (interface{}, error) { in := new(HealthCheckRequest) if err := dec(in); err != nil { return nil, err } if interceptor == nil { return srv.(AgentServer).HealthCheck(ctx, in) } info := &grpc.UnaryServerInfo{ Server: srv, FullMethod: "/v1.Agent/HealthCheck", } handler := func(ctx context.Context, req interface{}) (interface{}, error) { return srv.(AgentServer).HealthCheck(ctx, req.(*HealthCheckRequest)) } return interceptor(ctx, in, info, handler) } // Agent_ServiceDesc is the grpc.ServiceDesc for Agent service. // It's only intended for direct use with grpc.RegisterService, // and not to be introspected or modified (even as a copy) var Agent_ServiceDesc = grpc.ServiceDesc{ ServiceName: "v1.Agent", HandlerType: (*AgentServer)(nil), Methods: []grpc.MethodDesc{ { MethodName: "GetNode", Handler: _Agent_GetNode_Handler, }, { MethodName: "PatchNode", Handler: _Agent_PatchNode_Handler, }, { MethodName: "UpdateNodeCapacity", Handler: _Agent_UpdateNodeCapacity_Handler, }, { MethodName: "HealthCheck", Handler: _Agent_HealthCheck_Handler, }, }, Streams: []grpc.StreamDesc{}, Metadata: "pkg/agent/api/v1/api.proto", } ================================================ FILE: pkg/agent/api/v1/constants.go ================================================ /* Copyright Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ package v1 const ( // ConfigStatus queries the status of the last configuration push to resmgr. ConfigStatus = "config-status" ) ================================================ FILE: pkg/agent/config-updater.go ================================================ /* Copyright 2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ package agent import ( "fmt" "net" "sync" "time" "context" "encoding/json" "google.golang.org/grpc" resmgr "github.com/intel/cri-resource-manager/pkg/apis/resmgr/v1alpha1" resmgr_v1 "github.com/intel/cri-resource-manager/pkg/cri/resource-manager/config/api/v1" "github.com/intel/cri-resource-manager/pkg/log" ) const ( // configuration update rate-limiting timeout rateLimitTimeout = 2 * time.Second // setConfigTimeout is the duration we wait at most for a SetConfig reply setConfigTimeout = 5 * time.Second // retryTimeout is the timeout after we retry sending configuration updates upon failure retryTimeout = 5 * time.Second ) // configUpdater handles sending configuration to cri-resmgr type configUpdater interface { Start() error Stop() UpdateConfig(*resmgrConfig) UpdateAdjustment(*resmgrAdjustment) StatusChan() chan *resmgrStatus GetError() error } // updater implements configUpdater type updater struct { log.Logger resmgrCli resmgr_v1.ConfigClient newConfig chan *resmgrConfig newAdjustment chan *resmgrAdjustment newStatus chan *resmgrStatus cfgErrLock sync.RWMutex cfgErr error } func newConfigUpdater() (configUpdater, error) { u := &updater{Logger: log.NewLogger("config-updater")} c, err := newResmgrCli(opts.resmgrSocket) if err != nil { return nil, agentError("failed to create connection to cri-resmgr") } u.resmgrCli = c u.newConfig = make(chan *resmgrConfig) u.newAdjustment = make(chan *resmgrAdjustment) u.newStatus = make(chan *resmgrStatus) return u, nil } func (u *updater) Start() error { u.Info("Starting config-updater") go func() { var pendingConfig *resmgrConfig var pendingAdjustment *resmgrAdjustment var ratelimit <-chan time.Time for { select { case cfg := <-u.newConfig: u.Info("scheduling update after %v rate-limiting timeout...", rateLimitTimeout) pendingConfig = cfg ratelimit = time.After(rateLimitTimeout) case adjust := <-u.newAdjustment: u.Info("scheduling update after %v rate-limiting timeout...", rateLimitTimeout) pendingAdjustment = adjust ratelimit = time.After(rateLimitTimeout) case _ = <-ratelimit: if pendingConfig != nil { mgrErr, err := u.setConfig(pendingConfig) if err != nil { u.Error("failed to send configuration update: %v", err) ratelimit = time.After(retryTimeout) } else { if mgrErr != nil { u.Error("cri-resmgr configuration error: %v", mgrErr) } pendingConfig = nil ratelimit = nil } } if pendingAdjustment != nil { errors, err := u.setAdjustment(pendingAdjustment) if err != nil { u.Error("failed to update adjustments: %+v", err) } if len(errors) > 0 { u.Error("some adjustment updates failed: %+v", errors) } u.newStatus <- &resmgrStatus{ request: err, errors: errors, } pendingAdjustment = nil ratelimit = nil } } } }() return nil } func (u *updater) Stop() { } func (u *updater) UpdateConfig(c *resmgrConfig) { u.newConfig <- c } func (u *updater) UpdateAdjustment(c *resmgrAdjustment) { u.newAdjustment <- c } func (u *updater) StatusChan() chan *resmgrStatus { return u.newStatus } func (u *updater) setError(err error) error { u.cfgErrLock.Lock() defer u.cfgErrLock.Unlock() u.cfgErr = err return err } func (u *updater) GetError() error { u.cfgErrLock.RLock() defer u.cfgErrLock.RUnlock() return u.cfgErr } func (u *updater) setConfig(cfg *resmgrConfig) (error, error) { ctx, cancel := context.WithTimeout(context.Background(), setConfigTimeout) defer cancel() req := &resmgr_v1.SetConfigRequest{NodeName: nodeName, Config: *cfg} u.Debug("sending SetConfig request to cri-resmgr") reply, err := u.resmgrCli.SetConfig(ctx, req, []grpc.CallOption{grpc.FailFast(false)}...) switch { case err != nil: return nil, u.setError(err) case reply.Error != "": return u.setError(fmt.Errorf("%s", reply.Error)), nil default: return u.setError(nil), nil } } func (u *updater) setAdjustment(adjust *resmgrAdjustment) (map[string]string, error) { ctx, cancel := context.WithTimeout(context.Background(), setConfigTimeout) defer cancel() specs := map[string]*resmgr.AdjustmentSpec{} for name, p := range *adjust { specs[name] = &resmgr.AdjustmentSpec{ Scope: p.Spec.NodeScope(nodeName), Resources: p.Spec.Resources, Classes: p.Spec.Classes, ToptierLimit: p.Spec.ToptierLimit, } } encoded, err := json.Marshal(specs) if err != nil { return nil, agentError("setAdjustment: failed to encode AdjustmentSpec: %v", err) } req := &resmgr_v1.SetAdjustmentRequest{NodeName: nodeName, Adjustment: string(encoded)} u.Debug("sending SetAdjustment request to cri-resmgr") reply, err := u.resmgrCli.SetAdjustment(ctx, req, []grpc.CallOption{grpc.FailFast(false)}...) if err != nil { return nil, err } return reply.Errors, nil } func newResmgrCli(socket string) (resmgr_v1.ConfigClient, error) { dialOpts := []grpc.DialOption{ grpc.WithInsecure(), grpc.WithDialer(func(sock string, timeout time.Duration) (net.Conn, error) { return net.Dial("unix", socket) }), } conn, err := grpc.Dial(socket, dialOpts...) if err != nil { return nil, agentError("failed to connect to cri-resmgr: %v", err) } return resmgr_v1.NewConfigClient(conn), nil } ================================================ FILE: pkg/agent/flags.go ================================================ /* Copyright 2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ package agent import ( "flag" "github.com/intel/cri-resource-manager/pkg/cri/resource-manager/kubernetes" "github.com/intel/cri-resource-manager/pkg/cri/resource-manager/sockets" ) type options struct { kubeconfig string agentSocket string resmgrSocket string configNs string configMapName string labelName string } var opts = options{} func init() { flag.StringVar(&opts.agentSocket, "agent-socket", sockets.ResourceManagerAgent, "Socket for incoming requests from cri-resmgr") flag.StringVar(&opts.resmgrSocket, "cri-resmgr-socket", sockets.ResourceManagerConfig, "cri-resmgr socket to connect to") flag.StringVar(&opts.kubeconfig, "kubeconfig", "", "Kubeconfig to use, empty string implies in-cluster config (i.e. running inside a Pod)") flag.StringVar(&opts.configNs, "config-ns", "kube-system", "Kubernetes namespace where to look for config") flag.StringVar(&opts.configMapName, "configmap-name", "cri-resmgr-config", "Name of the K8s ConfigMap to watch") flag.StringVar(&opts.labelName, "label-name", kubernetes.ResmgrKey("group"), "Name of the label used to assign a node to a configuration group.") } ================================================ FILE: pkg/agent/kubernetes.go ================================================ /* Copyright 2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ package agent import ( "context" "encoding/json" "fmt" "os" "time" core_v1 "k8s.io/api/core/v1" meta_v1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apimachinery/pkg/types" k8swatch "k8s.io/apimachinery/pkg/watch" k8sclient "k8s.io/client-go/kubernetes" "k8s.io/client-go/rest" "k8s.io/client-go/tools/clientcmd" resmgr "github.com/intel/cri-resource-manager/pkg/apis/resmgr/generated/clientset/versioned/typed/resmgr/v1alpha1" agent_v1 "github.com/intel/cri-resource-manager/pkg/agent/api/v1" ) type namespace string // nodeName contains the name of the k8s we're running on var nodeName string // getK8sClient initializes a new Kubernetes client func (a *agent) getK8sClient(kubeconfig string) (*k8sclient.Clientset, *resmgr.CriresmgrV1alpha1Client, error) { var config *rest.Config var err error if kubeconfig == "" { a.Info("using in-cluster kubeconfig") config, err = rest.InClusterConfig() } else { config, err = clientcmd.BuildConfigFromFlags("", kubeconfig) } if err != nil { return nil, nil, err } genCli, err := k8sclient.NewForConfig(config) if err != nil { return nil, nil, err } resmgr, err := resmgr.NewForConfig(config) if err != nil { return nil, nil, err } return genCli, resmgr, nil } // getNodeObject gets a k8s Node object func getNodeObject(cli *k8sclient.Clientset) (*core_v1.Node, error) { node, err := cli.CoreV1().Nodes().Get(context.TODO(), nodeName, meta_v1.GetOptions{}) if err != nil { return nil, agentError("failed to get node object for node %q: %v", nodeName, err) } return node, nil } // patchNodeObject is a helper for patching a k8s Node object func patchNode(cli *k8sclient.Clientset, patchList []*agent_v1.JsonPatch) error { // Convert patch list into bytes data, err := json.Marshal(patchList) if err != nil { return agentError("failed to marshal Node patches: %v", err) } // Patch our node pt := types.JSONPatchType _, err = cli.CoreV1().Nodes().Patch(context.TODO(), nodeName, pt, data, meta_v1.PatchOptions{}) if err != nil { return err } return nil } // patchNodeStatus is a helper for patching the status of a k8s Node object func patchNodeStatus(cli *k8sclient.Clientset, fields map[string]string) error { patch, sep := fmt.Sprintf(`{"status": {`), "" for f, v := range fields { patch += sep + fmt.Sprintf(`"%s": %s`, f, v) sep = "," } patch += "}}" _, err := cli.CoreV1().Nodes().PatchStatus(context.TODO(), nodeName, []byte(patch)) return err } // patchAdjustmentStatus is a helper for patching the status of a Adjustment CRD. func patchAdjustmentStatus(_ *resmgr.CriresmgrV1alpha1Client, _ *resmgrStatus, _ ...string) error { return nil } // watch is a wrapper around the k8s watch.Interface type watch struct { parent *watcher kind string ns namespace name string openfn func(namespace, string) (k8swatch.Interface, error) queryfn func(namespace, string) (interface{}, error) stop chan struct{} events chan k8swatch.Event } // openFn is the type for functions creating k8s watcher of a particular kind. type openFn func(ns namespace, name string) (k8swatch.Interface, error) // queryFn is the type for functions querying k8s objects being watched. type queryFn func(ns namespace, name string) (interface{}, error) const ( // SyntheticMissing is a synthetic initial event for currently non-existent object. SyntheticMissing = k8swatch.EventType("SyntheticMissing") ) func newWatch(parent *watcher, kind string, ns namespace, open openFn, query queryFn) *watch { return &watch{ parent: parent, kind: kind, ns: ns, stop: make(chan struct{}), events: make(chan k8swatch.Event), openfn: open, queryfn: query, } } // newNodeWatch creates a watch for k8s Node func newNodeWatch(parent *watcher) *watch { w := newWatch(parent, "Node", namespace(""), func(ns namespace, name string) (k8swatch.Interface, error) { selector := meta_v1.ListOptions{FieldSelector: "metadata.name=" + name} k8w, err := parent.k8sCli.CoreV1().Nodes().Watch(context.TODO(), selector) if err != nil { return nil, err } return k8w, nil }, func(ns namespace, name string) (interface{}, error) { noopts := meta_v1.GetOptions{} node, err := parent.k8sCli.CoreV1().Nodes().Get(context.TODO(), name, noopts) if err != nil { return nil, err } return node, nil }) w.Start(nodeName) return w } // newConfigMapWatch creates a watch for k8s ConfigMap func newConfigMapWatch(parent *watcher, name string, ns namespace) *watch { w := newWatch(parent, "ConfigMap", ns, func(ns namespace, name string) (k8swatch.Interface, error) { selector := meta_v1.ListOptions{FieldSelector: "metadata.name=" + name} k8w, err := parent.k8sCli.CoreV1().ConfigMaps(string(ns)).Watch(context.TODO(), selector) if err != nil { return nil, err } return k8w, nil }, func(ns namespace, name string) (interface{}, error) { noopts := meta_v1.GetOptions{} cm, err := parent.k8sCli.CoreV1().ConfigMaps(string(ns)).Get(context.TODO(), name, noopts) if err != nil { return nil, err } return cm, nil }) w.Start(name) return w } // newAdustmentCRDWatch creates a watch for k8s Adjustment CRDs func newAdjustmentCRDWatch(parent *watcher, ns namespace) *watch { w := newWatch(parent, "AdjustmentCRD", ns, func(ns namespace, name string) (k8swatch.Interface, error) { k8w, err := parent.resmgrCli.Adjustments(string(ns)).Watch(meta_v1.ListOptions{}) if err != nil { return nil, err } return k8w, nil }, func(ns namespace, name string) (interface{}, error) { crds, err := parent.resmgrCli.Adjustments(string(ns)).List(meta_v1.ListOptions{}) if err != nil { return nil, err } if crds == nil || len(crds.Items) == 0 { crds = nil } return crds, nil }) w.Start("AdjustmentCRD") return w } func (w *watch) Name() string { ns, name := w.ns, w.name if ns != "" { ns += "/" } if name == "" { name = "" } return w.kind + ":" + string(ns) + name } // Query queries the object being watched. func (w *watch) Query() (interface{}, error) { if w.name == "" { return nil, nil } return w.queryfn(w.ns, w.name) } // Start watching an object. func (w *watch) Start(name string) { w.Stop() w.name = name if w.name == "" { return } // proxy events from a go-routine until we're told to stop. go func() { var k8w k8swatch.Interface var events <-chan k8swatch.Event var ratelimit <-chan time.Time var err error // let the watcher know not to expect initial event if objs, _ := w.queryfn(w.ns, w.name); objs == nil { w.events <- k8swatch.Event{Type: SyntheticMissing} } for { if events == nil { w.parent.Info("creating %s watch", w.Name()) if k8w, err = w.openfn(w.ns, w.name); err != nil { w.parent.Warn("failed to create %s watch: %v", w.Name(), err) ratelimit = time.After(1 * time.Second) } else { events = k8w.ResultChan() ratelimit = nil } } select { case _ = <-w.stop: if events != nil { k8w.Stop() } return case e, ok := <-events: if ok { w.events <- e } else { w.parent.Warn("failed to get event from watch %s", w.Name()) k8w.Stop() events = nil } case _ = <-ratelimit: } } }() } // Close closes a watch. func (w *watch) Stop() { select { case w.stop <- struct{}{}: default: } } // ResultChan returns the event channel of the watch. func (w *watch) ResultChan() <-chan k8swatch.Event { return w.events } func init() { // Node name is expected to be set in an environment variable nodeName = os.Getenv("NODE_NAME") } ================================================ FILE: pkg/agent/server.go ================================================ /* Copyright 2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ package agent import ( "context" "encoding/json" "fmt" "net" "os" "path/filepath" "strings" "google.golang.org/grpc" core_v1 "k8s.io/api/core/v1" k8sclient "k8s.io/client-go/kubernetes" v1 "github.com/intel/cri-resource-manager/pkg/agent/api/v1" "github.com/intel/cri-resource-manager/pkg/cri/resource-manager/sockets" "github.com/intel/cri-resource-manager/pkg/log" ) // agentServer is the interface for our gRPC server. type agentServer interface { Start(string) error Stop() } // server implements agentServer. type server struct { log.Logger cli *k8sclient.Clientset // client for accessing k8s api server *grpc.Server // gRPC server instance cfg configInterface } // newAgentServer creates new agentServer instance. func newAgentServer(cli *k8sclient.Clientset, cfg configInterface) (agentServer, error) { s := &server{ Logger: log.NewLogger("server"), cli: cli, cfg: cfg, } return s, nil } // Start runs server instance. func (s *server) Start(socket string) error { // Make sure we have a directory for the socket. if err := os.MkdirAll(filepath.Dir(socket), sockets.DirPermissions); err != nil { return agentError("failed to create directory for socket %s: %v", socket, err) } // Remove any leftover sockets. if err := os.Remove(socket); err != nil && !os.IsNotExist(err) { return agentError("failed to unlink socket file: %s", err) } // Create server listening for local unix domain socket lis, err := net.Listen("unix", socket) if err != nil { return agentError("failed to listen to socket: %v", err) } serverOpts := []grpc.ServerOption{} s.server = grpc.NewServer(serverOpts...) gs := &grpcServer{ Logger: s.Logger, cli: s.cli, cfg: s.cfg, } v1.RegisterAgentServer(s.server, gs) s.Info("starting gRPC server at socket %s", socket) go func() { defer lis.Close() err := s.server.Serve(lis) if err != nil { s.Fatal("grpc server died: %v", err) } }() return nil } // Stop agentServer instance func (s *server) Stop() { s.server.Stop() } // grpcServer implements v1.AgentServer type grpcServer struct { v1.UnimplementedAgentServer log.Logger cli *k8sclient.Clientset cfg configInterface } // GetNode gets K8s node object. func (g *grpcServer) GetNode(_ context.Context, req *v1.GetNodeRequest) (*v1.GetNodeReply, error) { g.Debug("received GetNodeRequest: %v", req) rpl := &v1.GetNodeReply{} node, err := getNodeObject(g.cli) if err != nil { return rpl, agentError("failed to get node object: %v", err) } serialized, err := json.Marshal(node) if err != nil { return rpl, agentError("failed to serialized node object: %v", err) } rpl.Node = string(serialized) return rpl, nil } // PatchNode patches the K8s node object. func (g *grpcServer) PatchNode(_ context.Context, req *v1.PatchNodeRequest) (*v1.PatchNodeReply, error) { g.Debug("received PatchNodeRequest: %v", req) rpl := &v1.PatchNodeReply{} // Apply patches if len(req.Patches) > 0 { err := patchNode(g.cli, req.Patches) if err != nil { return rpl, agentError("failed to patch node object: %v", err) } } return rpl, nil } // UpdateNodeCapacity updates capacity in Node status func (g *grpcServer) UpdateNodeCapacity(_ context.Context, req *v1.UpdateNodeCapacityRequest) (*v1.UpdateNodeCapacityReply, error) { g.Debug("received UpdateNodeCapacityRequest: %v", req) rpl := &v1.UpdateNodeCapacityReply{} capacity, sep := "", "" for name, count := range req.Capacities { if isNativeResource(name) { err := agentError("refusing to update capacity of native resource '%s'", name) return rpl, err } if !strings.Contains(name, ".") || !strings.Contains(name, "/") { err := agentError("invalid resource '%s' in capacity update", name) return rpl, err } capacity += sep + fmt.Sprintf(`"%s": "%s"`, name, count) sep = ", " } err := patchNodeStatus(g.cli, map[string]string{"capacity": "{" + capacity + "}"}) return rpl, err } // HealthCheck checks if the agent is in healthy state func (g *grpcServer) HealthCheck(_ context.Context, req *v1.HealthCheckRequest) (*v1.HealthCheckReply, error) { g.Debug("received HealthCheckRequest: %v", req) reply := &v1.HealthCheckReply{} if req.Query == v1.ConfigStatus { if err := g.cfg.getError(); err != nil { reply.Error = fmt.Sprintf("configuration error: %v", err) } } return reply, nil } func isNativeResource(name string) bool { switch { case name == string(core_v1.ResourceCPU), name == string(core_v1.ResourceMemory): return true case strings.HasPrefix(name, core_v1.ResourceHugePagesPrefix): return true default: return false } } ================================================ FILE: pkg/agent/watcher.go ================================================ /* Copyright 2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ package agent import ( core_v1 "k8s.io/api/core/v1" k8swatch "k8s.io/apimachinery/pkg/watch" k8sclient "k8s.io/client-go/kubernetes" "sync" "time" "encoding/json" patch "github.com/evanphx/json-patch" pkgtypes "k8s.io/apimachinery/pkg/types" resmgrcli "github.com/intel/cri-resource-manager/pkg/apis/resmgr/generated/clientset/versioned/typed/resmgr/v1alpha1" resmgr "github.com/intel/cri-resource-manager/pkg/apis/resmgr/v1alpha1" "github.com/intel/cri-resource-manager/pkg/log" ) type cachedConfig struct { sync.RWMutex nodeCfg *resmgrConfig // node-specific configuration groupCfg *resmgrConfig // group-specific configuration group string // group name, "" for default inscope resmgrAdjustment // external adjustments that apply to this node ignored resmgrAdjustment // external adjustments that do not apply to this node status *resmgrStatus // latest adjustment update status } // k8sWatcher is our interface to K8s control plane watcher type k8sWatcher interface { // Start the watcher instance Start() error // Stop the watcher instance Stop() // Get a chan through which to receive configuration updates ConfigChan() <-chan resmgrConfig // Get up-to-date config GetConfig() resmgrConfig // Get a chan through which to receive adjustment updates AdjustmentChan() <-chan resmgrAdjustment // Update the node Status for adjustment updates. UpdateStatus(*resmgrStatus) error } // watcher implements k8sWatcher type watcher struct { log.Logger stop chan struct{} // channel to stop watcher goroutine k8sCli *k8sclient.Clientset // k8s client interface resmgrCli *resmgrcli.CriresmgrV1alpha1Client // adjustment CRD interface currentConfig cachedConfig // current configuration, cached configChan chan resmgrConfig // channel for config updates adjustmentChan chan resmgrAdjustment // channel for adjustment updates } // newK8sWatcher creates a new K8sWatcher instance func newK8sWatcher(k8sCli *k8sclient.Clientset, resmgrCli *resmgrcli.CriresmgrV1alpha1Client) (k8sWatcher, error) { w := &watcher{ Logger: log.NewLogger("watcher"), k8sCli: k8sCli, resmgrCli: resmgrCli, stop: make(chan struct{}, 1), currentConfig: newCachedConfig(), configChan: make(chan resmgrConfig, 1), adjustmentChan: make(chan resmgrAdjustment, 1), } return w, nil } // Start runs a k8sWatcher instance func (w *watcher) Start() error { w.Info("starting watcher...") if nodeName == "" { return agentError("node name not set, NODE_NAME env variable should be set to match the name of this k8s Node") } go func() { w.watch() }() return nil } // Stop stops a running k8sWatcher instance func (w *watcher) Stop() { select { case w.stop <- struct{}{}: default: w.Debug("stop already sent") } } // ConfigChan returns the chan for config updates func (w *watcher) ConfigChan() <-chan resmgrConfig { return w.configChan } // AdjustmentChan returns the chan for adjustment updates func (w *watcher) AdjustmentChan() <-chan resmgrAdjustment { return w.adjustmentChan } // GetConfig returns the current cri-resmgr configuration func (w *watcher) GetConfig() resmgrConfig { cfg, kind := w.currentConfig.getConfig() w.Info("giving %s configuration in reply to query", kind) return cfg } // UpdateStatus updates the node status for adjustment updates. func (w *watcher) UpdateStatus(status *resmgrStatus) error { w.currentConfig.setStatus(status) return w.PatchAdjustmentStatus(status) } // PatchAdjustmentStatus updates the node status for adjustment updates. func (w *watcher) PatchAdjustmentStatus(status *resmgrStatus) error { errors := status.errors if errors == nil { errors = map[string]string{} } if status.request != nil { errors["request"] = status.request.Error() } inscope, ignored := w.currentConfig.getAdjustment() w.currentConfig.Lock() defer w.currentConfig.Unlock() errCnt := 0 for _, adjust := range inscope { if err := w.patchAdjustment(adjust, true, errors); err != nil { w.Error("%v", err) errCnt++ } } for _, adjust := range ignored { if err := w.patchAdjustment(adjust, false, errors); err != nil { w.Error("%v", err) errCnt++ } } if errCnt > 0 { return agentError("some adjustment status updates failed") } return nil } // patchAdjustment patches the status of an update to the given adjustment. func (w *watcher) patchAdjustment(adjust *resmgr.Adjustment, inscope bool, errors map[string]string) error { var pdata []byte var err error old, ok := adjust.Status.Nodes[nodeName] if !inscope { if !ok { w.Debug("adjustment %s does not need status patching...", adjust.Name) return nil } current := &resmgr.Adjustment{ Status: resmgr.AdjustmentStatus{ Nodes: map[string]resmgr.AdjustmentNodeStatus{ nodeName: old, }, }, } updated := &resmgr.Adjustment{ Status: resmgr.AdjustmentStatus{ Nodes: map[string]resmgr.AdjustmentNodeStatus{}, }, } oldData, _ := json.Marshal(current) newData, _ := json.Marshal(updated) pdata, err = patch.CreateMergePatch(oldData, newData) if err != nil { return agentError("failed to adjustment status patch: %v", err) } } else { current := &resmgr.Adjustment{ Status: resmgr.AdjustmentStatus{ Nodes: map[string]resmgr.AdjustmentNodeStatus{}, }, } if ok { current.Status.Nodes[nodeName] = old } updated := &resmgr.Adjustment{ Status: resmgr.AdjustmentStatus{ Nodes: map[string]resmgr.AdjustmentNodeStatus{ nodeName: {Errors: errors}, }, }, } oldData, _ := json.Marshal(current) newData, _ := json.Marshal(updated) pdata, err = patch.CreateMergePatch(oldData, newData) if err != nil { return agentError("failed to adjustment status patch: %v", err) } } ptype := pkgtypes.MergePatchType w.Debug("patching status of adjustment %s status with %v...", adjust.Name, string(pdata)) if _, err := w.resmgrCli.Adjustments(opts.configNs).Patch(adjust.Name, ptype, pdata); err != nil { return agentError("failed to patch Adjustment CRD %q: %v", adjust.Name, err) } if inscope { if adjust.Status.Nodes == nil { adjust.Status.Nodes = make(map[string]resmgr.AdjustmentNodeStatus) } adjust.Status.Nodes[nodeName] = resmgr.AdjustmentNodeStatus{Errors: errors} } else { delete(adjust.Status.Nodes, nodeName) } return nil } // sendConfig sends the current configuration. func (w *watcher) sendConfig() { cfg, kind := w.currentConfig.getConfig() w.Info("pushing %s configuration to client", kind) w.configChan <- cfg } // sendAdjustment sends the current overridden policies. func (w *watcher) sendAdjustment() { inscope, _ := w.currentConfig.getAdjustment() w.adjustmentChan <- inscope } func (w *watcher) watch() error { nodew := newNodeWatch(w) group := "" if node, err := nodew.Query(); err != nil { w.Warn("failed to query node %q: %v", nodeName, err) } else if node == nil { w.Warn("failed to query node %q, make sure that NODE_NAME is correctly set", nodeName) } else { group = node.(*core_v1.Node).Labels[opts.labelName] w.Info("configuration group is set to '%s'", group) } cfgw := newConfigMapWatch(w, opts.configMapName+".node."+nodeName, namespace(opts.configNs)) grpw := newConfigMapWatch(w, groupMapName(group), namespace(opts.configNs)) crdw := newAdjustmentCRDWatch(w, namespace(opts.configNs)) w.Info("watcher running") w.sendConfig() for { select { case _ = <-w.stop: w.Info("stopping configuration watcher") nodew.Stop() cfgw.Stop() grpw.Stop() crdw.Stop() return nil case e, ok := <-nodew.ResultChan(): if ok { switch e.Type { case k8swatch.Added, k8swatch.Modified: w.Info("node (%s) configuration updated", nodeName) label, _ := e.Object.(*core_v1.Node).Labels[opts.labelName] if group != label { group = label w.Info("configuration group is set to '%s'", group) grpw.Start(groupMapName(group)) } case k8swatch.Deleted: w.Warn("Hmm, our node got removed...") } continue } case e, ok := <-cfgw.ResultChan(): if ok { switch e.Type { case k8swatch.Added, k8swatch.Modified: w.Info("node ConfigMap updated") cm := e.Object.(*core_v1.ConfigMap) w.currentConfig.setNode(&cm.Data) w.sendConfig() case k8swatch.Deleted, SyntheticMissing: w.Info("node ConfigMap deleted") w.currentConfig.setNode(nil) w.sendConfig() } continue } case e, ok := <-grpw.ResultChan(): if ok { switch e.Type { case k8swatch.Added, k8swatch.Modified: w.Info("group/default ConfigMap updated") cm := e.Object.(*core_v1.ConfigMap) if w.currentConfig.setGroup(group, &cm.Data) { w.sendConfig() } case k8swatch.Deleted, SyntheticMissing: w.Info("group/default ConfigMap deleted") if w.currentConfig.setGroup(group, nil) { w.sendConfig() } } continue } case e, ok := <-crdw.ResultChan(): if ok { switch e.Type { case k8swatch.Added, k8swatch.Modified: w.Info("Adjustment CRD(s) updated: %T, %+v", e.Object, e.Object) w.Info("Adjustment CRD(s): %+v", e.Object.(*resmgr.Adjustment).Spec) if w.currentConfig.setAdjustment(e.Object.(*resmgr.Adjustment)) { w.sendAdjustment() } case k8swatch.Deleted: w.Info("Adjustment CRD(s) (%T) deleted", e.Object) if w.currentConfig.deleteAdjustment(e.Object.(*resmgr.Adjustment)) { w.sendAdjustment() } case SyntheticMissing: w.Info("No Adjustment CRD(s)") w.sendAdjustment() } continue } } // shouln't be necessary, but just in case avoid spinning on a closed channel time.Sleep(1 * time.Second) } } // groupMapName returns the our group ConfigMap, or the default one is we have no group. func groupMapName(group string) string { if group == "" { return opts.configMapName + ".default" } return opts.configMapName + ".group." + group } // newCacheConfig creates a new cachedConfig instance. func newCachedConfig() cachedConfig { return cachedConfig{ inscope: resmgrAdjustment{}, ignored: resmgrAdjustment{}, } } // getConfig is a helper method for getting the config data func (c *cachedConfig) getConfig() (resmgrConfig, string) { c.RLock() defer c.RUnlock() var cfg *resmgrConfig var kind string switch { case c.nodeCfg != nil: kind = "node" cfg = c.nodeCfg case c.group != "": kind = "group " + c.group cfg = c.groupCfg case c.groupCfg != nil: kind = "default" cfg = c.groupCfg default: kind = "fallback" } if cfg == nil { kind = "empty " + kind cfg = &resmgrConfig{} } return *cfg, kind } // getAdjustment is a helper method for getting a copy of external adjustments func (c *cachedConfig) getAdjustment() (resmgrAdjustment, resmgrAdjustment) { c.RLock() defer c.RUnlock() inscope := resmgrAdjustment{} for name, value := range c.inscope { inscope[name] = value } ignored := resmgrAdjustment{} for name, value := range c.ignored { ignored[name] = value } return inscope, ignored } // set node-specific configuration func (c *cachedConfig) setNode(data *map[string]string) bool { c.Lock() defer c.Unlock() c.nodeCfg = (*resmgrConfig)(data) return true } // set group-specific or default configuration func (c *cachedConfig) setGroup(group string, data *map[string]string) bool { c.Lock() defer c.Unlock() c.groupCfg = (*resmgrConfig)(data) c.group = group return c.nodeCfg == nil } // setAdjustment is a helper method for updating external adjustments func (c *cachedConfig) setAdjustment(adjust *resmgr.Adjustment) bool { var inscope, ignored bool var updated *resmgr.Adjustment c.Lock() defer c.Unlock() // // filter out updates // - for expired watches being recreated // - without any Spec changes (Status updates) // if updated, inscope = c.inscope[adjust.Name]; inscope { if adjust.HasSameVersion(updated) || adjust.Spec.Compare(&updated.Spec) { c.inscope[adjust.Name] = adjust return false } } else if updated, ignored = c.ignored[adjust.Name]; ignored { if adjust.HasSameVersion(updated) || adjust.Spec.Compare(&updated.Spec) { c.ignored[adjust.Name] = adjust return false } } // // we need to notify cri-resmgr if // - the adjustment applies to this node // - the adjustment used to apply to this node before the update // notify := false if adjust.Spec.IsNodeInScope(nodeName) { c.inscope[adjust.Name] = adjust if ignored { delete(c.ignored, adjust.Name) } notify = true } else { c.ignored[adjust.Name] = adjust if inscope { delete(c.inscope, adjust.Name) notify = true } } return notify } // deleteAdjustment is a helper method for updating external adjustments func (c *cachedConfig) deleteAdjustment(o *resmgr.Adjustment) bool { c.Lock() defer c.Unlock() // we need to notify cri-resmgr if the deleted adjustment used to apply to this node if _, ok := c.inscope[o.Name]; ok { delete(c.inscope, o.Name) return true } delete(c.ignored, o.Name) return false } // getAdjustmentNames returns the names of in scope and ignored adjustments. func (c *cachedConfig) getAdjustmentNames() ([]string, []string) { c.RLock() defer c.RUnlock() inscope := make([]string, 0, len(c.inscope)) ignored := make([]string, 0, len(c.ignored)) for name := range c.inscope { inscope = append(inscope, name) } for name := range c.ignored { ignored = append(ignored, name) } return inscope, ignored } // cache the status of the last adjustment update func (c *cachedConfig) setStatus(status *resmgrStatus) { c.Lock() defer c.Unlock() c.status = status } // get the last cached adjustment update status func (c *cachedConfig) getStatus() *resmgrStatus { c.RLock() defer c.RUnlock() return c.status } ================================================ FILE: pkg/apis/resmgr/expression.go ================================================ // Copyright 2019-2020 Intel Corporation. All Rights Reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package resmgr import ( "fmt" "path" "path/filepath" "strings" logger "github.com/intel/cri-resource-manager/pkg/log" ) // Evaluable is the interface objects need to implement to be evaluable against Expressions. type Evaluable interface { Eval(string) interface{} } // Expression is used to describe a criteria to select objects within a domain. type Expression struct { Key string `json:"key"` // key to check values of/against Op Operator `json:"operator"` // operator to apply to value of Key and Values Values []string `json:"values,omitempty"` // value(s) for domain key } const ( KeyPod = "pod" KeyID = "id" KeyUID = "uid" KeyName = "name" KeyNamespace = "namespace" KeyQOSClass = "qosclass" KeyLabels = "labels" KeyTags = "tags" ) // Operator defines the possible operators for an Expression. type Operator string const ( // Equals tests for equality with a single value. Equals Operator = "Equals" // NotEqual test for inequality with a single value. NotEqual Operator = "NotEqual" // In tests if the key's value is one of the specified set. In Operator = "In" // NotIn tests if the key's value is not one of the specified set. NotIn Operator = "NotIn" // Exists evalutes to true if the named key exists. Exists Operator = "Exists" // NotExist evalutes to true if the named key does not exist. NotExist Operator = "NotExist" // AlwaysTrue always evaluates to true. AlwaysTrue Operator = "AlwaysTrue" // Matches tests if the key value matches the only given globbing pattern. Matches Operator = "Matches" // MatchesNot is true if Matches would be false for the same key and pattern. MatchesNot Operator = "MatchesNot" // MatchesAny tests if the key value matches any of the given globbing patterns. MatchesAny Operator = "MatchesAny" // MatchesNone is true if MatchesAny would be false for the same key and patterns. MatchesNone Operator = "MatchesNone" ) // Our logger instance. var log = logger.NewLogger("expression") // Validate checks the expression for (obvious) invalidity. func (e *Expression) Validate() error { if e == nil { return exprError("nil expression") } switch e.Op { case Equals, NotEqual: if len(e.Values) != 1 { return exprError("invalid expression, '%s' requires a single value", e.Op) } case Matches, MatchesNot: if len(e.Values) != 1 { return exprError("invalid expression, '%s' requires a single value", e.Op) } case Exists, NotExist: if e.Values != nil && len(e.Values) != 0 { return exprError("invalid expression, '%s' does not take any values", e.Op) } case In, NotIn: case MatchesAny, MatchesNone: case AlwaysTrue: default: return exprError("invalid expression, unknown operator: %q", e.Op) } return nil } // Evaluate evaluates an expression against a container. func (e *Expression) Evaluate(subject Evaluable) bool { log.Debug("evaluating %q @ %s...", *e, subject) if e.Op == AlwaysTrue { return true } value, ok := e.KeyValue(subject) result := false switch e.Op { case Equals: result = ok && (value == e.Values[0] || e.Values[0] == "*") case NotEqual: result = !ok || value != e.Values[0] case Matches, MatchesNot: match := false if ok { match, _ = filepath.Match(e.Values[0], value) } result = ok && match if e.Op == MatchesNot { result = !result } case In, NotIn: if ok { for _, v := range e.Values { if value == v || v == "*" { result = true } } } if e.Op == NotIn { result = !result } case MatchesAny, MatchesNone: if ok { for _, pattern := range e.Values { if match, _ := filepath.Match(pattern, value); match { result = true break } } } if e.Op == MatchesNone { result = !result } case Exists: result = ok case NotExist: result = !ok } log.Debug("%q @ %s => %v", *e, subject, result) return result } // KeyValue extracts the value of the expresssion key from a container. func (e *Expression) KeyValue(subject Evaluable) (string, bool) { log.Debug("looking up %q @ %s...", e.Key, subject) value := "" ok := false keys, vsep := splitKeys(e.Key) if len(keys) == 1 { value, ok, _ = ResolveRef(subject, keys[0]) } else { vals := make([]string, 0, len(keys)) for _, key := range keys { v, found, _ := ResolveRef(subject, key) vals = append(vals, v) ok = ok || found } value = strings.Join(vals, vsep) } log.Debug("%q @ %s => %q, %v", e.Key, subject, value, ok) return value, ok } func splitKeys(keys string) ([]string, string) { // joint key specs have two valid forms: // - ":keylist" (equivalent to ":::") // - ":" if len(keys) < 4 || keys[0] != ':' { return []string{keys}, "" } keys = keys[1:] ksep := keys[0:1] vsep := keys[1:2] if validSeparator(ksep[0]) && validSeparator(vsep[0]) { keys = keys[2:] } else { ksep = ":" vsep = ":" } return strings.Split(keys, ksep), vsep } func validSeparator(b byte) bool { switch { case '0' <= b && b <= '9': return false case 'a' <= b && b <= 'z': return false case 'A' <= b && b <= 'Z': return false case b == '/', b == '.': return false } return true } // ResolveRef walks an object trying to resolve a reference to a value. func ResolveRef(subject Evaluable, spec string) (string, bool, error) { var obj interface{} log.Debug("resolving %q @ %s...", spec, subject) spec = path.Clean(spec) ref := strings.Split(spec, "/") if len(ref) == 1 { if strings.Index(spec, ".") != -1 { ref = []string{"labels", spec} } } obj = subject for len(ref) > 0 { key := ref[0] log.Debug("resolve walking %q @ %s...", key, obj) switch v := obj.(type) { case string: obj = v case map[string]string: value, ok := v[key] if !ok { return "", false, nil } obj = value case error: return "", false, exprError("%s: failed to resolve %q: %v", subject, spec, v) default: e, ok := obj.(Evaluable) if !ok { return "", false, exprError("%s: failed to resolve %q, unexpected type %T", subject, spec, obj) } obj = e.Eval(key) } ref = ref[1:] } str, ok := obj.(string) if !ok { return "", false, exprError("%s: reference %q resolved to non-string: %T", subject, spec, obj) } log.Debug("resolved %q @ %s => %s", spec, subject, str) return str, true, nil } // String returns the expression as a string. func (e *Expression) String() string { return fmt.Sprintf("<%s %s %s>", e.Key, e.Op, strings.Join(e.Values, ",")) } // DeepCopy creates a deep copy of the expression. func (e *Expression) DeepCopy() *Expression { out := &Expression{} e.DeepCopyInto(out) return out } // DeepCopyInto copies the expression into another one. func (e *Expression) DeepCopyInto(out *Expression) { out.Key = e.Key out.Op = e.Op out.Values = make([]string, len(e.Values)) copy(out.Values, e.Values) } // exprError returns a formatted error specific to expressions. func exprError(format string, args ...interface{}) error { return fmt.Errorf("expression: "+format, args...) } ================================================ FILE: pkg/apis/resmgr/expression_test.go ================================================ // Copyright 2019-2020 Intel Corporation. All Rights Reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package resmgr import ( "fmt" "strings" "testing" logger "github.com/intel/cri-resource-manager/pkg/log" ) type evaluable struct { name string namespace string qosclass string labels map[string]string tags map[string]string parent Evaluable } func newEvaluable(name, ns, qos string, labels, tags map[string]string, p Evaluable) *evaluable { return &evaluable{ name: name, namespace: ns, qosclass: qos, labels: labels, tags: tags, parent: p, } } func (e *evaluable) Eval(key string) interface{} { switch key { case KeyName: return e.name case KeyNamespace: return e.namespace case KeyQOSClass: return e.qosclass case KeyLabels: return e.labels case KeyTags: return e.tags case KeyPod: if e.parent != nil { return e.parent } fallthrough default: return fmt.Errorf("evaluable: cannot evaluate %q", key) } } func (e *evaluable) String() string { s := fmt.Sprintf("{ name: %q, namespace: %q, qosclass: %q, ", e.name, e.namespace, e.qosclass) labels, t := "{", "" for k, v := range e.labels { labels += t + fmt.Sprintf("%q:%q", k, v) t = ", " } labels += "}" tags, t := "{", "" for k, v := range e.tags { tags += t + fmt.Sprintf("%q:%q", k, v) t = ", " } tags += "}" s = fmt.Sprintf("%s, labels: %s, tags: %s }", s, labels, tags) return s } func TestResolveRefAndKeyValue(t *testing.T) { defer logger.Flush() pod := newEvaluable("P1", "pns", "pqos", map[string]string{"l1": "plone", "l2": "pltwo", "l5": "plfive"}, nil, nil) tcases := []struct { name string subject Evaluable keys []string values []string ok []bool error []bool keyvalues []string }{ { name: "test resolving references", subject: newEvaluable("C1", "cns", "cqos", map[string]string{"l1": "clone", "l2": "cltwo", "l3": "clthree"}, map[string]string{"t1": "ctone", "t2": "cttwo", "t3": "ctthree"}, pod), keys: []string{ "name", "namespace", "qosclass", "labels/l1", "labels/l2", "labels/l3", "labels/l4", "tags/t1", "tags/t2", "tags/t3", "tags/t4", "pod/labels/l1", "pod/labels/l2", "pod/labels/l3", "pod/labels/l4", "pod/labels/l5", ":,-pod/qosclass,pod/namespace,pod/name,name", }, values: []string{ "C1", "cns", "cqos", "clone", "cltwo", "clthree", "", "ctone", "cttwo", "ctthree", "", "plone", "pltwo", "", "", "plfive", "", }, keyvalues: []string{ "C1", "cns", "cqos", "clone", "cltwo", "clthree", "", "ctone", "cttwo", "ctthree", "", "plone", "pltwo", "", "", "plfive", "pqos-pns-P1-C1", }, ok: []bool{ true, true, true, true, true, true, false, true, true, true, false, true, true, false, false, true, false, }, error: []bool{ false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, true, }, }, } for _, tc := range tcases { t.Run(tc.name, func(t *testing.T) { for i := range tc.keys { value, ok, err := ResolveRef(tc.subject, tc.keys[i]) if err != nil && !tc.error[i] { t.Errorf("ResolveRef %s/%q should have given %q, but failed: %v", tc.subject, tc.keys[i], tc.values[i], err) continue } if value != tc.values[i] || ok != tc.ok[i] { t.Errorf("ResolveRef %s@%q: expected %v, %v got %v, %v", tc.subject, tc.keys[i], tc.values[i], tc.ok[i], value, ok) continue } expr := &Expression{ Key: tc.keys[i], Op: Equals, Values: []string{}, } value, _ = expr.KeyValue(tc.subject) if value != tc.keyvalues[i] { t.Errorf("KeyValue %s@%q: expected %v, got %v", tc.subject, tc.keys[i], tc.keyvalues[i], value) } } }) } } func TestSimpleOperators(t *testing.T) { defer logger.Flush() pod := newEvaluable("P1", "pns", "pqos", map[string]string{"l1": "plone", "l2": "pltwo", "l5": "plfive"}, nil, nil) sub := newEvaluable("C1", "cns", "cqos", map[string]string{"l1": "clone", "l2": "cltwo", "l3": "clthree"}, map[string]string{"t1": "ctone", "t2": "cttwo", "t4": "ctfour"}, pod) tcases := []struct { name string subject Evaluable keys []string ops []Operator values [][][]string results [][]bool }{ { name: "test Equals, NotEqual, In, NotIn operators", subject: sub, keys: []string{ "name", "pod/name", "namespace", "pod/namespace", "qosclass", "pod/qosclass", "labels/l1", "labels/l2", "labels/l3", "labels/l4", "tags/t1", "tags/t2", "tags/t3", "tags/t4", "pod/labels/l1", "pod/labels/l2", "pod/labels/l3", "pod/labels/l4", "pod/labels/l5", }, ops: []Operator{Equals, NotEqual, In, NotIn}, values: [][][]string{ {{"C1"}, {"C1"}, {"foo", "C1"}, {"foo"}}, // name {{"P1"}, {"P1"}, {"foo", "P1"}, {"foo"}}, // pod/name {{"cns"}, {"cns"}, {"foo", "cns"}, {"foo"}}, // namespace {{"pns"}, {"pns"}, {"foo", "pns"}, {"pns"}}, // pod/namespace {{"cqos"}, {"cqos"}, {"foo", "cqos"}, {"foo"}}, // qosclass {{"pqos"}, {"pqos"}, {"foo", "pqos"}, {"pqos"}}, // pod/qosclass {{"clone"}, {"clone"}, {"foo", "clone"}, {"foo"}}, // labels/l1 {{"cltwo"}, {"cltwo"}, {"foo", "cltwo"}, {"foo"}}, // labels/l2 {{"clthree"}, {"clthree"}, {"foo", "clthree"}, {"clthree"}}, // labels/l3 {{"clfour"}, {"clfour"}, {"foo", "clfour"}, {"foo"}}, // labels/l4 {{"ctone"}, {"ctone"}, {"foo", "ctone"}, {"foo"}}, // tags/t1 {{"cttwo"}, {"cttwo"}, {"foo", "cttwo"}, {"foo"}}, // tags/t2 {{"ctthree"}, {"ctthree"}, {"foo", "ctthree"}, {"foo"}}, // tags/t3 {{"ctfour"}, {"ctfour"}, {"foo", "ctfour"}, {"ctfour"}}, // tags/t4 {{"plone"}, {"plone"}, {"foo", "plone"}, {"foo"}}, // pod/labels/l1 {{"pltwo"}, {"pltwo"}, {"foo", "pltwo"}, {"foo"}}, // pod/labels/l2 {{"plthree"}, {"plthree"}, {"foo", "plthree"}, {"foo"}}, // pod/labels/l3 {{"plfour"}, {"plfour"}, {"foo", "plfour"}, {"foo"}}, // pod/labels/l4 {{"plfive"}, {"plfive"}, {"foo", "plfive"}, {"foo"}}, // pod/labels/l5 }, results: [][]bool{ {true, false, true, true}, // name {true, false, true, true}, // pod/name {true, false, true, true}, // namespace {true, false, true, false}, // pod/namespace {true, false, true, true}, // qosclass {true, false, true, false}, // pod/qosclass {true, false, true, true}, // labels/l1 {true, false, true, true}, // labels/l2 {true, false, true, false}, // labels/l3 {false, true, false, true}, // labels/l4 {true, false, true, true}, // tags/t1 {true, false, true, true}, // tags/t2 {false, true, false, true}, // tags/t3 {true, false, true, false}, // tags/t4 {true, false, true, true}, // pod/labels/l1 {true, false, true, true}, // pod/labels/l2 {false, true, false, true}, // pod/labels/l3 {false, true, false, true}, // pod/labels/l4 {true, false, true, true}, // pod/labels/l5 }, }, } for _, tc := range tcases { t.Run(tc.name, func(t *testing.T) { for k := range tc.keys { for o := range tc.ops { expr := &Expression{ Key: tc.keys[k], Op: tc.ops[o], Values: tc.values[k][o], } expect := tc.results[k][o] result := expr.Evaluate(tc.subject) if result != expect { t.Errorf("%s for %s: expected %v, got %v", expr, tc.subject, expect, result) } } } }) } } func TestMatching(t *testing.T) { defer logger.Flush() p1 := newEvaluable("P1", "pns1", "pqos1", map[string]string{"l1": "plv1", "l2": "plv2", "l5": "plv5"}, nil, nil) c11 := newEvaluable("C11", "cns1", "cqos11", map[string]string{"l1": "clv1", "l2": "clv2", "l3": "clv3"}, map[string]string{"t1": "ctv1", "t2": "tag2", "t4": "ctv4"}, p1) c12 := newEvaluable("C12", "cns1", "cqos12", map[string]string{"l1": "clv1", "l2": "clv2", "l3": "clv3"}, map[string]string{"t1": "ctv1", "t2": "foo", "t4": "ctv4"}, p1) c13 := newEvaluable("C12", "cns1", "cqos13", map[string]string{"l1": "clv1", "l2": "clv2", "l3": "clv3"}, map[string]string{"t1": "ctv1", "t2": "ctv2", "t4": "ctv4"}, p1) p2 := newEvaluable("P2", "pns2", "pqos2", map[string]string{"l1": "plv1", "l2": "plv2", "l5": "plv5"}, nil, nil) c21 := newEvaluable("C21", "cns1", "cqos21", map[string]string{"l1": "clv1", "l2": "clv2", "l3": "clv3"}, map[string]string{"t1": "ctv1", "t2": "tag2", "t4": "ctv4"}, p2) c22 := newEvaluable("C22", "cns1", "cqos22", map[string]string{"l1": "clv1", "l2": "clv2", "l3": "clv3"}, map[string]string{"t1": "ctv1", "t2": "ctv2", "t4": "ctv4"}, p2) c23 := newEvaluable("C23", "cns1", "cqos23", map[string]string{"l1": "clv1", "l2": "clv2", "l3": "clv3"}, map[string]string{"t1": "ctv1", "t2": "foo", "t4": "ctv4"}, p2) p3 := newEvaluable("P3", "pns3", "pqos3", map[string]string{"l1": "plv1", "l2": "plv2", "l5": "plv5"}, nil, nil) c3 := newEvaluable("C3", "cns3", "cqos3", map[string]string{"l1": "clv1", "l2": "clv2", "l3": "clv3"}, map[string]string{"t1": "ctv1", "t2": "tag2", "t4": "ctv4"}, p3) tcases := []struct { name string subjects []Evaluable selectors []*Expression expected [][]string }{ { name: "test inverted membership operator", subjects: []Evaluable{c11, c12, c13, c21, c22, c23, c3}, selectors: []*Expression{ { Key: ":,:pod/qosclass,pod/namespace,pod/name,qosclass,name", Op: Matches, Values: []string{ "pqos2:*:*:*:*", }, }, { Key: "tags/t2", Op: Matches, Values: []string{"[tf][ao][go]*"}, }, }, expected: [][]string{ {"C21", "C22", "C23"}, {"C11", "C12", "C21", "C23", "C3"}, }, }, } for _, tc := range tcases { t.Run(tc.name, func(t *testing.T) { for i, expr := range tc.selectors { results := []string{} for _, s := range tc.subjects { if expr.Evaluate(s) { results = append(results, s.Eval("name").(string)) } } expected := strings.Join(tc.expected[i], ",") got := strings.Join(results, ",") if expected != got { t.Errorf("%s: expected %s, got %s", expr, expected, got) } } }) } } ================================================ FILE: pkg/apis/resmgr/generated/clientset/versioned/clientset.go ================================================ // Copyright 2019-2020 Intel Corporation. All Rights Reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. // Code generated by client-gen. DO NOT EDIT. package versioned import ( "fmt" criresmgrv1alpha1 "github.com/intel/cri-resource-manager/pkg/apis/resmgr/generated/clientset/versioned/typed/resmgr/v1alpha1" discovery "k8s.io/client-go/discovery" rest "k8s.io/client-go/rest" flowcontrol "k8s.io/client-go/util/flowcontrol" ) type Interface interface { Discovery() discovery.DiscoveryInterface CriresmgrV1alpha1() criresmgrv1alpha1.CriresmgrV1alpha1Interface } // Clientset contains the clients for groups. Each group has exactly one // version included in a Clientset. type Clientset struct { *discovery.DiscoveryClient criresmgrV1alpha1 *criresmgrv1alpha1.CriresmgrV1alpha1Client } // CriresmgrV1alpha1 retrieves the CriresmgrV1alpha1Client func (c *Clientset) CriresmgrV1alpha1() criresmgrv1alpha1.CriresmgrV1alpha1Interface { return c.criresmgrV1alpha1 } // Discovery retrieves the DiscoveryClient func (c *Clientset) Discovery() discovery.DiscoveryInterface { if c == nil { return nil } return c.DiscoveryClient } // NewForConfig creates a new Clientset for the given config. // If config's RateLimiter is not set and QPS and Burst are acceptable, // NewForConfig will generate a rate-limiter in configShallowCopy. func NewForConfig(c *rest.Config) (*Clientset, error) { configShallowCopy := *c if configShallowCopy.RateLimiter == nil && configShallowCopy.QPS > 0 { if configShallowCopy.Burst <= 0 { return nil, fmt.Errorf("Burst is required to be greater than 0 when RateLimiter is not set and QPS is set to greater than 0") } configShallowCopy.RateLimiter = flowcontrol.NewTokenBucketRateLimiter(configShallowCopy.QPS, configShallowCopy.Burst) } var cs Clientset var err error cs.criresmgrV1alpha1, err = criresmgrv1alpha1.NewForConfig(&configShallowCopy) if err != nil { return nil, err } cs.DiscoveryClient, err = discovery.NewDiscoveryClientForConfig(&configShallowCopy) if err != nil { return nil, err } return &cs, nil } // NewForConfigOrDie creates a new Clientset for the given config and // panics if there is an error in the config. func NewForConfigOrDie(c *rest.Config) *Clientset { var cs Clientset cs.criresmgrV1alpha1 = criresmgrv1alpha1.NewForConfigOrDie(c) cs.DiscoveryClient = discovery.NewDiscoveryClientForConfigOrDie(c) return &cs } // New creates a new Clientset for the given RESTClient. func New(c rest.Interface) *Clientset { var cs Clientset cs.criresmgrV1alpha1 = criresmgrv1alpha1.New(c) cs.DiscoveryClient = discovery.NewDiscoveryClient(c) return &cs } ================================================ FILE: pkg/apis/resmgr/generated/clientset/versioned/doc.go ================================================ // Copyright 2019-2020 Intel Corporation. All Rights Reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. // Code generated by client-gen. DO NOT EDIT. // This package has the automatically generated clientset. package versioned ================================================ FILE: pkg/apis/resmgr/generated/clientset/versioned/fake/clientset_generated.go ================================================ // Copyright 2019-2020 Intel Corporation. All Rights Reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. // Code generated by client-gen. DO NOT EDIT. package fake import ( clientset "github.com/intel/cri-resource-manager/pkg/apis/resmgr/generated/clientset/versioned" criresmgrv1alpha1 "github.com/intel/cri-resource-manager/pkg/apis/resmgr/generated/clientset/versioned/typed/resmgr/v1alpha1" fakecriresmgrv1alpha1 "github.com/intel/cri-resource-manager/pkg/apis/resmgr/generated/clientset/versioned/typed/resmgr/v1alpha1/fake" "k8s.io/apimachinery/pkg/runtime" "k8s.io/apimachinery/pkg/watch" "k8s.io/client-go/discovery" fakediscovery "k8s.io/client-go/discovery/fake" "k8s.io/client-go/testing" ) // NewSimpleClientset returns a clientset that will respond with the provided objects. // It's backed by a very simple object tracker that processes creates, updates and deletions as-is, // without applying any validations and/or defaults. It shouldn't be considered a replacement // for a real clientset and is mostly useful in simple unit tests. func NewSimpleClientset(objects ...runtime.Object) *Clientset { o := testing.NewObjectTracker(scheme, codecs.UniversalDecoder()) for _, obj := range objects { if err := o.Add(obj); err != nil { panic(err) } } cs := &Clientset{tracker: o} cs.discovery = &fakediscovery.FakeDiscovery{Fake: &cs.Fake} cs.AddReactor("*", "*", testing.ObjectReaction(o)) cs.AddWatchReactor("*", func(action testing.Action) (handled bool, ret watch.Interface, err error) { gvr := action.GetResource() ns := action.GetNamespace() watch, err := o.Watch(gvr, ns) if err != nil { return false, nil, err } return true, watch, nil }) return cs } // Clientset implements clientset.Interface. Meant to be embedded into a // struct to get a default implementation. This makes faking out just the method // you want to test easier. type Clientset struct { testing.Fake discovery *fakediscovery.FakeDiscovery tracker testing.ObjectTracker } func (c *Clientset) Discovery() discovery.DiscoveryInterface { return c.discovery } func (c *Clientset) Tracker() testing.ObjectTracker { return c.tracker } var _ clientset.Interface = &Clientset{} // CriresmgrV1alpha1 retrieves the CriresmgrV1alpha1Client func (c *Clientset) CriresmgrV1alpha1() criresmgrv1alpha1.CriresmgrV1alpha1Interface { return &fakecriresmgrv1alpha1.FakeCriresmgrV1alpha1{Fake: &c.Fake} } ================================================ FILE: pkg/apis/resmgr/generated/clientset/versioned/fake/doc.go ================================================ // Copyright 2019-2020 Intel Corporation. All Rights Reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. // Code generated by client-gen. DO NOT EDIT. // This package has the automatically generated fake clientset. package fake ================================================ FILE: pkg/apis/resmgr/generated/clientset/versioned/fake/register.go ================================================ // Copyright 2019-2020 Intel Corporation. All Rights Reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. // Code generated by client-gen. DO NOT EDIT. package fake import ( criresmgrv1alpha1 "github.com/intel/cri-resource-manager/pkg/apis/resmgr/v1alpha1" v1 "k8s.io/apimachinery/pkg/apis/meta/v1" runtime "k8s.io/apimachinery/pkg/runtime" schema "k8s.io/apimachinery/pkg/runtime/schema" serializer "k8s.io/apimachinery/pkg/runtime/serializer" utilruntime "k8s.io/apimachinery/pkg/util/runtime" ) var scheme = runtime.NewScheme() var codecs = serializer.NewCodecFactory(scheme) var parameterCodec = runtime.NewParameterCodec(scheme) var localSchemeBuilder = runtime.SchemeBuilder{ criresmgrv1alpha1.AddToScheme, } // AddToScheme adds all types of this clientset into the given scheme. This allows composition // of clientsets, like in: // // import ( // "k8s.io/client-go/kubernetes" // clientsetscheme "k8s.io/client-go/kubernetes/scheme" // aggregatorclientsetscheme "k8s.io/kube-aggregator/pkg/client/clientset_generated/clientset/scheme" // ) // // kclientset, _ := kubernetes.NewForConfig(c) // _ = aggregatorclientsetscheme.AddToScheme(clientsetscheme.Scheme) // // After this, RawExtensions in Kubernetes types will serialize kube-aggregator types // correctly. var AddToScheme = localSchemeBuilder.AddToScheme func init() { v1.AddToGroupVersion(scheme, schema.GroupVersion{Version: "v1"}) utilruntime.Must(AddToScheme(scheme)) } ================================================ FILE: pkg/apis/resmgr/generated/clientset/versioned/scheme/doc.go ================================================ // Copyright 2019-2020 Intel Corporation. All Rights Reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. // Code generated by client-gen. DO NOT EDIT. // This package contains the scheme of the automatically generated clientset. package scheme ================================================ FILE: pkg/apis/resmgr/generated/clientset/versioned/scheme/register.go ================================================ // Copyright 2019-2020 Intel Corporation. All Rights Reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. // Code generated by client-gen. DO NOT EDIT. package scheme import ( criresmgrv1alpha1 "github.com/intel/cri-resource-manager/pkg/apis/resmgr/v1alpha1" v1 "k8s.io/apimachinery/pkg/apis/meta/v1" runtime "k8s.io/apimachinery/pkg/runtime" schema "k8s.io/apimachinery/pkg/runtime/schema" serializer "k8s.io/apimachinery/pkg/runtime/serializer" utilruntime "k8s.io/apimachinery/pkg/util/runtime" ) var Scheme = runtime.NewScheme() var Codecs = serializer.NewCodecFactory(Scheme) var ParameterCodec = runtime.NewParameterCodec(Scheme) var localSchemeBuilder = runtime.SchemeBuilder{ criresmgrv1alpha1.AddToScheme, } // AddToScheme adds all types of this clientset into the given scheme. This allows composition // of clientsets, like in: // // import ( // "k8s.io/client-go/kubernetes" // clientsetscheme "k8s.io/client-go/kubernetes/scheme" // aggregatorclientsetscheme "k8s.io/kube-aggregator/pkg/client/clientset_generated/clientset/scheme" // ) // // kclientset, _ := kubernetes.NewForConfig(c) // _ = aggregatorclientsetscheme.AddToScheme(clientsetscheme.Scheme) // // After this, RawExtensions in Kubernetes types will serialize kube-aggregator types // correctly. var AddToScheme = localSchemeBuilder.AddToScheme func init() { v1.AddToGroupVersion(Scheme, schema.GroupVersion{Version: "v1"}) utilruntime.Must(AddToScheme(Scheme)) } ================================================ FILE: pkg/apis/resmgr/generated/clientset/versioned/typed/resmgr/v1alpha1/adjustment.go ================================================ // Copyright 2019-2020 Intel Corporation. All Rights Reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. // Code generated by client-gen. DO NOT EDIT. package v1alpha1 import ( "context" "time" scheme "github.com/intel/cri-resource-manager/pkg/apis/resmgr/generated/clientset/versioned/scheme" v1alpha1 "github.com/intel/cri-resource-manager/pkg/apis/resmgr/v1alpha1" v1 "k8s.io/apimachinery/pkg/apis/meta/v1" types "k8s.io/apimachinery/pkg/types" watch "k8s.io/apimachinery/pkg/watch" rest "k8s.io/client-go/rest" ) // AdjustmentsGetter has a method to return a AdjustmentInterface. // A group's client should implement this interface. type AdjustmentsGetter interface { Adjustments(namespace string) AdjustmentInterface } // AdjustmentInterface has methods to work with Adjustment resources. type AdjustmentInterface interface { Create(*v1alpha1.Adjustment) (*v1alpha1.Adjustment, error) Update(*v1alpha1.Adjustment) (*v1alpha1.Adjustment, error) UpdateStatus(*v1alpha1.Adjustment) (*v1alpha1.Adjustment, error) Delete(name string, options *v1.DeleteOptions) error DeleteCollection(options *v1.DeleteOptions, listOptions v1.ListOptions) error Get(name string, options v1.GetOptions) (*v1alpha1.Adjustment, error) List(opts v1.ListOptions) (*v1alpha1.AdjustmentList, error) Watch(opts v1.ListOptions) (watch.Interface, error) Patch(name string, pt types.PatchType, data []byte, subresources ...string) (result *v1alpha1.Adjustment, err error) AdjustmentExpansion } // adjustments implements AdjustmentInterface type adjustments struct { client rest.Interface ns string } // newAdjustments returns a Adjustments func newAdjustments(c *CriresmgrV1alpha1Client, namespace string) *adjustments { return &adjustments{ client: c.RESTClient(), ns: namespace, } } // Get takes name of the adjustment, and returns the corresponding adjustment object, and an error if there is any. func (c *adjustments) Get(name string, options v1.GetOptions) (result *v1alpha1.Adjustment, err error) { result = &v1alpha1.Adjustment{} err = c.client.Get(). Namespace(c.ns). Resource("adjustments"). Name(name). VersionedParams(&options, scheme.ParameterCodec). Do(context.TODO()). Into(result) return } // List takes label and field selectors, and returns the list of Adjustments that match those selectors. func (c *adjustments) List(opts v1.ListOptions) (result *v1alpha1.AdjustmentList, err error) { var timeout time.Duration if opts.TimeoutSeconds != nil { timeout = time.Duration(*opts.TimeoutSeconds) * time.Second } result = &v1alpha1.AdjustmentList{} err = c.client.Get(). Namespace(c.ns). Resource("adjustments"). VersionedParams(&opts, scheme.ParameterCodec). Timeout(timeout). Do(context.TODO()). Into(result) return } // Watch returns a watch.Interface that watches the requested adjustments. func (c *adjustments) Watch(opts v1.ListOptions) (watch.Interface, error) { var timeout time.Duration if opts.TimeoutSeconds != nil { timeout = time.Duration(*opts.TimeoutSeconds) * time.Second } opts.Watch = true return c.client.Get(). Namespace(c.ns). Resource("adjustments"). VersionedParams(&opts, scheme.ParameterCodec). Timeout(timeout). Watch(context.TODO()) } // Create takes the representation of a adjustment and creates it. Returns the server's representation of the adjustment, and an error, if there is any. func (c *adjustments) Create(adjustment *v1alpha1.Adjustment) (result *v1alpha1.Adjustment, err error) { result = &v1alpha1.Adjustment{} err = c.client.Post(). Namespace(c.ns). Resource("adjustments"). Body(adjustment). Do(context.TODO()). Into(result) return } // Update takes the representation of a adjustment and updates it. Returns the server's representation of the adjustment, and an error, if there is any. func (c *adjustments) Update(adjustment *v1alpha1.Adjustment) (result *v1alpha1.Adjustment, err error) { result = &v1alpha1.Adjustment{} err = c.client.Put(). Namespace(c.ns). Resource("adjustments"). Name(adjustment.Name). Body(adjustment). Do(context.TODO()). Into(result) return } // UpdateStatus was generated because the type contains a Status member. // Add a +genclient:noStatus comment above the type to avoid generating UpdateStatus(). func (c *adjustments) UpdateStatus(adjustment *v1alpha1.Adjustment) (result *v1alpha1.Adjustment, err error) { result = &v1alpha1.Adjustment{} err = c.client.Put(). Namespace(c.ns). Resource("adjustments"). Name(adjustment.Name). SubResource("status"). Body(adjustment). Do(context.TODO()). Into(result) return } // Delete takes name of the adjustment and deletes it. Returns an error if one occurs. func (c *adjustments) Delete(name string, options *v1.DeleteOptions) error { return c.client.Delete(). Namespace(c.ns). Resource("adjustments"). Name(name). Body(options). Do(context.TODO()). Error() } // DeleteCollection deletes a collection of objects. func (c *adjustments) DeleteCollection(options *v1.DeleteOptions, listOptions v1.ListOptions) error { var timeout time.Duration if listOptions.TimeoutSeconds != nil { timeout = time.Duration(*listOptions.TimeoutSeconds) * time.Second } return c.client.Delete(). Namespace(c.ns). Resource("adjustments"). VersionedParams(&listOptions, scheme.ParameterCodec). Timeout(timeout). Body(options). Do(context.TODO()). Error() } // Patch applies the patch and returns the patched adjustment. func (c *adjustments) Patch(name string, pt types.PatchType, data []byte, subresources ...string) (result *v1alpha1.Adjustment, err error) { result = &v1alpha1.Adjustment{} err = c.client.Patch(pt). Namespace(c.ns). Resource("adjustments"). SubResource(subresources...). Name(name). Body(data). Do(context.TODO()). Into(result) return } ================================================ FILE: pkg/apis/resmgr/generated/clientset/versioned/typed/resmgr/v1alpha1/doc.go ================================================ // Copyright 2019-2020 Intel Corporation. All Rights Reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. // Code generated by client-gen. DO NOT EDIT. // This package has the automatically generated typed clients. package v1alpha1 ================================================ FILE: pkg/apis/resmgr/generated/clientset/versioned/typed/resmgr/v1alpha1/fake/doc.go ================================================ // Copyright 2019-2020 Intel Corporation. All Rights Reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. // Code generated by client-gen. DO NOT EDIT. // Package fake has the automatically generated clients. package fake ================================================ FILE: pkg/apis/resmgr/generated/clientset/versioned/typed/resmgr/v1alpha1/fake/fake_adjustment.go ================================================ // Copyright 2019-2020 Intel Corporation. All Rights Reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. // Code generated by client-gen. DO NOT EDIT. package fake import ( v1alpha1 "github.com/intel/cri-resource-manager/pkg/apis/resmgr/v1alpha1" v1 "k8s.io/apimachinery/pkg/apis/meta/v1" labels "k8s.io/apimachinery/pkg/labels" schema "k8s.io/apimachinery/pkg/runtime/schema" types "k8s.io/apimachinery/pkg/types" watch "k8s.io/apimachinery/pkg/watch" testing "k8s.io/client-go/testing" ) // FakeAdjustments implements AdjustmentInterface type FakeAdjustments struct { Fake *FakeCriresmgrV1alpha1 ns string } var adjustmentsResource = schema.GroupVersionResource{Group: "criresmgr.intel.com", Version: "v1alpha1", Resource: "adjustments"} var adjustmentsKind = schema.GroupVersionKind{Group: "criresmgr.intel.com", Version: "v1alpha1", Kind: "Adjustment"} // Get takes name of the adjustment, and returns the corresponding adjustment object, and an error if there is any. func (c *FakeAdjustments) Get(name string, options v1.GetOptions) (result *v1alpha1.Adjustment, err error) { obj, err := c.Fake. Invokes(testing.NewGetAction(adjustmentsResource, c.ns, name), &v1alpha1.Adjustment{}) if obj == nil { return nil, err } return obj.(*v1alpha1.Adjustment), err } // List takes label and field selectors, and returns the list of Adjustments that match those selectors. func (c *FakeAdjustments) List(opts v1.ListOptions) (result *v1alpha1.AdjustmentList, err error) { obj, err := c.Fake. Invokes(testing.NewListAction(adjustmentsResource, adjustmentsKind, c.ns, opts), &v1alpha1.AdjustmentList{}) if obj == nil { return nil, err } label, _, _ := testing.ExtractFromListOptions(opts) if label == nil { label = labels.Everything() } list := &v1alpha1.AdjustmentList{ListMeta: obj.(*v1alpha1.AdjustmentList).ListMeta} for _, item := range obj.(*v1alpha1.AdjustmentList).Items { if label.Matches(labels.Set(item.Labels)) { list.Items = append(list.Items, item) } } return list, err } // Watch returns a watch.Interface that watches the requested adjustments. func (c *FakeAdjustments) Watch(opts v1.ListOptions) (watch.Interface, error) { return c.Fake. InvokesWatch(testing.NewWatchAction(adjustmentsResource, c.ns, opts)) } // Create takes the representation of a adjustment and creates it. Returns the server's representation of the adjustment, and an error, if there is any. func (c *FakeAdjustments) Create(adjustment *v1alpha1.Adjustment) (result *v1alpha1.Adjustment, err error) { obj, err := c.Fake. Invokes(testing.NewCreateAction(adjustmentsResource, c.ns, adjustment), &v1alpha1.Adjustment{}) if obj == nil { return nil, err } return obj.(*v1alpha1.Adjustment), err } // Update takes the representation of a adjustment and updates it. Returns the server's representation of the adjustment, and an error, if there is any. func (c *FakeAdjustments) Update(adjustment *v1alpha1.Adjustment) (result *v1alpha1.Adjustment, err error) { obj, err := c.Fake. Invokes(testing.NewUpdateAction(adjustmentsResource, c.ns, adjustment), &v1alpha1.Adjustment{}) if obj == nil { return nil, err } return obj.(*v1alpha1.Adjustment), err } // UpdateStatus was generated because the type contains a Status member. // Add a +genclient:noStatus comment above the type to avoid generating UpdateStatus(). func (c *FakeAdjustments) UpdateStatus(adjustment *v1alpha1.Adjustment) (*v1alpha1.Adjustment, error) { obj, err := c.Fake. Invokes(testing.NewUpdateSubresourceAction(adjustmentsResource, "status", c.ns, adjustment), &v1alpha1.Adjustment{}) if obj == nil { return nil, err } return obj.(*v1alpha1.Adjustment), err } // Delete takes name of the adjustment and deletes it. Returns an error if one occurs. func (c *FakeAdjustments) Delete(name string, options *v1.DeleteOptions) error { _, err := c.Fake. Invokes(testing.NewDeleteAction(adjustmentsResource, c.ns, name), &v1alpha1.Adjustment{}) return err } // DeleteCollection deletes a collection of objects. func (c *FakeAdjustments) DeleteCollection(options *v1.DeleteOptions, listOptions v1.ListOptions) error { action := testing.NewDeleteCollectionAction(adjustmentsResource, c.ns, listOptions) _, err := c.Fake.Invokes(action, &v1alpha1.AdjustmentList{}) return err } // Patch applies the patch and returns the patched adjustment. func (c *FakeAdjustments) Patch(name string, pt types.PatchType, data []byte, subresources ...string) (result *v1alpha1.Adjustment, err error) { obj, err := c.Fake. Invokes(testing.NewPatchSubresourceAction(adjustmentsResource, c.ns, name, pt, data, subresources...), &v1alpha1.Adjustment{}) if obj == nil { return nil, err } return obj.(*v1alpha1.Adjustment), err } ================================================ FILE: pkg/apis/resmgr/generated/clientset/versioned/typed/resmgr/v1alpha1/fake/fake_resmgr_client.go ================================================ // Copyright 2019-2020 Intel Corporation. All Rights Reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. // Code generated by client-gen. DO NOT EDIT. package fake import ( v1alpha1 "github.com/intel/cri-resource-manager/pkg/apis/resmgr/generated/clientset/versioned/typed/resmgr/v1alpha1" rest "k8s.io/client-go/rest" testing "k8s.io/client-go/testing" ) type FakeCriresmgrV1alpha1 struct { *testing.Fake } func (c *FakeCriresmgrV1alpha1) Adjustments(namespace string) v1alpha1.AdjustmentInterface { return &FakeAdjustments{c, namespace} } // RESTClient returns a RESTClient that is used to communicate // with API server by this client implementation. func (c *FakeCriresmgrV1alpha1) RESTClient() rest.Interface { var ret *rest.RESTClient return ret } ================================================ FILE: pkg/apis/resmgr/generated/clientset/versioned/typed/resmgr/v1alpha1/generated_expansion.go ================================================ // Copyright 2019-2020 Intel Corporation. All Rights Reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. // Code generated by client-gen. DO NOT EDIT. package v1alpha1 type AdjustmentExpansion interface{} ================================================ FILE: pkg/apis/resmgr/generated/clientset/versioned/typed/resmgr/v1alpha1/resmgr_client.go ================================================ // Copyright 2019-2020 Intel Corporation. All Rights Reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. // Code generated by client-gen. DO NOT EDIT. package v1alpha1 import ( "github.com/intel/cri-resource-manager/pkg/apis/resmgr/generated/clientset/versioned/scheme" v1alpha1 "github.com/intel/cri-resource-manager/pkg/apis/resmgr/v1alpha1" rest "k8s.io/client-go/rest" ) type CriresmgrV1alpha1Interface interface { RESTClient() rest.Interface AdjustmentsGetter } // CriresmgrV1alpha1Client is used to interact with features provided by the criresmgr.intel.com group. type CriresmgrV1alpha1Client struct { restClient rest.Interface } func (c *CriresmgrV1alpha1Client) Adjustments(namespace string) AdjustmentInterface { return newAdjustments(c, namespace) } // NewForConfig creates a new CriresmgrV1alpha1Client for the given config. func NewForConfig(c *rest.Config) (*CriresmgrV1alpha1Client, error) { config := *c if err := setConfigDefaults(&config); err != nil { return nil, err } client, err := rest.RESTClientFor(&config) if err != nil { return nil, err } return &CriresmgrV1alpha1Client{client}, nil } // NewForConfigOrDie creates a new CriresmgrV1alpha1Client for the given config and // panics if there is an error in the config. func NewForConfigOrDie(c *rest.Config) *CriresmgrV1alpha1Client { client, err := NewForConfig(c) if err != nil { panic(err) } return client } // New creates a new CriresmgrV1alpha1Client for the given RESTClient. func New(c rest.Interface) *CriresmgrV1alpha1Client { return &CriresmgrV1alpha1Client{c} } func setConfigDefaults(config *rest.Config) error { gv := v1alpha1.SchemeGroupVersion config.GroupVersion = &gv config.APIPath = "/apis" config.NegotiatedSerializer = scheme.Codecs.WithoutConversion() if config.UserAgent == "" { config.UserAgent = rest.DefaultKubernetesUserAgent() } return nil } // RESTClient returns a RESTClient that is used to communicate // with API server by this client implementation. func (c *CriresmgrV1alpha1Client) RESTClient() rest.Interface { if c == nil { return nil } return c.restClient } ================================================ FILE: pkg/apis/resmgr/generated/informers/externalversions/factory.go ================================================ // Copyright 2019-2020 Intel Corporation. All Rights Reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. // Code generated by informer-gen. DO NOT EDIT. package externalversions import ( reflect "reflect" sync "sync" time "time" versioned "github.com/intel/cri-resource-manager/pkg/apis/resmgr/generated/clientset/versioned" internalinterfaces "github.com/intel/cri-resource-manager/pkg/apis/resmgr/generated/informers/externalversions/internalinterfaces" resmgr "github.com/intel/cri-resource-manager/pkg/apis/resmgr/generated/informers/externalversions/resmgr" v1 "k8s.io/apimachinery/pkg/apis/meta/v1" runtime "k8s.io/apimachinery/pkg/runtime" schema "k8s.io/apimachinery/pkg/runtime/schema" cache "k8s.io/client-go/tools/cache" ) // SharedInformerOption defines the functional option type for SharedInformerFactory. type SharedInformerOption func(*sharedInformerFactory) *sharedInformerFactory type sharedInformerFactory struct { client versioned.Interface namespace string tweakListOptions internalinterfaces.TweakListOptionsFunc lock sync.Mutex defaultResync time.Duration customResync map[reflect.Type]time.Duration informers map[reflect.Type]cache.SharedIndexInformer // startedInformers is used for tracking which informers have been started. // This allows Start() to be called multiple times safely. startedInformers map[reflect.Type]bool } // WithCustomResyncConfig sets a custom resync period for the specified informer types. func WithCustomResyncConfig(resyncConfig map[v1.Object]time.Duration) SharedInformerOption { return func(factory *sharedInformerFactory) *sharedInformerFactory { for k, v := range resyncConfig { factory.customResync[reflect.TypeOf(k)] = v } return factory } } // WithTweakListOptions sets a custom filter on all listers of the configured SharedInformerFactory. func WithTweakListOptions(tweakListOptions internalinterfaces.TweakListOptionsFunc) SharedInformerOption { return func(factory *sharedInformerFactory) *sharedInformerFactory { factory.tweakListOptions = tweakListOptions return factory } } // WithNamespace limits the SharedInformerFactory to the specified namespace. func WithNamespace(namespace string) SharedInformerOption { return func(factory *sharedInformerFactory) *sharedInformerFactory { factory.namespace = namespace return factory } } // NewSharedInformerFactory constructs a new instance of sharedInformerFactory for all namespaces. func NewSharedInformerFactory(client versioned.Interface, defaultResync time.Duration) SharedInformerFactory { return NewSharedInformerFactoryWithOptions(client, defaultResync) } // NewFilteredSharedInformerFactory constructs a new instance of sharedInformerFactory. // Listers obtained via this SharedInformerFactory will be subject to the same filters // as specified here. // Deprecated: Please use NewSharedInformerFactoryWithOptions instead func NewFilteredSharedInformerFactory(client versioned.Interface, defaultResync time.Duration, namespace string, tweakListOptions internalinterfaces.TweakListOptionsFunc) SharedInformerFactory { return NewSharedInformerFactoryWithOptions(client, defaultResync, WithNamespace(namespace), WithTweakListOptions(tweakListOptions)) } // NewSharedInformerFactoryWithOptions constructs a new instance of a SharedInformerFactory with additional options. func NewSharedInformerFactoryWithOptions(client versioned.Interface, defaultResync time.Duration, options ...SharedInformerOption) SharedInformerFactory { factory := &sharedInformerFactory{ client: client, namespace: v1.NamespaceAll, defaultResync: defaultResync, informers: make(map[reflect.Type]cache.SharedIndexInformer), startedInformers: make(map[reflect.Type]bool), customResync: make(map[reflect.Type]time.Duration), } // Apply all options for _, opt := range options { factory = opt(factory) } return factory } // Start initializes all requested informers. func (f *sharedInformerFactory) Start(stopCh <-chan struct{}) { f.lock.Lock() defer f.lock.Unlock() for informerType, informer := range f.informers { if !f.startedInformers[informerType] { go informer.Run(stopCh) f.startedInformers[informerType] = true } } } // WaitForCacheSync waits for all started informers' cache were synced. func (f *sharedInformerFactory) WaitForCacheSync(stopCh <-chan struct{}) map[reflect.Type]bool { informers := func() map[reflect.Type]cache.SharedIndexInformer { f.lock.Lock() defer f.lock.Unlock() informers := map[reflect.Type]cache.SharedIndexInformer{} for informerType, informer := range f.informers { if f.startedInformers[informerType] { informers[informerType] = informer } } return informers }() res := map[reflect.Type]bool{} for informType, informer := range informers { res[informType] = cache.WaitForCacheSync(stopCh, informer.HasSynced) } return res } // InternalInformerFor returns the SharedIndexInformer for obj using an internal // client. func (f *sharedInformerFactory) InformerFor(obj runtime.Object, newFunc internalinterfaces.NewInformerFunc) cache.SharedIndexInformer { f.lock.Lock() defer f.lock.Unlock() informerType := reflect.TypeOf(obj) informer, exists := f.informers[informerType] if exists { return informer } resyncPeriod, exists := f.customResync[informerType] if !exists { resyncPeriod = f.defaultResync } informer = newFunc(f.client, resyncPeriod) f.informers[informerType] = informer return informer } // SharedInformerFactory provides shared informers for resources in all known // API group versions. type SharedInformerFactory interface { internalinterfaces.SharedInformerFactory ForResource(resource schema.GroupVersionResource) (GenericInformer, error) WaitForCacheSync(stopCh <-chan struct{}) map[reflect.Type]bool Criresmgr() resmgr.Interface } func (f *sharedInformerFactory) Criresmgr() resmgr.Interface { return resmgr.New(f, f.namespace, f.tweakListOptions) } ================================================ FILE: pkg/apis/resmgr/generated/informers/externalversions/generic.go ================================================ // Copyright 2019-2020 Intel Corporation. All Rights Reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. // Code generated by informer-gen. DO NOT EDIT. package externalversions import ( "fmt" v1alpha1 "github.com/intel/cri-resource-manager/pkg/apis/resmgr/v1alpha1" schema "k8s.io/apimachinery/pkg/runtime/schema" cache "k8s.io/client-go/tools/cache" ) // GenericInformer is type of SharedIndexInformer which will locate and delegate to other // sharedInformers based on type type GenericInformer interface { Informer() cache.SharedIndexInformer Lister() cache.GenericLister } type genericInformer struct { informer cache.SharedIndexInformer resource schema.GroupResource } // Informer returns the SharedIndexInformer. func (f *genericInformer) Informer() cache.SharedIndexInformer { return f.informer } // Lister returns the GenericLister. func (f *genericInformer) Lister() cache.GenericLister { return cache.NewGenericLister(f.Informer().GetIndexer(), f.resource) } // ForResource gives generic access to a shared informer of the matching type // TODO extend this to unknown resources with a client pool func (f *sharedInformerFactory) ForResource(resource schema.GroupVersionResource) (GenericInformer, error) { switch resource { // Group=criresmgr.intel.com, Version=v1alpha1 case v1alpha1.SchemeGroupVersion.WithResource("adjustments"): return &genericInformer{resource: resource.GroupResource(), informer: f.Criresmgr().V1alpha1().Adjustments().Informer()}, nil } return nil, fmt.Errorf("no informer found for %v", resource) } ================================================ FILE: pkg/apis/resmgr/generated/informers/externalversions/internalinterfaces/factory_interfaces.go ================================================ // Copyright 2019-2020 Intel Corporation. All Rights Reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. // Code generated by informer-gen. DO NOT EDIT. package internalinterfaces import ( time "time" versioned "github.com/intel/cri-resource-manager/pkg/apis/resmgr/generated/clientset/versioned" v1 "k8s.io/apimachinery/pkg/apis/meta/v1" runtime "k8s.io/apimachinery/pkg/runtime" cache "k8s.io/client-go/tools/cache" ) // NewInformerFunc takes versioned.Interface and time.Duration to return a SharedIndexInformer. type NewInformerFunc func(versioned.Interface, time.Duration) cache.SharedIndexInformer // SharedInformerFactory a small interface to allow for adding an informer without an import cycle type SharedInformerFactory interface { Start(stopCh <-chan struct{}) InformerFor(obj runtime.Object, newFunc NewInformerFunc) cache.SharedIndexInformer } // TweakListOptionsFunc is a function that transforms a v1.ListOptions. type TweakListOptionsFunc func(*v1.ListOptions) ================================================ FILE: pkg/apis/resmgr/generated/informers/externalversions/resmgr/interface.go ================================================ // Copyright 2019-2020 Intel Corporation. All Rights Reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. // Code generated by informer-gen. DO NOT EDIT. package resmgr import ( internalinterfaces "github.com/intel/cri-resource-manager/pkg/apis/resmgr/generated/informers/externalversions/internalinterfaces" v1alpha1 "github.com/intel/cri-resource-manager/pkg/apis/resmgr/generated/informers/externalversions/resmgr/v1alpha1" ) // Interface provides access to each of this group's versions. type Interface interface { // V1alpha1 provides access to shared informers for resources in V1alpha1. V1alpha1() v1alpha1.Interface } type group struct { factory internalinterfaces.SharedInformerFactory namespace string tweakListOptions internalinterfaces.TweakListOptionsFunc } // New returns a new Interface. func New(f internalinterfaces.SharedInformerFactory, namespace string, tweakListOptions internalinterfaces.TweakListOptionsFunc) Interface { return &group{factory: f, namespace: namespace, tweakListOptions: tweakListOptions} } // V1alpha1 returns a new v1alpha1.Interface. func (g *group) V1alpha1() v1alpha1.Interface { return v1alpha1.New(g.factory, g.namespace, g.tweakListOptions) } ================================================ FILE: pkg/apis/resmgr/generated/informers/externalversions/resmgr/v1alpha1/adjustment.go ================================================ // Copyright 2019-2020 Intel Corporation. All Rights Reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. // Code generated by informer-gen. DO NOT EDIT. package v1alpha1 import ( time "time" versioned "github.com/intel/cri-resource-manager/pkg/apis/resmgr/generated/clientset/versioned" internalinterfaces "github.com/intel/cri-resource-manager/pkg/apis/resmgr/generated/informers/externalversions/internalinterfaces" v1alpha1 "github.com/intel/cri-resource-manager/pkg/apis/resmgr/generated/listers/resmgr/v1alpha1" resmgrv1alpha1 "github.com/intel/cri-resource-manager/pkg/apis/resmgr/v1alpha1" v1 "k8s.io/apimachinery/pkg/apis/meta/v1" runtime "k8s.io/apimachinery/pkg/runtime" watch "k8s.io/apimachinery/pkg/watch" cache "k8s.io/client-go/tools/cache" ) // AdjustmentInformer provides access to a shared informer and lister for // Adjustments. type AdjustmentInformer interface { Informer() cache.SharedIndexInformer Lister() v1alpha1.AdjustmentLister } type adjustmentInformer struct { factory internalinterfaces.SharedInformerFactory tweakListOptions internalinterfaces.TweakListOptionsFunc namespace string } // NewAdjustmentInformer constructs a new informer for Adjustment type. // Always prefer using an informer factory to get a shared informer instead of getting an independent // one. This reduces memory footprint and number of connections to the server. func NewAdjustmentInformer(client versioned.Interface, namespace string, resyncPeriod time.Duration, indexers cache.Indexers) cache.SharedIndexInformer { return NewFilteredAdjustmentInformer(client, namespace, resyncPeriod, indexers, nil) } // NewFilteredAdjustmentInformer constructs a new informer for Adjustment type. // Always prefer using an informer factory to get a shared informer instead of getting an independent // one. This reduces memory footprint and number of connections to the server. func NewFilteredAdjustmentInformer(client versioned.Interface, namespace string, resyncPeriod time.Duration, indexers cache.Indexers, tweakListOptions internalinterfaces.TweakListOptionsFunc) cache.SharedIndexInformer { return cache.NewSharedIndexInformer( &cache.ListWatch{ ListFunc: func(options v1.ListOptions) (runtime.Object, error) { if tweakListOptions != nil { tweakListOptions(&options) } return client.CriresmgrV1alpha1().Adjustments(namespace).List(options) }, WatchFunc: func(options v1.ListOptions) (watch.Interface, error) { if tweakListOptions != nil { tweakListOptions(&options) } return client.CriresmgrV1alpha1().Adjustments(namespace).Watch(options) }, }, &resmgrv1alpha1.Adjustment{}, resyncPeriod, indexers, ) } func (f *adjustmentInformer) defaultInformer(client versioned.Interface, resyncPeriod time.Duration) cache.SharedIndexInformer { return NewFilteredAdjustmentInformer(client, f.namespace, resyncPeriod, cache.Indexers{cache.NamespaceIndex: cache.MetaNamespaceIndexFunc}, f.tweakListOptions) } func (f *adjustmentInformer) Informer() cache.SharedIndexInformer { return f.factory.InformerFor(&resmgrv1alpha1.Adjustment{}, f.defaultInformer) } func (f *adjustmentInformer) Lister() v1alpha1.AdjustmentLister { return v1alpha1.NewAdjustmentLister(f.Informer().GetIndexer()) } ================================================ FILE: pkg/apis/resmgr/generated/informers/externalversions/resmgr/v1alpha1/interface.go ================================================ // Copyright 2019-2020 Intel Corporation. All Rights Reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. // Code generated by informer-gen. DO NOT EDIT. package v1alpha1 import ( internalinterfaces "github.com/intel/cri-resource-manager/pkg/apis/resmgr/generated/informers/externalversions/internalinterfaces" ) // Interface provides access to all the informers in this group version. type Interface interface { // Adjustments returns a AdjustmentInformer. Adjustments() AdjustmentInformer } type version struct { factory internalinterfaces.SharedInformerFactory namespace string tweakListOptions internalinterfaces.TweakListOptionsFunc } // New returns a new Interface. func New(f internalinterfaces.SharedInformerFactory, namespace string, tweakListOptions internalinterfaces.TweakListOptionsFunc) Interface { return &version{factory: f, namespace: namespace, tweakListOptions: tweakListOptions} } // Adjustments returns a AdjustmentInformer. func (v *version) Adjustments() AdjustmentInformer { return &adjustmentInformer{factory: v.factory, namespace: v.namespace, tweakListOptions: v.tweakListOptions} } ================================================ FILE: pkg/apis/resmgr/generated/listers/resmgr/v1alpha1/adjustment.go ================================================ // Copyright 2019-2020 Intel Corporation. All Rights Reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. // Code generated by lister-gen. DO NOT EDIT. package v1alpha1 import ( v1alpha1 "github.com/intel/cri-resource-manager/pkg/apis/resmgr/v1alpha1" "k8s.io/apimachinery/pkg/api/errors" "k8s.io/apimachinery/pkg/labels" "k8s.io/client-go/tools/cache" ) // AdjustmentLister helps list Adjustments. type AdjustmentLister interface { // List lists all Adjustments in the indexer. List(selector labels.Selector) (ret []*v1alpha1.Adjustment, err error) // Adjustments returns an object that can list and get Adjustments. Adjustments(namespace string) AdjustmentNamespaceLister AdjustmentListerExpansion } // adjustmentLister implements the AdjustmentLister interface. type adjustmentLister struct { indexer cache.Indexer } // NewAdjustmentLister returns a new AdjustmentLister. func NewAdjustmentLister(indexer cache.Indexer) AdjustmentLister { return &adjustmentLister{indexer: indexer} } // List lists all Adjustments in the indexer. func (s *adjustmentLister) List(selector labels.Selector) (ret []*v1alpha1.Adjustment, err error) { err = cache.ListAll(s.indexer, selector, func(m interface{}) { ret = append(ret, m.(*v1alpha1.Adjustment)) }) return ret, err } // Adjustments returns an object that can list and get Adjustments. func (s *adjustmentLister) Adjustments(namespace string) AdjustmentNamespaceLister { return adjustmentNamespaceLister{indexer: s.indexer, namespace: namespace} } // AdjustmentNamespaceLister helps list and get Adjustments. type AdjustmentNamespaceLister interface { // List lists all Adjustments in the indexer for a given namespace. List(selector labels.Selector) (ret []*v1alpha1.Adjustment, err error) // Get retrieves the Adjustment from the indexer for a given namespace and name. Get(name string) (*v1alpha1.Adjustment, error) AdjustmentNamespaceListerExpansion } // adjustmentNamespaceLister implements the AdjustmentNamespaceLister // interface. type adjustmentNamespaceLister struct { indexer cache.Indexer namespace string } // List lists all Adjustments in the indexer for a given namespace. func (s adjustmentNamespaceLister) List(selector labels.Selector) (ret []*v1alpha1.Adjustment, err error) { err = cache.ListAllByNamespace(s.indexer, s.namespace, selector, func(m interface{}) { ret = append(ret, m.(*v1alpha1.Adjustment)) }) return ret, err } // Get retrieves the Adjustment from the indexer for a given namespace and name. func (s adjustmentNamespaceLister) Get(name string) (*v1alpha1.Adjustment, error) { obj, exists, err := s.indexer.GetByKey(s.namespace + "/" + name) if err != nil { return nil, err } if !exists { return nil, errors.NewNotFound(v1alpha1.Resource("adjustment"), name) } return obj.(*v1alpha1.Adjustment), nil } ================================================ FILE: pkg/apis/resmgr/generated/listers/resmgr/v1alpha1/expansion_generated.go ================================================ // Copyright 2019-2020 Intel Corporation. All Rights Reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. // Code generated by lister-gen. DO NOT EDIT. package v1alpha1 // AdjustmentListerExpansion allows custom methods to be added to // AdjustmentLister. type AdjustmentListerExpansion interface{} // AdjustmentNamespaceListerExpansion allows custom methods to be added to // AdjustmentNamespaceLister. type AdjustmentNamespaceListerExpansion interface{} ================================================ FILE: pkg/apis/resmgr/v1alpha1/adjustment-schema.yaml ================================================ apiVersion: apiextensions.k8s.io/v1 kind: CustomResourceDefinition metadata: name: adjustments.criresmgr.intel.com spec: group: criresmgr.intel.com names: kind: Adjustment singular: adjustment plural: adjustments scope: Namespaced versions: - name: v1alpha1 served: true storage: true schema: # openAPI V3 Schema for validating adjustments openAPIV3Schema: type: object required: [ spec ] properties: spec: type: object required: [ scope ] properties: scope: type: array items: type: object properties: nodes: type: array items: type: string containers: type: array items: type: object properties: key: type: string operator: type: string values: type: array items: type: string resources: type: object properties: requests: type: object properties: cpu: type: string memory: type: string limits: type: object properties: cpu: type: string memory: type: string classes: type: object properties: rdt: type: string blockio: type: string toptierLimit: type: string status: type: object properties: nodes: type: object additionalProperties: type: object properties: errors: type: object additionalProperties: type: string ================================================ FILE: pkg/apis/resmgr/v1alpha1/adjustment.go ================================================ // Copyright 2019-2020 Intel Corporation. All Rights Reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package v1alpha1 import ( "fmt" "strings" resmgr "github.com/intel/cri-resource-manager/pkg/apis/resmgr" corev1 "k8s.io/api/core/v1" ) // HasSameVersion checks if the policy has the same version as the other. func (a *Adjustment) HasSameVersion(o *Adjustment) bool { if a.ResourceVersion != o.ResourceVersion { return false } if a.Generation != o.Generation { return false } return true } // NodeScope returns the sub-slice of scopes that apply to the given node. func (spec *AdjustmentSpec) NodeScope(node string) []AdjustmentScope { filtered := []AdjustmentScope{} for _, scope := range spec.Scope { if scope.IsNodeInScope(node) { filtered = append(filtered, scope) } } return filtered } // GetResourceRequirements returns the k8s resource requirements for this adjustment. func (spec *AdjustmentSpec) GetResourceRequirements() (corev1.ResourceRequirements, bool) { if spec.Resources != nil { return *spec.Resources, true } return corev1.ResourceRequirements{}, false } // GetRDTClass returns the RDT class for this adjustment. func (spec *AdjustmentSpec) GetRDTClass() (string, bool) { if spec.Classes == nil || spec.Classes.RDT == nil { return "", false } return *spec.Classes.RDT, true } // GetBlockIOClass returns the Block I/O class for this adjustment. func (spec *AdjustmentSpec) GetBlockIOClass() (string, bool) { if spec.Classes == nil || spec.Classes.BlockIO == nil { return "", false } return *spec.Classes.BlockIO, true } // IsNodeInScope tests if the node is within the scope of this spec. func (spec *AdjustmentSpec) IsNodeInScope(node string) bool { if len(spec.Scope) == 0 { return true } for _, s := range spec.Scope { if s.IsNodeInScope(node) { return true } } return false } // IsContainerInScope tests if the container is within the scope of this spec. func (spec *AdjustmentSpec) IsContainerInScope(container resmgr.Evaluable) bool { if len(spec.Scope) == 0 { return true } for _, s := range spec.Scope { if s.IsContainerInScope(container) { return true } } return false } // Compare checks if this spec is identical to another. func (spec *AdjustmentSpec) Compare(other *AdjustmentSpec) bool { switch { case !CompareScopes(spec.Scope, other.Scope): return false case !spec.compareResources(other): return false case !spec.Classes.Compare(other.Classes): return false case spec.ToptierLimit == nil && other.ToptierLimit != nil: return false case spec.ToptierLimit != nil && other.ToptierLimit == nil: return false case spec.ToptierLimit != nil && spec.ToptierLimit.Value() != other.ToptierLimit.Value(): return false } return true } // Verify checks the given spec for obvious errors. func (spec *AdjustmentSpec) Verify() error { if err := spec.verifyResources(); err != nil { return err } if err := spec.verifyToptierLimit(); err != nil { return err } return nil } // Check if the resources in this spec are identical to another one. func (spec *AdjustmentSpec) compareResources(other *AdjustmentSpec) bool { switch { case spec == nil && other == nil: return true case spec != nil && other == nil: return true case spec == nil && other != nil: return true case spec.Resources == nil && other.Resources == nil: return true case spec.Resources != nil && other.Resources == nil: return false case spec.Resources == nil && other.Resources != nil: return false } r := *spec.Resources o := *other.Resources if len(r.Requests) != len(o.Requests) { return false } if len(r.Limits) != len(o.Limits) { return false } for name, qty := range r.Requests { oqty, ok := o.Requests[name] if !ok || qty.Cmp(oqty) != 0 { return false } } for name, qty := range r.Limits { oqty, ok := o.Limits[name] if !ok || qty.Cmp(oqty) != 0 { return false } } return true } // verifyResources verifies the resource requirements of this spec. func (spec *AdjustmentSpec) verifyResources() error { if spec.Resources == nil { return nil } r := *spec.Resources if r.Requests == nil { r.Requests = corev1.ResourceList{} } if r.Limits == nil { r.Limits = corev1.ResourceList{} } req, rok := r.Requests[corev1.ResourceCPU] lim, lok := r.Limits[corev1.ResourceCPU] switch { case !rok && lok: r.Requests[corev1.ResourceCPU] = lim case rok && lok: if lim.Cmp(req) < 0 { return apiError("invalid CPU limit %q < request %q", lim, req) } } req, rok = r.Requests[corev1.ResourceMemory] lim, lok = r.Limits[corev1.ResourceMemory] switch { case !rok && lok: r.Requests[corev1.ResourceMemory] = lim case rok && lok: if lim.Cmp(req) < 0 { return apiError("invalid memory limit %q < request %q", lim, req) } } for name := range r.Requests { switch name { case corev1.ResourceCPU, corev1.ResourceMemory: default: return apiError("invalid resource requests: unsupported resource %v", name) } } for name := range r.Limits { switch name { case corev1.ResourceCPU, corev1.ResourceMemory: default: return apiError("invalid resource limits: unsupported resource %v", name) } } return nil } // verifyToptierLimit verifies the top tier memory limit settings of this spec. func (spec *AdjustmentSpec) verifyToptierLimit() error { if spec.ToptierLimit == nil { return nil } l := spec.ToptierLimit.Value() if l < 0 { return apiError("invalid ToptierLimit %v", l) } return nil } // IsNodeInScope tests if the node is within this scope. func (scope *AdjustmentScope) IsNodeInScope(node string) bool { if len(scope.Nodes) == 0 { return true } for _, n := range scope.Nodes { if matches(n, node) { return true } } return false } // IsContainerInScope tests if the container is within this scope. func (scope *AdjustmentScope) IsContainerInScope(container resmgr.Evaluable) bool { if len(scope.Containers) == 0 { return true } for _, expr := range scope.Containers { if expr.Evaluate(container) { return true } } return false } // match a string against a primitive pattern with a single optional trailing '*'. func matches(pattern, name string) bool { if pattern == "" { return true } if !strings.HasSuffix(pattern, "*") { return pattern == name } return strings.HasPrefix(name, pattern[0:len(pattern)-1]) } // CompareScopes checks if two slices of scopes are (syntactically) identical. func CompareScopes(scopes []AdjustmentScope, others []AdjustmentScope) bool { if len(scopes) != len(others) { return false } for idx, s := range scopes { o := others[idx] if !s.Compare(&o) { return false } } return true } // Compare check if the scope is identical to another one. func (scope *AdjustmentScope) Compare(other *AdjustmentScope) bool { if len(scope.Nodes) != len(other.Nodes) || len(scope.Containers) != len(other.Containers) { return false } for idx, n := range scope.Nodes { if other.Nodes[idx] != n { return false } } for idx, c := range scope.Containers { if other.Containers[idx] != c { return false } } return true } // Compare checks if the classes are identical to another set. func (c *Classes) Compare(o *Classes) bool { switch { case c == nil && o == nil: return true case c != nil && o == nil, c == nil && o != nil: return false case c.RDT != nil && o.RDT == nil, c.RDT == nil && o.RDT != nil: return false case c.BlockIO != nil && o.BlockIO == nil, c.BlockIO == nil && o.BlockIO != nil: return false case c.RDT == nil && c.BlockIO == nil: return true } return *c.RDT == *o.RDT && *c.BlockIO == *o.BlockIO } // apiError returns a format error specific to this API. func apiError(format string, args ...interface{}) error { return fmt.Errorf("adjustment API error: "+format, args...) } ================================================ FILE: pkg/apis/resmgr/v1alpha1/doc.go ================================================ // Copyright 2019-2020 Intel Corporation. All Rights Reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. // +k8s:deepcopy-gen=package // +groupName=criresmgr.intel.com package v1alpha1 ================================================ FILE: pkg/apis/resmgr/v1alpha1/register.go ================================================ // Copyright 2019-2020 Intel Corporation. All Rights Reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package v1alpha1 import ( metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apimachinery/pkg/runtime" "k8s.io/apimachinery/pkg/runtime/schema" ) var ( // SchemeBuilder initializes a scheme builder SchemeBuilder = runtime.NewSchemeBuilder(addKnownTypes) // AddToScheme is a global function that registers this API group & version to a scheme AddToScheme = SchemeBuilder.AddToScheme ) // SchemeGroupVersion is group version used to register these objects. var SchemeGroupVersion = schema.GroupVersion{ Group: GroupName, Version: Version, } func Resource(resource string) schema.GroupResource { return SchemeGroupVersion.WithResource(resource).GroupResource() } // Adds the list of known types to api.Scheme. func addKnownTypes(scheme *runtime.Scheme) error { scheme.AddKnownTypes(SchemeGroupVersion, &Adjustment{}, &AdjustmentList{}, ) metav1.AddToGroupVersion(scheme, SchemeGroupVersion) return nil } ================================================ FILE: pkg/apis/resmgr/v1alpha1/types.go ================================================ // Copyright 2019-2020 Intel Corporation. All Rights Reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package v1alpha1 import ( corev1 "k8s.io/api/core/v1" resapi "k8s.io/apimachinery/pkg/api/resource" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" resmgr "github.com/intel/cri-resource-manager/pkg/apis/resmgr" ) const ( GroupName string = "criresmgr.intel.com" // GroupName is the group of our CRD. Version string = "v1alpha1" // Version is the API version of our CRD. Kind string = "Adjustment" // Kind is the object kind of our CRD. Plural string = "adjustments" // Plural is Kind in plural form. Singular string = "adjustment" // Singular is Kind in singular form. Name string = Plural + "." + GroupName // Name is the full name of our CRD. ) // +genclient // +k8s:deepcopy-gen:interfaces=k8s.io/apimachinery/pkg/runtime.Object // Adjustment is a CRD used to externally adjust containers resource assignments. type Adjustment struct { metav1.TypeMeta `json:",inline"` metav1.ObjectMeta `json:"metadata,omitempty"` Spec AdjustmentSpec `json:"spec"` Status AdjustmentStatus `json:"status"` } // AdjustmentSpec specifies the scope for an external adjustment. type AdjustmentSpec struct { Scope []AdjustmentScope `json:"scope"` Resources *corev1.ResourceRequirements `json:"resources"` Classes *Classes `json:"classes"` ToptierLimit *resapi.Quantity `json:"toptierLimit"` } // AdjustmentStatus represents the status of applying an adjustment. type AdjustmentStatus struct { Nodes map[string]AdjustmentNodeStatus `json:"nodes"` } // AdjustmentNodeStatus represents the status of an adjustment on a node. type AdjustmentNodeStatus struct { Errors map[string]string `json:"errors"` } // AdjustmentScope defines the scope for an adjustment. type AdjustmentScope struct { Nodes []string `json:"nodes"` Containers []*resmgr.Expression `json:"containers"` } // Classes defines RDT and BlockIO class assignments. type Classes struct { BlockIO *string `json:"blockio"` RDT *string `json:"rdt"` } // +k8s:deepcopy-gen:interfaces=k8s.io/apimachinery/pkg/runtime.Object // AdjustmentList is a list of Adjustments. type AdjustmentList struct { metav1.TypeMeta `json:",inline"` metav1.ListMeta `json:"metadata"` Items []Adjustment `json:"items"` } ================================================ FILE: pkg/apis/resmgr/v1alpha1/zz_generated.deepcopy.go ================================================ //go:build !ignore_autogenerated // +build !ignore_autogenerated // Copyright 2019-2020 Intel Corporation. All Rights Reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. // Code generated by deepcopy-gen. DO NOT EDIT. package v1alpha1 import ( resmgr "github.com/intel/cri-resource-manager/pkg/apis/resmgr" v1 "k8s.io/api/core/v1" runtime "k8s.io/apimachinery/pkg/runtime" ) // DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. func (in *Adjustment) DeepCopyInto(out *Adjustment) { *out = *in out.TypeMeta = in.TypeMeta in.ObjectMeta.DeepCopyInto(&out.ObjectMeta) in.Spec.DeepCopyInto(&out.Spec) in.Status.DeepCopyInto(&out.Status) return } // DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new Adjustment. func (in *Adjustment) DeepCopy() *Adjustment { if in == nil { return nil } out := new(Adjustment) in.DeepCopyInto(out) return out } // DeepCopyObject is an autogenerated deepcopy function, copying the receiver, creating a new runtime.Object. func (in *Adjustment) DeepCopyObject() runtime.Object { if c := in.DeepCopy(); c != nil { return c } return nil } // DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. func (in *AdjustmentList) DeepCopyInto(out *AdjustmentList) { *out = *in out.TypeMeta = in.TypeMeta in.ListMeta.DeepCopyInto(&out.ListMeta) if in.Items != nil { in, out := &in.Items, &out.Items *out = make([]Adjustment, len(*in)) for i := range *in { (*in)[i].DeepCopyInto(&(*out)[i]) } } return } // DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new AdjustmentList. func (in *AdjustmentList) DeepCopy() *AdjustmentList { if in == nil { return nil } out := new(AdjustmentList) in.DeepCopyInto(out) return out } // DeepCopyObject is an autogenerated deepcopy function, copying the receiver, creating a new runtime.Object. func (in *AdjustmentList) DeepCopyObject() runtime.Object { if c := in.DeepCopy(); c != nil { return c } return nil } // DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. func (in *AdjustmentNodeStatus) DeepCopyInto(out *AdjustmentNodeStatus) { *out = *in if in.Errors != nil { in, out := &in.Errors, &out.Errors *out = make(map[string]string, len(*in)) for key, val := range *in { (*out)[key] = val } } return } // DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new AdjustmentNodeStatus. func (in *AdjustmentNodeStatus) DeepCopy() *AdjustmentNodeStatus { if in == nil { return nil } out := new(AdjustmentNodeStatus) in.DeepCopyInto(out) return out } // DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. func (in *AdjustmentScope) DeepCopyInto(out *AdjustmentScope) { *out = *in if in.Nodes != nil { in, out := &in.Nodes, &out.Nodes *out = make([]string, len(*in)) copy(*out, *in) } if in.Containers != nil { in, out := &in.Containers, &out.Containers *out = make([]*resmgr.Expression, len(*in)) for i := range *in { if (*in)[i] != nil { in, out := &(*in)[i], &(*out)[i] *out = (*in).DeepCopy() } } } return } // DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new AdjustmentScope. func (in *AdjustmentScope) DeepCopy() *AdjustmentScope { if in == nil { return nil } out := new(AdjustmentScope) in.DeepCopyInto(out) return out } // DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. func (in *AdjustmentSpec) DeepCopyInto(out *AdjustmentSpec) { *out = *in if in.Scope != nil { in, out := &in.Scope, &out.Scope *out = make([]AdjustmentScope, len(*in)) for i := range *in { (*in)[i].DeepCopyInto(&(*out)[i]) } } if in.Resources != nil { in, out := &in.Resources, &out.Resources *out = new(v1.ResourceRequirements) (*in).DeepCopyInto(*out) } if in.Classes != nil { in, out := &in.Classes, &out.Classes *out = new(Classes) (*in).DeepCopyInto(*out) } if in.ToptierLimit != nil { in, out := &in.ToptierLimit, &out.ToptierLimit x := (*in).DeepCopy() *out = &x } return } // DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new AdjustmentSpec. func (in *AdjustmentSpec) DeepCopy() *AdjustmentSpec { if in == nil { return nil } out := new(AdjustmentSpec) in.DeepCopyInto(out) return out } // DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. func (in *AdjustmentStatus) DeepCopyInto(out *AdjustmentStatus) { *out = *in if in.Nodes != nil { in, out := &in.Nodes, &out.Nodes *out = make(map[string]AdjustmentNodeStatus, len(*in)) for key, val := range *in { (*out)[key] = *val.DeepCopy() } } return } // DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new AdjustmentStatus. func (in *AdjustmentStatus) DeepCopy() *AdjustmentStatus { if in == nil { return nil } out := new(AdjustmentStatus) in.DeepCopyInto(out) return out } // DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. func (in *Classes) DeepCopyInto(out *Classes) { *out = *in if in.BlockIO != nil { in, out := &in.BlockIO, &out.BlockIO *out = new(string) **out = **in } if in.RDT != nil { in, out := &in.RDT, &out.RDT *out = new(string) **out = **in } return } // DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new Classes. func (in *Classes) DeepCopy() *Classes { if in == nil { return nil } out := new(Classes) in.DeepCopyInto(out) return out } ================================================ FILE: pkg/avx/collector.go ================================================ /* Copyright 2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ package avx //go:generate go run elfdump.go import ( "bytes" "flag" "fmt" "os" "path/filepath" "regexp" "strconv" "strings" "sync" "syscall" "unsafe" bpf "github.com/cilium/ebpf" "github.com/intel/cri-resource-manager/pkg/cgroups" logger "github.com/intel/cri-resource-manager/pkg/log" "github.com/pkg/errors" "github.com/prometheus/client_golang/prometheus" "golang.org/x/sys/unix" ) const ( // LastCPUName is the Prometheuse Gauge name for last CPU with AVX512 instructions. LastCPUName = "last_cpu_avx_task_switches" // AVXSwitchCountName is the Prometheuse Gauge name for AVX switch count per cgroup. AVXSwitchCountName = "avx_switch_count_per_cgroup" // AllSwitchCountName is the Prometheuse Gauge name for all switch count per cgroup. AllSwitchCountName = "all_switch_count_per_cgroup" // LastUpdateNs is the Prometheuse Gauge name for per cgroup AVX512 activity timestamp. LastUpdateNs = "last_update_ns" // Path to kernel tracepoints kernelTracepointPath = "/sys/kernel/debug/tracing/events" // rlimit value (512k) needed to lock map data in memory mapMemLockLimit = 524288 ) // Prometheus Metric descriptor indices and descriptor table const ( lastCPUDesc = iota avxSwitchCountDesc allSwitchCountDesc lastUpdateNsDesc numDescriptors ) var descriptors = [numDescriptors]*prometheus.Desc{ lastCPUDesc: prometheus.NewDesc( LastCPUName, "Number of task switches on the CPU where AVX512 instructions were used.", []string{ "cpu_id", }, nil, ), avxSwitchCountDesc: prometheus.NewDesc( AVXSwitchCountName, "Number of task switches where AVX512 instructions were used in a particular cgroup.", []string{ "container_id", }, nil, ), allSwitchCountDesc: prometheus.NewDesc( AllSwitchCountName, "Total number of task switches in a particular cgroup.", []string{ "container_id", }, nil, ), lastUpdateNsDesc: prometheus.NewDesc( "last_update_ns", "Time since last AVX512 activity in a particular cgroup.", []string{ "container_id", }, nil, ), } var ( bpfBinaryName = "avx512.o" bpfInstallpath = "/usr/libexec/bpf" // our logger instance log = logger.NewLogger("avx") ) type collector struct { root string ebpf *bpf.Collection fds []int } func enablePerfTracepoint(prog *bpf.Program, tracepoint string) (int, error) { id, err := os.ReadFile(filepath.Join(kernelTracepointPath, tracepoint, "id")) if err != nil { return -1, errors.Wrap(err, "unable to read tracepoint ID") } tid, err := strconv.Atoi(strings.TrimSpace(string(id))) if err != nil { return -1, errors.New("unable to convert tracepoint ID") } attr := unix.PerfEventAttr{ Type: unix.PERF_TYPE_TRACEPOINT, Config: uint64(tid), // tracepoint id Sample_type: unix.PERF_SAMPLE_RAW, Sample: 1, Wakeup: 1, } pfd, err := unix.PerfEventOpen(&attr, -1, 0, -1, unix.PERF_FLAG_FD_CLOEXEC) if err != nil { return -1, errors.Wrap(err, "unable to open perf events") } if _, _, errno := unix.Syscall(unix.SYS_IOCTL, uintptr(pfd), unix.PERF_EVENT_IOC_ENABLE, 0); errno != 0 { return -1, errors.Errorf("unable to set up perf events: %s", errno) } if _, _, errno := unix.Syscall(unix.SYS_IOCTL, uintptr(pfd), unix.PERF_EVENT_IOC_SET_BPF, uintptr(prog.FD())); errno != 0 { return -1, errors.Errorf("unable to attach bpf program to perf events: %s", errno) } return pfd, nil } func getKernelVersion() uint32 { var uts unix.Utsname err := unix.Uname(&uts) if err != nil { return 0 } str := string(bytes.SplitN(uts.Release[:], []byte{0}, 2)[0]) ver := strings.SplitN(str, ".", 3) major, err := strconv.ParseUint(ver[0], 10, 8) if err != nil { return 0 } minor, err := strconv.ParseUint(ver[1], 10, 8) if err != nil { return uint32(major << 16) } // ignore patch version return uint32(major<<16 + minor<<8) } func kernelVersionStr(v uint32) string { return fmt.Sprintf("%d.%d.0", v>>16, (v>>8)&0xff) } // NewCollector creates new Prometheus collector for AVX metrics func NewCollector() (prometheus.Collector, error) { // Set rlimit to be able to lock map values in memory memlockLimit := &unix.Rlimit{ Cur: mapMemLockLimit, Max: mapMemLockLimit, } err := unix.Setrlimit(unix.RLIMIT_MEMLOCK, memlockLimit) if err != nil { return nil, errors.Wrap(err, "unable to set rlimit") } spec, err := bpf.LoadCollectionSpec(filepath.Join(bpfInstallpath, bpfBinaryName)) if err != nil { log.Info("Unable to load user eBPF (%v). Using default CollectionSpec from ELF program bytes", err) spec, err = bpf.LoadCollectionSpecFromReader(bytes.NewReader(program[:])) if err != nil { return nil, errors.Wrap(err, "unable to load default CollectionSpec from ELF program bytes") } } hostVer := getKernelVersion() progVer := spec.Programs["tracepoint__x86_fpu_regs_deactivated"].KernelVersion if hostVer < progVer { return nil, errors.Wrapf(err, "The host kernel version (v%s) is too old to run the AVX512 collector program. Minimum version is v%s", kernelVersionStr(hostVer), kernelVersionStr(progVer)) } collection, err := bpf.NewCollection(spec) if err != nil { return nil, errors.Wrap(err, "unable to create new Collection") } ffd, err := enablePerfTracepoint(collection.Programs["tracepoint__x86_fpu_regs_deactivated"], "x86_fpu/x86_fpu_regs_deactivated") if err != nil { return nil, errors.Wrap(err, "unable to enable fpu tracepoint") } sfd, err := enablePerfTracepoint(collection.Programs["tracepoint__sched_switch"], "sched/sched_switch") if err != nil { return nil, errors.Wrap(err, "unable to enable sched tracepoint") } return &collector{ root: cgroups.GetV2Dir(), ebpf: collection, fds: []int{ffd, sfd}, }, nil } // Describe implements prometheus.Collector interface func (c *collector) Describe(ch chan<- *prometheus.Desc) { for _, d := range descriptors { ch <- d } } // from iovisor/gobpf: bpf.NowNanoseconds() // nowNanoseconds returns a time that can be compared to bpf_ktime_get_ns() func nowNanoseconds() uint64 { var ts syscall.Timespec syscall.Syscall(syscall.SYS_CLOCK_GETTIME, 1 /* CLOCK_MONOTONIC */, uintptr(unsafe.Pointer(&ts)), 0) sec, nsec := ts.Unix() return 1000*1000*1000*uint64(sec) + uint64(nsec) } // Collect implements prometheus.Collector interface func (c collector) Collect(ch chan<- prometheus.Metric) { var ( wg sync.WaitGroup key uint64 perCPUVal []uint32 ) cgroupids := make(map[uint64]uint32) lastCPUs := make(map[string]uint32) cg := cgroups.NewCgroupID(c.root) m := c.ebpf.Maps["avx_context_switch_count_hash"] iter := m.Iterate() for iter.Next(&key, &perCPUVal) { var sum uint32 for cpuID, count := range perCPUVal { if count == 0 { continue } sum = sum + count cpuX := fmt.Sprintf("CPU%d", cpuID) lastCPUs[cpuX] = lastCPUs[cpuX] + count } cgroupids[key] = sum log.Debug("cgroupid %d => counter %d", key, sum) // reset the counter by deleting the key err := m.Delete(key) if err != nil { log.Error("%+v", err) } } if iter.Err() != nil { log.Error("unable to iterate all elements of avx_context_switch_count: %+v", iter.Err()) } for lastCPU, count := range lastCPUs { ch <- prometheus.MustNewConstMetric( descriptors[lastCPUDesc], prometheus.GaugeValue, float64(count), lastCPU) } for cgroupid, counter := range cgroupids { wg.Add(1) go func(cgroupid_ uint64, counter_ uint32) { var allCount uint32 var lastUpdate uint64 defer wg.Done() path, err := cg.Find(cgroupid_) if err != nil { log.Error("failed to find cgroup by id: %v", err) return } re := regexp.MustCompile(`[a-z0-9]{64}`) matches := re.FindStringSubmatch(filepath.Base(path)) if len(matches) == 0 { return } ch <- prometheus.MustNewConstMetric( descriptors[avxSwitchCountDesc], prometheus.GaugeValue, float64(counter_), matches[0]) if err := c.ebpf.Maps["all_context_switch_count_hash"].Lookup(uint64(cgroupid_), &allCount); err != nil { log.Error("unable to find 'all' context switch count: %+v", err) return } log.Debug("all: %d", allCount) if err := c.ebpf.Maps["last_update_ns_hash"].Lookup(uint64(cgroupid_), &lastUpdate); err != nil { log.Error("unable to find last update timestamp: %+v", err) return } log.Debug("last: %d", lastUpdate) ch <- prometheus.MustNewConstMetric( descriptors[allSwitchCountDesc], prometheus.GaugeValue, float64(allCount), re.FindStringSubmatch(filepath.Base(path))[0]) ch <- prometheus.MustNewConstMetric( descriptors[lastUpdateNsDesc], prometheus.GaugeValue, float64(nowNanoseconds()-lastUpdate), re.FindStringSubmatch(filepath.Base(path))[0]) }(cgroupid, counter) } // We need to wait so that the response channel doesn't get closed. wg.Wait() m = c.ebpf.Maps["all_context_switch_count_hash"] iter = m.Iterate() var val uint32 for iter.Next(&key, &val) { // reset the counter by deleting the key err := m.Delete(key) if err != nil { log.Error("%+v", err) } } if iter.Err() != nil { log.Error("unable to reset all elements of all_context_switch_count: %+v", iter.Err()) } } func init() { flag.StringVar(&bpfInstallpath, "bpf-install-path", bpfInstallpath, "Path to eBPF install directory") } ================================================ FILE: pkg/avx/elfdump.go ================================================ //go:build ignore // +build ignore /* Copyright 2020 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ package main import ( "encoding/hex" "fmt" "os" "strings" "text/template" ) const ( blocksPerRow = 12 ) type Program struct { ProgramLines []string } func main() { f, err := os.ReadFile("../../libexec/avx512.o") if err != nil { fmt.Println("Note: AVX512 eBPF ELF not available.") } enc := make([]byte, hex.EncodedLen(len(f))) enclen := hex.Encode(enc, f) var j int var row strings.Builder program := make([]string, 0) for i := 0; i < enclen-1; i = i + 2 { fmt.Fprintf(&row, "0x%s, ", enc[i:i+2]) j++ if j%blocksPerRow == 0 { program = append(program, row.String()) row.Reset() } } // flush last row program = append(program, row.String()) p := Program{ ProgramLines: program, } template := template.Must(template.New("").Parse(`// Code generated by go generate; DO NOT EDIT. package avx var program = [...]byte{ {{- range .ProgramLines }} {{ printf "%s" . }} {{- end }} } `)) outfile, err := os.Create("programbytes_gendata.go") if err != nil { fmt.Println("elfdump:", err) os.Exit(1) } defer outfile.Close() err = template.Execute(outfile, p) if err != nil { fmt.Println("elfdump:", err) } } ================================================ FILE: pkg/avx/register.go ================================================ // Copyright 2019 Intel Corporation. All Rights Reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. //go:build !noavx // +build !noavx package avx import ( "github.com/intel/cri-resource-manager/pkg/metrics" ) func init() { err := metrics.RegisterCollector("avx", NewCollector) if err != nil { log.Error("Failed to register AVX collector: %v", err) } } ================================================ FILE: pkg/blockio/blockio.go ================================================ /* Copyright 2020 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ package blockio import ( "errors" "fmt" "os" "path/filepath" "sort" "strings" "syscall" "golang.org/x/sys/unix" "k8s.io/apimachinery/pkg/api/resource" "github.com/intel/cri-resource-manager/pkg/cgroups" "github.com/intel/cri-resource-manager/pkg/cri/resource-manager/cache" logger "github.com/intel/cri-resource-manager/pkg/log" ) const ( // ConfigModuleName is the configuration section of blockio class definitions ConfigModuleName = "blockio" // sysfsBlockDeviceIOSchedulerPaths expands (with glob) to block device scheduler files. // If modified, check how to parse device node from expanded paths. sysfsBlockDeviceIOSchedulerPaths = "/sys/block/*/queue/scheduler" ) // Class represents a block I/O class, a class name together with its associated // parameters, essentially a single key/value pair from staticOciBlockIO below. // This type is only used for querying all (static) block I/O classes in a sorting- // form. type Class struct { Name string Parameters cgroups.OciBlockIOParameters } // BlockDeviceInfo holds information on a block device to be configured. // As users can specify block devices using wildcards ("/dev/disk/by-id/*SSD*") // BlockDeviceInfo.Origin is maintained for traceability: why this // block device is included in configuration. // BlockDeviceInfo.DevNode contains resolved device node, like "/dev/sda". type BlockDeviceInfo struct { Major int64 Minor int64 DevNode string Origin string } // Our logger instance. var log logger.Logger = logger.NewLogger("blockio") // staticOciBlockIO connects user-defined block I/O classes to // corresponding OCI BlockIO parameters. "Static" means that // new/current block devices matching device wildcards in these // classes are not expanded every time new containers are assigned to // these classes. Devices are scanned on only at the beginning and on // blockio configuration changes. var staticOciBlockIO = map[string]cgroups.OciBlockIOParameters{} // currentIOSchedulers contains io-schedulers (found in // sysfsBlockDeviceIOSchedulerPaths) of device nodes: // {"/dev/sda": "bfq"} var currentIOSchedulers map[string]string // GetClasses returns block I/O class names and associated parameters in sorted slice. func GetClasses() []*Class { classes := make([]*Class, 0, len(staticOciBlockIO)) for name, params := range staticOciBlockIO { classes = append(classes, &Class{Name: name, Parameters: params}) } sort.Slice(classes, func(i, j int) bool { return strings.Compare(classes[i].Name, classes[j].Name) < 0 }) return classes } // UpdateOciConfig converts the configuration in the opt variable into staticOciBlockIO func UpdateOciConfig(ignoreErrors bool) error { currentIOSchedulers, ioSchedulerDetectionError := getCurrentIOSchedulers() if ioSchedulerDetectionError != nil { log.Warn("configuration validation partly disabled due to IO scheduler detection error %#v", ioSchedulerDetectionError.Error()) } staticOciBlockIO = map[string]cgroups.OciBlockIOParameters{} // Create static OCI BlockIO structures for each blockio class for class := range opt.Classes { ociBlockIO, err := devicesParametersToOci(opt.Classes[class], currentIOSchedulers) if err != nil { if ignoreErrors { log.Error("ignoring: %v", err) } else { return err } } // Handle all configurations as static for now. That // is, the list of block devices matching Devices // wildcards will not be updated without new // configNotify(). class.DynamicDevices not supported // yet. staticOciBlockIO[class] = ociBlockIO } return nil } // SetContainerClass assigns the pod in a container to a blockio class. func SetContainerClass(c cache.Container, class string) error { ociBlockIO, classIsStatic := staticOciBlockIO[class] if !classIsStatic { return blockioError("no OCI BlockIO parameters for class %#v", class) } blkioCgroupRoot := cgroups.Blkio.Path() containerCgroupDir := c.GetCgroupDir() if containerCgroupDir == "" { return blockioError("failed to find cgroup directory for container %s under %#v, container id %#v", c.PrettyName(), blkioCgroupRoot, c.GetID()) } containerCgroupPath := filepath.Join(blkioCgroupRoot, containerCgroupDir) err := cgroups.ResetBlkioParameters(containerCgroupPath, ociBlockIO) if err != nil { return blockioError("assigning container %v to class %#v failed: %w", c.PrettyName(), class, err) } return nil } // getCurrentIOSchedulers returns currently active io-scheduler used for each block device in the system. func getCurrentIOSchedulers() (map[string]string, error) { var ios = map[string]string{} schedulerFiles, err := filepath.Glob(sysfsBlockDeviceIOSchedulerPaths) if err != nil { return ios, blockioError("error in IO scheduler wildcards %#v: %w", sysfsBlockDeviceIOSchedulerPaths, err) } for _, schedulerFile := range schedulerFiles { devName := strings.SplitN(schedulerFile, "/", 5)[3] schedulerDataB, err := os.ReadFile(schedulerFile) if err != nil { // A block device may be disconnected. Continue without error. log.Error("failed to read current IO scheduler %#v: %v\n", schedulerFile, err) continue } schedulerData := strings.Trim(string(schedulerDataB), "\n") currentScheduler := "" if strings.IndexByte(schedulerData, ' ') == -1 { currentScheduler = schedulerData } else { openB := strings.Index(schedulerData, "[") closeB := strings.Index(schedulerData, "]") if -1 < openB && openB < closeB { currentScheduler = schedulerData[openB+1 : closeB] } } if currentScheduler == "" { return ios, blockioError("could not parse current scheduler in %#v\n", schedulerFile) } ios["/dev/"+devName] = currentScheduler } return ios, nil } // deviceParametersToOci converts single blockio class parameters into OCI BlockIO structure. func devicesParametersToOci(dps []DevicesParameters, currentIOSchedulers map[string]string) (cgroups.OciBlockIOParameters, error) { errs := []error{} oci := cgroups.NewOciBlockIOParameters() for _, dp := range dps { var err error var weight, throttleReadBps, throttleWriteBps, throttleReadIOPS, throttleWriteIOPS int64 weight, err = parseAndValidateInt64("Weight", dp.Weight, -1, 10, 1000) errs = append(errs, err) throttleReadBps, err = parseAndValidateInt64("ThrottleReadBps", dp.ThrottleReadBps, -1, 0, -1) errs = append(errs, err) throttleWriteBps, err = parseAndValidateInt64("ThrottleWriteBps", dp.ThrottleWriteBps, -1, 0, -1) errs = append(errs, err) throttleReadIOPS, err = parseAndValidateInt64("ThrottleReadIOPS", dp.ThrottleReadIOPS, -1, 0, -1) errs = append(errs, err) throttleWriteIOPS, err = parseAndValidateInt64("ThrottleWriteIOPS", dp.ThrottleWriteIOPS, -1, 0, -1) errs = append(errs, err) if dp.Devices == nil { if weight > -1 { oci.Weight = weight } if throttleReadBps > -1 || throttleWriteBps > -1 || throttleReadIOPS > -1 || throttleWriteIOPS > -1 { errs = append(errs, fmt.Errorf("ignoring throttling (rbps=%#v wbps=%#v riops=%#v wiops=%#v): Devices not listed", dp.ThrottleReadBps, dp.ThrottleWriteBps, dp.ThrottleReadIOPS, dp.ThrottleWriteIOPS)) } } else { blockDevices, err := currentPlatform.configurableBlockDevices(dp.Devices) if err != nil { // Problems in matching block device wildcards and resolving symlinks // are worth reporting, but must not block configuring blkio where possible. log.Error(err.Error()) } if len(blockDevices) == 0 { log.Warn("no matches on any of Devices: %v, parameters ignored", dp.Devices) } for _, blockDeviceInfo := range blockDevices { if weight != -1 { if ios, found := currentIOSchedulers[blockDeviceInfo.DevNode]; found { if ios != "bfq" && ios != "cfq" { log.Warn("weight has no effect on device %#v due to "+ "incompatible io-scheduler %#v (bfq or cfq required)", blockDeviceInfo.DevNode, ios) } } oci.WeightDevice.Update(blockDeviceInfo.Major, blockDeviceInfo.Minor, weight) } if throttleReadBps != -1 { oci.ThrottleReadBpsDevice.Update(blockDeviceInfo.Major, blockDeviceInfo.Minor, throttleReadBps) } if throttleWriteBps != -1 { oci.ThrottleWriteBpsDevice.Update(blockDeviceInfo.Major, blockDeviceInfo.Minor, throttleWriteBps) } if throttleReadIOPS != -1 { oci.ThrottleReadIOPSDevice.Update(blockDeviceInfo.Major, blockDeviceInfo.Minor, throttleReadIOPS) } if throttleWriteIOPS != -1 { oci.ThrottleWriteIOPSDevice.Update(blockDeviceInfo.Major, blockDeviceInfo.Minor, throttleWriteIOPS) } } } } return oci, errors.Join(errs...) } // parseAndValidateInt64 parses quantities, like "64 M", and validates that they are in given range. func parseAndValidateInt64(fieldName string, fieldContent string, defaultValue int64, min int64, max int64) (int64, error) { // Returns field content if fieldContent == "" { return defaultValue, nil } qty, err := resource.ParseQuantity(fieldContent) if err != nil { return defaultValue, fmt.Errorf("syntax error in %#v (%#v)", fieldName, fieldContent) } value := qty.Value() if min != -1 && min > value { return defaultValue, fmt.Errorf("value of %#v (%#v) smaller than minimum (%#v)", fieldName, value, min) } if max != -1 && value > max { return defaultValue, fmt.Errorf("value of %#v (%#v) bigger than maximum (%#v)", fieldName, value, max) } return value, nil } // platformInterface includes functions that access the system. Enables mocking the system. type platformInterface interface { configurableBlockDevices(devWildcards []string) ([]BlockDeviceInfo, error) } // defaultPlatform versions of platformInterface functions access the underlying system. type defaultPlatform struct{} // currentPlatform defines which platformInterface is used: defaultPlatform or a mock, for instance. var currentPlatform platformInterface = defaultPlatform{} // configurableBlockDevices finds major:minor numbers for device filenames (wildcards allowed) func (dpm defaultPlatform) configurableBlockDevices(devWildcards []string) ([]BlockDeviceInfo, error) { // Return map {devNode: BlockDeviceInfo} // Example: {"/dev/sda": {Major:8, Minor:0, Origin:"from symlink /dev/disk/by-id/ata-VendorXSSD from wildcard /dev/disk/by-id/*SSD*"}} errs := []error{} blockDevices := []BlockDeviceInfo{} var origin string // 1. Expand wildcards to device filenames (may be symlinks) // Example: devMatches["/dev/disk/by-id/ata-VendorSSD"] == "from wildcard \"dev/disk/by-id/*SSD*\"" devMatches := map[string]string{} // {devNodeOrSymlink: origin} for _, devWildcard := range devWildcards { devWildcardMatches, err := filepath.Glob(devWildcard) if err != nil { errs = append(errs, fmt.Errorf("bad device wildcard %#v: %w", devWildcard, err)) continue } if len(devWildcardMatches) == 0 { errs = append(errs, fmt.Errorf("device wildcard %#v does not match any device nodes", devWildcard)) continue } for _, devMatch := range devWildcardMatches { if devMatch != devWildcard { origin = fmt.Sprintf("from wildcard %#v", devWildcard) } else { origin = "" } devMatches[devMatch] = strings.TrimSpace(fmt.Sprintf("%v %v", devMatches[devMatch], origin)) } } // 2. Find out real device nodes behind symlinks // Example: devRealPaths["/dev/sda"] == "from symlink \"/dev/disk/by-id/ata-VendorSSD\"" devRealpaths := map[string]string{} // {devNode: origin} for devMatch, devOrigin := range devMatches { realDevNode, err := filepath.EvalSymlinks(devMatch) if err != nil { errs = append(errs, fmt.Errorf("cannot filepath.EvalSymlinks(%#v): %w", devMatch, err)) continue } if realDevNode != devMatch { origin = fmt.Sprintf("from symlink %#v %v", devMatch, devOrigin) } else { origin = devOrigin } devRealpaths[realDevNode] = strings.TrimSpace(fmt.Sprintf("%v %v", devRealpaths[realDevNode], origin)) } // 3. Filter out everything but block devices that are not partitions // Example: blockDevices[0] == {Major: 8, Minor: 0, DevNode: "/dev/sda", Origin: "..."} for devRealpath, devOrigin := range devRealpaths { origin := "" if devOrigin != "" { origin = fmt.Sprintf(" (origin: %s)", devOrigin) } fileInfo, err := os.Stat(devRealpath) if err != nil { errs = append(errs, fmt.Errorf("cannot os.Stat(%#v): %w%s", devRealpath, err, origin)) continue } fileMode := fileInfo.Mode() if fileMode&os.ModeDevice == 0 { errs = append(errs, fmt.Errorf("file %#v is not a device%s", devRealpath, origin)) continue } if fileMode&os.ModeCharDevice != 0 { errs = append(errs, fmt.Errorf("file %#v is a character device%s", devRealpath, origin)) continue } sys, ok := fileInfo.Sys().(*syscall.Stat_t) major := unix.Major(sys.Rdev) minor := unix.Minor(sys.Rdev) if !ok { errs = append(errs, fmt.Errorf("cannot get syscall stat_t from %#v: %w%s", devRealpath, err, origin)) continue } if minor&0xf != 0 { errs = append(errs, fmt.Errorf("skipping %#v: cannot weight/throttle partitions%s", devRealpath, origin)) continue } blockDevices = append(blockDevices, BlockDeviceInfo{ Major: int64(major), Minor: int64(minor), DevNode: devRealpath, Origin: devOrigin, }) } return blockDevices, errors.Join(errs...) } // blockioError creates a formatted error message. func blockioError(format string, args ...interface{}) error { return fmt.Errorf(format, args...) } ================================================ FILE: pkg/blockio/blockio_test.go ================================================ // Copyright 2019 Intel Corporation. All Rights Reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package blockio import ( "fmt" "path/filepath" "strings" "testing" "github.com/intel/cri-resource-manager/pkg/cgroups" "github.com/intel/cri-resource-manager/pkg/testutils" ) var knownIOSchedulers = map[string]bool{ "bfq": true, "cfq": true, "deadline": true, "kyber": true, "mq-deadline": true, "none": true, "noop": true, } // TestGetCurrentIOSchedulers: unit test for getCurrentIOSchedulers() func TestGetCurrentIOSchedulers(t *testing.T) { currentIOSchedulers, err := getCurrentIOSchedulers() testutils.VerifyError(t, err, 0, nil) for blockDev, ioScheduler := range currentIOSchedulers { s, ok := knownIOSchedulers[ioScheduler] if !ok || !s { t.Errorf("unknown io scheduler %#v on block device %#v", ioScheduler, blockDev) } } } // TestConfigurableBlockDevices: unit tests for configurableBlockDevices() func TestConfigurableBlockDevices(t *testing.T) { sysfsBlockDevs, err := filepath.Glob("/sys/block/*") if err != nil { sysfsBlockDevs = []string{} } devBlockDevs := []string{} for _, sysfsBlockDev := range sysfsBlockDevs { if strings.HasPrefix(sysfsBlockDev, "/sys/block/sd") || strings.HasPrefix(sysfsBlockDev, "/sys/block/vd") { devBlockDevs = append(devBlockDevs, strings.Replace(sysfsBlockDev, "/sys/block/", "/dev/", 1)) } } devPartitions := []string{} for _, devBlockDev := range devBlockDevs { devPartitions, _ = filepath.Glob(devBlockDev + "[0-9]") if len(devPartitions) > 0 { break } } t.Logf("test real block devices: %v", devBlockDevs) t.Logf("test partitions: %v", devPartitions) tcases := []struct { name string devWildcards []string expectedErrorCount int expectedErrorSubstrings []string expectedMatches int disabled bool disabledReason string }{ { name: "no device wildcards", devWildcards: nil, expectedErrorCount: 0, }, { name: "bad wildcard", devWildcards: []string{"/[-/verybadwildcard]"}, expectedErrorCount: 1, expectedErrorSubstrings: []string{"verybadwildcard", "syntax error"}, }, { name: "not matching wildcard", devWildcards: []string{"/dev/path that should not exist/*"}, expectedErrorCount: 1, expectedErrorSubstrings: []string{"does not match any"}, }, { name: "two wildcards: empty string and a character device", devWildcards: []string{"/dev/null", ""}, expectedErrorCount: 2, expectedErrorSubstrings: []string{"\"/dev/null\" is a character device", "\"\" does not match any"}, }, { name: "not a device or even a file", devWildcards: []string{"/proc", "/proc/meminfo", "/proc/notexistingfile"}, expectedErrorCount: 3, expectedErrorSubstrings: []string{"\"/proc\" is not a device", "\"/proc/meminfo\" is not a device"}, }, { name: "real block devices", devWildcards: devBlockDevs, expectedMatches: len(devBlockDevs), }, { name: "partition", devWildcards: devPartitions, expectedErrorCount: len(devPartitions), expectedErrorSubstrings: []string{"cannot weight/throttle partitions"}, disabled: len(devPartitions) == 0, disabledReason: "no block device partitions found", }, } for _, tc := range tcases { t.Run(tc.name, func(t *testing.T) { if tc.disabled { t.Skip(tc.disabledReason) } realPlatform := defaultPlatform{} bdis, err := realPlatform.configurableBlockDevices(tc.devWildcards) testutils.VerifyError(t, err, tc.expectedErrorCount, tc.expectedErrorSubstrings) if len(bdis) != tc.expectedMatches { t.Errorf("expected %d matching block devices, got %d", tc.expectedMatches, len(bdis)) } }) } } // TestDevicesParametersToOci: unit tests for devicesParametersToOci func TestDevicesParametersToOci(t *testing.T) { // switch real devicesParametersToOci to call mockPlatform.configurableBlockDevices currentPlatform = mockPlatform{} tcases := []struct { name string dps []DevicesParameters iosched map[string]string expectedOci *cgroups.OciBlockIOParameters expectedErrorCount int expectedErrorSubstrings []string }{ { name: "all OCI fields", dps: []DevicesParameters{ { Weight: "144", }, { Devices: []string{"/dev/sda"}, ThrottleReadBps: "1G", ThrottleWriteBps: "2M", ThrottleReadIOPS: "3k", ThrottleWriteIOPS: "4", Weight: "50", }, }, iosched: map[string]string{"/dev/sda": "bfq"}, expectedOci: &cgroups.OciBlockIOParameters{ Weight: 144, WeightDevice: cgroups.OciDeviceWeights{ {Major: 11, Minor: 12, Weight: 50}, }, ThrottleReadBpsDevice: cgroups.OciDeviceRates{ {Major: 11, Minor: 12, Rate: 1000000000}, }, ThrottleWriteBpsDevice: cgroups.OciDeviceRates{ {Major: 11, Minor: 12, Rate: 2000000}, }, ThrottleReadIOPSDevice: cgroups.OciDeviceRates{ {Major: 11, Minor: 12, Rate: 3000}, }, ThrottleWriteIOPSDevice: cgroups.OciDeviceRates{ {Major: 11, Minor: 12, Rate: 4}, }, }, }, { name: "later match overrides value", dps: []DevicesParameters{ { Devices: []string{"/dev/sda", "/dev/sdb", "/dev/sdc"}, ThrottleReadBps: "100", Weight: "110", }, { Devices: []string{"/dev/sdb", "/dev/sdc"}, ThrottleReadBps: "300", Weight: "330", }, { Devices: []string{"/dev/sdb"}, ThrottleReadBps: "200", Weight: "220", }, }, iosched: map[string]string{"/dev/sda": "bfq", "/dev/sdb": "bfq", "/dev/sdc": "cfq"}, expectedOci: &cgroups.OciBlockIOParameters{ Weight: -1, WeightDevice: cgroups.OciDeviceWeights{ {Major: 11, Minor: 12, Weight: 110}, {Major: 21, Minor: 22, Weight: 220}, {Major: 31, Minor: 32, Weight: 330}, }, ThrottleReadBpsDevice: cgroups.OciDeviceRates{ {Major: 11, Minor: 12, Rate: 100}, {Major: 21, Minor: 22, Rate: 200}, {Major: 31, Minor: 32, Rate: 300}, }, }, }, { name: "invalid weights, many errors in different parameter sets", dps: []DevicesParameters{ { Weight: "99999", }, { Devices: []string{"/dev/sda"}, Weight: "1", }, { Devices: []string{"/dev/sdb"}, Weight: "-2", }, }, expectedErrorCount: 3, expectedErrorSubstrings: []string{ "(99999) bigger than maximum", "(1) smaller than minimum", "(-2) smaller than minimum", }, }, { name: "throttling without listing Devices", dps: []DevicesParameters{ { ThrottleReadBps: "100M", ThrottleWriteIOPS: "20k", }, }, expectedErrorCount: 1, expectedErrorSubstrings: []string{ "Devices not listed", "\"100M\"", "\"20k\"", }, }, } for _, tc := range tcases { t.Run(tc.name, func(t *testing.T) { oci, err := devicesParametersToOci(tc.dps, tc.iosched) testutils.VerifyError(t, err, tc.expectedErrorCount, tc.expectedErrorSubstrings) if tc.expectedOci != nil { testutils.VerifyDeepEqual(t, "OCI parameters", *tc.expectedOci, oci) } }) } } // mockPlatform implements mock versions of platformInterface functions. type mockPlatform struct{} // configurableBlockDevices mock always returns a set of block devices. func (mpf mockPlatform) configurableBlockDevices(devWildcards []string) ([]BlockDeviceInfo, error) { blockDevices := []BlockDeviceInfo{} for _, devWildcard := range devWildcards { if devWildcard == "/dev/sda" { blockDevices = append(blockDevices, BlockDeviceInfo{ Major: 11, Minor: 12, DevNode: devWildcard, Origin: fmt.Sprintf("from wildcards %v", devWildcard), }) } else if devWildcard == "/dev/sdb" { blockDevices = append(blockDevices, BlockDeviceInfo{ Major: 21, Minor: 22, DevNode: devWildcard, Origin: fmt.Sprintf("from wildcards %v", devWildcard), }) } else if devWildcard == "/dev/sdc" { blockDevices = append(blockDevices, BlockDeviceInfo{ Major: 31, Minor: 32, DevNode: devWildcard, Origin: fmt.Sprintf("from wildcards %v", devWildcard), }) } } return blockDevices, nil } ================================================ FILE: pkg/blockio/config.go ================================================ /* Copyright 2020 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ package blockio import ( pkgcfg "github.com/intel/cri-resource-manager/pkg/config" ) // options captures our configurable parameters. type options struct { // Classes define weights and throttling parameters for sets of devices. Classes map[string][]DevicesParameters `json:",omitempty"` } // DevicesParameters defines Block IO parameters for a set of devices. type DevicesParameters struct { Devices []string `json:",omitempty"` ThrottleReadBps string `json:",omitempty"` ThrottleWriteBps string `json:",omitempty"` ThrottleReadIOPS string `json:",omitempty"` ThrottleWriteIOPS string `json:",omitempty"` Weight string `json:",omitempty"` } // Currently active set of "raw" options var opt = defaultOptions().(*options) // defaultOptions returns a new instance of "raw" options set to their defaults func defaultOptions() interface{} { return &options{} } func init() { pkgcfg.Register(ConfigModuleName, "Block I/O class control", opt, defaultOptions) } ================================================ FILE: pkg/cgroups/cgroupblkio.go ================================================ // Copyright 2020 Intel Corporation. All Rights Reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package cgroups import ( "errors" "fmt" "os" "path/filepath" "strconv" "strings" logger "github.com/intel/cri-resource-manager/pkg/log" ) const ( blkioCgroupDir = "/sys/fs/cgroup/blkio/" ) // logger var log logger.Logger = logger.NewLogger("cgroupblkio") // cgroups blkio parameter filenames. var blkioWeightFiles = []string{"blkio.bfq.weight", "blkio.weight"} var blkioWeightDeviceFiles = []string{"blkio.bfq.weight_device", "blkio.weight_device"} var blkioThrottleReadBpsFiles = []string{"blkio.throttle.read_bps_device"} var blkioThrottleWriteBpsFiles = []string{"blkio.throttle.write_bps_device"} var blkioThrottleReadIOPSFiles = []string{"blkio.throttle.read_iops_device"} var blkioThrottleWriteIOPSFiles = []string{"blkio.throttle.write_iops_device"} // OciBlockIOParameters contains OCI standard configuration of cgroups blkio parameters. // // Effects of Weight and Rate values in SetBlkioParameters(): // Value | Effect // -------+------------------------------------------------------------------- // // -1 | Do not write to cgroups, value is missing // 0 | Write to cgroups, will remove the setting as specified in cgroups blkio interface // other | Write to cgroups, sets the value type OciBlockIOParameters struct { Weight int64 WeightDevice OciDeviceWeights ThrottleReadBpsDevice OciDeviceRates ThrottleWriteBpsDevice OciDeviceRates ThrottleReadIOPSDevice OciDeviceRates ThrottleWriteIOPSDevice OciDeviceRates } // OciDeviceWeight contains values for // - blkio.[io-scheduler].weight type OciDeviceWeight struct { Major int64 Minor int64 Weight int64 } // OciDeviceRate contains values for // - blkio.throttle.read_bps_device // - blkio.throttle.write_bps_device // - blkio.throttle.read_iops_device // - blkio.throttle.write_iops_device type OciDeviceRate struct { Major int64 Minor int64 Rate int64 } // OciDeviceWeights contains weights for devices type OciDeviceWeights []OciDeviceWeight // OciDeviceRates contains throttling rates for devices type OciDeviceRates []OciDeviceRate // OciDeviceParameters interface provides functions common to OciDeviceWeights and OciDeviceRates type OciDeviceParameters interface { Append(maj, min, val int64) Update(maj, min, val int64) } // Append appends (major, minor, value) to OciDeviceWeights slice. func (w *OciDeviceWeights) Append(maj, min, val int64) { *w = append(*w, OciDeviceWeight{Major: maj, Minor: min, Weight: val}) } // Append appends (major, minor, value) to OciDeviceRates slice. func (r *OciDeviceRates) Append(maj, min, val int64) { *r = append(*r, OciDeviceRate{Major: maj, Minor: min, Rate: val}) } // Update updates device weight in OciDeviceWeights slice, or appends it if not found. func (w *OciDeviceWeights) Update(maj, min, val int64) { for index, devWeight := range *w { if devWeight.Major == maj && devWeight.Minor == min { (*w)[index].Weight = val return } } w.Append(maj, min, val) } // Update updates device rate in OciDeviceRates slice, or appends it if not found. func (r *OciDeviceRates) Update(maj, min, val int64) { for index, devRate := range *r { if devRate.Major == maj && devRate.Minor == min { (*r)[index].Rate = val return } } r.Append(maj, min, val) } // NewOciBlockIOParameters creates new OciBlockIOParameters instance. func NewOciBlockIOParameters() OciBlockIOParameters { return OciBlockIOParameters{ Weight: -1, } } // NewOciDeviceWeight creates new OciDeviceWeight instance. func NewOciDeviceWeight() OciDeviceWeight { return OciDeviceWeight{ Major: -1, Minor: -1, Weight: -1, } } // NewOciDeviceRate creates new OciDeviceRate instance. func NewOciDeviceRate() OciDeviceRate { return OciDeviceRate{ Major: -1, Minor: -1, Rate: -1, } } // GetBlkioDir returns the cgroups blkio controller directory. func GetBlkioDir() string { return blkioCgroupDir } type devMajMin struct { Major int64 Minor int64 } // ResetBlkioParameters adds new, changes existing and removes missing blockIO parameters in cgroupsDir func ResetBlkioParameters(cgroupsDir string, blockIO OciBlockIOParameters) error { errs := []error{} oldBlockIO, getErr := GetBlkioParameters(cgroupsDir) errs = append(errs, getErr) newBlockIO := NewOciBlockIOParameters() newBlockIO.Weight = blockIO.Weight // Set new device weights seenDev := map[devMajMin]bool{} for _, ociWDP := range blockIO.WeightDevice { seenDev[devMajMin{ociWDP.Major, ociWDP.Minor}] = true newBlockIO.WeightDevice = append(newBlockIO.WeightDevice, ociWDP) } // Reset old device weights that were missing from blockIO.WeightDevice for _, ociWDP := range oldBlockIO.WeightDevice { if !seenDev[devMajMin{ociWDP.Major, ociWDP.Minor}] { newBlockIO.WeightDevice = append(newBlockIO.WeightDevice, OciDeviceWeight{ociWDP.Major, ociWDP.Minor, 0}) } } newBlockIO.ThrottleReadBpsDevice = resetDevRates(oldBlockIO.ThrottleReadBpsDevice, blockIO.ThrottleReadBpsDevice) newBlockIO.ThrottleWriteBpsDevice = resetDevRates(oldBlockIO.ThrottleWriteBpsDevice, blockIO.ThrottleWriteBpsDevice) newBlockIO.ThrottleReadIOPSDevice = resetDevRates(oldBlockIO.ThrottleReadIOPSDevice, blockIO.ThrottleReadIOPSDevice) newBlockIO.ThrottleWriteIOPSDevice = resetDevRates(oldBlockIO.ThrottleWriteIOPSDevice, blockIO.ThrottleWriteIOPSDevice) errs = append(errs, SetBlkioParameters(cgroupsDir, newBlockIO)) return errors.Join(errs...) } // resetDevRates adds wanted rate parameters to new and resets unwated rates func resetDevRates(old, wanted []OciDeviceRate) []OciDeviceRate { rates := []OciDeviceRate{} seenDev := map[devMajMin]bool{} for _, rdp := range wanted { rates = append(rates, rdp) seenDev[devMajMin{rdp.Major, rdp.Minor}] = true } for _, rdp := range old { if !seenDev[devMajMin{rdp.Major, rdp.Minor}] { rates = append(rates, OciDeviceRate{rdp.Major, rdp.Minor, 0}) } } return rates } // GetBlkioParameters returns OCI BlockIO parameters from files in cgroups blkio controller directory. func GetBlkioParameters(cgroupsDir string) (OciBlockIOParameters, error) { errs := []error{} blockIO := NewOciBlockIOParameters() content, err := readFromFileInDir(cgroupsDir, blkioWeightFiles) if err == nil { weight, err := strconv.ParseInt(strings.TrimSuffix(content, "\n"), 10, 64) if err == nil { blockIO.Weight = weight } else { errs = append(errs, fmt.Errorf("parsing weight from %#v failed: %w", content, err)) } } else { errs = append(errs, err) } errs = append(errs, readOciDeviceParameters(cgroupsDir, blkioWeightDeviceFiles, &blockIO.WeightDevice)) errs = append(errs, readOciDeviceParameters(cgroupsDir, blkioThrottleReadBpsFiles, &blockIO.ThrottleReadBpsDevice)) errs = append(errs, readOciDeviceParameters(cgroupsDir, blkioThrottleWriteBpsFiles, &blockIO.ThrottleWriteBpsDevice)) errs = append(errs, readOciDeviceParameters(cgroupsDir, blkioThrottleReadIOPSFiles, &blockIO.ThrottleReadIOPSDevice)) errs = append(errs, readOciDeviceParameters(cgroupsDir, blkioThrottleWriteIOPSFiles, &blockIO.ThrottleWriteIOPSDevice)) return blockIO, errors.Join(errs...) } // readOciDeviceParameters parses device lines used for weights and throttling rates func readOciDeviceParameters(baseDir string, filenames []string, params OciDeviceParameters) error { errs := []error{} contents, err := readFromFileInDir(baseDir, filenames) if err != nil { return err } for _, line := range strings.Split(contents, "\n") { // Device weight files may have "default NNN" line at the beginning. Skip it. if line == "" || strings.HasPrefix(line, "default ") { continue } // Expect syntax MAJOR:MINOR VALUE devVal := strings.Split(line, " ") if len(devVal) != 2 { errs = append(errs, fmt.Errorf("invalid line %q, single space expected", line)) continue } majMin := strings.Split(devVal[0], ":") if len(majMin) != 2 { errs = append(errs, fmt.Errorf("invalid line %q, single colon expected before space", line)) continue } major, majErr := strconv.ParseInt(majMin[0], 10, 64) minor, minErr := strconv.ParseInt(majMin[1], 10, 64) value, valErr := strconv.ParseInt(devVal[1], 10, 64) if majErr != nil || minErr != nil || valErr != nil { errs = append(errs, fmt.Errorf("invalid number when parsing \"major:minor value\" from \"%s:%s %s\"", majMin[0], majMin[1], devVal[1])) continue } params.Append(major, minor, value) } return errors.Join(errs...) } // readFromFileInDir returns content from the first successfully read file. func readFromFileInDir(baseDir string, filenames []string) (string, error) { errs := []error{} // If reading all the files fails, return list of read errors. for _, filename := range filenames { filepath := filepath.Join(baseDir, filename) content, err := currentPlatform.readFromFile(filepath) if err == nil { return content, nil } errs = append(errs, err) } err := errors.Join(errs...) if err != nil { return "", fmt.Errorf("could not read any of files %q: %w", filenames, err) } return "", nil } // SetBlkioParameters writes OCI BlockIO parameters to files in cgroups blkio contoller directory. func SetBlkioParameters(cgroupsDir string, blockIO OciBlockIOParameters) error { log.Debug("configuring cgroups blkio controller in directory %#v with parameters %+v", cgroupsDir, blockIO) errs := []error{} if blockIO.Weight >= 0 { errs = append(errs, writeToFileInDir(cgroupsDir, blkioWeightFiles, strconv.FormatInt(blockIO.Weight, 10))) } for _, weightDevice := range blockIO.WeightDevice { errs = append(errs, writeDevValueToFileInDir(cgroupsDir, blkioWeightDeviceFiles, weightDevice.Major, weightDevice.Minor, weightDevice.Weight)) } for _, rateDevice := range blockIO.ThrottleReadBpsDevice { errs = append(errs, writeDevValueToFileInDir(cgroupsDir, blkioThrottleReadBpsFiles, rateDevice.Major, rateDevice.Minor, rateDevice.Rate)) } for _, rateDevice := range blockIO.ThrottleWriteBpsDevice { errs = append(errs, writeDevValueToFileInDir(cgroupsDir, blkioThrottleWriteBpsFiles, rateDevice.Major, rateDevice.Minor, rateDevice.Rate)) } for _, rateDevice := range blockIO.ThrottleReadIOPSDevice { errs = append(errs, writeDevValueToFileInDir(cgroupsDir, blkioThrottleReadIOPSFiles, rateDevice.Major, rateDevice.Minor, rateDevice.Rate)) } for _, rateDevice := range blockIO.ThrottleWriteIOPSDevice { errs = append(errs, writeDevValueToFileInDir(cgroupsDir, blkioThrottleWriteIOPSFiles, rateDevice.Major, rateDevice.Minor, rateDevice.Rate)) } return errors.Join(errs...) } // writeDevValueToFileInDir writes MAJOR:MINOR VALUE to the first existing file under baseDir func writeDevValueToFileInDir(baseDir string, filenames []string, major, minor, value int64) error { content := fmt.Sprintf("%d:%d %d", major, minor, value) return writeToFileInDir(baseDir, filenames, content) } // writeToFileInDir writes content to the first existing file in the list under baseDir. func writeToFileInDir(baseDir string, filenames []string, content string) error { errs := []error{} // Returns list of errors from writes, list of single error due to all filenames missing or nil on success. for _, filename := range filenames { filepath := filepath.Join(baseDir, filename) err := currentPlatform.writeToFile(filepath, content) if err == nil { return nil } errs = append(errs, err) } err := errors.Join(errs...) if err != nil { return fmt.Errorf("could not write content %#v to any of files %q: %w", content, filenames, err) } return nil } // platformInterface includes functions that access the system. Enables mocking the platform. type platformInterface interface { readFromFile(filename string) (string, error) writeToFile(filename string, content string) error } // defaultPlatform versions of platformInterface functions access the underlying system. type defaultPlatform struct{} // currentPlatform defines which platformInterface is used: defaultPlatform or a mock, for instance. var currentPlatform platformInterface = defaultPlatform{} // readFromFile returns file contents as a string. func (dpm defaultPlatform) readFromFile(filename string) (string, error) { content, err := os.ReadFile(filename) return string(content), err } // writeToFile writes content to an existing file. func (dpm defaultPlatform) writeToFile(filename string, content string) error { f, err := os.OpenFile(filename, os.O_WRONLY, 0666) if err != nil { return err } defer f.Close() _, err = f.Write([]byte(content)) return err } ================================================ FILE: pkg/cgroups/cgroupblkio_test.go ================================================ // Copyright 2020 Intel Corporation. All Rights Reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package cgroups import ( "fmt" "testing" "github.com/intel/cri-resource-manager/pkg/testutils" ) func TestUpdateAppend(t *testing.T) { tcases := []struct { name string inputMajMinVals [][]int64 inputItem []int64 expectedMajMinVal [][]int64 expectedErrorCount int expectedErrorSubstrings []string }{ { name: "update empty list", inputItem: []int64{1, 2, 3}, expectedMajMinVal: [][]int64{{1, 2, 3}}, }, { name: "update appends non-existing element", inputMajMinVals: [][]int64{{10, 20, 30}, {40, 50, 60}}, inputItem: []int64{1, 2, 3}, expectedMajMinVal: [][]int64{{10, 20, 30}, {40, 50, 60}, {1, 2, 3}}, }, { name: "update the first existing element", inputMajMinVals: [][]int64{{10, 20, 30}, {40, 50, 60}, {40, 50, 60}}, inputItem: []int64{40, 50, 66}, expectedMajMinVal: [][]int64{{10, 20, 30}, {40, 50, 66}, {40, 50, 60}}, }, } for _, tc := range tcases { t.Run(tc.name, func(t *testing.T) { devWeights := OciDeviceWeights{} devRates := OciDeviceRates{} expDevWeights := OciDeviceWeights{} expDevRates := OciDeviceRates{} for _, item := range tc.inputMajMinVals { devWeights.Append(item[0], item[1], item[2]) devRates.Append(item[0], item[1], item[2]) } devWeights.Update(tc.inputItem[0], tc.inputItem[1], tc.inputItem[2]) devRates.Update(tc.inputItem[0], tc.inputItem[1], tc.inputItem[2]) for _, item := range tc.expectedMajMinVal { expDevWeights = append(expDevWeights, OciDeviceWeight{item[0], item[1], item[2]}) expDevRates = append(expDevRates, OciDeviceRate{item[0], item[1], item[2]}) } testutils.VerifyDeepEqual(t, "device weights", expDevWeights, devWeights) testutils.VerifyDeepEqual(t, "device rates", expDevRates, devRates) }) } } // TestResetBlkioParameters: unit test for ResetBlkioParameters() func TestResetBlkioParameters(t *testing.T) { tcases := []struct { name string cgroupsDir string blockIO OciBlockIOParameters fsContent map[string]string expectedFsWrites map[string]string expectedBlockIO *OciBlockIOParameters expectedErrorCount int expectedErrorSubstrings []string }{ { name: "write to clean cgroups", cgroupsDir: "/write/to/clean", blockIO: OciBlockIOParameters{ Weight: 222, WeightDevice: OciDeviceWeights{{1, 2, 3}, {4, 5, 6}, {7, 8, 9}}, ThrottleReadBpsDevice: OciDeviceRates{{11, 12, 13}, {111, 112, 113}}, ThrottleWriteBpsDevice: OciDeviceRates{{21, 22, 23}, {221, 222, 223}}, ThrottleReadIOPSDevice: OciDeviceRates{{31, 32, 33}, {331, 332, 333}}, ThrottleWriteIOPSDevice: OciDeviceRates{{41, 42, 43}, {441, 442, 443}}, }, fsContent: map[string]string{ "/write/to/clean/blkio.bfq.weight": "100\n", "/write/to/clean/blkio.bfq.weight_device": "", "/write/to/clean/blkio.throttle.read_bps_device": "", "/write/to/clean/blkio.throttle.write_bps_device": "", "/write/to/clean/blkio.throttle.read_iops_device": "", "/write/to/clean/blkio.throttle.write_iops_device": "", }, expectedFsWrites: map[string]string{ "/write/to/clean/blkio.bfq.weight": "222", "/write/to/clean/blkio.bfq.weight_device": "1:2 3+4:5 6+7:8 9", "/write/to/clean/blkio.throttle.read_bps_device": "11:12 13+111:112 113", "/write/to/clean/blkio.throttle.write_bps_device": "21:22 23+221:222 223", "/write/to/clean/blkio.throttle.read_iops_device": "31:32 33+331:332 333", "/write/to/clean/blkio.throttle.write_iops_device": "41:42 43+441:442 443", }, }, { name: "reset all existing", cgroupsDir: "/reset/all", blockIO: NewOciBlockIOParameters(), fsContent: map[string]string{ "/reset/all/blkio.bfq.weight": "200\n", "/reset/all/blkio.bfq.weight_device": "default 200\n1:2 3\n4:5 6\n", "/reset/all/blkio.throttle.read_bps_device": "11:12 13\n14:15 16\n", "/reset/all/blkio.throttle.write_bps_device": "21:22 23\n", "/reset/all/blkio.throttle.read_iops_device": "31:32 33\n", "/reset/all/blkio.throttle.write_iops_device": "41:42 43\n", }, expectedFsWrites: map[string]string{ "/reset/all/blkio.bfq.weight_device": "1:2 0+4:5 0", "/reset/all/blkio.throttle.read_bps_device": "11:12 0+14:15 0", "/reset/all/blkio.throttle.write_bps_device": "21:22 0", "/reset/all/blkio.throttle.read_iops_device": "31:32 0", "/reset/all/blkio.throttle.write_iops_device": "41:42 0", }, }, { name: "merge", cgroupsDir: "/merge", blockIO: OciBlockIOParameters{ Weight: 80, WeightDevice: OciDeviceWeights{{1, 2, 1113}, {7, 8, 9}}, // drop middle, update first, keep last ThrottleReadBpsDevice: OciDeviceRates{{11, 12, 13}}, // keep the first entry ThrottleWriteBpsDevice: OciDeviceRates{{24, 25, 26}}, // keep the last entry ThrottleReadIOPSDevice: OciDeviceRates{{31, 32, 33}, {331, 332, 333}}, // keep all ThrottleWriteIOPSDevice: OciDeviceRates{{41, 42, 430}, {441, 442, 4430}}, // change all }, fsContent: map[string]string{ "/merge/blkio.bfq.weight": "200\n", "/merge/blkio.bfq.weight_device": "default 200\n1:2 3\n4:5 6\n7:8 9", "/merge/blkio.throttle.read_bps_device": "11:12 13\n14:15 16\n", "/merge/blkio.throttle.write_bps_device": "21:22 23\n24:25 26\n", "/merge/blkio.throttle.read_iops_device": "31:32 33\n331:332 333\n", "/merge/blkio.throttle.write_iops_device": "41:42 43\n441:442 443\n", }, expectedFsWrites: map[string]string{ "/merge/blkio.bfq.weight": "80", "/merge/blkio.bfq.weight_device": "1:2 1113+7:8 9+4:5 0", "/merge/blkio.throttle.read_bps_device": "11:12 13+14:15 0", "/merge/blkio.throttle.write_bps_device": "24:25 26+21:22 0", "/merge/blkio.throttle.read_iops_device": "31:32 33+331:332 333", "/merge/blkio.throttle.write_iops_device": "41:42 430+441:442 4430", }, }, } for _, tc := range tcases { t.Run(tc.name, func(t *testing.T) { mpf := mockPlatform{ fsOrigContent: tc.fsContent, fsWrites: make(map[string]string), } currentPlatform = &mpf err := ResetBlkioParameters(tc.cgroupsDir, tc.blockIO) testutils.VerifyError(t, err, tc.expectedErrorCount, tc.expectedErrorSubstrings) if tc.expectedFsWrites != nil { testutils.VerifyDeepEqual(t, "filesystem writes", tc.expectedFsWrites, mpf.fsWrites) } }) } } // TestGetBlkioParameters: unit test for GetBlkioParameters() func TestGetBlkioParameters(t *testing.T) { tcases := []struct { name string cgroupsDir string readsFail int fsContent map[string]string expectedBlockIO *OciBlockIOParameters expectedErrorCount int expectedErrorSubstrings []string }{ { name: "empty files", cgroupsDir: "/empty/ok", fsContent: map[string]string{ "/empty/ok/blkio.bfq.weight": "", "/empty/ok/blkio.bfq.weight_device": "", "/empty/ok/blkio.throttle.read_bps_device": "", "/empty/ok/blkio.throttle.write_bps_device": "", "/empty/ok/blkio.throttle.read_iops_device": "", "/empty/ok/blkio.throttle.write_iops_device": "", }, expectedBlockIO: &OciBlockIOParameters{Weight: -1}, expectedErrorCount: 1, // weight is not expected to be empty expectedErrorSubstrings: []string{"parsing weight"}, }, { name: "everything defined", cgroupsDir: "/read/ok", fsContent: map[string]string{ "/read/ok/blkio.bfq.weight": "1", // test weight_device file with real "default" line "/read/ok/blkio.bfq.weight_device": "default 10\n1:2 3\n", // test parsing two lines and skipping empty lines "/read/ok/blkio.throttle.read_bps_device": "\n11:22 33\n\n111:222 333\n", // test single line file "/read/ok/blkio.throttle.write_bps_device": "1111:2222 3333\n", // test single line, missing LF at the end "/read/ok/blkio.throttle.read_iops_device": "11111:22222 33333", // test small and large values "/read/ok/blkio.throttle.write_iops_device": "0:0 0\n4294967296:4294967297 9223372036854775807\n", }, expectedBlockIO: &OciBlockIOParameters{ Weight: 1, WeightDevice: OciDeviceWeights{{1, 2, 3}}, ThrottleReadBpsDevice: OciDeviceRates{{11, 22, 33}, {111, 222, 333}}, ThrottleWriteBpsDevice: OciDeviceRates{{1111, 2222, 3333}}, ThrottleReadIOPSDevice: OciDeviceRates{{11111, 22222, 33333}}, ThrottleWriteIOPSDevice: OciDeviceRates{{0, 0, 0}, {4294967296, 4294967297, 9223372036854775807}}, }, }, { name: "test bad lines", cgroupsDir: "read/bad", fsContent: map[string]string{ "read/bad/blkio.bfq.weight": "xyz", // test bad line in the middle "read/bad/blkio.bfq.weight_device": "default 10\n1:2 3\nbad\n4:5 6\n", // test no spaces "read/bad/blkio.throttle.read_bps_device": "11:22:33", // test too many spaces "read/bad/blkio.throttle.write_bps_device": "1111 2222 3333 \n", // test no colons "read/bad/blkio.throttle.read_iops_device": "1111122222 33333", // test missing number "read/bad/blkio.throttle.write_iops_device": "0: 0\n", }, expectedErrorCount: 6, expectedErrorSubstrings: []string{"bad", "xyz", "11:22:33", "1111 2222 3333 ", "1111122222 33333", "0: 0"}, expectedBlockIO: &OciBlockIOParameters{ Weight: -1, WeightDevice: OciDeviceWeights{{1, 2, 3}, {4, 5, 6}}, }, }, { name: "all files missing", cgroupsDir: "/missing/err", fsContent: map[string]string{}, expectedBlockIO: &OciBlockIOParameters{Weight: -1}, expectedErrorCount: 6, expectedErrorSubstrings: []string{ "file not found", "blkio.bfq.weight", "blkio.bfq.weight_device", "blkio.throttle.read_bps_device", "blkio.throttle.write_bps_device", "blkio.throttle.read_iops_device", "blkio.throttle.write_iops_device", }, }, } for _, tc := range tcases { t.Run(tc.name, func(t *testing.T) { mpf := mockPlatform{ fsOrigContent: tc.fsContent, readsFail: tc.readsFail, } currentPlatform = &mpf blockIO, err := GetBlkioParameters(tc.cgroupsDir) testutils.VerifyError(t, err, tc.expectedErrorCount, tc.expectedErrorSubstrings) if tc.expectedBlockIO != nil { testutils.VerifyDeepEqual(t, "blockio parameters", *tc.expectedBlockIO, blockIO) } }) } } // TestSetBlkioParameters: unit test for SetBlkioParameters() func TestSetBlkioParameters(t *testing.T) { tcases := []struct { name string cgroupsDir string blockIO OciBlockIOParameters writesFail int expectedFsWrites map[string]string expectedErrorCount int expectedErrorSubstrings []string }{ { name: "write full OCI struct", cgroupsDir: "/my/full", blockIO: OciBlockIOParameters{ Weight: 10, WeightDevice: OciDeviceWeights{{Major: 1, Minor: 2, Weight: 3}}, ThrottleReadBpsDevice: OciDeviceRates{{Major: 11, Minor: 12, Rate: 13}}, ThrottleWriteBpsDevice: OciDeviceRates{{Major: 21, Minor: 22, Rate: 23}}, ThrottleReadIOPSDevice: OciDeviceRates{{Major: 31, Minor: 32, Rate: 33}}, ThrottleWriteIOPSDevice: OciDeviceRates{{Major: 41, Minor: 42, Rate: 43}}, }, expectedFsWrites: map[string]string{ "/my/full/blkio.bfq.weight": "10", "/my/full/blkio.bfq.weight_device": "1:2 3", "/my/full/blkio.throttle.read_bps_device": "11:12 13", "/my/full/blkio.throttle.write_bps_device": "21:22 23", "/my/full/blkio.throttle.read_iops_device": "31:32 33", "/my/full/blkio.throttle.write_iops_device": "41:42 43", }, }, { name: "write empty struct", cgroupsDir: "/my/empty", blockIO: OciBlockIOParameters{}, expectedFsWrites: map[string]string{ "/my/empty/blkio.bfq.weight": "0", }, }, { name: "multidevice weight and throttling, no weight write on -1", cgroupsDir: "/my/multidev", blockIO: OciBlockIOParameters{ Weight: -1, WeightDevice: OciDeviceWeights{{1, 2, 3}, {4, 5, 6}}, ThrottleReadBpsDevice: OciDeviceRates{{11, 12, 13}, {111, 112, 113}}, ThrottleWriteBpsDevice: OciDeviceRates{{21, 22, 23}, {221, 222, 223}}, ThrottleReadIOPSDevice: OciDeviceRates{{31, 32, 33}, {331, 332, 333}}, ThrottleWriteIOPSDevice: OciDeviceRates{{41, 42, 43}, {441, 442, 443}}, }, expectedFsWrites: map[string]string{ "/my/multidev/blkio.bfq.weight_device": "1:2 3+4:5 6", "/my/multidev/blkio.throttle.read_bps_device": "11:12 13+111:112 113", "/my/multidev/blkio.throttle.write_bps_device": "21:22 23+221:222 223", "/my/multidev/blkio.throttle.read_iops_device": "31:32 33+331:332 333", "/my/multidev/blkio.throttle.write_iops_device": "41:42 43+441:442 443", }, }, { name: "no bfq.weight", cgroupsDir: "/my/nobfq", blockIO: OciBlockIOParameters{Weight: 100}, writesFail: 1, expectedFsWrites: map[string]string{"/my/nobfq/blkio.weight": "100"}, }, { name: "all writes fail", cgroupsDir: "/my/writesfail", blockIO: OciBlockIOParameters{ Weight: -1, WeightDevice: OciDeviceWeights{{1, 0, 100}}, }, writesFail: 9999, expectedErrorCount: 1, expectedErrorSubstrings: []string{ "could not write content \"1:0 100\" to any of files", "\"blkio.bfq.weight_device\"", "\"blkio.weight_device\"", }, }, } for _, tc := range tcases { t.Run(tc.name, func(t *testing.T) { mpf := mockPlatform{ fsWrites: make(map[string]string), writesFail: tc.writesFail, } currentPlatform = &mpf err := SetBlkioParameters(tc.cgroupsDir, tc.blockIO) testutils.VerifyError(t, err, tc.expectedErrorCount, tc.expectedErrorSubstrings) if tc.expectedFsWrites != nil { testutils.VerifyDeepEqual(t, "filesystem writes", tc.expectedFsWrites, mpf.fsWrites) } }) } } // mockPlatform implements mock versions of platformInterface functions. type mockPlatform struct { fsOrigContent map[string]string fsWrites map[string]string readsFail int writesFail int } func (mpf *mockPlatform) readFromFile(filename string) (string, error) { if mpf.readsFail > 0 { mpf.readsFail-- return "", fmt.Errorf("mockPlatofrm: reading from %#v failed", filename) } if content, ok := mpf.fsOrigContent[filename]; ok { return content, nil } return "", fmt.Errorf("mockPlatform: file not found %#v", filename) } func (mpf *mockPlatform) writeToFile(filename string, content string) error { var newContent string if mpf.writesFail > 0 { mpf.writesFail-- return fmt.Errorf("mockPlatform: writing to %#v failed", filename) } if oldContent, ok := mpf.fsWrites[filename]; ok { newContent = fmt.Sprintf("%s+%s", oldContent, content) } else { newContent = content } mpf.fsWrites[filename] = newContent return nil } ================================================ FILE: pkg/cgroups/cgroupcontrol.go ================================================ // Copyright 2020-2021 Intel Corporation. All Rights Reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package cgroups import ( "bufio" "errors" "fmt" "os" "path" "strings" "syscall" ) // Controller is our enumerated type for cgroup controllers. type Controller int // Group represents a control group. type Group string // nolint const ( // UnkownController represents a controller of unknown type. UnknownController Controller = iota // blkio cgroup controller. Blkio // cpu cgroup controller. Cpu // cpuacct cgroup controller. Cpuacct // cpuset cgroup controller. Cpuset // devices cgroup controller. Devices // freezer cgroup controller. Freezer // hugetlb cgroup controller. Hugetlb // memory cgroup controller. Memory // net_cls cgroup controller. NetCls // net_prio cgroup controller. NetPrio // per_event cgroup controller. PerfEvent // pids cgroup controller. Pids ) var ( // controllerNames maps controllers to names/relative paths. controllerNames = map[Controller]string{ Blkio: "blkio", Cpu: "cpu", Cpuacct: "cpuacct", Cpuset: "cpuset", Devices: "devices", Freezer: "freezer", Hugetlb: "hugetlb", Memory: "memory", NetCls: "net_cls", NetPrio: "net_prio", PerfEvent: "perf_event", Pids: "pids", } // controllerNames maps controllers to names/relative paths. controllerDirs = map[string]Controller{ "blkio": Blkio, "cpu": Cpu, "cpuacct": Cpuacct, "cpuset": Cpuset, "devices": Devices, "freezer": Freezer, "hugetlb": Hugetlb, "memory": Memory, "net_cls": NetCls, "net_prio": NetPrio, "perf_event": PerfEvent, "pids": Pids, } ) // String returns the name of the given controller. func (c Controller) String() string { if name, ok := controllerNames[c]; ok { return name } return "unknown" } // Path returns the absolute path of the given controller. func (c Controller) Path() string { DetectSystemCgroupVersion() if systemCgroupVersion == 2 { return GetMountDir() } return path.Join(mountDir, c.String()) } // RelPath returns the relative path of the given controller. func (c Controller) RelPath() string { DetectSystemCgroupVersion() if systemCgroupVersion == 2 { return "" } return c.String() } // Group returns the given group for the controller. func (c Controller) Group(group string) Group { return Group(path.Join(c.Path(), group)) } // AsGroup returns the group for the given absolute directory path. func AsGroup(absDir string) Group { return Group(absDir) } // Controller returns the controller for the group. func (g Group) Controller() Controller { DetectSystemCgroupVersion() if systemCgroupVersion == 2 { return UnknownController } relPath := strings.TrimPrefix(string(g), mountDir+"/") split := strings.SplitN(relPath, "/", 2) if len(split) > 0 { return controllerDirs[split[0]] } return UnknownController } // GetTasks reads the pids of threads currently assigned to the group. func (g Group) GetTasks() ([]string, error) { return g.readPids(Tasks) } // GetProcesses reads the pids of processes currently assigned to the group. func (g Group) GetProcesses() ([]string, error) { return g.readPids(Procs) } // AddTasks writes the given thread pids to the group. func (g Group) AddTasks(pids ...string) error { return g.writePids(Tasks, pids...) } // AddProcesses writes the given process pids to the group. func (g Group) AddProcesses(pids ...string) error { return g.writePids(Procs, pids...) } // Write writes the formatted data to the groups entry. func (g Group) Write(entry, format string, args ...interface{}) error { entryPath := path.Join(string(g), entry) f, err := os.OpenFile(entryPath, os.O_WRONLY, 0644) if err != nil { return g.errorf("%q: failed to open: %v", entry, err) } defer f.Close() data := fmt.Sprintf(format, args...) if _, err := f.Write([]byte(data)); err != nil { return g.errorf("%q: failed to write %q: %v", entry, data, err) } return nil } // readPids reads pids from a cgroup's tasks or procs entry. func (g Group) readPids(entry string) ([]string, error) { var pids []string pidFile := path.Join(string(g), entry) f, err := os.OpenFile(pidFile, os.O_RDONLY, 0644) if err != nil { return nil, g.errorf("failed to open %q: %v", entry, err) } defer f.Close() s := bufio.NewScanner(f) for s.Scan() { pids = append(pids, s.Text()) } if s.Err() != nil { return nil, g.errorf("failed to read %q: %v", entry, err) } return pids, nil } // writePids writes pids to a cgroup's tasks or procs entry. func (g Group) writePids(entry string, pids ...string) error { pidFile := path.Join(string(g), entry) f, err := os.OpenFile(pidFile, os.O_WRONLY, 0644) if err != nil { return g.errorf("failed to write pids to %q: %v", pidFile, err) } defer f.Close() for _, pid := range pids { if _, err := f.Write([]byte(pid)); err != nil { if !errors.Is(err, syscall.ESRCH) { return g.errorf("failed to write pid %s to %q: %v", pid, pidFile, err) } } } return nil } // error returns a formatted group-specific error. func (g Group) errorf(format string, args ...interface{}) error { name := strings.TrimPrefix(string(g), mountDir+"/") return fmt.Errorf("cgroup "+name+": "+format, args...) } ================================================ FILE: pkg/cgroups/cgroupid.go ================================================ package cgroups import ( "encoding/binary" "fmt" "os" "path/filepath" "sync" "golang.org/x/sys/unix" ) // CgroupID implements mapping kernel cgroup IDs to cgroupfs paths with transparent caching. type CgroupID struct { root string cache map[uint64]string sync.Mutex } // NewCgroupID creates a new CgroupID map/cache. func NewCgroupID(root string) *CgroupID { return &CgroupID{ root: root, cache: make(map[uint64]string), } } func getID(path string) uint64 { h, _, err := unix.NameToHandleAt(unix.AT_FDCWD, path, 0) if err != nil { return 0 } return binary.LittleEndian.Uint64(h.Bytes()) } // Find finds the path for the given cgroup id. func (cgid *CgroupID) Find(id uint64) (string, error) { found := false var p string cgid.Lock() defer cgid.Unlock() if path, ok := cgid.cache[id]; ok { return path, nil } err := filepath.Walk(cgid.root, func(path string, info os.FileInfo, err error) error { if err != nil { if os.IsNotExist(err) { return nil } fmt.Printf("WalkFunc called with an error (path %q: %v\n)", path, err) return err } if found { return filepath.SkipDir } if info.IsDir() && id == getID(path) { found = true p = path return filepath.SkipDir } return nil }) if err != nil { return "", err } else if !found { return "", fmt.Errorf("cgroupid %v not found", id) } cgid.cache[id] = p return p, nil } ================================================ FILE: pkg/cgroups/cgrouppath.go ================================================ // Copyright 2020 Intel Corporation. All Rights Reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package cgroups import ( "flag" "os" "path" "path/filepath" ) // nolint const ( // Tasks is a cgroup's "tasks" entry. Tasks = "tasks" // Procs is cgroup's "cgroup.procs" entry. Procs = "cgroup.procs" // CpuShares is the cpu controller's "cpu.shares" entry. CpuShares = "cpu.shares" // CpuPeriod is the cpu controller's "cpu.cfs_period_us" entry. CpuPeriod = "cpu.cfs_period_us" // CpuQuota is the cpu controller's "cpu.cfs_quota_us" entry. CpuQuota = "cpu.cfs_quota_us" // CpusetCpus is the cpuset controller's cpuset.cpus entry. CpusetCpus = "cpuset.cpus" // CpusetMems is the cpuset controller's cpuset.mems entry. CpusetMems = "cpuset.mems" // Controllers is the cgroup v2 controllers file Controllers = "cgroup.controllers" ) var ( // mount is the parent directory for per-controller cgroupfs mounts. mountDir = "/sys/fs/cgroup" // v2Dir is the parent directory for per-controller cgroupfs mounts. v2Dir = path.Join(mountDir, "unified") // KubeletRoot is the --cgroup-root option the kubelet is running with. KubeletRoot = "" // detected system cgroup version, 0 is undetected systemCgroupVersion = 0 ) // GetMountDir returns the common mount point for cgroup v1 controllers. func GetMountDir() string { return mountDir } // SetMountDir sets the common mount point for the cgroup v1 controllers. func SetMountDir(dir string) { v2, _ := filepath.Rel(mountDir, v2Dir) mountDir = dir if v2 != "" { v2Dir = path.Join(mountDir, v2) } } // GetV2Dir() returns the cgroup v2 unified mount directory. func GetV2Dir() string { return v2Dir } // SetV2Dir sets the unified cgroup v2 mount directory. func SetV2Dir(dir string) { if dir[0] == '/' { v2Dir = dir } else { v2Dir = path.Join(mountDir, v2Dir) } } func init() { flag.StringVar(&mountDir, "cgroup-mount", mountDir, "directory under which cgroup v1 controllers are mounted") flag.StringVar(&v2Dir, "cgroup-v2-dir", v2Dir, "cgroup v2 unified mount directory") flag.StringVar(&KubeletRoot, "kubelet-cgroup-root", KubeletRoot, "--cgroup-root options the kubelet is running with") } func DetectSystemCgroupVersion() int { if systemCgroupVersion == 0 { if _, err := os.Stat(path.Join(GetMountDir(), Controllers)); err == nil { systemCgroupVersion = 2 } else { systemCgroupVersion = 1 } } return systemCgroupVersion } ================================================ FILE: pkg/cgroups/cgroupstats.go ================================================ // Copyright 2019 Intel Corporation. All Rights Reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package cgroups import ( "fmt" "os" "path" "path/filepath" "strconv" "strings" "github.com/intel/cri-resource-manager/pkg/sysfs" ) // BlkioDeviceBytes contains a single operations line of blkio.throttle.io_service_bytes_recursive file type BlkioDeviceBytes struct { Major int Minor int Operations map[string]int64 } // BlkioThrottleBytes has parsed contents of blkio.throttle.io_service_bytes_recursive file type BlkioThrottleBytes struct { DeviceBytes []*BlkioDeviceBytes TotalBytes int64 } // CPUAcctUsage has a parsed line of cpuacct.usage_all file type CPUAcctUsage struct { CPU int User int64 System int64 } // HugetlbUsage has parsed contents of huge pages usage in bytes. type HugetlbUsage struct { Size string Bytes int64 MaxBytes int64 } // MemoryUsage has parsed contents of memory usage in bytes. type MemoryUsage struct { Bytes int64 MaxBytes int64 } // NumaLine represents one line in the NUMA statistics file. type NumaLine struct { Total int64 Nodes map[string]int64 } // NumaStat has parsed contets of a NUMA statistics file. type NumaStat struct { Total NumaLine File NumaLine Anon NumaLine Unevictable NumaLine HierarchicalTotal NumaLine HierarchicalFile NumaLine HierarchicalAnon NumaLine HierarchicalUnevictable NumaLine } // GlobalNumaStats has the statistics from one global NUMA nodestats file. type GlobalNumaStats struct { NumaHit int64 NumaMiss int64 NumaForeign int64 InterleaveHit int64 LocalNode int64 OtherNode int64 } func readCgroupFileLines(filePath string) ([]string, error) { f, err := os.ReadFile(filePath) if err != nil { return nil, err } data := string(f) rawLines := strings.Split(data, "\n") lines := make([]string, 0) // Sanitize the lines and remove empty ones. for _, rawLine := range rawLines { if len(strings.TrimSpace(rawLine)) > 0 { lines = append(lines, rawLine) } } return lines, nil } func readCgroupSingleNumber(filePath string) (int64, error) { // File looks like this: // // 4 lines, err := readCgroupFileLines(filePath) if err != nil { return 0, err } if len(lines) != 1 { return 0, fmt.Errorf("error parsing file") } number, err := strconv.ParseInt(lines[0], 10, 64) if err != nil { return 0, err } return number, nil } // GetBlkioThrottleBytes returns amount of bytes transferred to/from the disk. func GetBlkioThrottleBytes(cgroupPath string) (BlkioThrottleBytes, error) { const ( cgroupEntry = "blkio.throttle.io_service_bytes_recursive" ) // File looks like this: // // 8:16 Read 4223325184 // 8:16 Write 3207528448 // 8:16 Sync 5387592704 // 8:16 Async 2043260928 // 8:16 Discard 0 // 8:16 Total 7430853632 // 8:0 Read 5246572032 // 8:0 Write 2361737216 // 8:0 Sync 5575892480 // 8:0 Async 2032416768 // 8:0 Discard 0 // 8:0 Total 7608309248 // Total 15039162880 entry := path.Join(cgroupPath, cgroupEntry) lines, err := readCgroupFileLines(entry) if err != nil { return BlkioThrottleBytes{}, err } if len(lines) == 1 && lines[0] == "Total 0" { return BlkioThrottleBytes{}, nil } result := BlkioThrottleBytes{DeviceBytes: make([]*BlkioDeviceBytes, 0)} devidx := map[string]int{} for _, line := range lines { split := strings.Split(line, " ") key := split[0] if key == "Total" { if len(split) != 2 { continue } totalBytes, err := strconv.ParseInt(split[1], 10, 64) if err != nil { return BlkioThrottleBytes{}, err } result.TotalBytes = totalBytes } else { var dev *BlkioDeviceBytes majmin := strings.Split(key, ":") if len(majmin) != 2 { return BlkioThrottleBytes{}, fmt.Errorf("error parsing file %s", entry) } maj64, err := strconv.ParseInt(string(majmin[0]), 10, 32) if err != nil { return BlkioThrottleBytes{}, err } min64, err := strconv.ParseInt(string(majmin[1]), 10, 32) if err != nil { return BlkioThrottleBytes{}, err } major := int(maj64) minor := int(min64) idx, ok := devidx[split[0]] if ok { dev = result.DeviceBytes[idx] } else { dev = &BlkioDeviceBytes{ Major: major, Minor: minor, Operations: make(map[string]int64), } idx = len(result.DeviceBytes) devidx[key] = idx result.DeviceBytes = append(result.DeviceBytes, dev) } op, count := split[1], split[2] bytes, err := strconv.ParseInt(count, 10, 64) if err != nil { return BlkioThrottleBytes{}, err } dev.Operations[op] = bytes } } return result, nil } // GetCPUAcctStats retrieves CPU account statistics for a given cgroup. func GetCPUAcctStats(cgroupPath string) ([]CPUAcctUsage, error) { // File looks like this: // // cpu user system // 0 3723082232186 2456599218 // 1 3748398003001 1149546796 lines, err := readCgroupFileLines(path.Join(cgroupPath, "cpuacct.usage_all")) if err != nil { return nil, err } result := make([]CPUAcctUsage, 0, len(lines)-1) for _, line := range lines[1:] { tokens := strings.Split(line, " ") if len(tokens) != 3 { continue } cpu, err := strconv.ParseInt(tokens[0], 10, 32) if err != nil { return nil, err } user, err := strconv.ParseInt(tokens[1], 10, 64) if err != nil { return nil, err } system, err := strconv.ParseInt(tokens[2], 10, 64) if err != nil { return nil, err } result = append(result, CPUAcctUsage{CPU: int(cpu), User: user, System: system}) } return result, nil } // GetCPUSetMemoryMigrate returns boolean indicating whether memory migration is enabled. func GetCPUSetMemoryMigrate(cgroupPath string) (bool, error) { // File looks like this: // // 0 number, err := readCgroupSingleNumber(path.Join(cgroupPath, "cpuset.memory_migrate")) if err != nil { return false, err } if number == 0 { return false, nil } else if number == 1 { return true, nil } return false, fmt.Errorf("error parsing file") } // GetHugetlbUsage retrieves huge pages statistics for a given cgroup. func GetHugetlbUsage(cgroupPath string) ([]HugetlbUsage, error) { const ( prefix = "/hugetlb." usageSuffix = ".usage_in_bytes" maxUsageSuffix = ".max_usage_in_bytes" ) // Files look like this: // // 124 usageFiles, err := filepath.Glob(path.Join(cgroupPath, prefix+"*"+usageSuffix)) if err != nil { return nil, err } result := make([]HugetlbUsage, 0, len(usageFiles)) for _, file := range usageFiles { if strings.Contains(filepath.Base(file), ".rsvd") { // Skip reservations files. continue } size := strings.SplitN(filepath.Base(file), ".", 3)[1] bytes, err := readCgroupSingleNumber(file) if err != nil { return nil, err } max, err := readCgroupSingleNumber(strings.TrimSuffix(file, usageSuffix) + maxUsageSuffix) if err != nil { return nil, err } result = append(result, HugetlbUsage{ Size: size, Bytes: bytes, MaxBytes: max, }) } return result, nil } // GetMemoryUsage retrieves cgroup memory usage. func GetMemoryUsage(cgroupPath string) (MemoryUsage, error) { // Files look like this: // // 142 usage, err := readCgroupSingleNumber(path.Join(cgroupPath, "memory.usage_in_bytes")) if err != nil { return MemoryUsage{}, err } maxUsage, err := readCgroupSingleNumber(path.Join(cgroupPath, "memory.max_usage_in_bytes")) if err != nil { return MemoryUsage{}, err } result := MemoryUsage{ Bytes: usage, MaxBytes: maxUsage, } return result, nil } // GetNumaStats returns parsed cgroup NUMA statistics. func GetNumaStats(cgroupPath string) (NumaStat, error) { const ( cgroupEntry = "memory.numa_stat" ) // File looks like this: // // total=44611 N0=32631 N1=7501 N2=1982 N3=2497 // file=44428 N0=32614 N1=7335 N2=1982 N3=2497 // anon=183 N0=17 N1=166 N2=0 N3=0 // unevictable=0 N0=0 N1=0 N2=0 N3=0 // hierarchical_total=768133 N0=509113 N1=138887 N2=20464 N3=99669 // hierarchical_file=722017 N0=496516 N1=119997 N2=20181 N3=85323 // hierarchical_anon=46096 N0=12597 N1=18890 N2=283 N3=14326 // hierarchical_unevictable=20 N0=0 N1=0 N2=0 N3=20 entry := path.Join(cgroupPath, cgroupEntry) lines, err := readCgroupFileLines(entry) if err != nil { return NumaStat{}, err } result := NumaStat{} for _, line := range lines { split := strings.Split(line, " ") if len(line) < 2 { return NumaStat{}, fmt.Errorf("error parsing file %s", entry) } keytotal := strings.Split(split[0], "=") if len(keytotal) != 2 { return NumaStat{}, fmt.Errorf("error parsing file %s", entry) } key, tot := keytotal[0], keytotal[1] total, err := strconv.ParseInt(tot, 10, 64) if err != nil { return NumaStat{}, fmt.Errorf("error parsing file %s: %v", entry, err) } nodes := make(map[string]int64) for _, nodeEntry := range split[1:] { nodeamount := strings.Split(nodeEntry, "=") if len(nodeamount) != 2 { return NumaStat{}, fmt.Errorf("error parsing file %s", entry) } node, amount := nodeamount[0], nodeamount[1] number, err := strconv.ParseInt(amount, 10, 64) if err != nil { return NumaStat{}, fmt.Errorf("error parsing file %s: %v", entry, err) } nodes[node] = number } switch key { case "total": result.Total.Total = total result.Total.Nodes = nodes case "file": result.File.Total = total result.File.Nodes = nodes case "anon": result.Anon.Total = total result.Anon.Nodes = nodes case "unevictable": result.Unevictable.Total = total result.Unevictable.Nodes = nodes case "hierarchical_total": result.HierarchicalTotal.Total = total result.HierarchicalTotal.Nodes = nodes case "hierarchical_file": result.HierarchicalFile.Total = total result.HierarchicalFile.Nodes = nodes case "hierarchical_anon": result.HierarchicalAnon.Total = total result.HierarchicalAnon.Nodes = nodes case "hierarchical_unevictable": result.HierarchicalUnevictable.Total = total result.HierarchicalUnevictable.Nodes = nodes default: return NumaStat{}, fmt.Errorf("error parsing file, unknown key %s", key) } } return result, nil } // GetGlobalNumaStats returns the global (non-cgroup) NUMA statistics per node. func GetGlobalNumaStats() (map[int]GlobalNumaStats, error) { const ( prefix = "/sys/devices/system/node/node" ) // Files look like this: // // numa_hit 1851614569 // numa_miss 0 // numa_foreign 0 // interleave_hit 49101 // local_node 1851614569 // other_node 0 result := make(map[int]GlobalNumaStats) nodeDirs, err := filepath.Glob(prefix + "*") if err != nil { return map[int]GlobalNumaStats{}, err } for _, dir := range nodeDirs { id := strings.TrimPrefix(dir, prefix) node, err := strconv.ParseInt(id, 10, 0) if err != nil { return map[int]GlobalNumaStats{}, fmt.Errorf("error parsing directory name") } nodeStat := GlobalNumaStats{} numastat := path.Join(dir, "numastat") err = sysfs.ParseFileEntries(numastat, map[string]interface{}{ "numa_hit": &nodeStat.NumaHit, "numa_miss": &nodeStat.NumaMiss, "numa_foreign": &nodeStat.NumaForeign, "interleave_hit": &nodeStat.InterleaveHit, "local_node": &nodeStat.LocalNode, "other_node": &nodeStat.OtherNode, }, func(line string) (string, string, error) { fields := strings.Fields(strings.TrimSpace(line)) if len(fields) != 2 { return "", "", fmt.Errorf("failed to parse line '%s'", line) } return fields[0], fields[1], nil }, ) if err != nil { return map[int]GlobalNumaStats{}, fmt.Errorf("error parsing numastat file: %v", err) } result[int(node)] = nodeStat } return result, nil } ================================================ FILE: pkg/cgroupstats/collector.go ================================================ // Copyright 2019 Intel Corporation. All Rights Reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package cgroupstats import ( "flag" "os" "path/filepath" "regexp" "strconv" "strings" "sync" "github.com/intel/cri-resource-manager/pkg/cgroups" logger "github.com/intel/cri-resource-manager/pkg/log" "github.com/intel/cri-resource-manager/pkg/metrics" "github.com/prometheus/client_golang/prometheus" ) // Prometheus Metric descriptor indices and descriptor table const ( numaStatsDesc = iota memoryUsageDesc memoryMigrateDesc cpuAcctUsageDesc hugeTlbUsageDesc blkioDeviceUsageDesc numDescriptors ) var descriptors = [numDescriptors]*prometheus.Desc{ numaStatsDesc: prometheus.NewDesc( "cgroup_numa_stats", "NUMA statistics for a given container and pod.", []string{ // cgroup path "container_id", // NUMA node ID "numa_node_id", // NUMA memory type "type", }, nil, ), memoryUsageDesc: prometheus.NewDesc( "cgroup_memory_usage", "Memory usage statistics for a given container and pod.", []string{ "container_id", "type", }, nil, ), memoryMigrateDesc: prometheus.NewDesc( "cgroup_memory_migrate", "Memory migrate status for a given container and pod.", []string{ "container_id", }, nil, ), cpuAcctUsageDesc: prometheus.NewDesc( "cgroup_cpu_acct", "CPU accounting for a given container and pod.", []string{ "container_id", // CPU ID "cpu", "type", }, nil, ), hugeTlbUsageDesc: prometheus.NewDesc( "cgroup_hugetlb_usage", "Hugepages usage for a given container and pod.", []string{ "container_id", "size", "type", }, nil, ), blkioDeviceUsageDesc: prometheus.NewDesc( "cgroup_blkio_device_usage", "Blkio Device bytes usage for a given container and pod.", []string{ "container_id", "major", "minor", "operation", }, nil, ), } var ( // cgroupRoot is the mount point for the cgroup (v1) filesystem cgroupRoot = "/sys/fs/cgroup" // our logger instance log = logger.NewLogger("cgroupstats") ) const ( kubepodsDir = "kubepods.slice" ) type collector struct { } // NewCollector creates new Prometheus collector func NewCollector() (prometheus.Collector, error) { return &collector{}, nil } // Describe implements prometheus.Collector interface func (c *collector) Describe(ch chan<- *prometheus.Desc) { for _, d := range descriptors { ch <- d } } func updateCPUAcctUsageMetric(ch chan<- prometheus.Metric, path string, metric []cgroups.CPUAcctUsage) { for i, acct := range metric { ch <- prometheus.MustNewConstMetric( descriptors[cpuAcctUsageDesc], prometheus.CounterValue, float64(acct.CPU), path, strconv.FormatInt(int64(i), 10), "CPU", ) ch <- prometheus.MustNewConstMetric( descriptors[cpuAcctUsageDesc], prometheus.CounterValue, float64(acct.User), path, strconv.FormatInt(int64(i), 10), "User", ) ch <- prometheus.MustNewConstMetric( descriptors[cpuAcctUsageDesc], prometheus.CounterValue, float64(acct.System), path, strconv.FormatInt(int64(i), 10), "System", ) } } func updateMemoryMigrateMetric(ch chan<- prometheus.Metric, path string, migrate bool) { migrateValue := 0 if migrate { migrateValue = 1 } ch <- prometheus.MustNewConstMetric( descriptors[memoryMigrateDesc], prometheus.GaugeValue, float64(migrateValue), path, ) } func updateMemoryUsageMetric(ch chan<- prometheus.Metric, path string, metric cgroups.MemoryUsage) { ch <- prometheus.MustNewConstMetric( descriptors[memoryUsageDesc], prometheus.GaugeValue, float64(metric.Bytes), path, "Bytes", ) ch <- prometheus.MustNewConstMetric( descriptors[memoryUsageDesc], prometheus.GaugeValue, float64(metric.MaxBytes), path, "MaxBytes", ) } func updateNumaStatMetric(ch chan<- prometheus.Metric, path string, metric cgroups.NumaStat) { // TODO: use "reflect" to iterate through the struct fields of NumaStat? for key, value := range metric.Total.Nodes { ch <- prometheus.MustNewConstMetric( descriptors[numaStatsDesc], prometheus.GaugeValue, float64(value), path, key, "Total", ) } for key, value := range metric.File.Nodes { ch <- prometheus.MustNewConstMetric( descriptors[numaStatsDesc], prometheus.GaugeValue, float64(value), path, key, "File", ) } for key, value := range metric.Anon.Nodes { ch <- prometheus.MustNewConstMetric( descriptors[numaStatsDesc], prometheus.GaugeValue, float64(value), path, key, "Anon", ) } for key, value := range metric.Unevictable.Nodes { ch <- prometheus.MustNewConstMetric( descriptors[numaStatsDesc], prometheus.GaugeValue, float64(value), path, key, "Unevictable", ) } for key, value := range metric.HierarchicalTotal.Nodes { ch <- prometheus.MustNewConstMetric( descriptors[numaStatsDesc], prometheus.GaugeValue, float64(value), path, key, "HierarchicalTotal", ) } for key, value := range metric.HierarchicalFile.Nodes { ch <- prometheus.MustNewConstMetric( descriptors[numaStatsDesc], prometheus.GaugeValue, float64(value), path, key, "HierarchicalFile", ) } for key, value := range metric.HierarchicalAnon.Nodes { ch <- prometheus.MustNewConstMetric( descriptors[numaStatsDesc], prometheus.GaugeValue, float64(value), path, key, "HierarchicalAnon", ) } for key, value := range metric.HierarchicalUnevictable.Nodes { ch <- prometheus.MustNewConstMetric( descriptors[numaStatsDesc], prometheus.GaugeValue, float64(value), path, key, "HierarchicalUnevictable", ) } } func updateHugeTlbUsageMetric(ch chan<- prometheus.Metric, path string, metric []cgroups.HugetlbUsage) { // One HugeTlbUsage for each size. for _, hugeTlbUsage := range metric { ch <- prometheus.MustNewConstMetric( descriptors[hugeTlbUsageDesc], prometheus.GaugeValue, float64(hugeTlbUsage.Bytes), path, hugeTlbUsage.Size, "Bytes", ) ch <- prometheus.MustNewConstMetric( descriptors[hugeTlbUsageDesc], prometheus.GaugeValue, float64(hugeTlbUsage.MaxBytes), path, hugeTlbUsage.Size, "MaxBytes", ) } } func updateBlkioDeviceUsageMetric(ch chan<- prometheus.Metric, path string, metric cgroups.BlkioThrottleBytes) { for _, deviceBytes := range metric.DeviceBytes { for operation, val := range deviceBytes.Operations { ch <- prometheus.MustNewConstMetric( descriptors[blkioDeviceUsageDesc], prometheus.CounterValue, float64(val), path, strconv.FormatInt(int64(deviceBytes.Major), 10), strconv.FormatInt(int64(deviceBytes.Minor), 10), operation, ) } } } func walkCgroups() []string { // XXX TODO: add support for kubelet cgroupfs cgroup driver. containerDirs := []string{} cpuset := filepath.Join(cgroupRoot, "cpuset") filepath.Walk(filepath.Join(cpuset, kubepodsDir), func(path string, info os.FileInfo, err error) error { if err != nil { if os.IsNotExist(err) { return nil } return err } if !info.IsDir() { return nil } dir := info.Name() if !strings.HasSuffix(dir, ".scope") { return nil } switch { case strings.HasPrefix(dir, "cri-containerd-"): break case strings.HasPrefix(dir, "crio-"): break case strings.HasPrefix(dir, "docker-"): break default: return filepath.SkipDir } path = strings.TrimPrefix(path, cpuset+"/") containerDirs = append(containerDirs, path) return nil }) return containerDirs } func cgroupPath(controller, path string) string { return filepath.Join(cgroupRoot, controller, path) } // Collect implements prometheus.Collector interface func (c collector) Collect(ch chan<- prometheus.Metric) { var wg sync.WaitGroup // We don't bail out on errors because those can happen if there is a race condition between // the destruction of a container and us getting to read the cgroup data. We just don't report // the values we don't get. collectors := []func(string, *regexp.Regexp){ func(path string, re *regexp.Regexp) { defer wg.Done() numa, err := cgroups.GetNumaStats(cgroupPath("memory", path)) if err == nil { updateNumaStatMetric(ch, re.FindStringSubmatch(filepath.Base(path))[0], numa) } else { log.Error("failed to collect NUMA stats for %s: %v", path, err) } }, func(path string, re *regexp.Regexp) { defer wg.Done() memory, err := cgroups.GetMemoryUsage(cgroupPath("memory", path)) if err == nil { updateMemoryUsageMetric(ch, re.FindStringSubmatch(filepath.Base(path))[0], memory) } else { log.Error("failed to collect memory usage stats for %s: %v", path, err) } }, func(path string, re *regexp.Regexp) { defer wg.Done() migrate, err := cgroups.GetCPUSetMemoryMigrate(cgroupPath("cpuset", path)) if err == nil { updateMemoryMigrateMetric(ch, re.FindStringSubmatch(filepath.Base(path))[0], migrate) } else { log.Error("failed to collect memory migration stats for %s: %v", path, err) } }, func(path string, re *regexp.Regexp) { defer wg.Done() cpuAcctUsage, err := cgroups.GetCPUAcctStats(cgroupPath("cpuacct", path)) if err == nil { updateCPUAcctUsageMetric(ch, re.FindStringSubmatch(filepath.Base(path))[0], cpuAcctUsage) } else { log.Error("failed to collect CPU accounting stats for %s: %v", path, err) } }, func(path string, re *regexp.Regexp) { defer wg.Done() hugeTlbUsage, err := cgroups.GetHugetlbUsage(cgroupPath("hugetlb", path)) if err == nil { updateHugeTlbUsageMetric(ch, re.FindStringSubmatch(filepath.Base(path))[0], hugeTlbUsage) } else { log.Error("failed to collect hugetlb stats for %s: %v", path, err) } }, func(path string, re *regexp.Regexp) { defer wg.Done() blkioDeviceUsage, err := cgroups.GetBlkioThrottleBytes(cgroupPath("blkio", path)) if err == nil { updateBlkioDeviceUsageMetric(ch, re.FindStringSubmatch(filepath.Base(path))[0], blkioDeviceUsage) } else { log.Error("failed to collect blkio stats for %s: %v", path, err) } }, } containerIDRegexp := regexp.MustCompile(`[a-z0-9]{64}`) for _, path := range walkCgroups() { wg.Add(len(collectors)) for _, fn := range collectors { go fn(path, containerIDRegexp) } } // We need to wait so that the response channel doesn't get closed. wg.Wait() } func init() { flag.StringVar(&cgroupRoot, "cgroup-path", cgroupRoot, "Path to cgroup filesystem mountpoint") err := metrics.RegisterCollector("cgroupstats", NewCollector) if err != nil { log.Error("failed register cgroupstats collector: %v", err) } } ================================================ FILE: pkg/config/config.go ================================================ // Copyright 2019 Intel Corporation. All Rights Reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package config import ( "reflect" "sigs.k8s.io/yaml" "strings" ) const ( // MainModule is the default parent for all configuration. MainModule = "main" ) // GetConfigFn is used to query a module for its default configuration. type GetConfigFn func() interface{} // NotifyFn is used to notify a module about configuration changes. type NotifyFn func(Event, Source) error // Event describes what triggered an invocation of a configuration notification callback. type Event string const ( // UpdateEvent corresponds to a normal configuration udpate. UpdateEvent = "update" // RevertEvent corresponds to a configuration rollback in case of errors. RevertEvent = "rollback" ) // Source describes where configuration is originated from. type Source string const ( // ConfigFile is a YAML/JSON file configuration source. ConfigFile Source = "configuration file" // ConfigExternal is an external configuration source, for instance a node agent. ConfigExternal Source = "external configuration" // ConfigBackup is a backup of the previous configuration. ConfigBackup Source = "backed up configuration" ) // Module is a logical unit of configuration, declared using Declare(). type Module struct { path string // fully qualified path in dotted notation, parent.name description string // verbose module description help string // verbose description/help about this module ptr interface{} // pointer to module configuration data parent *Module // parent module name string // name relative to parent, last part of path children map[string]*Module // modules nested under this module getdefault GetConfigFn // getter for default configuration notifiers []NotifyFn // update notification callbacks noValidate bool // omit data validation } // main is the root of our configuration. var main = &Module{ path: MainModule, name: MainModule, children: make(map[string]*Module), } // GetConfig returns the current configuration. func GetConfig() (Data, error) { return main.getconfig() } // SetConfig updates the configuration using data from an external source. func SetConfig(cfg map[string]string) error { data, err := DataFromStringMap(cfg) if err != nil { return configError("failed to update configuration: %v", err) } return setconfig(data, ConfigExternal) } // SetConfigFromFile updates the configuration from the given file. func SetConfigFromFile(path string) error { data, err := DataFromFile(path) if err != nil { return configError("failed to apply configuration from file: %v", err) } return setconfig(data, ConfigFile) } // GetModule looks up the module for the given path, implicitly creating it if necessary. func GetModule(path string) *Module { return lookup(path) } // AddNotify attaches the given update notification callback to the module. func (m *Module) AddNotify(fn NotifyFn) error { return WithNotify(fn).apply(m) } // Register registers a unit of configuration data to be handled by this package. func Register(path, description string, ptr interface{}, getfn GetConfigFn, opts ...Option) *Module { m := lookup(path) if !m.isImplicit() { log.Fatal("module %s: conflicting module with same path already declared (%s)", path, m.description) } m.setDescription(description) m.ptr = ptr m.getdefault = getfn m.check() foreign := m.notifiers m.notifiers = nil for _, opt := range opts { opt.apply(m) } m.notifiers = append(m.notifiers, foreign...) return m } // setconfig updates the configuration, notifies all modules, and does a rollback if necessary. func setconfig(data Data, source Source) error { snapshot, err := main.getconfig() if err != nil { return configError("pre-update configuration snapshot failed: %v", err) } log.Info("validating configuration...") err = main.validate(data) if err != nil { return err } log.Info("applying configuration...") err = main.configure(data, false) if err != nil { revertconfig(snapshot, false) return err } log.Info("activating configuration...") err = main.notify(UpdateEvent, source) if err != nil { log.Error("configuration rejected: %v", err) revertconfig(snapshot, true) return err } return nil } // revertconfig reverts configuration using a previously taken snapshot func revertconfig(snapshot Data, notify bool) { err := main.configure(snapshot, true) if err != nil { log.Error("failed to revert configuration: %v", err) } if !notify { return } err = main.notify(RevertEvent, ConfigBackup) if err != nil { log.Error("reverted configuration rejected: %v", err) } } // getconfig returns the configuration for the given module and its submodules. func (m *Module) getconfig() (Data, error) { var mcfg, ccfg Data var err error if m.isImplicit() { mcfg = make(Data) } else { mcfg, err = DataFromObject(m.ptr) if err != nil { return nil, configError("module %s: failed to get confguration: %v", m.path, err) } } for name, child := range m.children { ccfg, err = child.getconfig() if err != nil { return nil, configError("module %s: failed to get child configuration for %s: %v", m.path, child.path, err) } mcfg[name] = ccfg } return mcfg, nil } // isImplict returns true if the module has not been explicitly declared. func (m *Module) isImplicit() bool { return m.description == "" } // hasChild checks if the module has a child with the given name. func (m *Module) hasChild(name string) bool { _, ok := m.children[name] return ok } // configure reconfigures the given module and its submodules with the provided data. func (m *Module) configure(data Data, force bool) error { log.Debug("module %s: reconfiguring...", m.path) modcfg, subcfg := data.split(m.hasChild) if err := m.apply(modcfg); err != nil { if !force { return err } log.Error("%v", err) } for name, child := range m.children { childcfg, err := subcfg.pick(name, true) if err != nil { err = configError("module %s: failed to pick configuration: %v", child.path, err) if !force { return err } log.Error("%v", err) } err = child.configure(childcfg, force) if err != nil { if !force { return err } log.Error("%v", err) } } return nil } // apply applies the given module-local configuration to the module. func (m *Module) apply(cfg Data) error { if m.isImplicit() { return nil } log.Debug("module %s: applying module configuration...", m.path) // First, reset module config to defaults defcfg, err := DataFromObject(m.getdefault()) if err != nil { return configError("module %s: failed to retrieve default configuration: %v", m.path, err) } raw, err := yaml.Marshal(defcfg) if err != nil { return configError("module %s: failed to marshal default configuration: %v", m.path, err) } if err = yaml.Unmarshal(raw, m.ptr); err != nil { return configError("module %s: failed to pre-reset to default configuration: %v", m.path, err) } // Second, apply given conf on top of the defaults if len(cfg) > 0 { raw, err = yaml.Marshal(cfg) if err != nil { return configError("module %s: failed to marshal configuration: %v", m.path, err) } if err = yaml.Unmarshal(raw, m.ptr); err != nil { return configError("module %s: failed to apply configuration: %v", m.path, err) } } return nil } // notify notifies this module and its children about a configuration change. func (m *Module) notify(event Event, source Source) error { for _, child := range m.children { if err := child.notify(event, source); err != nil { return err } } for _, fn := range m.notifiers { if err := fn(event, source); err != nil { return configError("module %s rejected %v configuration: %v", m.path, event, err) } } return nil } // check performs basic sanity checks on the module. func (m *Module) check() { ptrType := reflect.TypeOf(m.ptr) ptr := reflect.ValueOf(m.ptr).Elem() if ptrType.Kind() != reflect.Ptr || ptr.Kind() != reflect.Struct { log.Fatal("module %s: configuration data must be a pointer to a struct, not %T", m.path, m.ptr) } if m.parent == nil || m.parent.isImplicit() { return } ptr = reflect.ValueOf(m.parent.ptr).Elem() for i := 0; i < ptr.NumField(); i++ { field := ptr.Type().Field(i) if m.name == fieldName(field) { log.Fatal("module %s: parent has configuration data with conflicting field", m.name) } } } // getFields() does a deep discovery of all fields of struct, handling also // embedded (i.e. struct composition) fields. func getFields(typ reflect.Type) map[string]struct{} { fields := make(map[string]struct{}) var get func(t reflect.Type) get = func(t reflect.Type) { for i := 0; i < t.NumField(); i++ { f := t.Field(i) if f.Type.Kind() == reflect.Struct && f.Anonymous { get(f.Type) } else { fields[fieldName(f)] = struct{}{} } } } get(typ) return fields } // validate checks that each field of data refers to either module data or a submodule. func (m *Module) validate(data Data) error { log.Debug("validating data for module %s...", m.path) modcfg, subcfg := data.split(m.hasChild) fields := map[string]struct{}{} if m.isImplicit() { if len(modcfg) > 0 { names := []string{} for name := range modcfg { names = append(names, name) } if !m.noValidate { return configError("implicit module %s: given configuration data %s", m.path, strings.Join(names, ",")) } log.Error("implicit module %s: given configuration date %s", m.path, strings.Join(names, ",")) } } else { fields = getFields(reflect.TypeOf(m.ptr).Elem()) } for field := range modcfg { if _, ok := fields[field]; !ok { if !m.noValidate { return configError("module %s: given unknown configuration data %s", m.path, field) } log.Error("module %s: given unknown configuration data %s", m.path, field) } } subcfg = subcfg.copy() for name, child := range m.children { childcfg, err := subcfg.pick(name, true) if err != nil { return configError("module %s: failed to pick configuration for child %s: %v", m.path, child.path, err) } err = child.validate(childcfg) if err != nil { return err } } if len(subcfg) > 0 { unconsumed := []string{} for name := range subcfg { unconsumed = append(unconsumed, name) } return configError("module %s: no child corresponding to data %s", m.path, strings.Join(unconsumed, ",")) } return nil } // fieldName returns the name used to refer to the struct field in JSON/YAML encoding. func fieldName(f reflect.StructField) string { val, ok := f.Tag.Lookup("json") if !ok { return f.Name } tags := strings.Split(val, ",") if len(tags) < 1 { return f.Name } name := tags[0] if name == "" { return f.Name } return name } // lookup finds/creates a module corresponding to the given split module path. func lookup(path string) *Module { names := strings.Split(path, ".") path = "" module := main for _, name := range names { if path != "" { path += "." + name } else { path = name } m, ok := module.children[name] if !ok { m = &Module{ path: path, parent: module, name: name, children: make(map[string]*Module), } module.children[name] = m } module = m } return module } // Print prints the current configuration, using the given function or fmt.Printf. func Print(printfn func(string, ...interface{})) { data, err := GetConfig() if err != nil { log.Error("error: failed to get configuration: %v", err) return } data.Print(printfn) } ================================================ FILE: pkg/config/data.go ================================================ // Copyright 2019 Intel Corporation. All Rights Reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package config import ( "fmt" "os" "sigs.k8s.io/yaml" "strings" ) // Data is our internal representation of configuration data. type Data map[string]interface{} // DataFromObject remarshals the given object into configuration data. func DataFromObject(obj interface{}) (Data, error) { raw, err := yaml.Marshal(obj) if err != nil { return nil, configError("failed to marshal object %T to data: %v", obj, err) } data := make(Data) if err = yaml.Unmarshal(raw, &data); err != nil { return nil, configError("failed to unmarshal object %T to data: %v", obj, err) } return data, nil } // DataFromStringMap remarshals the given map into configuration data. func DataFromStringMap(smap map[string]string) (Data, error) { data := make(Data) for key, val := range smap { var obj interface{} if err := yaml.Unmarshal([]byte(val), &obj); err != nil { return nil, configError("failed to unmarshal data from map: %v", err) } data[key] = obj } return data, nil } // DataFromFile unmarshals the content of the given file into configuration data. func DataFromFile(path string) (Data, error) { raw, err := os.ReadFile(path) if err != nil { return nil, configError("failed to read file %q: %v", path, err) } data := make(Data) if err := yaml.Unmarshal(raw, &data); err != nil { return nil, configError("failed to load configuration from file %q: %v", path, err) } return data, nil } // copy does a shallow copy of the given data. func (d Data) copy() Data { data := make(Data) for key, value := range d { data[key] = value } return data } // split splits up the given data to module- and child-specific parts. func (d Data) split(hasChild func(string) bool) (Data, Data) { mod, sub := make(Data), make(Data) for key, val := range d { if hasChild(key) || strings.IndexByte(key, '.') != -1 { sub[key] = val } else { mod[key] = val } } return mod, sub } // pick picks data for the given key. func (d Data) pick(key string, removePicked bool) (Data, error) { var data Data var err error if obj, ok := d[key]; ok { data, err = DataFromObject(obj) if err != nil { return nil, err } if removePicked { delete(d, key) } } // pick/remove data for all dotted keys matching the key being picked for k, v := range d { split := strings.Split(k, ".") if len(split) > 1 && split[0] == key { if data == nil { data = make(Data) } subkey := strings.Join(split[1:], ".") if _, ok := data[subkey]; ok { return nil, configError("dotted key %q conflicts with nested key %q", k, subkey) } data[subkey] = v if removePicked { delete(d, k) } } } return data, nil } // String returns configuration data as a string. func (d Data) String() string { raw, err := yaml.Marshal(d) if err != nil { return fmt.Sprintf("", err) } return string(raw) } // Print prints the configuration data using the given function or fmt.Printf. func (d Data) Print(fn func(string, ...interface{})) { if fn == nil { fn = func(format string, args ...interface{}) { fmt.Printf(format+"\n", args...) } } for _, line := range strings.Split(d.String(), "\n") { fn("%s", line) } } ================================================ FILE: pkg/config/duration.go ================================================ // Copyright 2019-2020 Intel Corporation. All Rights Reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package config import ( "fmt" "time" ) // Duration is a time.Duration which implements JSON marshalling/unmarshalling. type Duration time.Duration // MarshalJSON is the JSON marshaller for (time.)Duration. func (d Duration) MarshalJSON() ([]byte, error) { return []byte("\"" + time.Duration(d).String() + "\""), nil } // UnmarshalJSON is the JSON unmarshaller for (time.)Duration. func (d *Duration) UnmarshalJSON(data []byte) error { if len(data) < 2 { return fmt.Errorf("invalid Duration data") } parsed, err := time.ParseDuration(string(data[1 : len(data)-1])) if err != nil { return err } *d = Duration(parsed) return nil } // String returns the value of Duration as a string. func (d *Duration) String() string { return time.Duration(*d).String() } ================================================ FILE: pkg/config/error.go ================================================ // Copyright 2019 Intel Corporation. All Rights Reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package config import ( "fmt" ) // configError creates a formatted configuration-specific error. func configError(format string, args ...interface{}) error { return fmt.Errorf("config error: "+format, args...) } ================================================ FILE: pkg/config/help.go ================================================ // Copyright 2019 Intel Corporation. All Rights Reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package config import ( "fmt" "reflect" "sort" "strings" ) // Describe provides help about configuration of the given modules. func Describe(names ...string) { modules := findModules(names, nil) if len(modules) == 0 { fmt.Printf("No matching modules found.\n") return } for _, m := range modules { m.showHelp() fmt.Printf("\n\n") } } func (m *Module) setDescription(description string) { description = strings.Trim(description, "\n") if description == "" { m.description = "Module " + m.path + " has no description." return } if strings.IndexByte(description, '\n') == -1 { m.description = description } else { lines := strings.Split(description, "\n") m.description = lines[0] m.help = strings.Trim(strings.Join(lines[1:], "\n"), "\n") } } func (m *Module) showHelp() { kind := "module" if m.isImplicit() { kind = "implicit module" } fmt.Printf("- %s %s: %s\n", kind, m.name, m.description) fmt.Printf(" full path: %s\n", m.path) if len(m.children) > 0 { submodules, sep := "", "" for _, child := range m.children { submodules += sep + child.path sep = ", " } fmt.Printf(" sub-modules: %s\n", submodules) } fmt.Printf(" description:\n") if m.help != "" { fmt.Printf("\n") for _, line := range strings.Split(m.help, "\n") { fmt.Printf(" %s\n", line) } } else { m.describeData() } } func (m *Module) describeData() { if m.isImplicit() { return } cfg := reflect.ValueOf(m.ptr).Elem() fmt.Printf(" No runtime configuration documentation for this package...\n") fmt.Printf(" Package runtime configuration data type: %s %s.\n", cfg.Type().Kind().String(), cfg.Type().String()) } func findModules(names []string, m *Module) []*Module { if m == nil { m = main } matches := []*Module{} if len(names) == 0 { matches = append(matches, m) } else { for _, name := range names { switch { case name == m.name || name == m.path: matches = append(matches, m) case name[0] == '.' && name[len(name)-1] == '.' && strings.Index(m.path, name) > 0: matches = append(matches, m) case name[0] == '.' && strings.HasSuffix(m.path, name): matches = append(matches, m) case name[len(name)-1] == '.' && strings.HasPrefix(m.path, name): matches = append(matches, m) } } } children := []*Module{} for _, child := range m.children { children = append(children, child) } sort.Slice(children, func(i, j int) bool { return strings.Compare(children[i].path, children[j].path) < 0 }, ) for _, child := range children { matches = append(matches, findModules(names, child)...) } return matches } ================================================ FILE: pkg/config/log.go ================================================ // Copyright 2019 Intel Corporation. All Rights Reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package config import ( "fmt" "os" ) // // Notes: // Unless we split the Logger interface (pkg/log.Logger) from its actual implementation // we cannot import it import here. pkg/log itself implements its runtime configurability // using this module so we would end up with an import cycle. As a workaround for now we // let out logger be set externally and we set it from pkg/log. // // Logger is our set of logging functions. type Logger struct { DebugEnabled func() bool Debug func(string, ...interface{}) Info func(string, ...interface{}) Warning func(string, ...interface{}) Error func(string, ...interface{}) Fatal func(string, ...interface{}) Panic func(string, ...interface{}) } // log is our Logger. var log = defaultLogger() // SetLogger sets our logger. func SetLogger(logger Logger) { if logger.DebugEnabled != nil { log.DebugEnabled = logger.DebugEnabled } if logger.Debug != nil { log.Debug = logger.Debug } if logger.Info != nil { log.Info = logger.Info } if logger.Warning != nil { log.Warning = logger.Warning } if logger.Error != nil { log.Error = logger.Error } if logger.Panic != nil { log.Panic = logger.Panic } if logger.Fatal != nil { log.Fatal = logger.Fatal } } func defaultLogger() Logger { return Logger{ DebugEnabled: debugEnabled, Debug: debugmsg, Info: infomsg, Warning: warningmsg, Error: errormsg, Fatal: fatalmsg, Panic: panicmsg, } } func debugEnabled() bool { return true } func debugmsg(format string, args ...interface{}) { fmt.Printf("D: [config] "+format+"\n", args...) } func infomsg(format string, args ...interface{}) { fmt.Printf("I: [config] "+format+"\n", args...) } func warningmsg(format string, args ...interface{}) { fmt.Printf("W: [config] "+format+"\n", args...) } func errormsg(format string, args ...interface{}) { fmt.Printf("E: [config] "+format+"\n", args...) } func fatalmsg(format string, args ...interface{}) { fmt.Printf("E: [config] fatal error: "+format+"\n", args...) os.Exit(1) } func panicmsg(format string, args ...interface{}) { errormsg(format, args...) panic(fmt.Sprintf("fatal error: "+format+"\n", args...)) } ================================================ FILE: pkg/config/options.go ================================================ // Copyright 2019 Intel Corporation. All Rights Reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package config // WithNotify specifies a notification function to be called after configuration updates. func WithNotify(fn NotifyFn) Option { return newFuncOption(func(o interface{}) error { switch o.(type) { case *Module: m := o.(*Module) m.notifiers = append(m.notifiers, fn) default: return configError("WithNotify is not valid for object of type %T", o) } return nil }) } // WithoutDataValidation specifies that data passed to this module should not be validated. func WithoutDataValidation() Option { return newFuncOption(func(o interface{}) error { switch o.(type) { case *Module: m := o.(*Module) m.noValidate = true default: return configError("WithoutDataValidation is not valid for object of type %T", o) } return nil }) } // Option is the generic interface for any option applicable to a Module or Config. type Option interface { apply(interface{}) error } // funcOption is a generic functional option. type funcOption struct { f func(interface{}) error } // apply applies a functional option to an object. func (fo *funcOption) apply(o interface{}) error { return fo.f(o) } // newFuncOption creates a new option instance. func newFuncOption(f func(interface{}) error) *funcOption { return &funcOption{ f: f, } } ================================================ FILE: pkg/cpuallocator/allocator.go ================================================ // Copyright 2019 Intel Corporation. All Rights Reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package cpuallocator import ( "fmt" "sort" logger "github.com/intel/cri-resource-manager/pkg/log" "github.com/intel/cri-resource-manager/pkg/sysfs" "github.com/intel/cri-resource-manager/pkg/utils" "github.com/intel/cri-resource-manager/pkg/utils/cpuset" "github.com/intel/goresctrl/pkg/sst" idset "github.com/intel/goresctrl/pkg/utils" ) // AllocFlag represents CPU allocation preferences. type AllocFlag uint const ( // AllocIdlePackages requests allocation of full idle packages. AllocIdlePackages AllocFlag = 1 << iota // AllocIdleNodes requests allocation of full idle NUMA nodes. AllocIdleNodes // AllocIdleCores requests allocation of full idle cores (all threads in core). AllocIdleCores // AllocDefault is the default allocation preferences. AllocDefault = AllocIdlePackages | AllocIdleCores logSource = "cpuallocator" ) // allocatorHelper encapsulates state for allocating CPUs. type allocatorHelper struct { logger.Logger // allocatorHelper logger instance sys sysfs.System // sysfs CPU and topology information topology topologyCache // cached topology information flags AllocFlag // allocation preferences from cpuset.CPUSet // set of CPUs to allocate from prefer CPUPriority // CPU priority to prefer cnt int // number of CPUs to allocate result cpuset.CPUSet // set of CPUs allocated pkgs []sysfs.CPUPackage // physical CPU packages, sorted by preference cpus []sysfs.CPU // CPU cores, sorted by preference } // CPUAllocator is an interface for a generic CPU allocator type CPUAllocator interface { AllocateCpus(from *cpuset.CPUSet, cnt int, prefer CPUPriority) (cpuset.CPUSet, error) ReleaseCpus(from *cpuset.CPUSet, cnt int, prefer CPUPriority) (cpuset.CPUSet, error) } type CPUPriority int const ( PriorityHigh CPUPriority = iota PriorityNormal PriorityLow NumCPUPriorities PriorityNone = NumCPUPriorities ) type cpuAllocator struct { logger.Logger sys sysfs.System // wrapped sysfs.System instance topologyCache topologyCache // topology lookups } // topologyCache caches topology lookups type topologyCache struct { pkg map[idset.ID]cpuset.CPUSet node map[idset.ID]cpuset.CPUSet core map[idset.ID]cpuset.CPUSet cpuPriorities cpuPriorities // CPU priority mapping } type cpuPriorities [NumCPUPriorities]cpuset.CPUSet // IDFilter helps filtering Ids. type IDFilter func(idset.ID) bool // IDSorter helps sorting Ids. type IDSorter func(int, int) bool // our logger instance var log = logger.NewLogger(logSource) // NewCPUAllocator return a new cpuAllocator instance func NewCPUAllocator(sys sysfs.System) CPUAllocator { ca := cpuAllocator{ Logger: log, sys: sys, topologyCache: newTopologyCache(sys), } return &ca } // Pick packages, nodes or CPUs by filtering according to a function. func pickIds(idSlice []idset.ID, f IDFilter) []idset.ID { ids := make([]idset.ID, len(idSlice)) idx := 0 for _, id := range idSlice { if f == nil || f(id) { ids[idx] = id idx++ } } return ids[0:idx] } // newAllocatorHelper creates a new CPU allocatorHelper. func newAllocatorHelper(sys sysfs.System, topo topologyCache) *allocatorHelper { a := &allocatorHelper{ Logger: log, sys: sys, topology: topo, flags: AllocDefault, } return a } // Allocate full idle CPU packages. func (a *allocatorHelper) takeIdlePackages() { a.Debug("* takeIdlePackages()...") offline := a.sys.Offlined() // pick idle packages pkgs := pickIds(a.sys.PackageIDs(), func(id idset.ID) bool { cset := a.topology.pkg[id].Difference(offline) return cset.Intersection(a.from).Equals(cset) }) // sorted by number of preferred cpus and then by cpu id sort.Slice(pkgs, func(i, j int) bool { if res := a.topology.cpuPriorities.cmpCPUSet(a.topology.pkg[pkgs[i]], a.topology.pkg[pkgs[j]], a.prefer, -1); res != 0 { return res > 0 } return pkgs[i] < pkgs[j] }) a.Debug(" => idle packages sorted by preference: %v", pkgs) // take as many idle packages as we need/can for _, id := range pkgs { cset := a.topology.pkg[id].Difference(offline) a.Debug(" => considering package %v (#%s)...", id, cset) if a.cnt >= cset.Size() { a.Debug(" => taking package %v...", id) a.result = a.result.Union(cset) a.from = a.from.Difference(cset) a.cnt -= cset.Size() if a.cnt == 0 { break } } } } // Allocate full idle CPU cores. func (a *allocatorHelper) takeIdleCores() { a.Debug("* takeIdleCores()...") offline := a.sys.Offlined() // pick (first id for all) idle cores cores := pickIds(a.sys.CPUIDs(), func(id idset.ID) bool { cset := a.topology.core[id].Difference(offline) if cset.IsEmpty() { return false } return cset.Intersection(a.from).Equals(cset) && cset.List()[0] == int(id) }) // sorted by id sort.Slice(cores, func(i, j int) bool { if res := a.topology.cpuPriorities.cmpCPUSet(a.topology.core[cores[i]], a.topology.core[cores[j]], a.prefer, -1); res != 0 { return res > 0 } return cores[i] < cores[j] }) a.Debug(" => idle cores sorted by preference: %v", cores) // take as many idle cores as we can for _, id := range cores { cset := a.topology.core[id].Difference(offline) a.Debug(" => considering core %v (#%s)...", id, cset) if a.cnt >= cset.Size() { a.Debug(" => taking core %v...", id) a.result = a.result.Union(cset) a.from = a.from.Difference(cset) a.cnt -= cset.Size() if a.cnt == 0 { break } } } } // Allocate idle CPU hyperthreads. func (a *allocatorHelper) takeIdleThreads() { offline := a.sys.Offlined() // pick all threads with free capacity cores := pickIds(a.sys.CPUIDs(), func(id idset.ID) bool { return a.from.Difference(offline).Contains(int(id)) }) a.Debug(" => idle threads unsorted: %v", cores) // sorted for preference by id, mimicking cpus_assignment.go for now: // IOW, prefer CPUs // - from packages with higher number of CPUs/cores already in a.result // - from packages having larger number of available cpus with preferred priority // - from a single package // - from the list of cpus with preferred priority // - from packages with fewer remaining free CPUs/cores in a.from // - from cores with fewer remaining free CPUs/cores in a.from // - from packages with lower id // - with lower id sort.Slice(cores, func(i, j int) bool { iCore := cores[i] jCore := cores[j] iPkg := a.sys.CPU(iCore).PackageID() jPkg := a.sys.CPU(jCore).PackageID() iCoreSet := a.topology.core[iCore] jCoreSet := a.topology.core[jCore] iPkgSet := a.topology.pkg[iPkg] jPkgSet := a.topology.pkg[jPkg] iPkgColo := iPkgSet.Intersection(a.result).Size() jPkgColo := jPkgSet.Intersection(a.result).Size() if iPkgColo != jPkgColo { return iPkgColo > jPkgColo } // Always sort cores in package order if res := a.topology.cpuPriorities.cmpCPUSet(iPkgSet.Intersection(a.from), jPkgSet.Intersection(a.from), a.prefer, a.cnt); res != 0 { return res > 0 } if iPkg != jPkg { return iPkg < jPkg } iCset := cpuset.New(int(cores[i])) jCset := cpuset.New(int(cores[j])) if res := a.topology.cpuPriorities.cmpCPUSet(iCset, jCset, a.prefer, 0); res != 0 { return res > 0 } iPkgFree := iPkgSet.Intersection(a.from).Size() jPkgFree := jPkgSet.Intersection(a.from).Size() if iPkgFree != jPkgFree { return iPkgFree < jPkgFree } iCoreFree := iCoreSet.Intersection(a.from).Size() jCoreFree := jCoreSet.Intersection(a.from).Size() if iCoreFree != jCoreFree { return iCoreFree < jCoreFree } return iCore < jCore }) a.Debug(" => idle threads sorted: %v", cores) // take as many idle cores as we can for _, id := range cores { cset := a.topology.core[id].Difference(offline) a.Debug(" => considering thread %v (#%s)...", id, cset) cset = cpuset.New(int(id)) a.result = a.result.Union(cset) a.from = a.from.Difference(cset) a.cnt -= cset.Size() if a.cnt == 0 { break } } } // takeAny is a dummy allocator not dependent on sysfs topology information func (a *allocatorHelper) takeAny() { a.Debug("* takeAnyCores()...") cpus := a.from.List() if len(cpus) >= a.cnt { cset := cpuset.New(cpus[0:a.cnt]...) a.result = a.result.Union(cset) a.from = a.from.Difference(cset) a.cnt = 0 } } // Perform CPU allocation. func (a *allocatorHelper) allocate() cpuset.CPUSet { if a.sys != nil { if (a.flags & AllocIdlePackages) != 0 { a.takeIdlePackages() } if a.cnt > 0 && (a.flags&AllocIdleCores) != 0 { a.takeIdleCores() } if a.cnt > 0 { a.takeIdleThreads() } } else { a.takeAny() } if a.cnt == 0 { return a.result } return cpuset.New() } func (ca *cpuAllocator) allocateCpus(from *cpuset.CPUSet, cnt int, prefer CPUPriority) (cpuset.CPUSet, error) { var result cpuset.CPUSet var err error switch { case from.Size() < cnt: result, err = cpuset.New(), fmt.Errorf("cpuset %s does not have %d CPUs", from, cnt) case from.Size() == cnt: result, err, *from = from.Clone(), nil, cpuset.New() default: a := newAllocatorHelper(ca.sys, ca.topologyCache) a.from = from.Clone() a.cnt = cnt a.prefer = prefer result, err, *from = a.allocate(), nil, a.from.Clone() a.Debug("%d cpus from #%v (preferring #%v) => #%v", cnt, from.Union(result), a.prefer, result) } return result, err } // AllocateCpus allocates a number of CPUs from the given set. func (ca *cpuAllocator) AllocateCpus(from *cpuset.CPUSet, cnt int, prefer CPUPriority) (cpuset.CPUSet, error) { result, err := ca.allocateCpus(from, cnt, prefer) return result, err } // ReleaseCpus releases a number of CPUs from the given set. func (ca *cpuAllocator) ReleaseCpus(from *cpuset.CPUSet, cnt int, prefer CPUPriority) (cpuset.CPUSet, error) { oset := from.Clone() result, err := ca.allocateCpus(from, from.Size()-cnt, prefer) ca.Debug("ReleaseCpus(#%s, %d) => kept: #%s, released: #%s", oset, cnt, from, result) return result, err } func newTopologyCache(sys sysfs.System) topologyCache { c := topologyCache{ pkg: make(map[idset.ID]cpuset.CPUSet), node: make(map[idset.ID]cpuset.CPUSet), core: make(map[idset.ID]cpuset.CPUSet)} if sys != nil { for _, id := range sys.PackageIDs() { c.pkg[id] = sys.Package(id).CPUSet() } for _, id := range sys.NodeIDs() { c.node[id] = sys.Node(id).CPUSet() } for _, id := range sys.CPUIDs() { c.core[id] = sys.CPU(id).ThreadCPUSet() } } c.discoverCPUPriorities(sys) return c } func (c *topologyCache) discoverCPUPriorities(sys sysfs.System) { if sys == nil { return } var prio cpuPriorities // Discover on per-package basis for id := range c.pkg { cpuPriorities, sstActive := c.discoverSstCPUPriority(sys, id) if !sstActive { cpuPriorities = c.discoverCpufreqPriority(sys, id) } for p, cpus := range cpuPriorities { source := map[bool]string{true: "sst", false: "cpufreq"}[sstActive] cset := sysfs.CPUSetFromIDSet(idset.NewIDSet(cpus...)) log.Debug("package #%d (%s): %d %s priority cpus (%v)", id, source, len(cpus), CPUPriority(p), cset) prio[p] = prio[p].Union(cset) } } c.cpuPriorities = prio } func (c *topologyCache) discoverSstCPUPriority(sys sysfs.System, pkgID idset.ID) ([NumCPUPriorities][]idset.ID, bool) { active := false pkg := sys.Package(pkgID) sst := pkg.SstInfo() cpuIDs := c.pkg[pkgID].List() prios := make(map[idset.ID]CPUPriority, len(cpuIDs)) // Determine SST-based priority. Based on experimentation there is some // hierarchy between the SST features. Without trying to be too smart // we follow the principles below: // 1. SST-TF has highest preference, mastering over SST-BF and making most // of SST-CP settings ineffective // 2. SST-CP dictates over SST-BF // 3. SST-BF is meaningful if neither SST-TF nor SST-CP is enabled switch { case sst == nil: case sst.TFEnabled: log.Debug("package #%d: using SST-TF based CPU prioritization", pkgID) // We only look at the CLOS id as SST-TF (seems to) follows ordered CLOS priority for _, i := range cpuIDs { id := idset.ID(i) p := PriorityLow // First two CLOSes are prioritized by SST if sys.CPU(id).SstClos() < 2 { p = PriorityHigh } prios[id] = p } active = true case sst.CPEnabled: closPrio := c.sstClosPriority(sys, pkgID) log.Debug("package #%d: using SST-CP based CPU prioritization with CLOS mapping %v", pkgID, closPrio) active = false for _, i := range cpuIDs { id := idset.ID(i) clos := sys.CPU(id).SstClos() p := closPrio[clos] if p != PriorityNormal { active = true } prios[id] = p } } if !active && sst != nil && sst.BFEnabled { log.Debug("package #%d: using SST-BF based CPU prioritization", pkgID) for _, i := range cpuIDs { id := idset.ID(i) p := PriorityLow if sst.BFCores.Has(id) { p = PriorityHigh } prios[id] = p } active = true } var ret [NumCPUPriorities][]idset.ID for cpu, prio := range prios { ret[prio] = append(ret[prio], cpu) } return ret, active } func (c *topologyCache) sstClosPriority(sys sysfs.System, pkgID idset.ID) map[int]CPUPriority { sortedKeys := func(m map[int]int) []int { keys := make([]int, 0, len(m)) for k := range m { keys = append(keys, k) } sort.Ints(keys) return keys } pkg := sys.Package(pkgID) sstinfo := pkg.SstInfo() // Get a list of unique CLOS proportional priority values closPps := make(map[int]int) closIds := make(map[int]int) for _, cpuID := range c.pkg[pkgID].List() { clos := sys.CPU(idset.ID(cpuID)).SstClos() pp := sstinfo.ClosInfo[clos].ProportionalPriority closPps[pp] = clos closIds[clos] = 0 // 0 is a dummy value here } // Form a list of (active) CLOS ids in sorted order var closSorted []int if sstinfo.CPPriority == sst.Ordered { // In ordered mode the priority is simply the CLOS id closSorted = sortedKeys(closIds) log.Debug("package #%d, ordered SST-CP priority with CLOS ids %v", pkgID, closSorted) } else { // In proportional mode we sort by the proportional priority parameter closPpSorted := sortedKeys(closPps) for _, pp := range closPpSorted { closSorted = append(closSorted, closPps[pp]) } log.Debug("package #%d, proportional SST-CP priority with PP-to-CLOS parity %v", pkgID, closPps) } // Map from CLOS id to cpuallocator CPU priority closPriority := make(map[int]CPUPriority, len(closSorted)) for _, id := range closSorted { // Default to normal priority closPriority[id] = PriorityNormal } if len(closSorted) > 1 { // Highest CLOS id maps to high CPU priority closPriority[closSorted[0]] = PriorityHigh closPriority[closSorted[len(closSorted)-1]] = PriorityLow } return closPriority } func (c *topologyCache) discoverCpufreqPriority(sys sysfs.System, pkgID idset.ID) [NumCPUPriorities][]idset.ID { var prios [NumCPUPriorities][]idset.ID // Group cpus by base frequency and energy performance profile freqs := map[uint64][]idset.ID{} epps := map[sysfs.EPP][]idset.ID{} cpuIDs := c.pkg[pkgID].List() for _, num := range cpuIDs { id := idset.ID(num) cpu := sys.CPU(id) bf := cpu.BaseFrequency() freqs[bf] = append(freqs[bf], id) epp := cpu.EPP() epps[epp] = append(epps[epp], id) } // Construct a sorted lists of detected frequencies and epp values freqList := []uint64{} for freq := range freqs { if freq > 0 { freqList = append(freqList, freq) } } utils.SortUint64s(freqList) eppList := []int{} for e := range epps { if e != sysfs.EPPUnknown { eppList = append(eppList, int(e)) } } sort.Ints(eppList) // Finally, determine priority of each CPU for _, num := range cpuIDs { id := idset.ID(num) cpu := sys.CPU(id) p := PriorityNormal if len(freqList) > 1 { bf := cpu.BaseFrequency() // All cpus NOT in the lowest base frequency bin are considered high prio if bf > freqList[0] { p = PriorityHigh } else { p = PriorityLow } } // All cpus NOT in the lowest performance epp are considered high prio // NOTE: higher EPP value denotes lower performance preference if len(eppList) > 1 { epp := cpu.EPP() if int(epp) < eppList[len(eppList)-1] { p = PriorityHigh } else { p = PriorityLow } } prios[p] = append(prios[p], id) } return prios } func (p CPUPriority) String() string { switch p { case PriorityHigh: return "high" case PriorityNormal: return "normal" case PriorityLow: return "low" } return "none" } // cmpCPUSet compares two cpusets in terms of preferred cpu priority. Returns: // // > 0 if cpuset A is preferred // < 0 if cpuset B is preferred // 0 if cpusets A and B are equal in terms of cpu priority func (c *cpuPriorities) cmpCPUSet(csetA, csetB cpuset.CPUSet, prefer CPUPriority, cpuCnt int) int { if prefer == PriorityNone { return 0 } // Favor cpuset having CPUs with priorities equal to or lower than what was requested for prio := prefer; prio < NumCPUPriorities; prio++ { prefA := csetA.Intersection(c[prio]).Size() prefB := csetB.Intersection(c[prio]).Size() if cpuCnt > 0 && prio == prefer && prefA >= cpuCnt && prefB >= cpuCnt { // Prefer the tightest fitting if both cpusets satisfy the // requested amount of CPUs with the preferred priority return prefB - prefA } if prefA != prefB { return prefA - prefB } } // Repel cpuset having CPUs with higher priority than what was requested for prio := PriorityHigh; prio < prefer; prio++ { nonprefA := csetA.Intersection(c[prio]).Size() nonprefB := csetB.Intersection(c[prio]).Size() if nonprefA != nonprefB { return nonprefB - nonprefA } } return 0 } ================================================ FILE: pkg/cpuallocator/cpuallocator_test.go ================================================ // Copyright 2020 Intel Corporation. All Rights Reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package cpuallocator import ( "os" "path" "testing" "github.com/intel/cri-resource-manager/pkg/sysfs" "github.com/intel/cri-resource-manager/pkg/utils" "github.com/intel/cri-resource-manager/pkg/utils/cpuset" ) func TestAllocatorHelper(t *testing.T) { // Create tmpdir and decompress testdata there tmpdir, err := os.MkdirTemp("", "cri-resource-manager-test-") if err != nil { t.Fatalf("failed to create tmpdir: %v", err) } defer os.RemoveAll(tmpdir) if err := utils.UncompressTbz2(path.Join("testdata", "sysfs.tar.bz2"), tmpdir); err != nil { t.Fatalf("failed to decompress testdata: %v", err) } // Discover mock system from the testdata sys, err := sysfs.DiscoverSystemAt(path.Join(tmpdir, "sysfs", "2-socket-4-node-40-core", "sys")) if err != nil { t.Fatalf("failed to discover mock system: %v", err) } topoCache := newTopologyCache(sys) // Fake cpu priorities: 5 cores from pkg #0 as high prio // Package CPUs: #0: [0-19,40-59], #1: [20-39,60-79] topoCache.cpuPriorities = [NumCPUPriorities]cpuset.CPUSet{ cpuset.MustParse("2,5,8,15,17,42,45,48,55,57"), cpuset.MustParse("20-39,60-79"), cpuset.MustParse("0,1,3,4,6,7,9-14,16,18,19,40,41,43,44,46,47,49-54,56,58,59"), } tcs := []struct { description string from cpuset.CPUSet prefer CPUPriority cnt int expected cpuset.CPUSet }{ { description: "too few available CPUs", from: cpuset.MustParse("2,3,10-14,20"), prefer: PriorityNormal, cnt: 9, expected: cpuset.New(), }, { description: "request all available CPUs", from: cpuset.MustParse("2,3,10-14,20"), prefer: PriorityNormal, cnt: 8, expected: cpuset.MustParse("2,3,10-14,20"), }, { description: "prefer high priority cpus", from: cpuset.MustParse("2,3,10-25"), prefer: PriorityHigh, cnt: 4, expected: cpuset.New(2, 3, 15, 17), }, } // Run tests for _, tc := range tcs { t.Run(tc.description, func(t *testing.T) { a := newAllocatorHelper(sys, topoCache) a.from = tc.from a.prefer = tc.prefer a.cnt = tc.cnt result := a.allocate() if !result.Equals(tc.expected) { t.Errorf("expected %q, result was %q", tc.expected, result) } }) } } ================================================ FILE: pkg/cri/client/client.go ================================================ // Copyright 2019 Intel Corporation. All Rights Reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package client import ( "context" "fmt" "net" "os" "syscall" "time" "google.golang.org/grpc" "google.golang.org/grpc/connectivity" criv1 "k8s.io/cri-api/pkg/apis/runtime/v1" "github.com/intel/cri-resource-manager/pkg/instrumentation" logger "github.com/intel/cri-resource-manager/pkg/log" "github.com/intel/cri-resource-manager/pkg/utils" v1 "github.com/intel/cri-resource-manager/pkg/cri/client/v1" ) // DialNotifyFn is a function to call after a successful net.Dial[Timeout](). type DialNotifyFn func(string, int, int, os.FileMode, error) // Options contains the configurable options of our CRI client. type Options struct { // ImageSocket is the socket path for the CRI image service. ImageSocket string // RuntimeSocket is the socket path for the CRI runtime service. RuntimeSocket string // DialNotify is an optional function to notify after net.Dial returns for a socket. DialNotify DialNotifyFn } // ConnectOptions contains options for connecting to the server. type ConnectOptions struct { // Wait indicates whether Connect() should wait (indefinitely) for the server. Wait bool // Reconnect indicates whether CheckConnection() should attempt to Connect(). Reconnect bool } // Client is the interface we expose to our CRI client. type Client interface { // Connect tries to connect the client to the specified image and runtime services. Connect(ConnectOptions) error // Close closes any existing client connections. Close() // CheckConnection checks if we have (un-Close()'d as opposed to working) connections. CheckConnection(ConnectOptions) error // HasRuntimeService checks if the client is configured with runtime services. HasRuntimeService() bool // We expose full image and runtime client services. criv1.ImageServiceClient criv1.RuntimeServiceClient } type criClient interface { criv1.ImageServiceClient criv1.RuntimeServiceClient } // client is the implementation of Client. type client struct { logger.Logger criv1.ImageServiceClient criv1.RuntimeServiceClient options Options // client options icc *grpc.ClientConn // our gRPC connection to the image service rcc *grpc.ClientConn // our gRPC connection to the runtime service client criClient } const ( // DontConnect is used to mark a socket to not be connected. DontConnect = "-" ) // NewClient creates a new client instance. func NewClient(options Options) (Client, error) { if options.ImageSocket == DontConnect && options.RuntimeSocket == DontConnect { return nil, clientError("neither image nor runtime socket specified") } c := &client{ Logger: logger.NewLogger("cri/client"), options: options, } return c, nil } // Connect attempts to establish gRPC client connections to the configured services. func (c *client) Connect(options ConnectOptions) error { var err error kind, socket := "image services", c.options.ImageSocket if c.icc, err = c.connect(kind, socket, options); err != nil { return err } kind, socket = "runtime services", c.options.RuntimeSocket if socket == c.options.ImageSocket { c.rcc = c.icc } else { if c.rcc, err = c.connect(kind, socket, options); err != nil { c.icc = nil return err } } client, err := v1.Connect(c.rcc, c.icc) if err != nil { return err } c.client = client return nil } // Close any open service connection. func (c *client) Close() { if c.icc != nil { c.Debug("closing image service connection...") c.icc.Close() } if c.rcc != nil { c.Debug("closing runtime service connection...") if c.rcc != c.icc { c.rcc.Close() } } c.icc = nil c.rcc = nil } // Check if the connecton to CRI services is up, try to reconnect if requested. func (c *client) CheckConnection(options ConnectOptions) error { if (c.icc == nil || c.icc.GetState() == connectivity.Ready) && (c.rcc == nil || c.rcc.GetState() == connectivity.Ready) { return nil } c.Close() if options.Reconnect { c.Warn("client connections are down") if err := c.Connect(ConnectOptions{Wait: false}); err == nil { return nil } } return clientError("client connections are down") } // HasRuntimeService checks if the client is configured with runtime services. func (c *client) HasRuntimeService() bool { return c.options.RuntimeSocket != "" && c.options.RuntimeSocket != DontConnect } func (c *client) checkRuntimeService() error { if c.client == nil || c.rcc == nil { return clientError("no CRI RuntimeService client") } return nil } func (c *client) checkImageService() error { if c.client == nil || c.icc == nil { return clientError("no CRI ImageService client") } return nil } // connect attempts to create a gRPC client connection to the given socket. func (c *client) connect(kind, socket string, options ConnectOptions) (*grpc.ClientConn, error) { var cc *grpc.ClientConn var err error if socket == DontConnect { return nil, nil } dialOpts := instrumentation.InjectGrpcClientTrace( grpc.WithInsecure(), grpc.WithBlock(), grpc.FailOnNonTempDialError(true), grpc.WithDialer(func(socket string, timeout time.Duration) (net.Conn, error) { conn, err := net.DialTimeout("unix", socket, timeout) if err != nil { return conn, err } c.dialNotify(socket) return conn, err })) if options.Wait { c.Info("waiting for %s on socket %s...", kind, socket) if err = utils.WaitForServer(socket, -1, dialOpts, &cc); err != nil { return nil, clientError("failed to connect to %s: %v", kind, err) } } else { if cc, err = grpc.Dial(socket, dialOpts...); err != nil { return nil, clientError("failed to connect to %s: %v", kind, err) } } return cc, nil } func (c *client) dialNotify(socket string) { if c.options.DialNotify == nil { return } info, err := os.Stat(socket) if err != nil { c.options.DialNotify(socket, -1, -1, 0, err) return } st, ok := info.Sys().(*syscall.Stat_t) if !ok { err := clientError("failed to stat socket %q: %v", socket, err) c.options.DialNotify(socket, -1, -1, 0, err) return } uid, gid := int(st.Uid), int(st.Gid) mode := info.Mode() & os.ModePerm c.options.DialNotify(socket, uid, gid, mode, nil) } func (c *client) Version(ctx context.Context, in *criv1.VersionRequest, _ ...grpc.CallOption) (*criv1.VersionResponse, error) { if err := c.checkRuntimeService(); err != nil { return nil, err } return c.client.Version(ctx, in) } func (c *client) RunPodSandbox(ctx context.Context, in *criv1.RunPodSandboxRequest, _ ...grpc.CallOption) (*criv1.RunPodSandboxResponse, error) { if err := c.checkRuntimeService(); err != nil { return nil, err } return c.client.RunPodSandbox(ctx, in) } func (c *client) StopPodSandbox(ctx context.Context, in *criv1.StopPodSandboxRequest, _ ...grpc.CallOption) (*criv1.StopPodSandboxResponse, error) { if err := c.checkRuntimeService(); err != nil { return nil, err } return c.client.StopPodSandbox(ctx, in) } func (c *client) RemovePodSandbox(ctx context.Context, in *criv1.RemovePodSandboxRequest, _ ...grpc.CallOption) (*criv1.RemovePodSandboxResponse, error) { if err := c.checkRuntimeService(); err != nil { return nil, err } return c.client.RemovePodSandbox(ctx, in) } func (c *client) PodSandboxStatus(ctx context.Context, in *criv1.PodSandboxStatusRequest, _ ...grpc.CallOption) (*criv1.PodSandboxStatusResponse, error) { if err := c.checkRuntimeService(); err != nil { return nil, err } return c.client.PodSandboxStatus(ctx, in) } func (c *client) ListPodSandbox(ctx context.Context, in *criv1.ListPodSandboxRequest, _ ...grpc.CallOption) (*criv1.ListPodSandboxResponse, error) { if err := c.checkRuntimeService(); err != nil { return nil, err } return c.client.ListPodSandbox(ctx, in) } func (c *client) CreateContainer(ctx context.Context, in *criv1.CreateContainerRequest, _ ...grpc.CallOption) (*criv1.CreateContainerResponse, error) { if err := c.checkRuntimeService(); err != nil { return nil, err } return c.client.CreateContainer(ctx, in) } func (c *client) StartContainer(ctx context.Context, in *criv1.StartContainerRequest, _ ...grpc.CallOption) (*criv1.StartContainerResponse, error) { if err := c.checkRuntimeService(); err != nil { return nil, err } return c.client.StartContainer(ctx, in) } func (c *client) StopContainer(ctx context.Context, in *criv1.StopContainerRequest, _ ...grpc.CallOption) (*criv1.StopContainerResponse, error) { if err := c.checkRuntimeService(); err != nil { return nil, err } return c.client.StopContainer(ctx, in) } func (c *client) RemoveContainer(ctx context.Context, in *criv1.RemoveContainerRequest, _ ...grpc.CallOption) (*criv1.RemoveContainerResponse, error) { if err := c.checkRuntimeService(); err != nil { return nil, err } return c.client.RemoveContainer(ctx, in) } func (c *client) ListContainers(ctx context.Context, in *criv1.ListContainersRequest, _ ...grpc.CallOption) (*criv1.ListContainersResponse, error) { if err := c.checkRuntimeService(); err != nil { return nil, err } return c.client.ListContainers(ctx, in) } func (c *client) ContainerStatus(ctx context.Context, in *criv1.ContainerStatusRequest, _ ...grpc.CallOption) (*criv1.ContainerStatusResponse, error) { if err := c.checkRuntimeService(); err != nil { return nil, err } return c.client.ContainerStatus(ctx, in) } func (c *client) UpdateContainerResources(ctx context.Context, in *criv1.UpdateContainerResourcesRequest, _ ...grpc.CallOption) (*criv1.UpdateContainerResourcesResponse, error) { if err := c.checkRuntimeService(); err != nil { return nil, err } return c.client.UpdateContainerResources(ctx, in) } func (c *client) ReopenContainerLog(ctx context.Context, in *criv1.ReopenContainerLogRequest, _ ...grpc.CallOption) (*criv1.ReopenContainerLogResponse, error) { if err := c.checkRuntimeService(); err != nil { return nil, err } return c.client.ReopenContainerLog(ctx, in) } func (c *client) ExecSync(ctx context.Context, in *criv1.ExecSyncRequest, _ ...grpc.CallOption) (*criv1.ExecSyncResponse, error) { if err := c.checkRuntimeService(); err != nil { return nil, err } return c.client.ExecSync(ctx, in) } func (c *client) Exec(ctx context.Context, in *criv1.ExecRequest, _ ...grpc.CallOption) (*criv1.ExecResponse, error) { if err := c.checkRuntimeService(); err != nil { return nil, err } return c.client.Exec(ctx, in) } func (c *client) Attach(ctx context.Context, in *criv1.AttachRequest, _ ...grpc.CallOption) (*criv1.AttachResponse, error) { if err := c.checkRuntimeService(); err != nil { return nil, err } return c.client.Attach(ctx, in) } func (c *client) PortForward(ctx context.Context, in *criv1.PortForwardRequest, _ ...grpc.CallOption) (*criv1.PortForwardResponse, error) { if err := c.checkRuntimeService(); err != nil { return nil, err } return c.client.PortForward(ctx, in) } func (c *client) ContainerStats(ctx context.Context, in *criv1.ContainerStatsRequest, _ ...grpc.CallOption) (*criv1.ContainerStatsResponse, error) { if err := c.checkRuntimeService(); err != nil { return nil, err } return c.client.ContainerStats(ctx, in) } func (c *client) ListContainerStats(ctx context.Context, in *criv1.ListContainerStatsRequest, _ ...grpc.CallOption) (*criv1.ListContainerStatsResponse, error) { if err := c.checkRuntimeService(); err != nil { return nil, err } return c.client.ListContainerStats(ctx, in) } func (c *client) PodSandboxStats(ctx context.Context, in *criv1.PodSandboxStatsRequest, _ ...grpc.CallOption) (*criv1.PodSandboxStatsResponse, error) { if err := c.checkRuntimeService(); err != nil { return nil, err } return c.client.PodSandboxStats(ctx, in) } func (c *client) ListPodSandboxStats(ctx context.Context, in *criv1.ListPodSandboxStatsRequest, _ ...grpc.CallOption) (*criv1.ListPodSandboxStatsResponse, error) { if err := c.checkRuntimeService(); err != nil { return nil, err } return c.client.ListPodSandboxStats(ctx, in) } func (c *client) UpdateRuntimeConfig(ctx context.Context, in *criv1.UpdateRuntimeConfigRequest, _ ...grpc.CallOption) (*criv1.UpdateRuntimeConfigResponse, error) { if err := c.checkRuntimeService(); err != nil { return nil, err } return c.client.UpdateRuntimeConfig(ctx, in) } func (c *client) Status(ctx context.Context, in *criv1.StatusRequest, _ ...grpc.CallOption) (*criv1.StatusResponse, error) { if err := c.checkRuntimeService(); err != nil { return nil, err } return c.client.Status(ctx, in) } func (c *client) CheckpointContainer(ctx context.Context, in *criv1.CheckpointContainerRequest, _ ...grpc.CallOption) (*criv1.CheckpointContainerResponse, error) { return c.client.CheckpointContainer(ctx, in) } func (c *client) GetContainerEvents(ctx context.Context, in *criv1.GetEventsRequest, _ ...grpc.CallOption) (criv1.RuntimeService_GetContainerEventsClient, error) { return c.client.GetContainerEvents(ctx, in) } func (c *client) ListMetricDescriptors(ctx context.Context, in *criv1.ListMetricDescriptorsRequest, _ ...grpc.CallOption) (*criv1.ListMetricDescriptorsResponse, error) { return c.client.ListMetricDescriptors(ctx, in) } func (c *client) ListPodSandboxMetrics(ctx context.Context, in *criv1.ListPodSandboxMetricsRequest, _ ...grpc.CallOption) (*criv1.ListPodSandboxMetricsResponse, error) { return c.client.ListPodSandboxMetrics(ctx, in) } func (c *client) RuntimeConfig(ctx context.Context, in *criv1.RuntimeConfigRequest, _ ...grpc.CallOption) (*criv1.RuntimeConfigResponse, error) { return c.client.RuntimeConfig(ctx, in) } func (c *client) ListImages(ctx context.Context, in *criv1.ListImagesRequest, _ ...grpc.CallOption) (*criv1.ListImagesResponse, error) { if err := c.checkImageService(); err != nil { return nil, err } return c.client.ListImages(ctx, in) } func (c *client) ImageStatus(ctx context.Context, in *criv1.ImageStatusRequest, _ ...grpc.CallOption) (*criv1.ImageStatusResponse, error) { if err := c.checkImageService(); err != nil { return nil, err } return c.client.ImageStatus(ctx, in) } func (c *client) PullImage(ctx context.Context, in *criv1.PullImageRequest, _ ...grpc.CallOption) (*criv1.PullImageResponse, error) { if err := c.checkImageService(); err != nil { return nil, err } return c.client.PullImage(ctx, in) } func (c *client) RemoveImage(ctx context.Context, in *criv1.RemoveImageRequest, _ ...grpc.CallOption) (*criv1.RemoveImageResponse, error) { if err := c.checkImageService(); err != nil { return nil, err } return c.client.RemoveImage(ctx, in) } func (c *client) ImageFsInfo(ctx context.Context, in *criv1.ImageFsInfoRequest, _ ...grpc.CallOption) (*criv1.ImageFsInfoResponse, error) { if err := c.checkImageService(); err != nil { return nil, err } return c.client.ImageFsInfo(ctx, in) } // Return a formatted client-specific error. func clientError(format string, args ...interface{}) error { return fmt.Errorf("cri/client: "+format, args...) } ================================================ FILE: pkg/cri/client/v1/client.go ================================================ // Copyright Intel Corporation. All Rights Reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package v1 import ( "context" "fmt" "google.golang.org/grpc" criv1 "k8s.io/cri-api/pkg/apis/runtime/v1" logger "github.com/intel/cri-resource-manager/pkg/log" ) type Client interface { criv1.ImageServiceClient criv1.RuntimeServiceClient } type client struct { logger.Logger isc criv1.ImageServiceClient rsc criv1.RuntimeServiceClient rcc *grpc.ClientConn icc *grpc.ClientConn } // Connect v2alpha1 RuntimeService and ImageService clients. func Connect(runtime, image *grpc.ClientConn) (Client, error) { c := &client{ Logger: logger.Get("cri/client"), rcc: runtime, icc: image, } if c.rcc != nil { c.Info("probing CRI v1 RuntimeService client...") c.rsc = criv1.NewRuntimeServiceClient(c.rcc) _, err := c.rsc.Version(context.Background(), &criv1.VersionRequest{}) if err != nil { return nil, err } } if c.icc != nil { c.Info("probing CRI v1 ImageService client...") c.isc = criv1.NewImageServiceClient(c.icc) _, err := c.isc.ListImages(context.Background(), &criv1.ListImagesRequest{}) if err != nil { return nil, err } } return c, nil } func (c *client) checkRuntimeService() error { if c.rcc == nil { return fmt.Errorf("no CRI v1 RuntimeService client") } return nil } func (c *client) checkImageService() error { if c.icc == nil { return fmt.Errorf("no CRI v1 ImageService client") } return nil } func (c *client) Version(ctx context.Context, in *criv1.VersionRequest, _ ...grpc.CallOption) (*criv1.VersionResponse, error) { if err := c.checkRuntimeService(); err != nil { return nil, err } return c.rsc.Version(ctx, in) } func (c *client) RunPodSandbox(ctx context.Context, in *criv1.RunPodSandboxRequest, _ ...grpc.CallOption) (*criv1.RunPodSandboxResponse, error) { if err := c.checkRuntimeService(); err != nil { return nil, err } return c.rsc.RunPodSandbox(ctx, in) } func (c *client) StopPodSandbox(ctx context.Context, in *criv1.StopPodSandboxRequest, _ ...grpc.CallOption) (*criv1.StopPodSandboxResponse, error) { if err := c.checkRuntimeService(); err != nil { return nil, err } return c.rsc.StopPodSandbox(ctx, in) } func (c *client) RemovePodSandbox(ctx context.Context, in *criv1.RemovePodSandboxRequest, _ ...grpc.CallOption) (*criv1.RemovePodSandboxResponse, error) { if err := c.checkRuntimeService(); err != nil { return nil, err } return c.rsc.RemovePodSandbox(ctx, in) } func (c *client) PodSandboxStatus(ctx context.Context, in *criv1.PodSandboxStatusRequest, _ ...grpc.CallOption) (*criv1.PodSandboxStatusResponse, error) { if err := c.checkRuntimeService(); err != nil { return nil, err } return c.rsc.PodSandboxStatus(ctx, in) } func (c *client) ListPodSandbox(ctx context.Context, in *criv1.ListPodSandboxRequest, _ ...grpc.CallOption) (*criv1.ListPodSandboxResponse, error) { if err := c.checkRuntimeService(); err != nil { return nil, err } return c.rsc.ListPodSandbox(ctx, in) } func (c *client) CreateContainer(ctx context.Context, in *criv1.CreateContainerRequest, _ ...grpc.CallOption) (*criv1.CreateContainerResponse, error) { if err := c.checkRuntimeService(); err != nil { return nil, err } return c.rsc.CreateContainer(ctx, in) } func (c *client) StartContainer(ctx context.Context, in *criv1.StartContainerRequest, _ ...grpc.CallOption) (*criv1.StartContainerResponse, error) { if err := c.checkRuntimeService(); err != nil { return nil, err } return c.rsc.StartContainer(ctx, in) } func (c *client) StopContainer(ctx context.Context, in *criv1.StopContainerRequest, _ ...grpc.CallOption) (*criv1.StopContainerResponse, error) { if err := c.checkRuntimeService(); err != nil { return nil, err } return c.rsc.StopContainer(ctx, in) } func (c *client) RemoveContainer(ctx context.Context, in *criv1.RemoveContainerRequest, _ ...grpc.CallOption) (*criv1.RemoveContainerResponse, error) { if err := c.checkRuntimeService(); err != nil { return nil, err } return c.rsc.RemoveContainer(ctx, in) } func (c *client) ListContainers(ctx context.Context, in *criv1.ListContainersRequest, _ ...grpc.CallOption) (*criv1.ListContainersResponse, error) { if err := c.checkRuntimeService(); err != nil { return nil, err } return c.rsc.ListContainers(ctx, in) } func (c *client) ContainerStatus(ctx context.Context, in *criv1.ContainerStatusRequest, _ ...grpc.CallOption) (*criv1.ContainerStatusResponse, error) { if err := c.checkRuntimeService(); err != nil { return nil, err } return c.rsc.ContainerStatus(ctx, in) } func (c *client) UpdateContainerResources(ctx context.Context, in *criv1.UpdateContainerResourcesRequest, _ ...grpc.CallOption) (*criv1.UpdateContainerResourcesResponse, error) { if err := c.checkRuntimeService(); err != nil { return nil, err } return c.rsc.UpdateContainerResources(ctx, in) } func (c *client) ReopenContainerLog(ctx context.Context, in *criv1.ReopenContainerLogRequest, _ ...grpc.CallOption) (*criv1.ReopenContainerLogResponse, error) { if err := c.checkRuntimeService(); err != nil { return nil, err } return c.rsc.ReopenContainerLog(ctx, in) } func (c *client) ExecSync(ctx context.Context, in *criv1.ExecSyncRequest, _ ...grpc.CallOption) (*criv1.ExecSyncResponse, error) { if err := c.checkRuntimeService(); err != nil { return nil, err } return c.rsc.ExecSync(ctx, in) } func (c *client) Exec(ctx context.Context, in *criv1.ExecRequest, _ ...grpc.CallOption) (*criv1.ExecResponse, error) { if err := c.checkRuntimeService(); err != nil { return nil, err } return c.rsc.Exec(ctx, in) } func (c *client) Attach(ctx context.Context, in *criv1.AttachRequest, _ ...grpc.CallOption) (*criv1.AttachResponse, error) { if err := c.checkRuntimeService(); err != nil { return nil, err } return c.rsc.Attach(ctx, in) } func (c *client) PortForward(ctx context.Context, in *criv1.PortForwardRequest, _ ...grpc.CallOption) (*criv1.PortForwardResponse, error) { if err := c.checkRuntimeService(); err != nil { return nil, err } return c.rsc.PortForward(ctx, in) } func (c *client) ContainerStats(ctx context.Context, in *criv1.ContainerStatsRequest, _ ...grpc.CallOption) (*criv1.ContainerStatsResponse, error) { if err := c.checkRuntimeService(); err != nil { return nil, err } return c.rsc.ContainerStats(ctx, in) } func (c *client) ListContainerStats(ctx context.Context, in *criv1.ListContainerStatsRequest, _ ...grpc.CallOption) (*criv1.ListContainerStatsResponse, error) { if err := c.checkRuntimeService(); err != nil { return nil, err } return c.rsc.ListContainerStats(ctx, in) } func (c *client) PodSandboxStats(ctx context.Context, in *criv1.PodSandboxStatsRequest, _ ...grpc.CallOption) (*criv1.PodSandboxStatsResponse, error) { if err := c.checkRuntimeService(); err != nil { return nil, err } return c.rsc.PodSandboxStats(ctx, in) } func (c *client) ListPodSandboxStats(ctx context.Context, in *criv1.ListPodSandboxStatsRequest, _ ...grpc.CallOption) (*criv1.ListPodSandboxStatsResponse, error) { if err := c.checkRuntimeService(); err != nil { return nil, err } return c.rsc.ListPodSandboxStats(ctx, in) } func (c *client) UpdateRuntimeConfig(ctx context.Context, in *criv1.UpdateRuntimeConfigRequest, _ ...grpc.CallOption) (*criv1.UpdateRuntimeConfigResponse, error) { if err := c.checkRuntimeService(); err != nil { return nil, err } return c.rsc.UpdateRuntimeConfig(ctx, in) } func (c *client) Status(ctx context.Context, in *criv1.StatusRequest, _ ...grpc.CallOption) (*criv1.StatusResponse, error) { if err := c.checkRuntimeService(); err != nil { return nil, err } return c.rsc.Status(ctx, in) } func (c *client) CheckpointContainer(ctx context.Context, in *criv1.CheckpointContainerRequest, _ ...grpc.CallOption) (*criv1.CheckpointContainerResponse, error) { if err := c.checkRuntimeService(); err != nil { return nil, err } return c.rsc.CheckpointContainer(ctx, in) } func (c *client) GetContainerEvents(ctx context.Context, in *criv1.GetEventsRequest, _ ...grpc.CallOption) (criv1.RuntimeService_GetContainerEventsClient, error) { if err := c.checkRuntimeService(); err != nil { return nil, err } eventsClient, err := c.rsc.GetContainerEvents(ctx, in) if err != nil { return nil, err } return eventsClient, err } func (c *client) ListMetricDescriptors(ctx context.Context, in *criv1.ListMetricDescriptorsRequest, _ ...grpc.CallOption) (*criv1.ListMetricDescriptorsResponse, error) { if err := c.checkRuntimeService(); err != nil { return nil, err } return c.rsc.ListMetricDescriptors(ctx, in) } func (c *client) ListPodSandboxMetrics(ctx context.Context, in *criv1.ListPodSandboxMetricsRequest, _ ...grpc.CallOption) (*criv1.ListPodSandboxMetricsResponse, error) { if err := c.checkRuntimeService(); err != nil { return nil, err } return c.rsc.ListPodSandboxMetrics(ctx, in) } func (c *client) RuntimeConfig(ctx context.Context, in *criv1.RuntimeConfigRequest, _ ...grpc.CallOption) (*criv1.RuntimeConfigResponse, error) { if err := c.checkRuntimeService(); err != nil { return nil, err } return c.rsc.RuntimeConfig(ctx, in) } func (c *client) ListImages(ctx context.Context, in *criv1.ListImagesRequest, _ ...grpc.CallOption) (*criv1.ListImagesResponse, error) { if err := c.checkImageService(); err != nil { return nil, err } return c.isc.ListImages(ctx, in) } func (c *client) ImageStatus(ctx context.Context, in *criv1.ImageStatusRequest, _ ...grpc.CallOption) (*criv1.ImageStatusResponse, error) { if err := c.checkImageService(); err != nil { return nil, err } return c.isc.ImageStatus(ctx, in) } func (c *client) PullImage(ctx context.Context, in *criv1.PullImageRequest, _ ...grpc.CallOption) (*criv1.PullImageResponse, error) { if err := c.checkImageService(); err != nil { return nil, err } return c.isc.PullImage(ctx, in) } func (c *client) RemoveImage(ctx context.Context, in *criv1.RemoveImageRequest, _ ...grpc.CallOption) (*criv1.RemoveImageResponse, error) { if err := c.checkImageService(); err != nil { return nil, err } return c.isc.RemoveImage(ctx, in) } func (c *client) ImageFsInfo(ctx context.Context, in *criv1.ImageFsInfoRequest, _ ...grpc.CallOption) (*criv1.ImageFsInfoResponse, error) { if err := c.checkImageService(); err != nil { return nil, err } return c.isc.ImageFsInfo(ctx, in) } // Return a formatted client-specific error. func clientError(format string, args ...interface{}) error { return fmt.Errorf("cri/client: "+format, args...) } ================================================ FILE: pkg/cri/relay/image-service.go ================================================ // Copyright 2019 Intel Corporation. All Rights Reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package relay import ( "context" criv1 "k8s.io/cri-api/pkg/apis/runtime/v1" ) func (r *relay) ListImages(ctx context.Context, req *criv1.ListImagesRequest) (*criv1.ListImagesResponse, error) { return r.client.ListImages(ctx, req) } func (r *relay) ImageStatus(ctx context.Context, req *criv1.ImageStatusRequest) (*criv1.ImageStatusResponse, error) { return r.client.ImageStatus(ctx, req) } func (r *relay) PullImage(ctx context.Context, req *criv1.PullImageRequest) (*criv1.PullImageResponse, error) { return r.client.PullImage(ctx, req) } func (r *relay) RemoveImage(ctx context.Context, req *criv1.RemoveImageRequest) (*criv1.RemoveImageResponse, error) { return r.client.RemoveImage(ctx, req) } func (r *relay) ImageFsInfo(ctx context.Context, req *criv1.ImageFsInfoRequest) (*criv1.ImageFsInfoResponse, error) { return r.client.ImageFsInfo(ctx, req) } ================================================ FILE: pkg/cri/relay/relay.go ================================================ // Copyright 2019 Intel Corporation. All Rights Reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package relay import ( "fmt" "os" "sync" criv1 "k8s.io/cri-api/pkg/apis/runtime/v1" "github.com/intel/cri-resource-manager/pkg/cri/client" "github.com/intel/cri-resource-manager/pkg/cri/server" logger "github.com/intel/cri-resource-manager/pkg/log" ) const ( // DisableService is used to mark a socket/service to not be connected. DisableService = client.DontConnect // DefaultImageSocket uses the runtime socket for the image servie, too. DefaultImageSocket = "default" ) // Options contains the configurable options of our CRI relay. type Options struct { // RelaySocket is the socket path for the CRI relay services. RelaySocket string // ImageSocket is the socket path for the (real) CRI image services. ImageSocket string // RuntimeSocket is the socket path for the (real) CRI runtime services. RuntimeSocket string // QualifyReqFn produces context for disambiguating a CRI request/reply. QualifyReqFn func(interface{}) string } // Relay is the interface we expose for controlling our CRI relay. type Relay interface { // Setup prepares the relay to start processing CRI requests. Setup() error // Start starts the relay. Start() error // Stop stops the relay. Stop() // Client returns the relays client interface. Client() client.Client // Server returns the relays server interface. Server() server.Server } // relay is the implementation of Relay. type relay struct { logger.Logger sync.Mutex options Options // relay options client client.Client // relay CRI client server server.Server // relay CRI server evtClient criv1.RuntimeService_GetContainerEventsClient evtChans map[*criv1.GetEventsRequest]chan *criv1.ContainerEventResponse } // NewRelay creates a new relay instance. func NewRelay(options Options) (Relay, error) { var err error r := &relay{ Logger: logger.NewLogger("cri/relay"), options: options, evtChans: map[*criv1.GetEventsRequest]chan *criv1.ContainerEventResponse{}, } imageSocket := r.options.ImageSocket if imageSocket == DefaultImageSocket { imageSocket = r.options.RuntimeSocket } cltopts := client.Options{ ImageSocket: imageSocket, RuntimeSocket: r.options.RuntimeSocket, DialNotify: r.dialNotify, } if r.client, err = client.NewClient(cltopts); err != nil { return nil, relayError("failed to create relay client: %v", err) } srvopts := server.Options{ Socket: r.options.RelaySocket, User: -1, Group: -1, Mode: 0660, QualifyReqFn: r.options.QualifyReqFn, } if r.server, err = server.NewServer(srvopts); err != nil { return nil, relayError("failed to create relay server: %v", err) } return r, nil } // Setup prepares the relay to start processing requests. func (r *relay) Setup() error { if err := r.client.Connect(client.ConnectOptions{Wait: true}); err != nil { return relayError("client connection failed: %v", err) } if r.options.ImageSocket != DisableService { if err := r.server.RegisterImageService(r); err != nil { return relayError("failed to register image service: %v", err) } } if r.options.RuntimeSocket != DisableService { if err := r.server.RegisterRuntimeService(r); err != nil { return relayError("failed to register runtime service: %v", err) } } return nil } // Start starts the relays request processing goroutine. func (r *relay) Start() error { if err := r.server.Start(); err != nil { return relayError("failed to start relay: %v", err) } return nil } // Stop stops the relay. func (r *relay) Stop() { r.client.Close() r.server.Stop() } // Client returns the relays Client interface. func (r *relay) Client() client.Client { return r.client } // Server returns the relays Server interface. func (r *relay) Server() server.Server { return r.server } func (r *relay) dialNotify(socket string, uid int, gid int, mode os.FileMode, err error) { if err != nil { r.Error("failed to determine permissions/ownership of client socket %q: %v", socket, err) return } // Notes: // Kubelet has separate configuration/command line options for the container // runtime's Image and Runtime Services. Hence, in principle it is possible // that we run with two separate sockets for those. However, we always expose // both services over our single relay socket. Since we cannot set two set of // ownerships and permissions on a single socket, if this situation arises in // practice we choose to go with the runtime socket's properties. if r.options.ImageSocket != r.options.RuntimeSocket { if socket != r.options.RuntimeSocket && r.options.RuntimeSocket != client.DontConnect { r.Warn("ignoring ownership/permissions of dedicated CR Image Service socket...") return } } if err := r.server.Chown(uid, gid); err != nil { r.Error("server socket ownership change request failed: %v", err) } else { if err := r.server.Chmod(mode); err != nil { r.Error("server socket permissions change request failed: %v", err) } } } // relayError creates a formatted relay-specific error. func relayError(format string, args ...interface{}) error { return fmt.Errorf("cri/relay: "+format, args...) } ================================================ FILE: pkg/cri/relay/runtime-service.go ================================================ // Copyright 2019 Intel Corporation. All Rights Reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package relay import ( "context" "fmt" "time" criv1 "k8s.io/cri-api/pkg/apis/runtime/v1" "github.com/intel/cri-resource-manager/pkg/dump" ) func (r *relay) dump(method string, req interface{}) { if r.DebugEnabled() { qualif := r.qualifier(req) dump.RequestMessage("relayed", method, qualif, req, true) } } // qualifier pulls a qualifier for disambiguation from a CRI request message. func (r *relay) qualifier(msg interface{}) string { if fn := r.options.QualifyReqFn; fn != nil { return fn(msg) } return "" } func (r *relay) Version(ctx context.Context, req *criv1.VersionRequest) (*criv1.VersionResponse, error) { r.dump("Version", req) return r.client.Version(ctx, req) } func (r *relay) RunPodSandbox(ctx context.Context, req *criv1.RunPodSandboxRequest) (*criv1.RunPodSandboxResponse, error) { r.dump("RunPodSandbox", req) return r.client.RunPodSandbox(ctx, req) } func (r *relay) StopPodSandbox(ctx context.Context, req *criv1.StopPodSandboxRequest) (*criv1.StopPodSandboxResponse, error) { r.dump("StopPodSandbox", req) return r.client.StopPodSandbox(ctx, req) } func (r *relay) RemovePodSandbox(ctx context.Context, req *criv1.RemovePodSandboxRequest) (*criv1.RemovePodSandboxResponse, error) { r.dump("RemovePodSandbox", req) return r.client.RemovePodSandbox(ctx, req) } func (r *relay) PodSandboxStatus(ctx context.Context, req *criv1.PodSandboxStatusRequest) (*criv1.PodSandboxStatusResponse, error) { r.dump("PodSandboxStatus", req) return r.client.PodSandboxStatus(ctx, req) } func (r *relay) ListPodSandbox(ctx context.Context, req *criv1.ListPodSandboxRequest) (*criv1.ListPodSandboxResponse, error) { r.dump("ListPodSandbox", req) return r.client.ListPodSandbox(ctx, req) } func (r *relay) CreateContainer(ctx context.Context, req *criv1.CreateContainerRequest) (*criv1.CreateContainerResponse, error) { r.dump("CreateContainer", req) return r.client.CreateContainer(ctx, req) } func (r *relay) StartContainer(ctx context.Context, req *criv1.StartContainerRequest) (*criv1.StartContainerResponse, error) { r.dump("StartContainer", req) return r.client.StartContainer(ctx, req) } func (r *relay) StopContainer(ctx context.Context, req *criv1.StopContainerRequest) (*criv1.StopContainerResponse, error) { r.dump("StopContainer", req) return r.client.StopContainer(ctx, req) } func (r *relay) RemoveContainer(ctx context.Context, req *criv1.RemoveContainerRequest) (*criv1.RemoveContainerResponse, error) { r.dump("RemoveContainer", req) return r.client.RemoveContainer(ctx, req) } func (r *relay) ListContainers(ctx context.Context, req *criv1.ListContainersRequest) (*criv1.ListContainersResponse, error) { r.dump("ListContainers", req) return r.client.ListContainers(ctx, req) } func (r *relay) ContainerStatus(ctx context.Context, req *criv1.ContainerStatusRequest) (*criv1.ContainerStatusResponse, error) { r.dump("ContainerStatus", req) return r.client.ContainerStatus(ctx, req) } func (r *relay) UpdateContainerResources(ctx context.Context, req *criv1.UpdateContainerResourcesRequest) (*criv1.UpdateContainerResourcesResponse, error) { r.dump("UpdateContainerResources", req) return r.client.UpdateContainerResources(ctx, req) } func (r *relay) ReopenContainerLog(ctx context.Context, req *criv1.ReopenContainerLogRequest) (*criv1.ReopenContainerLogResponse, error) { r.dump("ReopenContainerLog", req) return r.client.ReopenContainerLog(ctx, req) } func (r *relay) ExecSync(ctx context.Context, req *criv1.ExecSyncRequest) (*criv1.ExecSyncResponse, error) { r.dump("ExecSync", req) return r.client.ExecSync(ctx, req) } func (r *relay) Exec(ctx context.Context, req *criv1.ExecRequest) (*criv1.ExecResponse, error) { r.dump("Exec", req) return r.client.Exec(ctx, req) } func (r *relay) Attach(ctx context.Context, req *criv1.AttachRequest) (*criv1.AttachResponse, error) { r.dump("Attach", req) return r.client.Attach(ctx, req) } func (r *relay) PortForward(ctx context.Context, req *criv1.PortForwardRequest) (*criv1.PortForwardResponse, error) { r.dump("PortForward", req) return r.client.PortForward(ctx, req) } func (r *relay) ContainerStats(ctx context.Context, req *criv1.ContainerStatsRequest) (*criv1.ContainerStatsResponse, error) { r.dump("ContainerStats", req) return r.client.ContainerStats(ctx, req) } func (r *relay) ListContainerStats(ctx context.Context, req *criv1.ListContainerStatsRequest) (*criv1.ListContainerStatsResponse, error) { r.dump("ListContainerStats", req) return r.client.ListContainerStats(ctx, req) } func (r *relay) PodSandboxStats(ctx context.Context, req *criv1.PodSandboxStatsRequest) (*criv1.PodSandboxStatsResponse, error) { r.dump("PodSandboxStats", req) return r.client.PodSandboxStats(ctx, req) } func (r *relay) ListPodSandboxStats(ctx context.Context, req *criv1.ListPodSandboxStatsRequest) (*criv1.ListPodSandboxStatsResponse, error) { r.dump("ListPodSandboxStats", req) return r.client.ListPodSandboxStats(ctx, req) } func (r *relay) UpdateRuntimeConfig(ctx context.Context, req *criv1.UpdateRuntimeConfigRequest) (*criv1.UpdateRuntimeConfigResponse, error) { r.dump("UpdateRuntimeConfig", req) return r.client.UpdateRuntimeConfig(ctx, req) } func (r *relay) Status(ctx context.Context, req *criv1.StatusRequest) (*criv1.StatusResponse, error) { r.dump("Status", req) return r.client.Status(ctx, req) } func (r *relay) CheckpointContainer(ctx context.Context, req *criv1.CheckpointContainerRequest) (*criv1.CheckpointContainerResponse, error) { r.dump("CheckpointContainer", req) return r.client.CheckpointContainer(ctx, req) } func (r *relay) GetContainerEvents(req *criv1.GetEventsRequest, srv criv1.RuntimeService_GetContainerEventsServer) error { evtC := r.addEventServer(req) if err := r.startEventRelay(req); err != nil { r.delEventServer(req) return err } for evt := range evtC { if err := srv.Send(evt); err != nil { r.Errorf("failed to relay/send container event: %v", err) r.delEventServer(req) return err } } return nil } func (r *relay) ListMetricDescriptors(ctx context.Context, req *criv1.ListMetricDescriptorsRequest) (*criv1.ListMetricDescriptorsResponse, error) { r.dump("ListMetricDescriptors", req) return r.client.ListMetricDescriptors(ctx, req) } func (r *relay) ListPodSandboxMetrics(ctx context.Context, req *criv1.ListPodSandboxMetricsRequest) (*criv1.ListPodSandboxMetricsResponse, error) { r.dump("ListPodSandboxMetrics", req) return r.client.ListPodSandboxMetrics(ctx, req) } func (r *relay) RuntimeConfig(ctx context.Context, req *criv1.RuntimeConfigRequest) (*criv1.RuntimeConfigResponse, error) { r.dump("RuntimeConfig", req) return r.client.RuntimeConfig(ctx, req) } const ( eventRelayTimeout = 1 * time.Second ) func (r *relay) addEventServer(req *criv1.GetEventsRequest) chan *criv1.ContainerEventResponse { r.Lock() defer r.Unlock() evtC := make(chan *criv1.ContainerEventResponse, 128) r.evtChans[req] = evtC return evtC } func (r *relay) delEventServer(req *criv1.GetEventsRequest) chan *criv1.ContainerEventResponse { r.Lock() defer r.Unlock() evtC := r.evtChans[req] delete(r.evtChans, req) return evtC } func (r *relay) startEventRelay(req *criv1.GetEventsRequest) error { r.Lock() defer r.Unlock() if r.evtClient != nil { return nil } c, err := r.client.GetContainerEvents(context.Background(), req) if err != nil { return fmt.Errorf("failed to create container event client: %w", err) } r.evtClient = c go r.relayEvents() return nil } func (r *relay) relayEvents() { for { evt, err := r.evtClient.Recv() if err != nil { r.Errorf("failed to relay/receive container event: %v", err) } r.Lock() if err != nil { for req, evtC := range r.evtChans { delete(r.evtChans, req) close(evtC) } r.evtClient = nil } else { for req, evtC := range r.evtChans { select { case evtC <- evt: case _ = <-time.After(eventRelayTimeout): delete(r.evtChans, req) close(evtC) } } } r.Unlock() if err != nil { return } } } ================================================ FILE: pkg/cri/resource-manager/agent/agent.go ================================================ /* Copyright 2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ package agent import ( "context" "encoding/json" "fmt" "net" "strings" "time" "google.golang.org/grpc" core_v1 "k8s.io/api/core/v1" agent_v1 "github.com/intel/cri-resource-manager/pkg/agent/api/v1" ) const ( SocketDisabled = "disabled" ) // Interface describe interfaces of cri-resource-manager agent type Interface interface { IsDisabled() bool GetNode(time.Duration) (core_v1.Node, error) PatchNode([]*agent_v1.JsonPatch, time.Duration) error UpdateNodeCapacity(map[string]string, time.Duration) error GetLabels(time.Duration) (map[string]string, error) SetLabels(map[string]string, time.Duration) error RemoveLabels([]string, time.Duration) error GetAnnotations(time.Duration) (map[string]string, error) SetAnnotations(map[string]string, time.Duration) error RemoveAnnotations([]string, time.Duration) error GetTaints(time.Duration) ([]core_v1.Taint, error) SetTaints([]core_v1.Taint, time.Duration) error RemoveTaints([]core_v1.Taint, time.Duration) error FindTaintIndex([]core_v1.Taint, *core_v1.Taint) (int, bool) } // agentInterface implements Interface type agentInterface struct { socket string cli agent_v1.AgentClient } // NewAgentInterface connects to cri-resource-manager-agent gRPC server // and return a new Interface func NewAgentInterface(socket string) (Interface, error) { a := &agentInterface{ socket: socket, } if a.IsDisabled() { return a, nil } dialOpts := []grpc.DialOption{ // grpc.WithBlock(), // grpc.WithTimeout(10 * time.Second), grpc.WithInsecure(), // grpc.FailOnNonTempDialError(true), grpc.WithDialer(func(sock string, timeout time.Duration) (net.Conn, error) { return net.Dial("unix", sock) }), } conn, err := grpc.Dial(socket, dialOpts...) if err != nil { return nil, agentError("failed to connect to cri-resmgr agent: %v", err) } a.cli = agent_v1.NewAgentClient(conn) return a, nil } // IsDisabled returns true if the agent interface is disabled. func (a *agentInterface) IsDisabled() bool { return a.socket == SocketDisabled || a.socket == "" } func (a *agentInterface) GetNode(timeout time.Duration) (core_v1.Node, error) { if a.IsDisabled() { return core_v1.Node{}, agentError("agent interface is disabled") } ctx, cancel, callOpts := prepareCall(timeout) defer cancel() req := &agent_v1.GetNodeRequest{} node := core_v1.Node{} rsp, err := a.cli.GetNode(ctx, req, callOpts...) if err != nil { return node, agentError("failed to get node object: %v", err) } if err = json.Unmarshal([]byte(rsp.Node), &node); err != nil { return node, agentError("invalid response, failed to unmarshal v1.Node: %v", err) } return node, nil } func (a *agentInterface) PatchNode(patches []*agent_v1.JsonPatch, timeout time.Duration) error { if a.IsDisabled() { return agentError("agent interface is disabled") } ctx, cancel, callOpts := prepareCall(timeout) defer cancel() req := &agent_v1.PatchNodeRequest{ Patches: patches, } _, err := a.cli.PatchNode(ctx, req, callOpts...) if err != nil { return agentError("failed to patch node object: %v", err) } return nil } func (a *agentInterface) UpdateNodeCapacity(caps map[string]string, timeout time.Duration) error { if a.IsDisabled() { return agentError("agent interface is disabled") } ctx, cancel, callOpts := prepareCall(timeout) defer cancel() req := &agent_v1.UpdateNodeCapacityRequest{ Capacities: caps, } _, err := a.cli.UpdateNodeCapacity(ctx, req, callOpts...) if err != nil { return agentError("failed to update node capacities: %v", err) } return nil } const ( // PatchAdd specifies an add operation. PatchAdd string = "add" // PatchRemove specifies an remove operation. PatchRemove string = "remove" // PatchReplace specifies an replace operation. PatchReplace string = "replace" ) func patchPath(class, key string) string { return "/metadata/" + class + "/" + strings.Replace(key, "/", "~1", -1) } func labelPatchPath(key string) string { return patchPath("labels", key) } func annotationPatchPath(key string) string { return patchPath("annotations", key) } func taintPatchPath(idx int) string { return fmt.Sprintf("/spec/taints/%d", idx) } func (a *agentInterface) GetLabels(timeout time.Duration) (map[string]string, error) { if a.IsDisabled() { return nil, agentError("agent interface is disabled") } node, err := a.GetNode(timeout) if err != nil { return nil, err } return node.Labels, nil } func (a *agentInterface) SetLabels(labels map[string]string, timeout time.Duration) error { if a.IsDisabled() { return agentError("agent interface is disabled") } if len(labels) == 0 { return nil } node, err := a.GetNode(timeout) if err != nil { return err } patches := []*agent_v1.JsonPatch{} for key, val := range labels { patch := &agent_v1.JsonPatch{ Path: labelPatchPath(key), // Value is supposed to be in marshalled JSON format. Thus, we need // to add quotes so that it will be interpreted as a string. Value: "\"" + val + "\"", } if _, ok := node.Labels[key]; ok { patch.Op = PatchReplace } else { patch.Op = PatchAdd } patches = append(patches, patch) } return a.PatchNode(patches, timeout) } func (a *agentInterface) RemoveLabels(keys []string, timeout time.Duration) error { if a.IsDisabled() { return agentError("agent interface is disabled") } if len(keys) == 0 { return nil } node, err := a.GetNode(timeout) if err != nil { return err } patches := []*agent_v1.JsonPatch{} for _, key := range keys { if _, ok := node.Labels[key]; !ok { continue } patch := &agent_v1.JsonPatch{ Op: PatchRemove, Path: labelPatchPath(key), } patches = append(patches, patch) } if len(patches) == 0 { return nil } return a.PatchNode(patches, timeout) } func (a *agentInterface) GetAnnotations(timeout time.Duration) (map[string]string, error) { if a.IsDisabled() { return nil, agentError("agent interface is disabled") } node, err := a.GetNode(timeout) if err != nil { return nil, err } return node.Annotations, nil } func (a *agentInterface) SetAnnotations(annotations map[string]string, timeout time.Duration) error { if a.IsDisabled() { return agentError("agent interface is disabled") } if len(annotations) == 0 { return nil } node, err := a.GetNode(timeout) if err != nil { return err } patches := []*agent_v1.JsonPatch{} for key, val := range annotations { patch := &agent_v1.JsonPatch{ Path: annotationPatchPath(key), Value: val, } if _, ok := node.Annotations[key]; ok { patch.Op = PatchReplace } else { patch.Op = PatchAdd } patches = append(patches, patch) } return a.PatchNode(patches, timeout) } func (a *agentInterface) RemoveAnnotations(keys []string, timeout time.Duration) error { if a.IsDisabled() { return agentError("agent interface is disabled") } if len(keys) == 0 { return nil } node, err := a.GetNode(timeout) if err != nil { return err } patches := []*agent_v1.JsonPatch{} for _, key := range keys { if _, ok := node.Annotations[key]; !ok { continue } patch := &agent_v1.JsonPatch{ Op: PatchRemove, Path: annotationPatchPath(key), } patches = append(patches, patch) } if len(patches) == 0 { return nil } return a.PatchNode(patches, timeout) } func (a *agentInterface) GetTaints(timeout time.Duration) ([]core_v1.Taint, error) { if a.IsDisabled() { return nil, agentError("agent interface is disabled") } node, err := a.GetNode(timeout) if err != nil { return nil, err } return node.Spec.Taints, nil } func (a *agentInterface) SetTaints(taints []core_v1.Taint, timeout time.Duration) error { if a.IsDisabled() { return agentError("agent interface is disabled") } if len(taints) == 0 { return nil } node, err := a.GetNode(timeout) if err != nil { return err } patches := []*agent_v1.JsonPatch{} if node.Spec.Taints == nil { patch := &agent_v1.JsonPatch{ Op: PatchAdd, Path: "/spec/taints", Value: "[]"} patches = append(patches, patch) } for _, t := range taints { value, err := json.Marshal(t) if err != nil { return agentError("BUG: failed to marshal taint %v: %v", t, err) } idx, found := findTaintIndex(node.Spec.Taints, &t) patch := &agent_v1.JsonPatch{Value: string(value)} patch.Path = taintPatchPath(idx) if !found { patch.Op = PatchAdd } else { patch.Op = PatchReplace } patches = append(patches, patch) } return a.PatchNode(patches, timeout) } func (a *agentInterface) RemoveTaints(taints []core_v1.Taint, timeout time.Duration) error { if a.IsDisabled() { return agentError("agent interface is disabled") } if len(taints) == 0 { return nil } node, err := a.GetNode(timeout) if err != nil { return err } if node.Spec.Taints == nil { return nil } patches := []*agent_v1.JsonPatch{} for _, t := range taints { idx, found := findTaintIndex(node.Spec.Taints, &t) if found { patch := &agent_v1.JsonPatch{ Op: "remove", Path: taintPatchPath(idx), } patches = append(patches, patch) } } if len(patches) == 0 { return nil } return a.PatchNode(patches, timeout) } func findTaintIndex(taints []core_v1.Taint, taint *core_v1.Taint) (int, bool) { for idx, t := range taints { if t.Key == taint.Key && t.Value == taint.Value && t.Effect == taint.Effect { return idx, true } } return 0, false } func (a *agentInterface) FindTaintIndex(taints []core_v1.Taint, taint *core_v1.Taint) (int, bool) { return findTaintIndex(taints, taint) } func agentError(format string, args ...interface{}) error { return fmt.Errorf("agent-client: "+format, args...) } func prepareCall(timeout time.Duration) (context.Context, context.CancelFunc, []grpc.CallOption) { callOpts := []grpc.CallOption{grpc.FailFast(false)} ctx := context.Background() cancel := func() {} if timeout >= 0 { ctx, cancel = context.WithTimeout(context.Background(), timeout) } return ctx, cancel, callOpts } ================================================ FILE: pkg/cri/resource-manager/builtin-policies.go ================================================ // Copyright 2019 Intel Corporation. All Rights Reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package resmgr import ( // List of builtin policies _ "github.com/intel/cri-resource-manager/pkg/cri/resource-manager/policy/builtin/balloons" _ "github.com/intel/cri-resource-manager/pkg/cri/resource-manager/policy/builtin/dynamic-pools" _ "github.com/intel/cri-resource-manager/pkg/cri/resource-manager/policy/builtin/none" _ "github.com/intel/cri-resource-manager/pkg/cri/resource-manager/policy/builtin/podpools" _ "github.com/intel/cri-resource-manager/pkg/cri/resource-manager/policy/builtin/static" _ "github.com/intel/cri-resource-manager/pkg/cri/resource-manager/policy/builtin/static-plus" _ "github.com/intel/cri-resource-manager/pkg/cri/resource-manager/policy/builtin/static-pools" _ "github.com/intel/cri-resource-manager/pkg/cri/resource-manager/policy/builtin/topology-aware" ) // TODO: add unit tests to verify that all builtin policies are found ================================================ FILE: pkg/cri/resource-manager/cache/affinity.go ================================================ // Copyright 2019 Intel Corporation. All Rights Reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package cache import ( "fmt" "sigs.k8s.io/yaml" "github.com/intel/cri-resource-manager/pkg/apis/resmgr" "github.com/intel/cri-resource-manager/pkg/cri/resource-manager/kubernetes" ) const ( // annotation key for specifying container affinity rules keyAffinity = "affinity" // annotation key for specifying container anti-affinity rules keyAntiAffinity = "anti-affinity" ) // Expression is used to describe affinity container scope and matching criteria. type Expression struct { resmgr.Expression } // simpleAffinity is an alternative, simplified syntax for intra-pod container affinity. type simpleAffinity map[string][]string // PodContainerAffinity defines a set of per-container affinities and anti-affinities. type podContainerAffinity map[string][]*Affinity // Affinity specifies a single container affinity. type Affinity struct { Scope *resmgr.Expression `json:"scope,omitempty"` // scope for evaluating this affinity Match *resmgr.Expression `json:"match"` // affinity expression Weight int32 `json:"weight,omitempty"` // (optional) weight for this affinity } const ( // UserWeightCutoff is the cutoff we clamp user-provided weights to. UserWeightCutoff = 1000 // DefaultWeight is the default assigned weight if omitted in annotations. DefaultWeight int32 = 1 ) // ImplicitAffinity can implicitly inject affinities to containers. type ImplicitAffinity func(Container, bool) *Affinity // Validate checks the affinity for (obvious) invalidity. func (a *Affinity) Validate() error { if err := a.Scope.Validate(); err != nil { return cacheError("invalid affinity scope: %v", err) } if err := a.Match.Validate(); err != nil { return cacheError("invalid affinity match: %v", err) } switch { case a.Weight > UserWeightCutoff: a.Weight = UserWeightCutoff case a.Weight < -UserWeightCutoff: a.Weight = -UserWeightCutoff } return nil } // EvaluateAffinity evaluates the given affinity against all known in-scope containers. func (cch *cache) EvaluateAffinity(a *Affinity) map[string]int32 { results := make(map[string]int32) for _, c := range cch.FilterScope(a.Scope) { if a.Match.Evaluate(c) { id := c.GetCacheID() results[id] += a.Weight } } return results } // FilterScope returns the containers selected by the scope expression. func (cch *cache) FilterScope(scope *resmgr.Expression) []Container { cch.Debug("calculating scope %s", scope.String()) result := []Container{} for _, c := range cch.GetContainers() { if scope.Evaluate(c) { cch.Debug(" + container %s: IN scope", c.PrettyName()) result = append(result, c) } else { cch.Debug(" - container %s: NOT IN scope", c.PrettyName()) } } return result } // String returns the affinity as a string. func (a *Affinity) String() string { kind := "" if a.Weight < 0 { kind = "anti-" } return fmt.Sprintf("<%saffinity: scope %s %s => %d>", kind, a.Scope.String(), a.Match.String(), a.Weight) } // Try to parse affinities in simplified notation from the given annotation value. func (pca *podContainerAffinity) parseSimple(pod *pod, value string, weight int32) bool { parsed := simpleAffinity{} if err := yaml.UnmarshalStrict([]byte(value), &parsed); err != nil { return false } podScope := pod.ScopeExpression() // // Notes: // We turn affinities given in the simple notation into a symmetric set of // affinities. IOW, if X has affinity on Y with wight W, then Y will have // affinity on X with W as well. In practice this is done by // 1) ensuring there is an affinity Y: X for every affinity X: Y // 2) generating an affinity expression for every container with affinities // The generated expression uses the operator Equal or In depending on whether // if the container has affinities on exactly one container in the symmetric // set. // symmetric := map[string]map[string]struct{}{} for name, values := range parsed { for _, v := range values { forw, ok := symmetric[name] if !ok { forw = map[string]struct{}{} symmetric[name] = forw } back, ok := symmetric[v] if !ok { back = map[string]struct{}{} symmetric[v] = back } forw[v], back[name] = struct{}{}, struct{}{} } } var op resmgr.Operator for name, affinities := range symmetric { others := []string{} for o := range affinities { others = append(others, o) } if len(others) == 1 { op = resmgr.Equals } else { op = resmgr.In } (*pca)[name] = append((*pca)[name], &Affinity{ Scope: podScope, Match: &resmgr.Expression{ Key: kubernetes.ContainerNameLabel, Op: op, Values: others, }, Weight: weight, }) } return true } // Try to parse affinities in full notation from the given annotation value. func (pca *podContainerAffinity) parseFull(pod *pod, value string, weight int32) error { parsed := podContainerAffinity{} if err := yaml.UnmarshalStrict([]byte(value), &parsed); err != nil { return cacheError("failed to parse affinity annotation '%s': %v", value, err) } podScope := pod.ScopeExpression() for name, pa := range parsed { ca, ok := (*pca)[name] if !ok { ca = make([]*Affinity, 0, len(pa)) } for _, a := range pa { if a.Scope == nil { a.Scope = podScope } if a.Weight == 0 { a.Weight = weight } else { if weight < 0 { a.Weight *= -1 } } if err := a.Validate(); err != nil { return err } ca = append(ca, a) } (*pca)[name] = ca } return nil } // GlobalAffinity creates an affinity with all containers in scope. func GlobalAffinity(key string, weight int32) *Affinity { return &Affinity{ Scope: &resmgr.Expression{ Op: resmgr.AlwaysTrue, // evaluate against all containers }, Match: &resmgr.Expression{ Key: key, Op: resmgr.Exists, }, Weight: weight, } } // GlobalAntiAffinity creates an anti-affinity with all containers in scope. func GlobalAntiAffinity(key string, weight int32) *Affinity { return GlobalAffinity(key, -weight) } // AddImplicitAffinities registers a set of implicit affinities. func (cch *cache) AddImplicitAffinities(implicit map[string]ImplicitAffinity) error { for name := range implicit { if _, ok := cch.implicit[name]; ok { return cacheError("implicit affinity %s already defined", name) } } for name, a := range implicit { cch.implicit[name] = a } return nil } // DeleteImplicitAffinities removes a previously registered set of implicit affinities. func (cch *cache) DeleteImplicitAffinities(names []string) { for _, name := range names { delete(cch.implicit, name) } } ================================================ FILE: pkg/cri/resource-manager/cache/affinity_test.go ================================================ // Copyright 2020 Intel Corporation. All Rights Reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package cache import ( "testing" ) func TestSimpleParsingSymmetry(t *testing.T) { c1, c2, c3, c4, c5 := "c1", "c2", "c3", "c4", "c5" tcases := []struct { name string source string result map[string][]string }{ { name: "trivial 2 by 2", source: `c1: [ c2 ]`, result: map[string][]string{ c1: {c2}, c2: {c1}, }, }, { name: "simple", source: `c1: [ c2, c3, c4, c5 ]`, result: map[string][]string{ c1: {c2, c3, c4, c5}, c2: {c1}, c3: {c1}, c4: {c1}, c5: {c1}, }, }, { name: "a bit more complex", source: ` c1: [ c2 ] c2: [ c3, c4, c5 ] c4: [ c5 ] `, result: map[string][]string{ c1: {c2}, c2: {c1, c3, c4, c5}, c3: {c2}, c4: {c2, c5}, c5: {c2, c4}, }, }, } for _, tc := range tcases { t.Run(tc.name, func(t *testing.T) { pca := podContainerAffinity{} if !pca.parseSimple(&pod{Name: "testpod"}, tc.source, 1) { t.Errorf("failed to parse simple container affinity %q", tc.source) return } found := map[string]map[string]struct{}{} for name, affinities := range pca { for _, a := range affinities { for _, o := range a.Match.Values { forw, ok := found[name] if !ok { forw = map[string]struct{}{} found[name] = forw } back, ok := found[o] if !ok { back = map[string]struct{}{} found[o] = back } forw[o] = struct{}{} back[name] = struct{}{} } } } for name, others := range tc.result { for _, o := range others { if _, ok := found[name][o]; !ok { t.Errorf("simple affinity %q did not produce %s: %s", tc.source, name, o) } else { delete(found[name], o) if len(found[name]) == 0 { delete(found, name) } } } } for name, others := range found { val := "" sep := "" for o := range others { val += sep + o sep = ", " } t.Errorf("simple affinity %q produced unexpected %s: [ %s ]", tc.source, name, val) } }) } } func TestStrictParsing(t *testing.T) { tcases := []struct { name string source string invalid bool }{ { name: "invalid annotation", source: ` memtier-benchmark: - scope: key: pod/name operator: Matches values: - redis-* match: key: name operator: Equals values: - redis weight: 10 `, invalid: true, }, { name: "valid annotation", source: ` memtier-benchmark: - scope: key: pod/name operator: Matches values: - redis-* match: key: name operator: Equals values: - redis weight: 10 `, }, } for _, tc := range tcases { t.Run(tc.name, func(t *testing.T) { pca := podContainerAffinity{} err := pca.parseFull(&pod{Name: "testpod"}, tc.source, 1) if tc.invalid && err == nil { t.Errorf("parsing invalid affinity expression should have failed") return } if !tc.invalid && err != nil { t.Errorf("parsing valid affinity expression should not fail") } }) } } ================================================ FILE: pkg/cri/resource-manager/cache/cache.go ================================================ // Copyright 2019 Intel Corporation. All Rights Reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package cache import ( "encoding/json" "errors" "fmt" "os" "path/filepath" "strconv" "strings" "sync" v1 "k8s.io/api/core/v1" criv1 "k8s.io/cri-api/pkg/apis/runtime/v1" "github.com/intel/cri-resource-manager/pkg/apis/resmgr" "github.com/intel/cri-resource-manager/pkg/cri/resource-manager/config" "github.com/intel/cri-resource-manager/pkg/cri/resource-manager/kubernetes" logger "github.com/intel/cri-resource-manager/pkg/log" "github.com/intel/cri-resource-manager/pkg/topology" "github.com/intel/cri-resource-manager/pkg/utils/cpuset" idset "github.com/intel/goresctrl/pkg/utils" ) const ( // CPU marks changes that can be applied by the CPU controller. CPU = "cpu" // CRI marks changes that can be applied by the CRI controller. CRI = "cri" // RDT marks changes that can be applied by the RDT controller. RDT = "rdt" // BlockIO marks changes that can be applied by the BlockIO controller. BlockIO = "blockio" // Memory marks changes that can be applied by the Memory controller. Memory = "memory" // PageMigration marks changes that can be applied by the PageMigration controller. PageMigration = "page-migration" // TagAVX512 tags containers that use AVX512 instructions. TagAVX512 = "AVX512" // RDTClassKey is the pod annotation key for specifying a container RDT class. RDTClassKey = "rdtclass" + "." + kubernetes.ResmgrKeyNamespace // BlockIOClassKey is the pod annotation key for specifying a container Block I/O class. BlockIOClassKey = "blockioclass" + "." + kubernetes.ResmgrKeyNamespace // ToptierLimitKey is the pod annotation key for specifying container top tier memory limits. ToptierLimitKey = "toptierlimit" + "." + kubernetes.ResmgrKeyNamespace // RDTClassPodQoS denotes that the RDTClass should be taken from PodQosClass RDTClassPodQoS = "/PodQos" // ToptierLimitUnset is the reserved value for indicating unset top tier limits. ToptierLimitUnset int64 = -1 // TopologyHintsKey can be used to opt out from automatic topology hint generation. TopologyHintsKey = "topologyhints" + "." + kubernetes.ResmgrKeyNamespace ) // allControllers is a slice of all controller domains. var allControllers = []string{CPU, CRI, RDT, BlockIO, Memory} // PodState is the pod state in the runtime. type PodState int32 const ( // PodStateReady marks a pod ready. PodStateReady = PodState(int32(criv1.PodSandboxState_SANDBOX_READY)) // PodStateNotReady marks a pod as not ready. PodStateNotReady = PodState(int32(criv1.PodSandboxState_SANDBOX_NOTREADY)) // PodStateStale marks a pod as removed. PodStateStale = PodState(int32(PodStateNotReady) + 1) ) // PodResourceRequirements are per container resource requirements, annotated by our webhook. type PodResourceRequirements struct { // InitContainers is the resource requirements by init containers. InitContainers map[string]v1.ResourceRequirements `json:"initContainers"` // Containers is the resource requirements by normal container. Containers map[string]v1.ResourceRequirements `json:"containers"` } // PodStatus wraps a PodSandboxStatus response for data extraction. type PodStatus struct { CgroupParent string // extracted CgroupParent } // Pod is the exposed interface from a cached pod. type Pod interface { resmgr.Evaluable fmt.Stringer // GetInitContainers returns the init containers of the pod. GetInitContainers() []Container // GetContainers returns the (non-init) containers of the pod. GetContainers() []Container // GetContainer returns the named container of the pod. GetContainer(string) (Container, bool) // GetId returns the pod id of the pod. GetID() string // GetUID returns the (kubernetes) unique id of the pod. GetUID() string // GetName returns the name of the pod. GetName() string // GetNamespace returns the namespace of the pod. GetNamespace() string // GetState returns the PodState of the pod. GetState() PodState // GetQOSClass returns the PodQOSClass of the pod. GetQOSClass() v1.PodQOSClass // GetLabelKeys returns the keys of all pod labels as a string slice. GetLabelKeys() []string // GetLabel returns the value of the given label and whether it was found. GetLabel(string) (string, bool) // GetResmgrLabelKeys returns pod label keys (without the namespace // part) in cri-resource-manager namespace. GetResmgrLabelKeys() []string // GetResmgrLabel returns the value of a pod label from the // cri-resource-manager namespace. GetResmgrLabel(string) (string, bool) // GetAnnotationKeys returns the keys of all pod annotations as a string slice. GetAnnotationKeys() []string // GetAnnotation returns the value of the given annotation and whether it was found. GetAnnotation(key string) (string, bool) // GetAnnotationObject decodes the value of the given annotation with the given function. GetAnnotationObject(key string, objPtr interface{}, decode func([]byte, interface{}) error) (bool, error) // GetResmgrAnnotationKeys returns pod annotation keys (without the // namespace part) in cri-resource-manager namespace as a string slice. GetResmgrAnnotationKeys() []string // GetAnnotation returns the value of a pod annotation from the // cri-resource-manager namespace and whether it was found. GetResmgrAnnotation(key string) (string, bool) // GetResmgrAnnotationObject decodes the value of the given annotation in the // cri-resource-manager namespace. GetResmgrAnnotationObject(key string, objPtr interface{}, decode func([]byte, interface{}) error) (bool, error) // GetEffectiveAnnotation returns the effective annotation for a container. // For any given key $K and container $C it will look for annotations in // this order and return the first one found: // $K/container.$C // $K/pod // $K // and return the value of the first key found. GetEffectiveAnnotation(key, container string) (string, bool) // GetCgroupParentDir returns the pods cgroup parent directory. GetCgroupParentDir() string // GetPodResourceRequirements returns container resource requirements if the // necessary associated annotation put in place by the CRI resource manager // webhook was found. GetPodResourceRequirements() PodResourceRequirements // GetContainerAffinity returns the affinity expressions for the named container. GetContainerAffinity(string) ([]*Affinity, error) // ScopeExpression returns an affinity expression for defining this pod as the scope. ScopeExpression() *resmgr.Expression // GetProcesses returns the pids of all processes in the pod either excluding // container processes, if called with false, or including those if called with true. GetProcesses(bool) ([]string, error) // GetTasks returns the pids of all threads in the pod either excluding cotnainer // processes, if called with false, or including those if called with true. GetTasks(bool) ([]string, error) } // A cached pod. type pod struct { cache *cache // our cache of object ID string // pod sandbox runtime id UID string // (k8s) unique id Name string // pod sandbox name Namespace string // pod namespace State PodState // ready/not ready QOSClass v1.PodQOSClass // pod QoS class Labels map[string]string // pod labels Annotations map[string]string // pod annotations CgroupParent string // cgroup parent directory containers map[string]string // container name to ID map Resources *PodResourceRequirements // annotated resource requirements Affinity *podContainerAffinity // annotated container affinity } // ContainerState is the container state in the runtime. type ContainerState int32 const ( // ContainerStateCreated marks a container created, not running. ContainerStateCreated = ContainerState(int32(criv1.ContainerState_CONTAINER_CREATED)) // ContainerStateRunning marks a container created, running. ContainerStateRunning = ContainerState(int32(criv1.ContainerState_CONTAINER_RUNNING)) // ContainerStateExited marks a container exited. ContainerStateExited = ContainerState(int32(criv1.ContainerState_CONTAINER_EXITED)) // ContainerStateUnknown marks a container to be in an unknown state. ContainerStateUnknown = ContainerState(int32(criv1.ContainerState_CONTAINER_UNKNOWN)) // ContainerStateCreating marks a container as being created. ContainerStateCreating = ContainerState(int32(ContainerStateUnknown) + 1) // ContainerStateStale marks a container removed. ContainerStateStale = ContainerState(int32(ContainerStateUnknown) + 2) ) // Container is the exposed interface from a cached container. type Container interface { resmgr.Evaluable fmt.Stringer // PrettyName returns the user-friendly : for the container. PrettyName() string // GetPod returns the pod of the container and a boolean indicating if there was one. GetPod() (Pod, bool) // GetID returns the ID of the container. GetID() string // GetPodID returns the pod ID of the container. GetPodID() string // GetCacheID returns the cacheID of the container. GetCacheID() string // GetName returns the name of the container. GetName() string // GetNamespace returns the namespace of the container. GetNamespace() string // UpdateState updates the state of the container. UpdateState(ContainerState) // GetState returns the ContainerState of the container. GetState() ContainerState // GetQOSClass returns the QoS class the pod would have if this was its only container. GetQOSClass() v1.PodQOSClass // GetImage returns the image of the container. GetImage() string // GetCommand returns the container command. GetCommand() []string // GetArgs returns the container command arguments. GetArgs() []string // GetLabelKeys returns the keys of all labels of the container. GetLabelKeys() []string // GetLabel returns the value of a container label. GetLabel(string) (string, bool) // GetLabels returns a copy of all container labels. GetLabels() map[string]string // GetResmgrLabelKeys returns container label keys (without the namespace // part) in cri-resource-manager namespace. GetResmgrLabelKeys() []string // GetResmgrLabel returns the value of a container label from the // cri-resource-manager namespace. GetResmgrLabel(string) (string, bool) // GetAnnotationKeys returns the keys of all annotations of the container. GetAnnotationKeys() []string // GetAnnotation returns the value of a container annotation. GetAnnotation(key string, objPtr interface{}) (string, bool) // GetResmgrAnnotationKeys returns container annotation keys (without the // namespace part) in cri-resource-manager namespace. GetResmgrAnnotationKeys() []string // GetAnnotation returns the value of a container annotation from the // cri-resource-manager namespace. GetResmgrAnnotation(key string, objPtr interface{}) (string, bool) // GetEffectiveAnnotation returns the effective annotation for the container from the pod. GetEffectiveAnnotation(key string) (string, bool) // GetAnnotations returns a copy of all container annotations. GetAnnotations() map[string]string // GetEnvKeys returns the keys of all container environment variables. GetEnvKeys() []string // GetEnv returns the value of a container environment variable. GetEnv(string) (string, bool) // GetMounts returns all the mounts of the container. GetMounts() []Mount // GetMountByHost returns the container path corresponding to the host path. // XXX We should remove this as is might not be unique. GetMountByHost(string) *Mount // GetmountByContainer returns the host path mounted to a container path. GetMountByContainer(string) *Mount // GetDevices returns the devices of the container. GetDevices() []Device // GetDeviceByHost returns the device for a host path. GetDeviceByHost(string) *Device // GetDeviceByContainer returns the device for a container path. GetDeviceByContainer(string) *Device // GetResourceRequirements returns the webhook-annotated requirements for ths container. GetResourceRequirements() v1.ResourceRequirements // GetLinuxResources returns the CRI linux resource request of the container. GetLinuxResources() *criv1.LinuxContainerResources // SetCommand sets the container command. SetCommand([]string) // SetArgs sets the container command arguments. SetArgs([]string) // SetLabel sets the value for a container label. SetLabel(string, string) // DeleteLabel removes a container label. DeleteLabel(string) // SetAnnotation sets the value for a container annotation. SetAnnotation(string, string) // DeleteAnnotation removes a container annotation. DeleteAnnotation(string) // SetEnv sets a container environment variable. SetEnv(string, string) // UnsetEnv unsets a container environment variable. UnsetEnv(string) // InsertMount inserts a mount into the container. InsertMount(*Mount) // DeleteMount removes a mount from the container. DeleteMount(string) // InsertDevice inserts a device into the container. InsertDevice(*Device) // DeleteDevice removes a device from the container. DeleteDevice(string) // Get any attached topology hints. GetTopologyHints() topology.Hints // GetCPUPeriod gets the CFS CPU period of the container. GetCPUPeriod() int64 // GetCpuQuota gets the CFS CPU quota of the container. GetCPUQuota() int64 // GetCPUShares gets the CFS CPU shares of the container. GetCPUShares() int64 // GetmemoryLimit gets the memory limit in bytes for the container. GetMemoryLimit() int64 // GetOomScoreAdj gets the OOM score adjustment for the container. GetOomScoreAdj() int64 // GetCpusetCPUs gets the cgroup cpuset.cpus of the container. GetCpusetCpus() string // GetCpusetMems gets the cgroup cpuset.mems of the container. GetCpusetMems() string // SetLinuxResources sets the Linux-specific resource request of the container. SetLinuxResources(*criv1.LinuxContainerResources) // SetCPUPeriod sets the CFS CPU period of the container. SetCPUPeriod(int64) // SetCPUQuota sets the CFS CPU quota of the container. SetCPUQuota(int64) // SetCPUShares sets the CFS CPU shares of the container. SetCPUShares(int64) // SetmemoryLimit sets the memory limit in bytes for the container. SetMemoryLimit(int64) // SetOomScoreAdj sets the OOM score adjustment for the container. SetOomScoreAdj(int64) // SetCpusetCpu sets the cgroup cpuset.cpus of the container. SetCpusetCpus(string) // SetCpusetMems sets the cgroup cpuset.mems of the container. SetCpusetMems(string) // GetAffinity returns the annotated affinity expressions for this container. GetAffinity() ([]*Affinity, error) // GetCgroupDir returns the relative path of the cgroup directory for the container. GetCgroupDir() string // SetRDTClass assigns this container to the given RDT class. SetRDTClass(string) // GetRDTClass returns the RDT class for this container. GetRDTClass() string // SetBlockIOClass assigns this container to the given BlockIO class. SetBlockIOClass(string) // GetBlockIOClass returns the BlockIO class for this container. GetBlockIOClass() string // SetToptierLimit sets the tier memory limit for the container. SetToptierLimit(int64) // GetToptierLimit returns the top tier memory limit for the container. GetToptierLimit() int64 // SetPageMigration sets the page migration policy/options for the container. SetPageMigration(*PageMigrate) // GetPageMigration returns the current page migration policy/options for the container. GetPageMigration() *PageMigrate // GetProcesses returns the pids of processes in the container. GetProcesses() ([]string, error) // GetTasks returns the pids of threads in the container. GetTasks() ([]string, error) // SetCRIRequest sets the current pending CRI request of the container. SetCRIRequest(req interface{}) error // GetCRIRequest returns the current pending CRI request of the container. GetCRIRequest() (interface{}, bool) // ClearCRIRequest clears and returns the current pending CRI request of the container. ClearCRIRequest() (interface{}, bool) // GetCRIEnvs returns container environment variables. GetCRIEnvs() []*criv1.KeyValue // GetCRIMounts returns container mounts. GetCRIMounts() []*criv1.Mount // GetCRIDevices returns container devices. GetCRIDevices() []*criv1.Device // GetPending gets the names of the controllers with pending changes. GetPending() []string // HasPending checks if the container has pending chanhes for the given controller. HasPending(string) bool // ClearPending clears the pending change marker for the given controller. ClearPending(string) // GetTag gets the value of the given tag. GetTag(string) (string, bool) // SetTag sets the value of the given tag and returns its previous value.. SetTag(string, string) (string, bool) // DeleteTag deletes the given tag, returning its deleted value. DeleteTag(string) (string, bool) } // A cached container. type container struct { cache *cache // our cache of objects ID string // container runtime id PodID string // associate pods runtime id CacheID string // our cache id Name string // container name Namespace string // container namespace State ContainerState // created/running/exited/unknown Image string // containers image Command []string // command to run in container Args []string // arguments for command Labels map[string]string // container labels Annotations map[string]string // container annotations Env map[string]string // environment variables Mounts map[string]*Mount // mounts Devices map[string]*Device // devices TopologyHints topology.Hints // Set of topology hints for all containers within Pod Tags map[string]string // container tags (local dynamic labels) Adjustment string // name of applicable external adjustment, if any Resources v1.ResourceRequirements // container resources (from webhook annotation) LinuxReq *criv1.LinuxContainerResources // used to estimate Resources if we lack annotations req *interface{} // pending CRI request CgroupDir string // cgroup directory relative to a(ny) controller. RDTClass string // RDT class this container is assigned to. BlockIOClass string // Block I/O class this container is assigned to. ToptierLimit int64 // Top tier memory limit. PageMigrate *PageMigrate // Page migration policy/options for this container. pending map[string]struct{} // controllers with pending changes for this container prettyName string // cached PrettyName() } // MountType is a propagation type. type MountType int32 const ( // MountPrivate is a private container mount. MountPrivate MountType = MountType(criv1.MountPropagation_PROPAGATION_PRIVATE) // MountHostToContainer is a host-to-container mount. MountHostToContainer MountType = MountType(criv1.MountPropagation_PROPAGATION_HOST_TO_CONTAINER) // MountBidirectional is a bidirectional mount. MountBidirectional MountType = MountType(criv1.MountPropagation_PROPAGATION_BIDIRECTIONAL) ) // Mount is a filesystem entry mounted inside a container. type Mount struct { // Container is the path inside the container. Container string // Host is the path on the host. Host string // Readonly specifies if the mount is read-only or read-write. Readonly bool // Relabels denotes SELinux relabeling. Relabel bool // Propagation identifies the mount propagation type. Propagation MountType } // Device is a device exposed to a container. type Device struct { // Container is the device path inside the container. Container string // Host is the device path on the host side. Host string // Permissions specify the device permissions for the container. Permissions string } // PageMigrate contains the policy/preferences for container page migration. type PageMigrate struct { SourceNodes idset.IDSet // idle memory pages on these NUMA nodes TargetNodes idset.IDSet // should be migrated to these NUMA nodes } // Clone creates a copy of the page migration policy/preferences. func (pm *PageMigrate) Clone() *PageMigrate { if pm == nil { return nil } c := &PageMigrate{} if pm.SourceNodes != nil { c.SourceNodes = pm.SourceNodes.Clone() } if pm.TargetNodes != nil { c.TargetNodes = pm.TargetNodes.Clone() } return c } // Cachable is an interface opaque cachable data must implement. type Cachable interface { // Set value (via a pointer receiver) to the object. Set(value interface{}) // Get the object that should be cached. Get() interface{} } // Cache is the primary interface exposed for tracking pods and containers. // // Cache tracks pods and containers in the runtime, mostly by processing CRI // requests and responses which the cache is fed as these are being procesed. // Cache also saves its state upon changes to secondary storage and restores // itself upon startup. type Cache interface { // InsertPod inserts a pod into the cache, using a runtime request or reply. InsertPod(id string, msg interface{}, status *PodStatus) (Pod, error) // DeletePod deletes a pod from the cache. DeletePod(id string) Pod // LookupPod looks up a pod in the cache. LookupPod(id string) (Pod, bool) // InsertContainer inserts a container into the cache, using a runtime request or reply. InsertContainer(msg interface{}) (Container, error) // UpdateContainerID updates a containers runtime id. UpdateContainerID(cacheID string, msg interface{}) (Container, error) // DeleteContainer deletes a container from the cache. DeleteContainer(id string) Container // LookupContainer looks up a container in the cache. LookupContainer(id string) (Container, bool) // LookupContainerByCgroup looks up a container for the given cgroup path. LookupContainerByCgroup(path string) (Container, bool) // GetPendingContainers returs all containers with pending changes. GetPendingContainers() []Container // GetPods returns all the pods known to the cache. GetPods() []Pod // GetContainers returns all the containers known to the cache. GetContainers() []Container // GetContainerCacheIds returns the cache ids of all containers. GetContainerCacheIds() []string // GetContaineIds return the ids of all containers. GetContainerIds() []string // FilterScope returns the containers selected by the scope expression. FilterScope(*resmgr.Expression) []Container // EvaluateAffinity evaluates the given affinity against all known in-scope containers EvaluateAffinity(*Affinity) map[string]int32 // AddImplicitAffinities adds a set of implicit affinities (added to all containers). AddImplicitAffinities(map[string]ImplicitAffinity) error // GetActivePolicy returns the name of the active policy stored in the cache. GetActivePolicy() string // SetActivePolicy updates the name of the active policy stored in the cache. SetActivePolicy(string) error // ResetActivePolicy clears the active policy any any policy-specific data from the cache. ResetActivePolicy() error // SetPolicyEntry sets the policy entry for a key. SetPolicyEntry(string, interface{}) // GetPolicyEntry gets the policy entry for a key. GetPolicyEntry(string, interface{}) bool // SetConfig caches the given configuration. SetConfig(*config.RawConfig) error // GetConfig returns the current/cached configuration. GetConfig() *config.RawConfig // ResetConfig clears any stored configuration from the cache. ResetConfig() error // SetAdjustment updates external adjustments and containers based this. SetAdjustment(*config.Adjustment) (bool, map[string]error) // Save requests a cache save. Save() error // RefreshPods purges/inserts stale/new pods/containers using a pod sandbox list response. RefreshPods(*criv1.ListPodSandboxResponse, map[string]*PodStatus) ([]Pod, []Pod, []Container) // RefreshContainers purges/inserts stale/new containers using a container list response. RefreshContainers(*criv1.ListContainersResponse) ([]Container, []Container) // Get the container (data) directory for a container. ContainerDirectory(string) string // OpenFile opens the names container data file, creating it if necessary. OpenFile(string, string, os.FileMode) (*os.File, error) // WriteFile writes a container data file, creating it if necessary. WriteFile(string, string, os.FileMode, []byte) error } const ( // CacheVersion is the running version of the cache. CacheVersion = "1" ) // permissions describe preferred/expected ownership and permissions for a file or directory. type permissions struct { prefer os.FileMode // permissions to create file/directory with reject os.FileMode // bits that cause rejection to use an existing entry } // permissions to create with/check against var ( cacheDirPerm = &permissions{prefer: 0710, reject: 0022} cacheFilePerm = &permissions{prefer: 0644, reject: 0022} dataDirPerm = &permissions{prefer: 0755, reject: 0022} dataFilePerm = &permissions{prefer: 0644, reject: 0022} ) // Our cache of objects. type cache struct { sync.Mutex `json:"-"` // we're lockable logger.Logger `json:"-"` // cache logger instance filePath string // where to store to/load from dataDir string // container data directory Pods map[string]*pod // known/cached pods Containers map[string]*container // known/cache containers NextID uint64 // next container cache id to use Cfg *config.RawConfig // cached/current configuration External *config.Adjustment // cached/current external adjustments PolicyName string // name of the active policy policyData map[string]interface{} // opaque policy data PolicyJSON map[string]string // ditto in raw, unmarshaled form pending map[string]struct{} // cache IDs of containers with pending changes implicit map[string]ImplicitAffinity // implicit affinities } // Make sure cache implements Cache. var _ Cache = &cache{} // Options contains the configurable cache options. type Options struct { // CacheDir is the directory the cache should save its state in. CacheDir string } // NewCache instantiates a new cache. Load it from the given path if it exists. func NewCache(options Options) (Cache, error) { cch := &cache{ filePath: filepath.Join(options.CacheDir, "cache"), dataDir: filepath.Join(options.CacheDir, "containers"), Logger: logger.NewLogger("cache"), Pods: make(map[string]*pod), Containers: make(map[string]*container), NextID: 1, policyData: make(map[string]interface{}), PolicyJSON: make(map[string]string), implicit: make(map[string]ImplicitAffinity), } if _, err := cch.checkPerm("cache", cch.filePath, false, cacheFilePerm); err != nil { return nil, cacheError("refusing to use existing cache file: %v", err) } if err := cch.mkdirAll("cache", options.CacheDir, cacheDirPerm); err != nil { return nil, err } if err := cch.mkdirAll("container", cch.dataDir, dataDirPerm); err != nil { return nil, err } if err := cch.Load(); err != nil { return nil, err } return cch, nil } // GetActivePolicy returns the name of the active policy stored in the cache. func (cch *cache) GetActivePolicy() string { return cch.PolicyName } // SetActivePolicy updaes the name of the active policy stored in the cache. func (cch *cache) SetActivePolicy(policy string) error { cch.PolicyName = policy return cch.Save() } // ResetActivePolicy clears the active policy any any policy-specific data from the cache. func (cch *cache) ResetActivePolicy() error { cch.Warn("clearing all data for active policy (%q) from cache...", cch.PolicyName) cch.PolicyName = "" cch.policyData = make(map[string]interface{}) cch.PolicyJSON = make(map[string]string) return cch.Save() } // SetConfig caches the given configuration. func (cch *cache) SetConfig(cfg *config.RawConfig) error { old := cch.Cfg cch.Cfg = cfg if err := cch.Save(); err != nil { cch.Cfg = old return err } return nil } // GetConfig returns the current/cached configuration. func (cch *cache) GetConfig() *config.RawConfig { return cch.Cfg } // ResetConfig clears any stored configuration from the cache. func (cch *cache) ResetConfig() error { old := cch.Cfg cch.Cfg = nil if err := cch.Save(); err != nil { cch.Cfg = old return err } return nil } // SetAdjustment updates external adjustments and containers based on this. func (cch *cache) SetAdjustment(external *config.Adjustment) (bool, map[string]error) { effective := map[*container]string{} // collect per container external adjustments, checking for obvious errors errors := map[string]error{} for id, c := range cch.Containers { if id != c.GetCacheID() { continue } adjustments := cch.getApplicableAdjustments(external, c) if len(adjustments) == 0 { continue } // conflict: multiple adjustments per container if len(adjustments) > 1 { errors[c.GetID()] = cacheError("conflicting adjustments for %s: %s", c.PrettyName(), strings.Join(adjustments, ",")) continue } adjust := external.Adjustments[adjustments[0]] // error: trying to override resources for BestEffort container if c.GetQOSClass() == v1.PodQOSBestEffort { if adjust.Resources != nil { errors[c.GetID()] = cacheError("%s: can't override resources for BestEffort %s", adjustments[0], c.PrettyName()) continue } } effective[c] = adjustments[0] } if len(errors) > 0 { return false, errors } // update per container external adjustments, mark all containers with pending changes for id, c := range cch.Containers { if id != c.GetCacheID() { continue } uptodate := effective[c] previous := c.setEffectiveAdjustment(uptodate) effective[c] = previous if previous != uptodate { cch.Info("%s effective external adjustment changed from %q to %q", c.PrettyName(), previous, uptodate) } c.markPending(allControllers...) } if err := cch.Save(); err != nil { for id, c := range cch.Containers { if id != c.GetCacheID() { continue } c.setEffectiveAdjustment(effective[c]) } return false, map[string]error{"cache": err} } cch.External = external return true, nil } // Get all external adjustments applicable to the given container. func (cch *cache) getApplicableAdjustments(ext *config.Adjustment, c *container) []string { if ext == nil { return []string{} } applicable := []string{} for name, adjust := range ext.Adjustments { if adjust.IsContainerInScope(c) { applicable = append(applicable, name) } } return applicable } // setEffectiveAdjustment updates the effective adjustments of all containers. func (cch *cache) setEffectiveAdjustment(effective map[*container]string) { for id, c := range cch.Containers { if id != c.GetCacheID() { continue } uptodate := effective[c] previous := c.setEffectiveAdjustment(uptodate) if previous != uptodate { cch.Info("%s effective external adjustment changed from %q to %q", c.PrettyName(), previous, uptodate) } // we forcibly mark the container as updated in all controller domains for _, ctrl := range allControllers { c.markPending(ctrl) } } } // Derive cache id using pod uid, or allocate a new unused local cache id. func (cch *cache) createCacheID(c *container) string { if pod, ok := c.cache.LookupPod(c.PodID); ok { uid := pod.GetUID() if uid != "" { return uid + ":" + c.Name } } cch.Warn("can't find unique id for pod %s, assigning local cache id", c.PodID) id := "cache:" + strconv.FormatUint(cch.NextID, 16) cch.NextID++ return id } // Insert a pod into the cache. func (cch *cache) InsertPod(id string, msg interface{}, status *PodStatus) (Pod, error) { var err error p := &pod{cache: cch, ID: id} switch msg.(type) { case *criv1.RunPodSandboxRequest: err = p.fromRunRequest(msg.(*criv1.RunPodSandboxRequest)) case *criv1.PodSandbox: err = p.fromListResponse(msg.(*criv1.PodSandbox), status) default: err = fmt.Errorf("cannot create pod from message %T", msg) } if err != nil { cch.Error("failed to insert pod %s: %v", id, err) return nil, err } cch.Pods[p.ID] = p cch.Save() return p, nil } // Delete a pod from the cache. func (cch *cache) DeletePod(id string) Pod { p, ok := cch.Pods[id] if !ok { return nil } cch.Debug("removing pod %s (%s)", p.Name, p.ID) delete(cch.Pods, id) cch.Save() return p } // Look up a pod in the cache. func (cch *cache) LookupPod(id string) (Pod, bool) { p, ok := cch.Pods[id] return p, ok } // Insert a container into the cache. func (cch *cache) InsertContainer(msg interface{}) (Container, error) { var err error c := &container{ cache: cch, } switch msg.(type) { case *criv1.CreateContainerRequest: err = c.fromCreateRequest(msg.(*criv1.CreateContainerRequest)) case *criv1.Container: err = c.fromListResponse(msg.(*criv1.Container)) default: err = fmt.Errorf("cannot create container from message %T", msg) } if err != nil { return nil, cacheError("failed to insert container %s: %v", c.CacheID, err) } c.CacheID = cch.createCacheID(c) cch.Containers[c.CacheID] = c if c.ID != "" { cch.Containers[c.ID] = c } cch.createContainerDirectory(c.CacheID) adjustments := cch.getApplicableAdjustments(cch.External, c) switch { case len(adjustments) > 1: cch.Error("conflicting adjustments for %s: %s", c.PrettyName(), strings.Join(adjustments, ",")) case len(adjustments) == 1: c.setEffectiveAdjustment(adjustments[0]) } cch.Save() return c, nil } // UpdateContainerID updates a containers runtime id. func (cch *cache) UpdateContainerID(cacheID string, msg interface{}) (Container, error) { c, ok := cch.Containers[cacheID] if !ok { return nil, cacheError("%s: failed to update ID, container not found", cacheID) } reply, ok := msg.(*criv1.CreateContainerResponse) if !ok { return nil, cacheError("%s: failed to update ID from message %T", c.PrettyName(), msg) } c.ID = reply.ContainerId cch.Containers[c.ID] = c cch.Save() return c, nil } // Delete a pod from the cache. func (cch *cache) DeleteContainer(id string) Container { c, ok := cch.Containers[id] if !ok { return nil } cch.Debug("removing container %s", c.PrettyName()) cch.removeContainerDirectory(c.CacheID) delete(cch.Containers, c.ID) delete(cch.Containers, c.CacheID) cch.Save() return c } // Look up a pod in the cache. func (cch *cache) LookupContainer(id string) (Container, bool) { c, ok := cch.Containers[id] return c, ok } // LookupContainerByCgroup looks up the container for the given cgroup path. func (cch *cache) LookupContainerByCgroup(path string) (Container, bool) { cch.Debug("resolving %s to a container...", path) for id, c := range cch.Containers { if id != c.CacheID { continue } parent := "" if pod, ok := c.GetPod(); ok { parent = pod.GetCgroupParentDir() } if parent == "" { continue } if !strings.HasPrefix(path, parent+"/") { continue } if strings.Index(path, c.GetID()) != -1 { return c, true } } return nil, false } // RefreshPods purges/inserts stale/new pods/containers using a pod sandbox list response. func (cch *cache) RefreshPods(msg *criv1.ListPodSandboxResponse, status map[string]*PodStatus) ([]Pod, []Pod, []Container) { valid := make(map[string]struct{}) add := []Pod{} del := []Pod{} containers := []Container{} for _, item := range msg.Items { valid[item.Id] = struct{}{} if _, ok := cch.Pods[item.Id]; !ok { cch.Debug("inserting discovered pod %s...", item.Id) pod, err := cch.InsertPod(item.Id, item, status[item.Id]) if err != nil { cch.Error("failed to insert discovered pod %s to cache: %v", item.Id, err) } else { add = append(add, pod) } } } for _, pod := range cch.Pods { if _, ok := valid[pod.ID]; !ok { cch.Debug("purging stale pod %s...", pod.ID) pod.State = PodStateStale del = append(del, cch.DeletePod(pod.ID)) } } for id, c := range cch.Containers { if _, ok := valid[c.PodID]; !ok { cch.Debug("purging container %s of stale pod %s...", c.CacheID, c.PodID) cch.DeleteContainer(c.CacheID) c.State = ContainerStateStale if id == c.CacheID { containers = append(containers, c) } } } return add, del, containers } // RefreshContainers purges/inserts stale/new containers using a container list response. func (cch *cache) RefreshContainers(msg *criv1.ListContainersResponse) ([]Container, []Container) { valid := make(map[string]struct{}) add := []Container{} del := []Container{} for _, c := range msg.Containers { if ContainerState(c.State) == ContainerStateExited { continue } valid[c.Id] = struct{}{} if _, ok := cch.Containers[c.Id]; !ok { cch.Debug("inserting discovered container %s...", c.Id) inserted, err := cch.InsertContainer(c) if err != nil { cch.Error("failed to insert discovered container %s to cache: %v", c.Id, err) } else { add = append(add, inserted) } } } for id, c := range cch.Containers { if _, ok := valid[c.ID]; !ok { cch.Debug("purging stale container %s (state: %v)...", c.CacheID, c.GetState()) cch.DeleteContainer(c.CacheID) c.State = ContainerStateStale if id == c.CacheID { del = append(del, c) } } } return add, del } // Mark a container as having pending changes. func (cch *cache) markPending(c *container) { if cch.pending == nil { cch.pending = make(map[string]struct{}) } cch.pending[c.CacheID] = struct{}{} } // Get all containers with pending changes. func (cch *cache) GetPendingContainers() []Container { pending := make([]Container, 0, len(cch.pending)) for id := range cch.pending { c, ok := cch.LookupContainer(id) if ok { pending = append(pending, c) } } return pending } // clear the pending state of the given container. func (cch *cache) clearPending(c *container) { delete(cch.pending, c.CacheID) } // Get the cache ids of all cached containers. func (cch *cache) GetContainerCacheIds() []string { ids := make([]string, len(cch.Containers)) idx := 0 for id, c := range cch.Containers { if id != c.CacheID { continue } ids[idx] = c.CacheID idx++ } return ids[0:idx] } // Get the ids of all cached containers. func (cch *cache) GetContainerIds() []string { ids := make([]string, len(cch.Containers)) idx := 0 for id, c := range cch.Containers { if id == c.CacheID { continue } ids[idx] = c.ID idx++ } return ids[0:idx] } // GetPods returns all pods present in the cache. func (cch *cache) GetPods() []Pod { pods := make([]Pod, 0, len(cch.Pods)) for _, pod := range cch.Pods { pods = append(pods, pod) } return pods } // GetContainers returns all the containers present in the cache. func (cch *cache) GetContainers() []Container { containers := make([]Container, 0, len(cch.Containers)/2) for id, container := range cch.Containers { if id != container.CacheID { continue } containers = append(containers, container) } return containers } // Set the policy entry for a key. func (cch *cache) SetPolicyEntry(key string, obj interface{}) { cch.policyData[key] = obj if cch.DebugEnabled() { if data, err := marshalEntry(obj); err != nil { cch.Error("marshalling of policy entry '%s' failed: %v", key, err) } else { cch.Debug("policy entry '%s' set to '%s'", key, string(data)) } } } // Get the policy entry for a key. func (cch *cache) GetPolicyEntry(key string, ptr interface{}) bool { // // Notes: // We try to serve requests from the demarshaled cache (policyData). // If that fails (may be a first access since load) we look for the // entry in the unmarshaled cache (PolicyJSON), demarshal, and cache // the entry if found. // Note the quirk: in the latter case we first directly unmarshal to // the pointer provided by the caller, only then Get() and cache the // result. // obj, ok := cch.policyData[key] if !ok { entry, ok := cch.PolicyJSON[key] if !ok { return false } // first access to key since startup if err := unmarshalEntry([]byte(entry), ptr); err != nil { cch.Fatal("failed to unmarshal '%s' policy entry for key '%s' (%T): %v", cch.PolicyName, key, ptr, err) } if err := cch.cacheEntry(key, ptr); err != nil { cch.Fatal("failed to cache '%s' policy entry for key '%s': %v", cch.PolicyName, key, err) } } else { // subsequent accesses to key if err := cch.setEntry(ptr, obj); err != nil { cch.Fatal("failed use cached entry for key '%s' of policy '%s': %v", key, cch.PolicyName, err) } } return true } // Marshal an opaque policy entry, special-casing cpusets and maps of cpusets. func marshalEntry(obj interface{}) ([]byte, error) { switch obj.(type) { case cpuset.CPUSet: return []byte("\"" + obj.(cpuset.CPUSet).String() + "\""), nil case map[string]cpuset.CPUSet: dst := make(map[string]string) for key, cset := range obj.(map[string]cpuset.CPUSet) { dst[key] = cset.String() } return json.Marshal(dst) default: return json.Marshal(obj) } } // Unmarshal an opaque policy entry, special-casing cpusets and maps of cpusets. func unmarshalEntry(data []byte, ptr interface{}) error { switch ptr.(type) { case *cpuset.CPUSet: cset, err := cpuset.Parse(string(data[1 : len(data)-1])) if err != nil { return err } *ptr.(*cpuset.CPUSet) = cset return nil case *map[string]cpuset.CPUSet: src := make(map[string]string) if err := json.Unmarshal([]byte(data), &src); err != nil { return cacheError("failed to unmarshal map[string]cpuset.CPUSet: %v", err) } dst := make(map[string]cpuset.CPUSet) for key, str := range src { cset, err := cpuset.Parse(str) if err != nil { return cacheError("failed to unmarshal cpuset.CPUSet '%s': %v", str, err) } dst[key] = cset } *ptr.(*map[string]cpuset.CPUSet) = dst return nil default: err := json.Unmarshal(data, ptr) return err } } // Cache an unmarshaled opaque policy entry, special-casing some simple/common types. func (cch *cache) cacheEntry(key string, ptr interface{}) error { if cachable, ok := ptr.(Cachable); ok { cch.policyData[key] = cachable.Get() return nil } switch ptr.(type) { case *cpuset.CPUSet: cch.policyData[key] = *ptr.(*cpuset.CPUSet) case *map[string]cpuset.CPUSet: cch.policyData[key] = *ptr.(*map[string]cpuset.CPUSet) case *map[string]string: cch.policyData[key] = *ptr.(*map[string]string) case *string: cch.policyData[key] = *ptr.(*string) case *bool: cch.policyData[key] = *ptr.(*bool) case *int32: cch.policyData[key] = *ptr.(*int32) case *uint32: cch.policyData[key] = *ptr.(*uint32) case *int64: cch.policyData[key] = *ptr.(*int64) case *uint64: cch.policyData[key] = *ptr.(*uint64) case *int: cch.policyData[key] = *ptr.(*int) case *uint: cch.policyData[key] = *ptr.(*uint) default: return cacheError("can't handle policy data of type %T", ptr) } return nil } // Serve an unmarshaled opaque policy entry, special-casing some simple/common types. func (cch *cache) setEntry(ptr, obj interface{}) error { if cachable, ok := ptr.(Cachable); ok { cachable.Set(obj) return nil } switch ptr.(type) { case *cpuset.CPUSet: *ptr.(*cpuset.CPUSet) = obj.(cpuset.CPUSet) case *map[string]cpuset.CPUSet: *ptr.(*map[string]cpuset.CPUSet) = obj.(map[string]cpuset.CPUSet) case *map[string]string: *ptr.(*map[string]string) = obj.(map[string]string) case *string: *ptr.(*string) = obj.(string) case *bool: *ptr.(*bool) = obj.(bool) case *int32: *ptr.(*int32) = obj.(int32) case *uint32: *ptr.(*uint32) = obj.(uint32) case *int64: *ptr.(*int64) = obj.(int64) case *uint64: *ptr.(*uint64) = obj.(uint64) case *int: *ptr.(*int) = obj.(int) case *uint: *ptr.(*uint) = obj.(uint) default: return cacheError("can't handle policy data of type %T", ptr) } return nil } // checkPerm checks permissions of an already existing file or directory. func (cch *cache) checkPerm(what, path string, isDir bool, p *permissions) (bool, error) { if isDir { what += " directory" } info, err := os.Stat(path) if err != nil { if !errors.Is(err, os.ErrNotExist) { return true, cacheError("failed to os.Stat() %s %q: %v", what, path, err) } return false, nil } // check expected file type if isDir { if !info.IsDir() { return true, cacheError("%s %q exists, but is not a directory", what, path) } } else { if info.Mode()&os.ModeType != 0 { return true, cacheError("%s %q exists, but is not a regular file", what, path) } } existing := info.Mode().Perm() expected := p.prefer rejected := p.reject if ((expected | rejected) &^ os.ModePerm) != 0 { cch.Panic("internal error: current permissions check only handles permission bits (rwx)") } // check that we don't have any of the rejectable permission bits set if existing&rejected != 0 { return true, cacheError("existing %s %q has disallowed permissions set: %v", what, path, existing&rejected) } // warn if permissions are less strict than the preferred defaults if (existing | expected) != expected { cch.Warn("existing %s %q has less strict permissions %v than expected %v", what, path, existing, expected) } return true, nil } // mkdirAll creates a directory, checking permissions if it already exists. func (cch *cache) mkdirAll(what, path string, p *permissions) error { exists, err := cch.checkPerm(what, path, true, p) if err != nil { return err } if exists { return nil } if err := os.MkdirAll(path, p.prefer); err != nil { return cacheError("failed to create %s directory %q: %v", what, path, err) } return nil } // snapshot is used to serialize the cache into a saveable/loadable state. type snapshot struct { Version string Pods map[string]*pod Containers map[string]*container NextID uint64 Cfg *config.RawConfig PolicyName string PolicyJSON map[string]string } // Snapshot takes a restorable snapshot of the current state of the cache. func (cch *cache) Snapshot() ([]byte, error) { s := snapshot{ Version: CacheVersion, Pods: make(map[string]*pod), Containers: make(map[string]*container), Cfg: cch.Cfg, NextID: cch.NextID, PolicyName: cch.PolicyName, PolicyJSON: cch.PolicyJSON, } for id, p := range cch.Pods { s.Pods[id] = p } for id, c := range cch.Containers { if id == c.CacheID { s.Containers[c.CacheID] = c } } for key, obj := range cch.policyData { data, err := marshalEntry(obj) if err != nil { return nil, cacheError("failed to marshal policy entry '%s': %v", key, err) } s.PolicyJSON[key] = string(data) } data, err := json.Marshal(s) if err != nil { return nil, cacheError("failed to marshal cache: %v", err) } return data, nil } // Restore restores a previously takes snapshot of the cache. func (cch *cache) Restore(data []byte) error { s := snapshot{ Pods: make(map[string]*pod), Containers: make(map[string]*container), PolicyJSON: make(map[string]string), } if err := json.Unmarshal(data, &s); err != nil { return cacheError("failed to unmarshal snapshot data: %v", err) } if s.Version != CacheVersion { return cacheError("can't restore snapshot, version '%s' != running version %s", s.Version, CacheVersion) } cch.Pods = s.Pods cch.Containers = s.Containers cch.Cfg = s.Cfg cch.NextID = s.NextID cch.PolicyJSON = s.PolicyJSON cch.PolicyName = s.PolicyName cch.policyData = make(map[string]interface{}) for _, p := range cch.Pods { p.cache = cch p.containers = make(map[string]string) } for _, c := range cch.Containers { c.cache = cch cch.Containers[c.CacheID] = c if c.ID != "" { cch.Containers[c.ID] = c } } return nil } // Save the state of the cache. func (cch *cache) Save() error { cch.Debug("saving cache to file '%s'...", cch.filePath) data, err := cch.Snapshot() if err != nil { return cacheError("failed to save cache: %v", err) } tmpPath := cch.filePath + ".saving" if err = os.WriteFile(tmpPath, data, cacheFilePerm.prefer); err != nil { return cacheError("failed to write cache to file %q: %v", tmpPath, err) } if err := os.Rename(tmpPath, cch.filePath); err != nil { return cacheError("failed to rename %q to %q: %v", tmpPath, cch.filePath, err) } return nil } // Load loads the last saved state of the cache. func (cch *cache) Load() error { cch.Debug("loading cache from file '%s'...", cch.filePath) data, err := os.ReadFile(cch.filePath) switch { case os.IsNotExist(err): cch.Debug("no cache file '%s', nothing to restore", cch.filePath) return nil case len(data) == 0: cch.Debug("empty cache file '%s', nothing to restore", cch.filePath) return nil case err != nil: return cacheError("failed to load cache from file '%s': %v", cch.filePath, err) } return cch.Restore(data) } func (cch *cache) ContainerDirectory(id string) string { c, ok := cch.Containers[id] if !ok { return "" } return filepath.Join(cch.dataDir, strings.Replace(c.CacheID, ":", "-", 1)) } func (cch *cache) createContainerDirectory(id string) error { dir := cch.ContainerDirectory(id) if dir == "" { return cacheError("failed to determine container directory path for container %s", id) } return cch.mkdirAll("container directory", dir, dataDirPerm) } func (cch *cache) removeContainerDirectory(id string) error { dir := cch.ContainerDirectory(id) if dir == "" { return cacheError("failed to delete directory for container %s", id) } return os.RemoveAll(dir) } func (cch *cache) OpenFile(id string, name string, perm os.FileMode) (*os.File, error) { dir := cch.ContainerDirectory(id) if dir == "" { return nil, cacheError("failed to determine data directory for container %s", id) } if err := cch.mkdirAll("container directory", dir, dataDirPerm); err != nil { return nil, cacheError("container %s: can't create data file %q: %v", id, name, err) } path := filepath.Join(dir, name) if _, err := cch.checkPerm("container", path, false, dataFilePerm); err != nil { return nil, err } file, err := os.OpenFile(path, os.O_CREATE|os.O_TRUNC|os.O_WRONLY, perm) if err != nil { return nil, cacheError("container %s: can't open data file %q: %v", id, path, err) } return file, nil } func (cch *cache) WriteFile(id string, name string, perm os.FileMode, data []byte) error { file, err := cch.OpenFile(id, name, perm) if err != nil { return err } defer file.Close() _, err = file.Write(data) return err } ================================================ FILE: pkg/cri/resource-manager/cache/cache_test.go ================================================ // Copyright 2019 Intel Corporation. All Rights Reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package cache import ( "fmt" "os" "strings" "testing" v1 "k8s.io/api/core/v1" criv1 "k8s.io/cri-api/pkg/apis/runtime/v1" "github.com/intel/cri-resource-manager/pkg/cri/resource-manager/kubernetes" ) var nextFakePodID = 1 var nextFakeContainerID = 1 type fakePod struct { name string uid string id string qos v1.PodQOSClass labels map[string]string annotations map[string]string podCfg *criv1.PodSandboxConfig } type fakeContainer struct { fakePod *fakePod name string id string labels map[string]string annotations map[string]string resources criv1.LinuxContainerResources } func createTmpCache() (Cache, string, error) { dir, err := os.MkdirTemp("", "cache-test") if err != nil { return nil, "", err } cch, err := NewCache(Options{CacheDir: dir}) if err != nil { return nil, "", err } return cch, dir, nil } func removeTmpCache(dir string) { if dir != "" { os.RemoveAll(dir) } } func createFakePod(cch Cache, fp *fakePod) (Pod, error) { if fp.labels == nil { fp.labels = make(map[string]string) } fp.id = fmt.Sprintf("pod%4.4d", nextFakePodID) fp.uid = fmt.Sprintf("poduid%4.4d", nextFakePodID) fp.labels[kubernetes.PodUIDLabel] = fp.uid nextFakePodID++ if string(fp.qos) == "" { fp.qos = v1.PodQOSBurstable } cgroupPath := "" if fp.qos != v1.PodQOSGuaranteed { pathClass := "kubepods-" + strings.ToLower(string(fp.qos)) cgroupPath = "/kubepods.slice/" + pathClass + ".slice/" + pathClass + "-pod" + fp.uid } else { cgroupPath = "/kubepods.slice/kubepods-pod" + strings.ReplaceAll(fp.uid, "-", "_") } req := &criv1.RunPodSandboxRequest{ Config: &criv1.PodSandboxConfig{ Metadata: &criv1.PodSandboxMetadata{ Name: fp.name, Uid: fp.uid, Namespace: "default", }, Labels: fp.labels, Annotations: fp.annotations, Linux: &criv1.LinuxPodSandboxConfig{ CgroupParent: cgroupPath, }, }, } fp.podCfg = req.Config cch.(*cache).Debug("*** => creating Pod: %+v\n", *req) p, err := cch.InsertPod(fp.id, req, nil) if err != nil { cch.(*cache).Debug("*** <= created Pod FAILED: %+v\n", err) return nil, err } cch.(*cache).Debug("*** <= created Pod: %+v\n", *p.(*pod)) return p, nil } func createFakeContainer(cch Cache, fc *fakeContainer) (Container, error) { if fc.labels == nil { fc.labels = make(map[string]string) } fc.id = fmt.Sprintf("container-id-%4.4d", nextFakeContainerID) nextFakeContainerID++ req := &criv1.CreateContainerRequest{ PodSandboxId: fc.fakePod.id, Config: &criv1.ContainerConfig{ Metadata: &criv1.ContainerMetadata{ Name: fc.name, }, Labels: fc.labels, Annotations: fc.annotations, Linux: &criv1.LinuxContainerConfig{ Resources: &fc.resources, }, }, SandboxConfig: fc.fakePod.podCfg, } cch.(*cache).Debug("*** => creating Container: %+v\n", *req) c, err := cch.InsertContainer(req) if err != nil { return nil, err } cch.(*cache).Debug("*** <= created Container: %+v\n", *c.(*container)) update := &criv1.CreateContainerResponse{ContainerId: fc.id} if _, err := cch.UpdateContainerID(c.GetCacheID(), update); err != nil { return nil, err } return c, nil } func TestLookupContainerByCgroup(t *testing.T) { fakePods := map[string]*fakePod{ "pod1": {name: "pod1"}, "pod2": {name: "pod2"}, "pod3": {name: "pod3"}, } fakePodContainers := map[string][]*fakeContainer{ "pod1": {{name: "container1"}, {name: "container2"}, {name: "err-container3"}}, "pod2": {{name: "err-container4"}, {name: "container5"}, {name: "err-container6"}}, "pod3": {{name: "container7"}, {name: "container8"}, {name: "container10"}}, } cch, dir, err := createTmpCache() if err != nil { t.Errorf("failed: %v", err) } defer removeTmpCache(dir) for _, fp := range fakePods { _, err := createFakePod(cch, fp) if err != nil { t.Errorf("failed to create fake pod: %v", err) } } for podName, fcs := range fakePodContainers { fp, ok := fakePods[podName] if !ok { t.Errorf("failed to find fake pod '%s'", podName) } for _, fc := range fcs { fc.fakePod = fp if _, err := createFakeContainer(cch, fc); err != nil { t.Errorf("failed to create fake container '%s.%s': %v", podName, fc.name, err) } } } for _, c := range cch.GetContainers() { p, ok := c.GetPod() if !ok { t.Errorf("failed to find Pod for Container %s", c.PrettyName()) } podCgroupDir := p.GetCgroupParentDir() path := podCgroupDir + "/container-" + c.GetID() + ".scope" cch.(*cache).Info("=> %s: testing lookup by cgroup path %s...", c.PrettyName(), path) chk, ok := cch.LookupContainerByCgroup(path) if !ok { t.Errorf("failed to look up container %s by cgroup path %s (pod parent cgroup: %s)", c.PrettyName(), path, podCgroupDir) } cch.(*cache).Info("<= %s", chk.PrettyName()) if strings.HasPrefix(c.GetName(), "err-") { path := podCgroupDir + "-another/container-" + c.GetID() + ".scope" cch.(*cache).Info("=> %s: testing lookup failure by cgroup path %s...", c.PrettyName(), path) chk, ok := cch.LookupContainerByCgroup(path) if ok { t.Errorf("look up of container %s by path %s should have failed, but gave %s", c.PrettyName(), path, chk.PrettyName()) } cch.(*cache).Info("<= OK (not found as expected)") } if chk.GetID() != c.GetID() { t.Errorf("found container %s is not the expected %s", chk.GetID(), c.GetID()) } } } func TestDefaultRDTAndBlockIOClasses(t *testing.T) { fakePods := map[string]*fakePod{ "pod1": { name: "pod1", qos: v1.PodQOSBestEffort, annotations: map[string]string{ "rdtclass." + kubernetes.ResmgrKeyNamespace + "/pod": "Pod1RDT", "rdtclass." + kubernetes.ResmgrKeyNamespace + "/container.container1": "RDT1", "blockioclass." + kubernetes.ResmgrKeyNamespace + "/container.container1": "BLKIO1", "rdtclass." + kubernetes.ResmgrKeyNamespace + "/container.container2": "RDT2", "blockioclass." + kubernetes.ResmgrKeyNamespace + "/container.container2": "BLKIO2", "rdtclass." + kubernetes.ResmgrKeyNamespace + "/container.container3": "RDT3", "blockioclass." + kubernetes.ResmgrKeyNamespace + "/container.container4": "BLKIO4", }, }, "pod2": { name: "pod2", qos: v1.PodQOSBurstable, annotations: map[string]string{ "blockioclass." + kubernetes.ResmgrKeyNamespace: "Pod2BLKIO", "rdtclass." + kubernetes.ResmgrKeyNamespace + "/container.3": "RDT3", "blockioclass." + kubernetes.ResmgrKeyNamespace + "/container.3": "BLKIO3", "rdtclass." + kubernetes.ResmgrKeyNamespace + "/container.4": "RDT4", "rdtclass." + kubernetes.ResmgrKeyNamespace + "/container.1": "RDT1", "blockioclass." + kubernetes.ResmgrKeyNamespace + "/container.2": "BLKIO2", }, }, } fakePodContainers := map[string][]*fakeContainer{ "pod1": { {name: "container1"}, {name: "container2"}, {name: "container3"}, {name: "container4"}, }, } type classes struct { RDT string BlockIO string } expected := map[string]map[string]classes{ "pod1": { "container1": { RDT: "RDT1", BlockIO: "BLKIO1", }, "container2": { RDT: "RDT2", BlockIO: "BLKIO2", }, "container3": { RDT: "RDT3", BlockIO: string(fakePods["pod1"].qos), }, "container4": { RDT: "Pod1RDT", BlockIO: "BLKIO4", }, }, "pod2": { "container1": { RDT: "RDT1", BlockIO: "Pod2BLKIO", }, "container2": { RDT: string(fakePods["pod2"].qos), BlockIO: "BLKIO2", }, "container3": { RDT: "RDT3", BlockIO: "BLKIO3", }, "container4": { RDT: "RDT4", BlockIO: "Pod2BLKIO", }, }, } cch, dir, err := createTmpCache() if err != nil { t.Errorf("failed: %v", err) } defer removeTmpCache(dir) for _, fp := range fakePods { _, err := createFakePod(cch, fp) if err != nil { t.Errorf("failed to create fake pod: %v", err) } } for podName, fcs := range fakePodContainers { fp, ok := fakePods[podName] if !ok { t.Errorf("failed to find fake pod '%s'", podName) } for _, fc := range fcs { fc.fakePod = fp if _, err := createFakeContainer(cch, fc); err != nil { t.Errorf("failed to create fake container '%s.%s': %v", podName, fc.name, err) } } } for _, c := range cch.GetContainers() { pod, ok := c.GetPod() if !ok { t.Errorf("failed to find Pod for Container %s", c.PrettyName()) } exp, ok := expected[pod.GetName()][c.GetName()] if !ok { t.Errorf("failed to find expected results Container %s", c.PrettyName()) } if c.GetRDTClass() != exp.RDT { t.Errorf("container %s: RDT class %s, expected %s", c.PrettyName(), c.GetRDTClass(), exp.RDT) } if c.GetBlockIOClass() != exp.BlockIO { t.Errorf("container %s: BlockIO class %s, expected %s", c.PrettyName(), c.GetBlockIOClass(), exp.BlockIO) } } } const ( // anything below 2 millicpus will yield 0 as an estimate minNonZeroRequest = 2 // check CPU request/limit estimate accuracy up to this many CPU cores maxCPU = (kubernetes.MaxShares / kubernetes.SharesPerCPU) * kubernetes.MilliCPUToCPU // we expect our estimates to be within 1 millicpu from the real ones expectedAccuracy = 1 ) func TestCPURequestCalculationAccuracy(t *testing.T) { for request := 0; request < maxCPU; request++ { shares := MilliCPUToShares(int64(request)) estimate := SharesToMilliCPU(int64(shares)) diff := int64(request) - estimate if diff > expectedAccuracy || diff < -expectedAccuracy { if diff < 0 { diff = -diff } if request > minNonZeroRequest { t.Errorf("CPU request %v: estimate %v, unexpected inaccuracy %v > %v", request, estimate, diff, expectedAccuracy) } else { t.Logf("CPU request %v: estimate %v, inaccuracy %v > %v (OK, this was expected)", request, estimate, diff, expectedAccuracy) } } // fail if our estimates are not accurate for full CPUs worth of millicpus if (request%1000) == 0 && diff != 0 { t.Errorf("CPU request %v != estimate %v (diff %v)", request, estimate, diff) } } } func TestCPULimitCalculationAccuracy(t *testing.T) { for limit := int64(0); limit < int64(maxCPU); limit++ { quota, period := MilliCPUToQuota(limit) estimate := QuotaToMilliCPU(quota, period) diff := limit - estimate if diff > expectedAccuracy || diff < -expectedAccuracy { if diff < 0 { diff = -diff } if quota != kubernetes.MinQuotaPeriod { t.Errorf("CPU limit %v: estimate %v, unexpected inaccuracy %v > %v", limit, estimate, diff, expectedAccuracy) } else { t.Logf("CPU limit %v: estimate %v, inaccuracy %v > %v (OK, this was expected)", limit, estimate, diff, expectedAccuracy) } } // fail if our estimates are not accurate for full CPUs worth of millicpus if (limit%1000) == 0 && diff != 0 { t.Errorf("CPU limit %v != estimate %v (diff %v)", limit, estimate, diff) } } } ================================================ FILE: pkg/cri/resource-manager/cache/container.go ================================================ // Copyright 2019 Intel Corporation. All Rights Reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package cache import ( "encoding/json" "regexp" "sort" "strconv" "strings" "github.com/intel/cri-resource-manager/pkg/apis/resmgr" "github.com/intel/cri-resource-manager/pkg/cgroups" "github.com/intel/cri-resource-manager/pkg/cri/resource-manager/kubernetes" "github.com/intel/cri-resource-manager/pkg/topology" v1 "k8s.io/api/core/v1" resapi "k8s.io/apimachinery/pkg/api/resource" criv1 "k8s.io/cri-api/pkg/apis/runtime/v1" extapi "github.com/intel/cri-resource-manager/pkg/apis/resmgr/v1alpha1" ) // Create a container for a create request. func (c *container) fromCreateRequest(req *criv1.CreateContainerRequest) error { c.PodID = req.PodSandboxId pod, ok := c.cache.Pods[c.PodID] if !ok { return cacheError("can't find cached pod %s for container to create", c.PodID) } cfg := req.Config if cfg == nil { return cacheError("container of pod %s has no config", c.PodID) } meta := cfg.Metadata if meta == nil { return cacheError("container of pod %s has no request metadata", c.PodID) } podCfg := req.SandboxConfig if podCfg == nil { return cacheError("container of pod %s has no request pod config data", c.PodID) } podMeta := podCfg.Metadata if podMeta == nil { return cacheError("container of pod %s has no request pod metadata", c.PodID) } c.Name = meta.Name c.Namespace = podMeta.Namespace c.State = ContainerStateCreating c.Image = cfg.GetImage().GetImage() c.Command = cfg.Command c.Args = cfg.Args c.Labels = cfg.Labels c.Annotations = cfg.Annotations c.Env = make(map[string]string) for _, kv := range cfg.Envs { c.Env[kv.Key] = kv.Value } genHints := true if hintSetting, ok := c.GetEffectiveAnnotation(TopologyHintsKey); ok { preference, err := strconv.ParseBool(hintSetting) if err != nil { c.cache.Error("invalid annotation %q=%q: %v", TopologyHintsKey, hintSetting, err) } else { genHints = preference } } c.cache.Info("automatic topology hint generation %s for %q", map[bool]string{false: "disabled", true: "enabled"}[genHints], c.PrettyName()) c.Mounts = make(map[string]*Mount) for _, m := range cfg.Mounts { c.Mounts[m.ContainerPath] = &Mount{ Container: m.ContainerPath, Host: m.HostPath, Readonly: m.Readonly, Relabel: m.SelinuxRelabel, Propagation: MountType(m.Propagation), } if genHints { if hints := getTopologyHints(m.HostPath, m.ContainerPath, m.Readonly); len(hints) > 0 { c.TopologyHints = topology.MergeTopologyHints(c.TopologyHints, hints) } } } c.Devices = make(map[string]*Device) for _, d := range cfg.Devices { c.Devices[d.ContainerPath] = &Device{ Container: d.ContainerPath, Host: d.HostPath, Permissions: d.Permissions, } if genHints { if hints := getTopologyHints(d.HostPath, d.ContainerPath, strings.IndexAny(d.Permissions, "wm") == -1); len(hints) > 0 { c.TopologyHints = topology.MergeTopologyHints(c.TopologyHints, hints) } } } c.Tags = make(map[string]string) c.LinuxReq = cfg.GetLinux().GetResources() if pod.Resources != nil { if r, ok := pod.Resources.InitContainers[c.Name]; ok { c.Resources = r } else if r, ok := pod.Resources.Containers[c.Name]; ok { c.Resources = r } } if len(c.Resources.Requests) == 0 && len(c.Resources.Limits) == 0 { c.Resources = estimateComputeResources(c.LinuxReq, pod.CgroupParent) } c.TopologyHints = topology.MergeTopologyHints(c.TopologyHints, getKubeletHint(c.GetCpusetCpus(), c.GetCpusetMems())) if err := c.setDefaults(); err != nil { return err } return nil } // Create container from a container list response. func (c *container) fromListResponse(lrc *criv1.Container) error { c.PodID = lrc.PodSandboxId pod, ok := c.cache.Pods[c.PodID] if !ok { return cacheError("can't find cached pod %s for listed container", c.PodID) } meta := lrc.Metadata if meta == nil { return cacheError("listed container of pod %s has no metadata", c.PodID) } c.ID = lrc.Id c.Name = meta.Name c.Namespace = pod.Namespace c.State = ContainerState(int32(lrc.State)) c.Image = lrc.GetImage().GetImage() c.Labels = lrc.Labels c.Annotations = lrc.Annotations c.Tags = make(map[string]string) if pod.Resources != nil { if r, ok := pod.Resources.InitContainers[c.Name]; ok { c.Resources = r } else if r, ok := pod.Resources.Containers[c.Name]; ok { c.Resources = r } } if len(c.Resources.Requests) == 0 && len(c.Resources.Limits) == 0 { c.Resources = estimateComputeResources(c.LinuxReq, pod.CgroupParent) } if err := c.setDefaults(); err != nil { return err } return nil } func (c *container) setDefaults() error { class, ok := c.GetEffectiveAnnotation(RDTClassKey) if !ok { class = RDTClassPodQoS } c.SetRDTClass(class) class, ok = c.GetEffectiveAnnotation(BlockIOClassKey) if !ok { class = string(c.GetQOSClass()) } c.SetBlockIOClass(class) limit, ok := c.GetEffectiveAnnotation(ToptierLimitKey) if !ok { c.ToptierLimit = ToptierLimitUnset } else { qty, err := resapi.ParseQuantity(limit) if err != nil { return cacheError("%q: failed to parse top tier limit annotation %q (%q): %v", c.PrettyName(), ToptierLimitKey, limit, err) } c.SetToptierLimit(qty.Value()) } return nil } func (c *container) PrettyName() string { if c.prettyName != "" { return c.prettyName } if pod, ok := c.GetPod(); !ok { c.prettyName = c.PodID + ":" + c.Name } else { c.prettyName = pod.GetName() + ":" + c.Name } return c.prettyName } func (c *container) GetPod() (Pod, bool) { pod, found := c.cache.Pods[c.PodID] return pod, found } func (c *container) GetID() string { return c.ID } func (c *container) GetPodID() string { return c.PodID } func (c *container) GetCacheID() string { return c.CacheID } func (c *container) GetName() string { return c.Name } func (c *container) GetNamespace() string { return c.Namespace } func (c *container) UpdateState(state ContainerState) { c.State = state } func (c *container) GetState() ContainerState { return c.State } func (c *container) GetQOSClass() v1.PodQOSClass { var qos v1.PodQOSClass if pod, found := c.GetPod(); found { qos = pod.GetQOSClass() } return qos } func (c *container) GetImage() string { return c.Image } func (c *container) GetCommand() []string { command := make([]string, len(c.Command)) copy(command, c.Command) return command } func (c *container) GetArgs() []string { args := make([]string, len(c.Args)) copy(args, c.Args) return args } func keysInNamespace(m map[string]string, namespace string) []string { keys := make([]string, 0, len(m)) for key := range m { split := strings.SplitN(key, "/", 2) if len(split) == 2 && split[0] == namespace { keys = append(keys, split[1]) } else if len(split) == 1 && len(namespace) == 0 { keys = append(keys, split[0]) } } return keys } func (c *container) GetLabelKeys() []string { keys := make([]string, len(c.Labels)) idx := 0 for key := range c.Labels { keys[idx] = key idx++ } return keys } func (c *container) GetLabel(key string) (string, bool) { value, ok := c.Labels[key] return value, ok } func (c *container) GetResmgrLabelKeys() []string { return keysInNamespace(c.Labels, kubernetes.ResmgrKeyNamespace) } func (c *container) GetResmgrLabel(key string) (string, bool) { value, ok := c.Labels[kubernetes.ResmgrKey(key)] return value, ok } func (c *container) GetLabels() map[string]string { if c.Labels == nil { return nil } labels := make(map[string]string, len(c.Labels)) for key, value := range c.Labels { labels[key] = value } return labels } func (c *container) GetAnnotationKeys() []string { keys := make([]string, len(c.Annotations)) idx := 0 for key := range c.Annotations { keys[idx] = key idx++ } return keys } func (c *container) GetAnnotation(key string, objPtr interface{}) (string, bool) { jsonStr, ok := c.Annotations[key] if !ok { return "", false } if objPtr != nil { if err := json.Unmarshal([]byte(jsonStr), objPtr); err != nil { c.cache.Error("failed to unmarshal annotation %s (%s) of pod %s into %T", key, jsonStr, c.ID, objPtr) return "", false } } return jsonStr, true } func (c *container) GetResmgrAnnotationKeys() []string { return keysInNamespace(c.Annotations, kubernetes.ResmgrKeyNamespace) } func (c *container) GetResmgrAnnotation(key string, objPtr interface{}) (string, bool) { return c.GetAnnotation(kubernetes.ResmgrKey(key), objPtr) } func (c *container) GetEffectiveAnnotation(key string) (string, bool) { pod, ok := c.GetPod() if !ok { return "", false } return pod.GetEffectiveAnnotation(key, c.Name) } func (c *container) GetAnnotations() map[string]string { if c.Annotations == nil { return nil } annotations := make(map[string]string, len(c.Annotations)) for key, value := range c.Annotations { annotations[key] = value } return annotations } func (c *container) GetEnvKeys() []string { keys := make([]string, len(c.Env)) idx := 0 for key := range c.Env { keys[idx] = key idx++ } return keys } func (c *container) GetEnv(key string) (string, bool) { value, ok := c.Env[key] return value, ok } func (c *container) GetMounts() []Mount { mounts := make([]Mount, len(c.Mounts)) idx := 0 for _, m := range c.Mounts { mounts[idx] = *m idx++ } return mounts } func (c *container) GetMountByHost(path string) *Mount { for _, m := range c.Mounts { if m.Host == path { return &(*m) } } return nil } func (c *container) GetMountByContainer(path string) *Mount { m, ok := c.Mounts[path] if !ok { return nil } return &(*m) } func (c *container) GetDevices() []Device { devices := make([]Device, len(c.Devices)) idx := 0 for _, d := range c.Devices { devices[idx] = *d idx++ } return devices } func (c *container) GetDeviceByHost(path string) *Device { for _, d := range c.Devices { if d.Host == path { return &(*d) } } return nil } func (c *container) GetDeviceByContainer(path string) *Device { d, ok := c.Devices[path] if !ok { return nil } return &(*d) } func (c *container) GetResourceRequirements() v1.ResourceRequirements { if adjust, _ := c.getEffectiveAdjustment(); adjust != nil { if resources, ok := adjust.GetResourceRequirements(); ok { return resources } } return c.Resources } func (c *container) GetLinuxResources() *criv1.LinuxContainerResources { if c.LinuxReq == nil { return nil } return &(*c.LinuxReq) } func (c *container) setEffectiveAdjustment(name string) string { previous := c.Adjustment c.Adjustment = name return previous } func (c *container) getEffectiveAdjustment() (*extapi.AdjustmentSpec, string) { if c.Adjustment == "" { return nil, "" } if c.cache.External != nil { return c.cache.External.Adjustments[c.Adjustment], c.Adjustment } return nil, c.Adjustment } func (c *container) SetCommand(value []string) { c.Command = value c.markPending(CRI) } func (c *container) SetArgs(value []string) { c.Args = value c.markPending(CRI) } func (c *container) SetLabel(key, value string) { if c.Labels == nil { c.Labels = make(map[string]string) } c.Labels[key] = value c.markPending(CRI) } func (c *container) DeleteLabel(key string) { if _, ok := c.Labels[key]; ok { delete(c.Labels, key) c.markPending(CRI) } } func (c *container) SetAnnotation(key, value string) { if c.Annotations == nil { c.Annotations = make(map[string]string) } c.Annotations[key] = value c.markPending(CRI) } func (c *container) DeleteAnnotation(key string) { if _, ok := c.Annotations[key]; ok { delete(c.Annotations, key) c.markPending(CRI) } } func (c *container) SetEnv(key, value string) { if c.Env == nil { c.Env = make(map[string]string) } c.Env[key] = value c.markPending(CRI) } func (c *container) UnsetEnv(key string) { if _, ok := c.Env[key]; ok { delete(c.Env, key) c.markPending(CRI) } } func (c *container) InsertMount(m *Mount) { if c.Mounts == nil { c.Mounts = make(map[string]*Mount) } c.Mounts[m.Container] = m c.markPending(CRI) } func (c *container) DeleteMount(path string) { if _, ok := c.Mounts[path]; ok { delete(c.Mounts, path) c.markPending(CRI) } } func (c *container) InsertDevice(d *Device) { if c.Devices == nil { c.Devices = make(map[string]*Device) } c.Devices[d.Container] = d c.markPending(CRI) } func (c *container) DeleteDevice(path string) { if _, ok := c.Devices[path]; ok { delete(c.Devices, path) c.markPending(CRI) } } func (c *container) GetTopologyHints() topology.Hints { return c.TopologyHints } func (c *container) GetCPUPeriod() int64 { if c.LinuxReq == nil { return 0 } return c.LinuxReq.CpuPeriod } func (c *container) GetCPUQuota() int64 { if c.LinuxReq == nil { return 0 } return c.LinuxReq.CpuQuota } func (c *container) GetCPUShares() int64 { if c.LinuxReq == nil { return 0 } return c.LinuxReq.CpuShares } func (c *container) GetMemoryLimit() int64 { if c.LinuxReq == nil { return 0 } return c.LinuxReq.MemoryLimitInBytes } func (c *container) GetOomScoreAdj() int64 { if c.LinuxReq == nil { return 0 } return c.LinuxReq.OomScoreAdj } func (c *container) GetCpusetCpus() string { if c.LinuxReq == nil { return "" } return c.LinuxReq.CpusetCpus } func (c *container) GetCpusetMems() string { if c.LinuxReq == nil { return "" } return c.LinuxReq.CpusetMems } func (c *container) SetLinuxResources(req *criv1.LinuxContainerResources) { c.LinuxReq = req c.markPending(CRI) } func (c *container) SetCPUPeriod(value int64) { if c.LinuxReq == nil { c.LinuxReq = &criv1.LinuxContainerResources{} } c.LinuxReq.CpuPeriod = value c.markPending(CRI) } func (c *container) SetCPUQuota(value int64) { if c.LinuxReq == nil { c.LinuxReq = &criv1.LinuxContainerResources{} } c.LinuxReq.CpuQuota = value c.markPending(CRI) } func (c *container) SetCPUShares(value int64) { if c.LinuxReq == nil { c.LinuxReq = &criv1.LinuxContainerResources{} } c.LinuxReq.CpuShares = value c.markPending(CRI) } func (c *container) SetMemoryLimit(value int64) { if c.LinuxReq == nil { c.LinuxReq = &criv1.LinuxContainerResources{} } c.LinuxReq.MemoryLimitInBytes = value c.markPending(CRI) } func (c *container) SetOomScoreAdj(value int64) { if c.LinuxReq == nil { c.LinuxReq = &criv1.LinuxContainerResources{} } c.LinuxReq.OomScoreAdj = value c.markPending(CRI) } func (c *container) SetCpusetCpus(value string) { if c.LinuxReq == nil { c.LinuxReq = &criv1.LinuxContainerResources{} } c.LinuxReq.CpusetCpus = value c.markPending(CRI) } func (c *container) SetCpusetMems(value string) { if c.LinuxReq == nil { c.LinuxReq = &criv1.LinuxContainerResources{} } c.LinuxReq.CpusetMems = value c.markPending(CRI) } func getTopologyHints(hostPath, containerPath string, readOnly bool) topology.Hints { if readOnly { // if device or path is read-only, assume it as non-important for now // TODO: determine topology hint, but use it with low priority return topology.Hints{} } // ignore topology information for small files in /etc, service files in /var/lib/kubelet and host libraries mounts ignoredTopologyPaths := []string{"/.cri-resmgr", "/etc/", "/dev/termination-log", "/lib/", "/lib64/", "/usr/lib/", "/usr/lib32/", "/usr/lib64/"} for _, path := range ignoredTopologyPaths { if strings.HasPrefix(hostPath, path) || strings.HasPrefix(containerPath, path) { return topology.Hints{} } } // More complext rules, for Kubelet secrets and config maps ignoredTopologyPathRegexps := []*regexp.Regexp{ // Kubelet directory can be different, but we can detect it by structure inside of it. // For now, we can safely ignore exposed config maps and secrets for topology hints. regexp.MustCompile(`(kubelet)?/pods/[[:xdigit:]-]+/volumes/kubernetes\.io~(configmap|secret)/`), } for _, re := range ignoredTopologyPathRegexps { if re.MatchString(hostPath) || re.MatchString(containerPath) { return topology.Hints{} } } if devPath, err := topology.FindSysFsDevice(hostPath); err == nil { // errors are ignored if hints, err := topology.NewTopologyHints(devPath); err == nil && len(hints) > 0 { return hints } } return topology.Hints{} } func getKubeletHint(cpus, mems string) (ret topology.Hints) { if cpus != "" || mems != "" { ret = topology.Hints{ topology.ProviderKubelet: topology.Hint{ Provider: topology.ProviderKubelet, CPUs: cpus, NUMAs: mems}} } return } func (c *container) GetAffinity() ([]*Affinity, error) { pod, ok := c.GetPod() if !ok { c.cache.Error("internal error: can't find Pod for container %s", c.PrettyName()) } affinity, err := pod.GetContainerAffinity(c.GetName()) if err != nil { return nil, err } affinity = append(affinity, c.implicitAffinities(len(affinity) > 0)...) c.cache.Debug("affinity for container %s:", c.PrettyName()) for _, a := range affinity { c.cache.Debug(" - %s", a.String()) } return affinity, nil } func (c *container) GetCgroupDir() string { if c.CgroupDir != "" { return c.CgroupDir } if pod, ok := c.GetPod(); ok { parent, _ := pod.GetCgroupParentDir(), pod.GetID() ID := c.GetID() c.CgroupDir = findContainerDir(parent, ID) } return c.CgroupDir } func (c *container) SetRDTClass(class string) { c.RDTClass = class c.markPending(RDT) } func (c *container) GetRDTClass() string { if adjust, _ := c.getEffectiveAdjustment(); adjust != nil { if class, ok := adjust.GetRDTClass(); ok { return class } } return c.RDTClass } func (c *container) SetBlockIOClass(class string) { c.BlockIOClass = class c.markPending(BlockIO) } func (c *container) GetBlockIOClass() string { if adjust, _ := c.getEffectiveAdjustment(); adjust != nil { if class, ok := adjust.GetBlockIOClass(); ok { return class } } return c.BlockIOClass } func (c *container) SetToptierLimit(limit int64) { c.ToptierLimit = limit c.markPending(Memory) } func (c *container) GetToptierLimit() int64 { if adjust, _ := c.getEffectiveAdjustment(); adjust != nil { if adjust.ToptierLimit != nil { return adjust.ToptierLimit.Value() } } return c.ToptierLimit } func (c *container) SetPageMigration(p *PageMigrate) { c.PageMigrate = p c.markPending(PageMigration) } func (c *container) GetPageMigration() *PageMigrate { return c.PageMigrate } func (c *container) GetProcesses() ([]string, error) { dir := c.GetCgroupDir() if dir == "" { return nil, cacheError("%s: unknown cgroup directory", c.PrettyName()) } return cgroups.Cpu.Group(dir).GetProcesses() } func (c *container) GetTasks() ([]string, error) { dir := c.GetCgroupDir() if dir == "" { return nil, cacheError("%s: unknown cgroup directory", c.PrettyName()) } return cgroups.Cpu.Group(dir).GetTasks() } func (c *container) SetCRIRequest(req interface{}) error { if c.req != nil { return cacheError("can't set pending container request: another pending") } c.req = &req return nil } func (c *container) GetCRIRequest() (interface{}, bool) { if c.req == nil { return nil, false } return *c.req, true } func (c *container) ClearCRIRequest() (interface{}, bool) { req, ok := c.GetCRIRequest() c.req = nil return req, ok } func (c *container) GetCRIEnvs() []*criv1.KeyValue { envs := make([]*criv1.KeyValue, len(c.Env), len(c.Env)) idx := 0 for k, v := range c.Env { envs[idx] = &criv1.KeyValue{ Key: k, Value: v, } idx++ } return envs } func (c *container) GetCRIMounts() []*criv1.Mount { if c.Mounts == nil { return nil } mounts := make([]*criv1.Mount, len(c.Mounts), len(c.Mounts)) idx := 0 for _, m := range c.Mounts { mounts[idx] = &criv1.Mount{ ContainerPath: m.Container, HostPath: m.Host, Readonly: m.Readonly, SelinuxRelabel: m.Relabel, Propagation: criv1.MountPropagation(m.Propagation), } idx++ } return mounts } func (c *container) GetCRIDevices() []*criv1.Device { if c.Devices == nil { return nil } devices := make([]*criv1.Device, len(c.Devices), len(c.Devices)) idx := 0 for _, d := range c.Devices { devices[idx] = &criv1.Device{ ContainerPath: d.Container, HostPath: d.Host, Permissions: d.Permissions, } idx++ } return devices } func (c *container) markPending(controllers ...string) { if c.pending == nil { c.pending = make(map[string]struct{}) } for _, ctrl := range controllers { c.pending[ctrl] = struct{}{} c.cache.markPending(c) } } func (c *container) ClearPending(controller string) { delete(c.pending, controller) if len(c.pending) == 0 { c.cache.clearPending(c) } } func (c *container) GetPending() []string { if c.pending == nil { return nil } pending := make([]string, 0, len(c.pending)) for controller := range c.pending { pending = append(pending, controller) } sort.Strings(pending) return pending } func (c *container) HasPending(controller string) bool { if c.pending == nil { return false } _, pending := c.pending[controller] return pending } func (c *container) GetTag(key string) (string, bool) { value, ok := c.Tags[key] return value, ok } func (c *container) SetTag(key string, value string) (string, bool) { prev, ok := c.Tags[key] c.Tags[key] = value return prev, ok } func (c *container) DeleteTag(key string) (string, bool) { value, ok := c.Tags[key] delete(c.Tags, key) return value, ok } func (c *container) implicitAffinities(hasExplicit bool) []*Affinity { affinities := []*Affinity{} for name, generate := range c.cache.implicit { implicit := generate(c, hasExplicit) if implicit == nil { c.cache.Debug("no implicit affinity %s for container %s", name, c.PrettyName()) continue } c.cache.Debug("using implicit affinity %s for %s", name, c.PrettyName()) affinities = append(affinities, implicit) } return affinities } func (c *container) String() string { return c.PrettyName() } func (c *container) Eval(key string) interface{} { switch key { case resmgr.KeyPod: pod, ok := c.GetPod() if !ok { return cacheError("%s: failed to find pod %s", c.PrettyName(), c.PodID) } return pod case resmgr.KeyName: return c.Name case resmgr.KeyNamespace: return c.Namespace case resmgr.KeyQOSClass: return c.GetQOSClass() case resmgr.KeyLabels: return c.Labels case resmgr.KeyTags: return c.Tags case resmgr.KeyID: return c.ID default: return cacheError("%s: Container cannot evaluate of %q", c.PrettyName(), key) } } // CompareContainersFn compares two containers by some arbitrary property. // It returns a negative integer, 0, or a positive integer, depending on // whether the first container is considered smaller, equal, or larger than // the second. type CompareContainersFn func(Container, Container) int // SortContainers sorts a slice of containers using the given comparison functions. // If the containers are otherwise equal they are sorted by pod and container name. // If the comparison functions are omitted, containers are compared by QoS class, // memory and cpuset size. func SortContainers(containers []Container, compareFns ...CompareContainersFn) { if len(compareFns) == 0 { compareFns = CompareByQOSMemoryCPU } sort.Slice(containers, func(i, j int) bool { ci, cj := containers[i], containers[j] for _, cmpFn := range compareFns { switch diff := cmpFn(ci, cj); { case diff < 0: return true case diff > 0: return false } } // If two containers are otherwise equal they are sorted by pod and container name. if pi, ok := ci.GetPod(); ok { if pj, ok := cj.GetPod(); ok { ni, nj := pi.GetName(), pj.GetName() if ni != nj { return ni < nj } } } return ci.GetName() < cj.GetName() }) } // CompareByQOSMemoryCPU is a slice for comparing container by QOS, memory, and CPU. var CompareByQOSMemoryCPU = []CompareContainersFn{CompareQOS, CompareMemory, CompareCPU} // CompareQOS compares containers by QOS class. func CompareQOS(ci, cj Container) int { qosi, qosj := ci.GetQOSClass(), cj.GetQOSClass() switch { case qosi == v1.PodQOSGuaranteed && qosj != v1.PodQOSGuaranteed: return -1 case qosj == v1.PodQOSGuaranteed && qosi != v1.PodQOSGuaranteed: return +1 case qosi == v1.PodQOSBurstable && qosj == v1.PodQOSBestEffort: return -1 case qosj == v1.PodQOSBurstable && qosi == v1.PodQOSBestEffort: return +1 } return 0 } // CompareMemory compares containers by memory requests and limits. func CompareMemory(ci, cj Container) int { var reqi, limi, reqj, limj int64 resi := ci.GetResourceRequirements() if qty, ok := resi.Requests[v1.ResourceMemory]; ok { reqi = qty.Value() } if qty, ok := resi.Limits[v1.ResourceMemory]; ok { limi = qty.Value() } resj := cj.GetResourceRequirements() if qty, ok := resj.Requests[v1.ResourceMemory]; ok { reqj = qty.Value() } if qty, ok := resj.Limits[v1.ResourceMemory]; ok { limj = qty.Value() } switch diff := reqj - reqi; { case diff < 0: return -1 case diff > 0: return +1 } switch diff := limj - limi; { case diff < 0: return -1 case diff > 0: return +1 } return 0 } // CompareCPU compares containers by CPU requests and limits. func CompareCPU(ci, cj Container) int { var reqi, limi, reqj, limj int64 resi := ci.GetResourceRequirements() if qty, ok := resi.Requests[v1.ResourceCPU]; ok { reqi = qty.MilliValue() } if qty, ok := resi.Limits[v1.ResourceCPU]; ok { limi = qty.MilliValue() } resj := cj.GetResourceRequirements() if qty, ok := resj.Requests[v1.ResourceCPU]; ok { reqj = qty.MilliValue() } if qty, ok := resj.Limits[v1.ResourceCPU]; ok { limj = qty.MilliValue() } switch diff := reqj - reqi; { case diff < 0: return -1 case diff > 0: return +1 } switch diff := limj - limi; { case diff < 0: return -1 case diff > 0: return +1 } return 0 } ================================================ FILE: pkg/cri/resource-manager/cache/container_test.go ================================================ // Copyright 2019 Intel Corporation. All Rights Reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package cache import ( "sort" "testing" "github.com/google/go-cmp/cmp" "github.com/google/go-cmp/cmp/cmpopts" ) func TestGetKubeletHint(t *testing.T) { type T struct { name string cpus string mems string expectedLen int } cases := []T{ { name: "empty", cpus: "", mems: "", expectedLen: 0, }, { name: "cpus", cpus: "0-9", mems: "", expectedLen: 1, }, { name: "mems", cpus: "", mems: "0,1", expectedLen: 1, }, { name: "both", cpus: "0-9", mems: "0,1", expectedLen: 1, }, } for _, tc := range cases { t.Run(tc.name, func(t *testing.T) { output := getKubeletHint(tc.cpus, tc.mems) if len(output) != tc.expectedLen { t.Errorf("expected len of hints: %d, got: %d, hints: %+v", tc.expectedLen, len(output), output) } }) } } func TestGetTopologyHints(t *testing.T) { type T struct { name string hostPath string containerPath string readOnly bool expectedLen int } cases := []T{ { name: "read-only", hostPath: "/something", containerPath: "/something", readOnly: true, }, { name: "host /etc", hostPath: "/etc/something", containerPath: "/data/something", }, { name: "container /etc", hostPath: "/var/lib/kubelet/pods/0c9bcfc4-c51b-11e9-ac9a-b8aeed7c7427/etc-hosts", containerPath: "/etc/hosts", }, { name: "ConfigMap", containerPath: "/var/lib/kube-proxy", hostPath: "/var/lib/kubelet/pods/0c9bcfc4-c51b-11e9-ac9a-b8aeed7c7427/volumes/kubernetes.io~configmap/kube-proxy", }, { name: "secret", containerPath: "/var/run/secrets/kubernetes.io/serviceaccount", hostPath: "/var/lib/kubelet/pods/0c9bcfc4-c51b-11e9-ac9a-b8aeed7c7427/volumes/kubernetes.io~secret/kube-proxy-token-d9slz", }, { name: "dev null", hostPath: "/dev/null", containerPath: "/dev/null", }, } for _, tc := range cases { t.Run(tc.name, func(t *testing.T) { output := getTopologyHints(tc.hostPath, tc.containerPath, tc.readOnly) if len(output) != tc.expectedLen { t.Errorf("expected len of hints: %d, got: %d, hints: %+v", tc.expectedLen, len(output), output) } }) } } func TestKeysInNamespace(t *testing.T) { testMap := map[string]string{ "no-namespace": "", "my.name.space": "", "my.name.space/key-1": "", "my.name.space/key-2": "", "other.name.space/other-key": "", } tcases := []struct { name string collectionMap map[string]string namespace string expectedKeys []string }{ { name: "empty map should return nothing for empty namespace", }, { name: "empty map should return nothing", namespace: "my.name.space", }, { name: "keys with no namespace", collectionMap: testMap, expectedKeys: []string{"my.name.space", "no-namespace"}, }, { name: "keys in namespace", collectionMap: testMap, namespace: "my.name.space", expectedKeys: []string{"key-1", "key-2"}, }, } for _, tc := range tcases { t.Run(tc.name, func(t *testing.T) { keys := keysInNamespace(tc.collectionMap, tc.namespace) sort.Strings(keys) if !cmp.Equal(keys, tc.expectedKeys, cmpopts.EquateEmpty()) { t.Errorf("Expected %v, received %v", tc.expectedKeys, keys) } }) } } ================================================ FILE: pkg/cri/resource-manager/cache/error.go ================================================ // Copyright 2019 Intel Corporation. All Rights Reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package cache import ( "fmt" ) func cacheError(format string, args ...interface{}) error { return fmt.Errorf("cache: "+format, args...) } ================================================ FILE: pkg/cri/resource-manager/cache/pod.go ================================================ // Copyright 2019 Intel Corporation. All Rights Reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package cache import ( "encoding/json" "strconv" "strings" v1 "k8s.io/api/core/v1" criv1 "k8s.io/cri-api/pkg/apis/runtime/v1" "github.com/intel/cri-resource-manager/pkg/apis/resmgr" "github.com/intel/cri-resource-manager/pkg/cgroups" "github.com/intel/cri-resource-manager/pkg/cri/resource-manager/kubernetes" ) const ( // KeyResourceAnnotation is the annotation key our webhook uses. KeyResourceAnnotation = "intel.com/resources" ) // Create a pod from a run request. func (p *pod) fromRunRequest(req *criv1.RunPodSandboxRequest) error { cfg := req.Config if cfg == nil { return cacheError("pod %s has no config", p.ID) } meta := cfg.Metadata if meta == nil { return cacheError("pod %s has no request metadata", p.ID) } p.containers = make(map[string]string) p.UID = meta.Uid p.Name = meta.Name p.Namespace = meta.Namespace p.State = PodState(int32(PodStateReady)) p.Labels = cfg.Labels p.Annotations = cfg.Annotations p.CgroupParent = cfg.GetLinux().GetCgroupParent() if err := p.discoverQOSClass(); err != nil { p.cache.Error("%v", err) } p.parseResourceAnnotations() return nil } // Create a pod from a list response. func (p *pod) fromListResponse(pod *criv1.PodSandbox, status *PodStatus) error { meta := pod.Metadata if meta == nil { return cacheError("pod %s has no reply metadata", p.ID) } p.containers = make(map[string]string) p.UID = meta.Uid p.Name = meta.Name p.Namespace = meta.Namespace p.State = PodState(int32(pod.State)) p.Labels = pod.Labels p.Annotations = pod.Annotations if status == nil { p.cache.Error("pod %s has no associated status query data", p.ID) } else { p.CgroupParent = status.CgroupParent } if err := p.discoverQOSClass(); err != nil { p.cache.Error("%v", err) } p.parseResourceAnnotations() return nil } // Get the init containers of a pod. func (p *pod) GetInitContainers() []Container { if p.Resources == nil { return nil } containers := []Container{} for id, c := range p.cache.Containers { if id != c.CacheID { continue } if _, ok := p.Resources.InitContainers[c.ID]; ok { containers = append(containers, c) } } return containers } // Get the normal containers of a pod. func (p *pod) GetContainers() []Container { containers := []Container{} for id, c := range p.cache.Containers { if c.PodID != p.ID || id != c.CacheID { continue } if p.Resources != nil { if _, ok := p.Resources.InitContainers[c.ID]; ok { continue } } containers = append(containers, c) } return containers } // Get container pointer by its name. func (p *pod) getContainer(name string) *container { var found *container if id, ok := p.containers[name]; ok { return p.cache.Containers[id] } for _, c := range p.GetContainers() { cptr := c.(*container) p.containers[cptr.Name] = cptr.ID if cptr.Name == name { found = cptr } } return found } // Get container by its name. func (p *pod) GetContainer(name string) (Container, bool) { c := p.getContainer(name) return c, c != nil } // Get the id of a pod. func (p *pod) GetID() string { return p.ID } // Get the (k8s) unique id of a pod. func (p *pod) GetUID() string { return p.UID } // Get the name of a pod. func (p *pod) GetName() string { return p.Name } // Get the namespace of a pod. func (p *pod) GetNamespace() string { return p.Namespace } // Get the PodState of a pod. func (p *pod) GetState() PodState { return p.State } // Get the keys of all labels of a pod. func (p *pod) GetLabelKeys() []string { keys := make([]string, len(p.Labels)) idx := 0 for key := range p.Labels { keys[idx] = key idx++ } return keys } // Get the label for a key of a pod. func (p *pod) GetLabel(key string) (string, bool) { value, ok := p.Labels[key] return value, ok } // Get all label keys in the cri-resource-manager namespace. func (p *pod) GetResmgrLabelKeys() []string { return keysInNamespace(p.Labels, kubernetes.ResmgrKeyNamespace) } // Get the label for the given key in the cri-resource-manager namespace. func (p *pod) GetResmgrLabel(key string) (string, bool) { value, ok := p.Labels[kubernetes.ResmgrKey(key)] return value, ok } // Get the keys of all annotations of a pod. func (p *pod) GetAnnotationKeys() []string { keys := make([]string, len(p.Annotations)) idx := 0 for key := range p.Annotations { keys[idx] = key idx++ } return keys } // Get pod annotation for the given key. func (p *pod) GetAnnotation(key string) (string, bool) { value, ok := p.Annotations[key] return value, ok } // Get and decode/unmarshal pod annotation for the given key. func (p *pod) GetAnnotationObject(key string, objPtr interface{}, decode func([]byte, interface{}) error) (bool, error) { var err error value, ok := p.GetAnnotation(key) if !ok { return false, nil } // decode with decoder function, if given if decode != nil { err = decode([]byte(value), objPtr) return true, err } // decode with type-specific default decoder switch objPtr.(type) { case *string: *objPtr.(*string) = value case *bool: *objPtr.(*bool), err = strconv.ParseBool(value) case *int: var i int64 i, err = strconv.ParseInt(value, 0, 0) *objPtr.(*int) = int(i) case *uint: var i uint64 i, err = strconv.ParseUint(value, 0, 0) *objPtr.(*uint) = uint(i) case *int64: *objPtr.(*int64), err = strconv.ParseInt(value, 0, 64) case *uint64: *objPtr.(*uint64), err = strconv.ParseUint(value, 0, 64) default: err = json.Unmarshal([]byte(value), objPtr) } if err != nil { p.cache.Error("failed to decode annotation %s (%s): %v", key, value, err) } return true, err } // Get the keys of all annotation in the cri-resource-manager namespace. func (p *pod) GetResmgrAnnotationKeys() []string { return keysInNamespace(p.Annotations, kubernetes.ResmgrKeyNamespace) } // Get the value of the given annotation in the cri-resource-manager namespace. func (p *pod) GetResmgrAnnotation(key string) (string, bool) { return p.GetAnnotation(kubernetes.ResmgrKey(key)) } // Get and decode the pod annotation for the key in the cri-resource-manager namespace.. func (p *pod) GetResmgrAnnotationObject(key string, objPtr interface{}, decode func([]byte, interface{}) error) (bool, error) { return p.GetAnnotationObject(kubernetes.ResmgrKey(key), objPtr, decode) } // Get the effective annotation for the container. func (p *pod) GetEffectiveAnnotation(key, container string) (string, bool) { if v, ok := p.Annotations[key+"/container."+container]; ok { return v, true } if v, ok := p.Annotations[key+"/pod"]; ok { return v, true } v, ok := p.Annotations[key] return v, ok } // Get the cgroup parent directory of a pod, if known. func (p *pod) GetCgroupParentDir() string { return p.CgroupParent } // discover a pod's QoS class by parsing the cgroup parent directory. func (p *pod) discoverQOSClass() error { if p.CgroupParent == "" { p.QOSClass = v1.PodQOSBestEffort return cacheError("%s: unknown cgroup parent/QoS class", p.ID) } dirs := strings.Split(p.CgroupParent[1:], "/") if len(dirs) < 1 { return cacheError("%s: failed to parse %q for QoS class", p.ID, p.CgroupParent) } // consume any potential --cgroup-root passed to kubelet if dirs[0] != "kubepods.slice" && dirs[0] != "kubepods" { dirs = dirs[1:] } if len(dirs) < 1 { return cacheError("%s: failed to parse %q for QoS class", p.ID, p.CgroupParent) } // consume potential kubepods[.slice] if dirs[0] == "kubepods.slice" || dirs[0] == "kubepods" { dirs = dirs[1:] } if len(dirs) < 1 { return cacheError("%s: failed to parse %q for QoS class", p.ID, p.CgroupParent) } // check for besteffort, burstable, or lack thereof indicating guaranteed switch dir := dirs[0]; { case dir == "kubepods-besteffort.slice" || dir == "besteffort": p.QOSClass = v1.PodQOSBestEffort return nil case dir == "kubepods-burstable.slice" || dir == "burstable": p.QOSClass = v1.PodQOSBurstable return nil case strings.HasPrefix(dir, "kubepods-pod") || strings.HasPrefix(dir, "pod"): p.QOSClass = v1.PodQOSGuaranteed return nil } return cacheError("%s: failed to parse %q for QoS class", p.ID, p.CgroupParent) } // Get the resource requirements of a pod. func (p *pod) GetPodResourceRequirements() PodResourceRequirements { if p.Resources == nil { return PodResourceRequirements{} } return *p.Resources } // Parse per container resource requirements from webhook annotations. func (p *pod) parseResourceAnnotations() { p.Resources = &PodResourceRequirements{} p.GetAnnotationObject(KeyResourceAnnotation, p.Resources, nil) } // Determine the QoS class of the pod. func (p *pod) GetQOSClass() v1.PodQOSClass { return p.QOSClass } // GetContainerAffinity returns the annotated affinity for the named container. func (p *pod) GetContainerAffinity(name string) ([]*Affinity, error) { if p.Affinity != nil { return (*p.Affinity)[name], nil } affinity := &podContainerAffinity{} value, ok := p.GetResmgrAnnotation(keyAffinity) if ok { weight := DefaultWeight if !affinity.parseSimple(p, value, weight) { if err := affinity.parseFull(p, value, weight); err != nil { p.cache.Error("%v", err) return nil, err } } } value, ok = p.GetResmgrAnnotation(keyAntiAffinity) if ok { weight := -DefaultWeight if !affinity.parseSimple(p, value, weight) { if err := affinity.parseFull(p, value, weight); err != nil { p.cache.Error("%v", err) return nil, err } } } if p.cache.DebugEnabled() { p.cache.Debug("Pod container affinity for %s:", p.GetName()) for id, ca := range *affinity { p.cache.Debug(" - container %s:", id) for _, a := range ca { p.cache.Debug(" * %s", a.String()) } } } p.Affinity = affinity return (*p.Affinity)[name], nil } // ScopeExpression returns an affinity expression for defining this pod as the scope. func (p *pod) ScopeExpression() *resmgr.Expression { return &resmgr.Expression{ // Domain: LabelsDomain, Key: kubernetes.PodNameLabel, Op: resmgr.Equals, Values: []string{p.GetName()}, } } // String returns a string representation of pod. func (p *pod) String() string { return p.Name } // Eval returns the value of a key for expression evaluation. func (p *pod) Eval(key string) interface{} { switch key { case resmgr.KeyName: return p.Name case resmgr.KeyNamespace: return p.Namespace case resmgr.KeyQOSClass: return p.GetQOSClass() case resmgr.KeyLabels: return p.Labels case resmgr.KeyID: return p.ID case resmgr.KeyUID: return p.UID default: return cacheError("Pod cannot evaluate of %q", key) } } // GetProcesses returns the pids of processes in a pod. func (p *pod) GetProcesses(recursive bool) ([]string, error) { return p.getTasks(recursive, true) } // GetTasks returns the pids of threads in a pod. func (p *pod) GetTasks(recursive bool) ([]string, error) { return p.getTasks(recursive, false) } // getTasks returns the pids of processes or threads in a pod. func (p *pod) getTasks(recursive, processes bool) ([]string, error) { var pids, childPids []string var err error dir := p.GetCgroupParentDir() if dir == "" { return nil, cacheError("%s: unknown cgroup parent directory", p.Name) } if processes { pids, err = cgroups.Cpu.Group(dir).GetProcesses() } else { pids, err = cgroups.Cpu.Group(dir).GetTasks() } if err != nil { return nil, cacheError("%s: failed to read pids: %v", p.Name, err) } if !recursive { return pids, nil } for _, c := range append(p.GetInitContainers(), p.GetContainers()...) { if c.GetState() == ContainerStateRunning { if processes { childPids, err = c.GetProcesses() } else { childPids, err = c.GetTasks() } if err == nil { pids = append(pids, childPids...) continue } p.cache.Error("%s: failed to read pids of %s: %v", p.Name, c.PrettyName(), err) } } return pids, nil } // ParsePodStatus parses a PodSandboxStatusResponse into a PodStatus. func ParsePodStatus(response *criv1.PodSandboxStatusResponse) (*PodStatus, error) { var name string type infoRuntimeSpec struct { Annotations map[string]string `json:"annotations"` } type infoConfig struct { Linux *struct { CgroupParent string `json:"cgroup_parent"` } `json:"linux"` } type statusInfo struct { RuntimeSpec *infoRuntimeSpec `json:"runtimeSpec"` Config *infoConfig `json:"config"` } if response.Status.Metadata != nil { name = response.Status.Metadata.Name } else { name = response.Status.Id } blob, ok := response.Info["info"] if !ok { return nil, cacheError("%s: missing info in pod status response", name) } info := statusInfo{} if err := json.Unmarshal([]byte(blob), &info); err != nil { return nil, cacheError("%s: failed to extract pod status info: %v", name, err) } ps := &PodStatus{} if info.Config != nil { // containerd // CgroupParent: Info["config"]["linux"]["cgroup_parent"] ps.CgroupParent = info.Config.Linux.CgroupParent } else if info.RuntimeSpec != nil { // cri-o // CgroupParent: Info["info"]["runtimeSpec"]["annotations"][crioCgroupParent] const ( crioCgroupParent = "io.kubernetes.cri-o.CgroupParent" ) ps.CgroupParent = info.RuntimeSpec.Annotations[crioCgroupParent] } if ps.CgroupParent == "" { return nil, cacheError("%s: failed to extract cgroup parent from pod status", name) } return ps, nil } ================================================ FILE: pkg/cri/resource-manager/cache/utils.go ================================================ // Copyright 2019 Intel Corporation. All Rights Reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package cache import ( "os" "path" "strconv" "strings" corev1 "k8s.io/api/core/v1" resapi "k8s.io/apimachinery/pkg/api/resource" "k8s.io/apimachinery/pkg/util/sets" criv1 "k8s.io/cri-api/pkg/apis/runtime/v1" "github.com/intel/cri-resource-manager/pkg/cgroups" "github.com/intel/cri-resource-manager/pkg/cri/resource-manager/kubernetes" ) var ( memoryCapacity int64 SharesToMilliCPU = kubernetes.SharesToMilliCPU QuotaToMilliCPU = kubernetes.QuotaToMilliCPU MilliCPUToShares = kubernetes.MilliCPUToShares MilliCPUToQuota = kubernetes.MilliCPUToQuota ) // IsPodQOSClassName returns true if the given class is one of the Pod QOS classes. func IsPodQOSClassName(class string) bool { switch corev1.PodQOSClass(class) { case corev1.PodQOSBestEffort, corev1.PodQOSBurstable, corev1.PodQOSGuaranteed: return true } return false } // estimateComputeResources calculates resource requests/limits from a CRI request. func estimateComputeResources(lnx *criv1.LinuxContainerResources, cgroupParent string) corev1.ResourceRequirements { var qos corev1.PodQOSClass resources := corev1.ResourceRequirements{ Requests: corev1.ResourceList{}, Limits: corev1.ResourceList{}, } if lnx == nil { return resources } if cgroupParent != "" { qos = cgroupParentToQOS(cgroupParent) } // calculate CPU request if value := SharesToMilliCPU(lnx.CpuShares); value > 0 { qty := resapi.NewMilliQuantity(value, resapi.DecimalSI) resources.Requests[corev1.ResourceCPU] = *qty } // get memory limit if value := lnx.MemoryLimitInBytes; value > 0 { qty := resapi.NewQuantity(value, resapi.DecimalSI) resources.Limits[corev1.ResourceMemory] = *qty } // set or calculate CPU limit, set memory request if known if qos == corev1.PodQOSGuaranteed { resources.Limits[corev1.ResourceCPU] = resources.Requests[corev1.ResourceCPU] resources.Requests[corev1.ResourceMemory] = resources.Limits[corev1.ResourceMemory] } else { if value := QuotaToMilliCPU(lnx.CpuQuota, lnx.CpuPeriod); value > 0 { qty := resapi.NewMilliQuantity(value, resapi.DecimalSI) resources.Limits[corev1.ResourceCPU] = *qty } } return resources } // getMemoryCapacity parses memory capacity from /proc/meminfo (mimicking cAdvisor). func getMemoryCapacity() int64 { var data []byte var err error if memoryCapacity > 0 { return memoryCapacity } if data, err = os.ReadFile("/proc/meminfo"); err != nil { return -1 } for _, line := range strings.Split(string(data), "\n") { keyval := strings.Split(line, ":") if len(keyval) != 2 || keyval[0] != "MemTotal" { continue } valunit := strings.Split(strings.TrimSpace(keyval[1]), " ") if len(valunit) != 2 || valunit[1] != "kB" { return -1 } memoryCapacity, err = strconv.ParseInt(valunit[0], 10, 64) if err != nil { return -1 } memoryCapacity *= 1024 break } return memoryCapacity } // cgroupParentToQOS tries to map Pod cgroup parent to QOS class. func cgroupParentToQOS(dir string) corev1.PodQOSClass { var qos corev1.PodQOSClass // The parent directory naming scheme depends on the cgroup driver in use. // Thus, rely on substring matching split := strings.Split(strings.TrimPrefix(dir, "/"), "/") switch { case len(split) < 2: qos = corev1.PodQOSClass("") case strings.Index(split[1], strings.ToLower(string(corev1.PodQOSBurstable))) != -1: qos = corev1.PodQOSBurstable case strings.Index(split[1], strings.ToLower(string(corev1.PodQOSBestEffort))) != -1: qos = corev1.PodQOSBestEffort default: qos = corev1.PodQOSGuaranteed } return qos } // resourcesToQOS tries to map Pod container resources (from annotation) to QOS class. func resourcesToQOS(podResources *PodResourceRequirements) corev1.PodQOSClass { var qos corev1.PodQOSClass if podResources == nil { return qos } requests := corev1.ResourceList{} limits := corev1.ResourceList{} zeroQuantity := resapi.MustParse("0") isGuaranteed := true for _, resources := range podResources.Containers { // process requests for name, quantity := range resources.Requests { if !isSupportedQoSComputeResource(name) { continue } if quantity.Cmp(zeroQuantity) == 1 { delta := quantity.DeepCopy() if _, exists := requests[name]; !exists { requests[name] = delta } else { delta.Add(requests[name]) requests[name] = delta } } } // process limits qosLimitsFound := sets.NewString() for name, quantity := range resources.Limits { if !isSupportedQoSComputeResource(name) { continue } if quantity.Cmp(zeroQuantity) == 1 { qosLimitsFound.Insert(string(name)) delta := quantity.DeepCopy() if _, exists := limits[name]; !exists { limits[name] = delta } else { delta.Add(limits[name]) limits[name] = delta } } } if !qosLimitsFound.HasAll(string(corev1.ResourceMemory), string(corev1.ResourceCPU)) { isGuaranteed = false } } if len(requests) == 0 && len(limits) == 0 { return corev1.PodQOSBestEffort } // Check is requests match limits for all resources. if isGuaranteed { for name, req := range requests { if lim, exists := limits[name]; !exists || lim.Cmp(req) != 0 { isGuaranteed = false break } } } if isGuaranteed && len(requests) == len(limits) { return corev1.PodQOSGuaranteed } return corev1.PodQOSBurstable } // findContainerDir brute-force searches for a container cgroup dir. func findContainerDir(podCgroupDir, ID string) string { var dirs []string if podCgroupDir == "" { return "" } cpusetDir := cgroups.Cpuset.Path() dirs = []string{ path.Join(cpusetDir, podCgroupDir, ID), // containerd, systemd path.Join(cpusetDir, podCgroupDir, "cri-containerd-"+ID+".scope"), // containerd, cgroupfs path.Join(cpusetDir, podCgroupDir, "cri-containerd-"+ID), // crio, systemd path.Join(cpusetDir, podCgroupDir, "crio-"+ID+".scope"), // crio, cgroupfs path.Join(cpusetDir, podCgroupDir, "crio-"+ID), } for _, dir := range dirs { if info, err := os.Stat(dir); err == nil { if info.Mode().IsDir() { return strings.TrimPrefix(dir, cpusetDir) } } } return "" } func isSupportedQoSComputeResource(name corev1.ResourceName) bool { return name == corev1.ResourceCPU || name == corev1.ResourceMemory } func init() { // TODO: get rid of this eventually, use pkg/sysfs instead... getMemoryCapacity() } ================================================ FILE: pkg/cri/resource-manager/config/api/v1/api.pb.go ================================================ // //Copyright 2019 Intel Corporation // //Licensed under the Apache License, Version 2.0 (the "License"); //you may not use this file except in compliance with the License. //You may obtain a copy of the License at // //http://www.apache.org/licenses/LICENSE-2.0 // //Unless required by applicable law or agreed to in writing, software //distributed under the License is distributed on an "AS IS" BASIS, //WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. //See the License for the specific language governing permissions and //limitations under the License. // Code generated by protoc-gen-go. DO NOT EDIT. // versions: // protoc-gen-go v1.28.0 // protoc v3.20.1 // source: pkg/cri/resource-manager/config/api/v1/api.proto package v1 import ( protoreflect "google.golang.org/protobuf/reflect/protoreflect" protoimpl "google.golang.org/protobuf/runtime/protoimpl" reflect "reflect" sync "sync" ) const ( // Verify that this generated code is sufficiently up-to-date. _ = protoimpl.EnforceVersion(20 - protoimpl.MinVersion) // Verify that runtime/protoimpl is sufficiently up-to-date. _ = protoimpl.EnforceVersion(protoimpl.MaxVersion - 20) ) type SetConfigRequest struct { state protoimpl.MessageState sizeCache protoimpl.SizeCache unknownFields protoimpl.UnknownFields // node_name is node name used to acquire this configuration. NodeName string `protobuf:"bytes,1,opt,name=node_name,json=nodeName,proto3" json:"node_name,omitempty"` // config is the ConfigMap data. Config map[string]string `protobuf:"bytes,2,rep,name=config,proto3" json:"config,omitempty" protobuf_key:"bytes,1,opt,name=key,proto3" protobuf_val:"bytes,2,opt,name=value,proto3"` } func (x *SetConfigRequest) Reset() { *x = SetConfigRequest{} if protoimpl.UnsafeEnabled { mi := &file_pkg_cri_resource_manager_config_api_v1_api_proto_msgTypes[0] ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) ms.StoreMessageInfo(mi) } } func (x *SetConfigRequest) String() string { return protoimpl.X.MessageStringOf(x) } func (*SetConfigRequest) ProtoMessage() {} func (x *SetConfigRequest) ProtoReflect() protoreflect.Message { mi := &file_pkg_cri_resource_manager_config_api_v1_api_proto_msgTypes[0] if protoimpl.UnsafeEnabled && x != nil { ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) if ms.LoadMessageInfo() == nil { ms.StoreMessageInfo(mi) } return ms } return mi.MessageOf(x) } // Deprecated: Use SetConfigRequest.ProtoReflect.Descriptor instead. func (*SetConfigRequest) Descriptor() ([]byte, []int) { return file_pkg_cri_resource_manager_config_api_v1_api_proto_rawDescGZIP(), []int{0} } func (x *SetConfigRequest) GetNodeName() string { if x != nil { return x.NodeName } return "" } func (x *SetConfigRequest) GetConfig() map[string]string { if x != nil { return x.Config } return nil } type SetConfigReply struct { state protoimpl.MessageState sizeCache protoimpl.SizeCache unknownFields protoimpl.UnknownFields // If not empty, indicate an error that happened while trying to apply new configuration. Error string `protobuf:"bytes,1,opt,name=error,proto3" json:"error,omitempty"` } func (x *SetConfigReply) Reset() { *x = SetConfigReply{} if protoimpl.UnsafeEnabled { mi := &file_pkg_cri_resource_manager_config_api_v1_api_proto_msgTypes[1] ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) ms.StoreMessageInfo(mi) } } func (x *SetConfigReply) String() string { return protoimpl.X.MessageStringOf(x) } func (*SetConfigReply) ProtoMessage() {} func (x *SetConfigReply) ProtoReflect() protoreflect.Message { mi := &file_pkg_cri_resource_manager_config_api_v1_api_proto_msgTypes[1] if protoimpl.UnsafeEnabled && x != nil { ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) if ms.LoadMessageInfo() == nil { ms.StoreMessageInfo(mi) } return ms } return mi.MessageOf(x) } // Deprecated: Use SetConfigReply.ProtoReflect.Descriptor instead. func (*SetConfigReply) Descriptor() ([]byte, []int) { return file_pkg_cri_resource_manager_config_api_v1_api_proto_rawDescGZIP(), []int{1} } func (x *SetConfigReply) GetError() string { if x != nil { return x.Error } return "" } type SetAdjustmentRequest struct { state protoimpl.MessageState sizeCache protoimpl.SizeCache unknownFields protoimpl.UnknownFields // node_name is node name used to acquire this configuration. NodeName string `protobuf:"bytes,1,opt,name=node_name,json=nodeName,proto3" json:"node_name,omitempty"` // Serialized map of all adjustment CRDs, name as key, CRD as value. Adjustment string `protobuf:"bytes,2,opt,name=adjustment,proto3" json:"adjustment,omitempty"` } func (x *SetAdjustmentRequest) Reset() { *x = SetAdjustmentRequest{} if protoimpl.UnsafeEnabled { mi := &file_pkg_cri_resource_manager_config_api_v1_api_proto_msgTypes[2] ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) ms.StoreMessageInfo(mi) } } func (x *SetAdjustmentRequest) String() string { return protoimpl.X.MessageStringOf(x) } func (*SetAdjustmentRequest) ProtoMessage() {} func (x *SetAdjustmentRequest) ProtoReflect() protoreflect.Message { mi := &file_pkg_cri_resource_manager_config_api_v1_api_proto_msgTypes[2] if protoimpl.UnsafeEnabled && x != nil { ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) if ms.LoadMessageInfo() == nil { ms.StoreMessageInfo(mi) } return ms } return mi.MessageOf(x) } // Deprecated: Use SetAdjustmentRequest.ProtoReflect.Descriptor instead. func (*SetAdjustmentRequest) Descriptor() ([]byte, []int) { return file_pkg_cri_resource_manager_config_api_v1_api_proto_rawDescGZIP(), []int{2} } func (x *SetAdjustmentRequest) GetNodeName() string { if x != nil { return x.NodeName } return "" } func (x *SetAdjustmentRequest) GetAdjustment() string { if x != nil { return x.Adjustment } return "" } type SetAdjustmentReply struct { state protoimpl.MessageState sizeCache protoimpl.SizeCache unknownFields protoimpl.UnknownFields // If not empty, indicates that errors happened while trying to apply the adjustments. Errors map[string]string `protobuf:"bytes,1,rep,name=errors,proto3" json:"errors,omitempty" protobuf_key:"bytes,1,opt,name=key,proto3" protobuf_val:"bytes,2,opt,name=value,proto3"` } func (x *SetAdjustmentReply) Reset() { *x = SetAdjustmentReply{} if protoimpl.UnsafeEnabled { mi := &file_pkg_cri_resource_manager_config_api_v1_api_proto_msgTypes[3] ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) ms.StoreMessageInfo(mi) } } func (x *SetAdjustmentReply) String() string { return protoimpl.X.MessageStringOf(x) } func (*SetAdjustmentReply) ProtoMessage() {} func (x *SetAdjustmentReply) ProtoReflect() protoreflect.Message { mi := &file_pkg_cri_resource_manager_config_api_v1_api_proto_msgTypes[3] if protoimpl.UnsafeEnabled && x != nil { ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) if ms.LoadMessageInfo() == nil { ms.StoreMessageInfo(mi) } return ms } return mi.MessageOf(x) } // Deprecated: Use SetAdjustmentReply.ProtoReflect.Descriptor instead. func (*SetAdjustmentReply) Descriptor() ([]byte, []int) { return file_pkg_cri_resource_manager_config_api_v1_api_proto_rawDescGZIP(), []int{3} } func (x *SetAdjustmentReply) GetErrors() map[string]string { if x != nil { return x.Errors } return nil } var File_pkg_cri_resource_manager_config_api_v1_api_proto protoreflect.FileDescriptor var file_pkg_cri_resource_manager_config_api_v1_api_proto_rawDesc = []byte{ 0x0a, 0x30, 0x70, 0x6b, 0x67, 0x2f, 0x63, 0x72, 0x69, 0x2f, 0x72, 0x65, 0x73, 0x6f, 0x75, 0x72, 0x63, 0x65, 0x2d, 0x6d, 0x61, 0x6e, 0x61, 0x67, 0x65, 0x72, 0x2f, 0x63, 0x6f, 0x6e, 0x66, 0x69, 0x67, 0x2f, 0x61, 0x70, 0x69, 0x2f, 0x76, 0x31, 0x2f, 0x61, 0x70, 0x69, 0x2e, 0x70, 0x72, 0x6f, 0x74, 0x6f, 0x12, 0x02, 0x76, 0x31, 0x22, 0xa4, 0x01, 0x0a, 0x10, 0x53, 0x65, 0x74, 0x43, 0x6f, 0x6e, 0x66, 0x69, 0x67, 0x52, 0x65, 0x71, 0x75, 0x65, 0x73, 0x74, 0x12, 0x1b, 0x0a, 0x09, 0x6e, 0x6f, 0x64, 0x65, 0x5f, 0x6e, 0x61, 0x6d, 0x65, 0x18, 0x01, 0x20, 0x01, 0x28, 0x09, 0x52, 0x08, 0x6e, 0x6f, 0x64, 0x65, 0x4e, 0x61, 0x6d, 0x65, 0x12, 0x38, 0x0a, 0x06, 0x63, 0x6f, 0x6e, 0x66, 0x69, 0x67, 0x18, 0x02, 0x20, 0x03, 0x28, 0x0b, 0x32, 0x20, 0x2e, 0x76, 0x31, 0x2e, 0x53, 0x65, 0x74, 0x43, 0x6f, 0x6e, 0x66, 0x69, 0x67, 0x52, 0x65, 0x71, 0x75, 0x65, 0x73, 0x74, 0x2e, 0x43, 0x6f, 0x6e, 0x66, 0x69, 0x67, 0x45, 0x6e, 0x74, 0x72, 0x79, 0x52, 0x06, 0x63, 0x6f, 0x6e, 0x66, 0x69, 0x67, 0x1a, 0x39, 0x0a, 0x0b, 0x43, 0x6f, 0x6e, 0x66, 0x69, 0x67, 0x45, 0x6e, 0x74, 0x72, 0x79, 0x12, 0x10, 0x0a, 0x03, 0x6b, 0x65, 0x79, 0x18, 0x01, 0x20, 0x01, 0x28, 0x09, 0x52, 0x03, 0x6b, 0x65, 0x79, 0x12, 0x14, 0x0a, 0x05, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x18, 0x02, 0x20, 0x01, 0x28, 0x09, 0x52, 0x05, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x3a, 0x02, 0x38, 0x01, 0x22, 0x26, 0x0a, 0x0e, 0x53, 0x65, 0x74, 0x43, 0x6f, 0x6e, 0x66, 0x69, 0x67, 0x52, 0x65, 0x70, 0x6c, 0x79, 0x12, 0x14, 0x0a, 0x05, 0x65, 0x72, 0x72, 0x6f, 0x72, 0x18, 0x01, 0x20, 0x01, 0x28, 0x09, 0x52, 0x05, 0x65, 0x72, 0x72, 0x6f, 0x72, 0x22, 0x53, 0x0a, 0x14, 0x53, 0x65, 0x74, 0x41, 0x64, 0x6a, 0x75, 0x73, 0x74, 0x6d, 0x65, 0x6e, 0x74, 0x52, 0x65, 0x71, 0x75, 0x65, 0x73, 0x74, 0x12, 0x1b, 0x0a, 0x09, 0x6e, 0x6f, 0x64, 0x65, 0x5f, 0x6e, 0x61, 0x6d, 0x65, 0x18, 0x01, 0x20, 0x01, 0x28, 0x09, 0x52, 0x08, 0x6e, 0x6f, 0x64, 0x65, 0x4e, 0x61, 0x6d, 0x65, 0x12, 0x1e, 0x0a, 0x0a, 0x61, 0x64, 0x6a, 0x75, 0x73, 0x74, 0x6d, 0x65, 0x6e, 0x74, 0x18, 0x02, 0x20, 0x01, 0x28, 0x09, 0x52, 0x0a, 0x61, 0x64, 0x6a, 0x75, 0x73, 0x74, 0x6d, 0x65, 0x6e, 0x74, 0x22, 0x8b, 0x01, 0x0a, 0x12, 0x53, 0x65, 0x74, 0x41, 0x64, 0x6a, 0x75, 0x73, 0x74, 0x6d, 0x65, 0x6e, 0x74, 0x52, 0x65, 0x70, 0x6c, 0x79, 0x12, 0x3a, 0x0a, 0x06, 0x65, 0x72, 0x72, 0x6f, 0x72, 0x73, 0x18, 0x01, 0x20, 0x03, 0x28, 0x0b, 0x32, 0x22, 0x2e, 0x76, 0x31, 0x2e, 0x53, 0x65, 0x74, 0x41, 0x64, 0x6a, 0x75, 0x73, 0x74, 0x6d, 0x65, 0x6e, 0x74, 0x52, 0x65, 0x70, 0x6c, 0x79, 0x2e, 0x45, 0x72, 0x72, 0x6f, 0x72, 0x73, 0x45, 0x6e, 0x74, 0x72, 0x79, 0x52, 0x06, 0x65, 0x72, 0x72, 0x6f, 0x72, 0x73, 0x1a, 0x39, 0x0a, 0x0b, 0x45, 0x72, 0x72, 0x6f, 0x72, 0x73, 0x45, 0x6e, 0x74, 0x72, 0x79, 0x12, 0x10, 0x0a, 0x03, 0x6b, 0x65, 0x79, 0x18, 0x01, 0x20, 0x01, 0x28, 0x09, 0x52, 0x03, 0x6b, 0x65, 0x79, 0x12, 0x14, 0x0a, 0x05, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x18, 0x02, 0x20, 0x01, 0x28, 0x09, 0x52, 0x05, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x3a, 0x02, 0x38, 0x01, 0x32, 0x86, 0x01, 0x0a, 0x06, 0x43, 0x6f, 0x6e, 0x66, 0x69, 0x67, 0x12, 0x37, 0x0a, 0x09, 0x53, 0x65, 0x74, 0x43, 0x6f, 0x6e, 0x66, 0x69, 0x67, 0x12, 0x14, 0x2e, 0x76, 0x31, 0x2e, 0x53, 0x65, 0x74, 0x43, 0x6f, 0x6e, 0x66, 0x69, 0x67, 0x52, 0x65, 0x71, 0x75, 0x65, 0x73, 0x74, 0x1a, 0x12, 0x2e, 0x76, 0x31, 0x2e, 0x53, 0x65, 0x74, 0x43, 0x6f, 0x6e, 0x66, 0x69, 0x67, 0x52, 0x65, 0x70, 0x6c, 0x79, 0x22, 0x00, 0x12, 0x43, 0x0a, 0x0d, 0x53, 0x65, 0x74, 0x41, 0x64, 0x6a, 0x75, 0x73, 0x74, 0x6d, 0x65, 0x6e, 0x74, 0x12, 0x18, 0x2e, 0x76, 0x31, 0x2e, 0x53, 0x65, 0x74, 0x41, 0x64, 0x6a, 0x75, 0x73, 0x74, 0x6d, 0x65, 0x6e, 0x74, 0x52, 0x65, 0x71, 0x75, 0x65, 0x73, 0x74, 0x1a, 0x16, 0x2e, 0x76, 0x31, 0x2e, 0x53, 0x65, 0x74, 0x41, 0x64, 0x6a, 0x75, 0x73, 0x74, 0x6d, 0x65, 0x6e, 0x74, 0x52, 0x65, 0x70, 0x6c, 0x79, 0x22, 0x00, 0x42, 0x07, 0x5a, 0x05, 0x2e, 0x2e, 0x2f, 0x76, 0x31, 0x62, 0x06, 0x70, 0x72, 0x6f, 0x74, 0x6f, 0x33, } var ( file_pkg_cri_resource_manager_config_api_v1_api_proto_rawDescOnce sync.Once file_pkg_cri_resource_manager_config_api_v1_api_proto_rawDescData = file_pkg_cri_resource_manager_config_api_v1_api_proto_rawDesc ) func file_pkg_cri_resource_manager_config_api_v1_api_proto_rawDescGZIP() []byte { file_pkg_cri_resource_manager_config_api_v1_api_proto_rawDescOnce.Do(func() { file_pkg_cri_resource_manager_config_api_v1_api_proto_rawDescData = protoimpl.X.CompressGZIP(file_pkg_cri_resource_manager_config_api_v1_api_proto_rawDescData) }) return file_pkg_cri_resource_manager_config_api_v1_api_proto_rawDescData } var file_pkg_cri_resource_manager_config_api_v1_api_proto_msgTypes = make([]protoimpl.MessageInfo, 6) var file_pkg_cri_resource_manager_config_api_v1_api_proto_goTypes = []interface{}{ (*SetConfigRequest)(nil), // 0: v1.SetConfigRequest (*SetConfigReply)(nil), // 1: v1.SetConfigReply (*SetAdjustmentRequest)(nil), // 2: v1.SetAdjustmentRequest (*SetAdjustmentReply)(nil), // 3: v1.SetAdjustmentReply nil, // 4: v1.SetConfigRequest.ConfigEntry nil, // 5: v1.SetAdjustmentReply.ErrorsEntry } var file_pkg_cri_resource_manager_config_api_v1_api_proto_depIdxs = []int32{ 4, // 0: v1.SetConfigRequest.config:type_name -> v1.SetConfigRequest.ConfigEntry 5, // 1: v1.SetAdjustmentReply.errors:type_name -> v1.SetAdjustmentReply.ErrorsEntry 0, // 2: v1.Config.SetConfig:input_type -> v1.SetConfigRequest 2, // 3: v1.Config.SetAdjustment:input_type -> v1.SetAdjustmentRequest 1, // 4: v1.Config.SetConfig:output_type -> v1.SetConfigReply 3, // 5: v1.Config.SetAdjustment:output_type -> v1.SetAdjustmentReply 4, // [4:6] is the sub-list for method output_type 2, // [2:4] is the sub-list for method input_type 2, // [2:2] is the sub-list for extension type_name 2, // [2:2] is the sub-list for extension extendee 0, // [0:2] is the sub-list for field type_name } func init() { file_pkg_cri_resource_manager_config_api_v1_api_proto_init() } func file_pkg_cri_resource_manager_config_api_v1_api_proto_init() { if File_pkg_cri_resource_manager_config_api_v1_api_proto != nil { return } if !protoimpl.UnsafeEnabled { file_pkg_cri_resource_manager_config_api_v1_api_proto_msgTypes[0].Exporter = func(v interface{}, i int) interface{} { switch v := v.(*SetConfigRequest); i { case 0: return &v.state case 1: return &v.sizeCache case 2: return &v.unknownFields default: return nil } } file_pkg_cri_resource_manager_config_api_v1_api_proto_msgTypes[1].Exporter = func(v interface{}, i int) interface{} { switch v := v.(*SetConfigReply); i { case 0: return &v.state case 1: return &v.sizeCache case 2: return &v.unknownFields default: return nil } } file_pkg_cri_resource_manager_config_api_v1_api_proto_msgTypes[2].Exporter = func(v interface{}, i int) interface{} { switch v := v.(*SetAdjustmentRequest); i { case 0: return &v.state case 1: return &v.sizeCache case 2: return &v.unknownFields default: return nil } } file_pkg_cri_resource_manager_config_api_v1_api_proto_msgTypes[3].Exporter = func(v interface{}, i int) interface{} { switch v := v.(*SetAdjustmentReply); i { case 0: return &v.state case 1: return &v.sizeCache case 2: return &v.unknownFields default: return nil } } } type x struct{} out := protoimpl.TypeBuilder{ File: protoimpl.DescBuilder{ GoPackagePath: reflect.TypeOf(x{}).PkgPath(), RawDescriptor: file_pkg_cri_resource_manager_config_api_v1_api_proto_rawDesc, NumEnums: 0, NumMessages: 6, NumExtensions: 0, NumServices: 1, }, GoTypes: file_pkg_cri_resource_manager_config_api_v1_api_proto_goTypes, DependencyIndexes: file_pkg_cri_resource_manager_config_api_v1_api_proto_depIdxs, MessageInfos: file_pkg_cri_resource_manager_config_api_v1_api_proto_msgTypes, }.Build() File_pkg_cri_resource_manager_config_api_v1_api_proto = out.File file_pkg_cri_resource_manager_config_api_v1_api_proto_rawDesc = nil file_pkg_cri_resource_manager_config_api_v1_api_proto_goTypes = nil file_pkg_cri_resource_manager_config_api_v1_api_proto_depIdxs = nil } ================================================ FILE: pkg/cri/resource-manager/config/api/v1/api.proto ================================================ /* Copyright 2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ syntax = "proto3"; package v1; option go_package = "../v1"; service Config{ rpc SetConfig(SetConfigRequest) returns (SetConfigReply) {} rpc SetAdjustment(SetAdjustmentRequest) returns (SetAdjustmentReply) {} } message SetConfigRequest { // node_name is node name used to acquire this configuration. string node_name = 1; // config is the ConfigMap data. map config = 2; } message SetConfigReply { // If not empty, indicate an error that happened while trying to apply new configuration. string error = 1; } message SetAdjustmentRequest { // node_name is node name used to acquire this configuration. string node_name = 1; // Serialized map of all adjustment CRDs, name as key, CRD as value. string adjustment = 2; } message SetAdjustmentReply { // If not empty, indicates that errors happened while trying to apply the adjustments. map errors = 1; } ================================================ FILE: pkg/cri/resource-manager/config/api/v1/api_grpc.pb.go ================================================ // Code generated by protoc-gen-go-grpc. DO NOT EDIT. // versions: // - protoc-gen-go-grpc v1.2.0 // - protoc v3.20.1 // source: pkg/cri/resource-manager/config/api/v1/api.proto package v1 import ( context "context" grpc "google.golang.org/grpc" codes "google.golang.org/grpc/codes" status "google.golang.org/grpc/status" ) // This is a compile-time assertion to ensure that this generated file // is compatible with the grpc package it is being compiled against. // Requires gRPC-Go v1.32.0 or later. const _ = grpc.SupportPackageIsVersion7 // ConfigClient is the client API for Config service. // // For semantics around ctx use and closing/ending streaming RPCs, please refer to https://pkg.go.dev/google.golang.org/grpc/?tab=doc#ClientConn.NewStream. type ConfigClient interface { SetConfig(ctx context.Context, in *SetConfigRequest, opts ...grpc.CallOption) (*SetConfigReply, error) SetAdjustment(ctx context.Context, in *SetAdjustmentRequest, opts ...grpc.CallOption) (*SetAdjustmentReply, error) } type configClient struct { cc grpc.ClientConnInterface } func NewConfigClient(cc grpc.ClientConnInterface) ConfigClient { return &configClient{cc} } func (c *configClient) SetConfig(ctx context.Context, in *SetConfigRequest, opts ...grpc.CallOption) (*SetConfigReply, error) { out := new(SetConfigReply) err := c.cc.Invoke(ctx, "/v1.Config/SetConfig", in, out, opts...) if err != nil { return nil, err } return out, nil } func (c *configClient) SetAdjustment(ctx context.Context, in *SetAdjustmentRequest, opts ...grpc.CallOption) (*SetAdjustmentReply, error) { out := new(SetAdjustmentReply) err := c.cc.Invoke(ctx, "/v1.Config/SetAdjustment", in, out, opts...) if err != nil { return nil, err } return out, nil } // ConfigServer is the server API for Config service. // All implementations must embed UnimplementedConfigServer // for forward compatibility type ConfigServer interface { SetConfig(context.Context, *SetConfigRequest) (*SetConfigReply, error) SetAdjustment(context.Context, *SetAdjustmentRequest) (*SetAdjustmentReply, error) mustEmbedUnimplementedConfigServer() } // UnimplementedConfigServer must be embedded to have forward compatible implementations. type UnimplementedConfigServer struct { } func (UnimplementedConfigServer) SetConfig(context.Context, *SetConfigRequest) (*SetConfigReply, error) { return nil, status.Errorf(codes.Unimplemented, "method SetConfig not implemented") } func (UnimplementedConfigServer) SetAdjustment(context.Context, *SetAdjustmentRequest) (*SetAdjustmentReply, error) { return nil, status.Errorf(codes.Unimplemented, "method SetAdjustment not implemented") } func (UnimplementedConfigServer) mustEmbedUnimplementedConfigServer() {} // UnsafeConfigServer may be embedded to opt out of forward compatibility for this service. // Use of this interface is not recommended, as added methods to ConfigServer will // result in compilation errors. type UnsafeConfigServer interface { mustEmbedUnimplementedConfigServer() } func RegisterConfigServer(s grpc.ServiceRegistrar, srv ConfigServer) { s.RegisterService(&Config_ServiceDesc, srv) } func _Config_SetConfig_Handler(srv interface{}, ctx context.Context, dec func(interface{}) error, interceptor grpc.UnaryServerInterceptor) (interface{}, error) { in := new(SetConfigRequest) if err := dec(in); err != nil { return nil, err } if interceptor == nil { return srv.(ConfigServer).SetConfig(ctx, in) } info := &grpc.UnaryServerInfo{ Server: srv, FullMethod: "/v1.Config/SetConfig", } handler := func(ctx context.Context, req interface{}) (interface{}, error) { return srv.(ConfigServer).SetConfig(ctx, req.(*SetConfigRequest)) } return interceptor(ctx, in, info, handler) } func _Config_SetAdjustment_Handler(srv interface{}, ctx context.Context, dec func(interface{}) error, interceptor grpc.UnaryServerInterceptor) (interface{}, error) { in := new(SetAdjustmentRequest) if err := dec(in); err != nil { return nil, err } if interceptor == nil { return srv.(ConfigServer).SetAdjustment(ctx, in) } info := &grpc.UnaryServerInfo{ Server: srv, FullMethod: "/v1.Config/SetAdjustment", } handler := func(ctx context.Context, req interface{}) (interface{}, error) { return srv.(ConfigServer).SetAdjustment(ctx, req.(*SetAdjustmentRequest)) } return interceptor(ctx, in, info, handler) } // Config_ServiceDesc is the grpc.ServiceDesc for Config service. // It's only intended for direct use with grpc.RegisterService, // and not to be introspected or modified (even as a copy) var Config_ServiceDesc = grpc.ServiceDesc{ ServiceName: "v1.Config", HandlerType: (*ConfigServer)(nil), Methods: []grpc.MethodDesc{ { MethodName: "SetConfig", Handler: _Config_SetConfig_Handler, }, { MethodName: "SetAdjustment", Handler: _Config_SetAdjustment_Handler, }, }, Streams: []grpc.StreamDesc{}, Metadata: "pkg/cri/resource-manager/config/api/v1/api.proto", } ================================================ FILE: pkg/cri/resource-manager/config/config.go ================================================ /* Copyright 2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ package config import ( extapi "github.com/intel/cri-resource-manager/pkg/apis/resmgr/v1alpha1" ) // RawConfig represents the resource manager config data in unparsed form, as // received from the agent. type RawConfig struct { // NodeName is the node name the agent used to acquire configuration. NodeName string // Data is the raw ConfigMap data for this node. Data map[string]string } // Adjustment represents external adjustments for this node. type Adjustment struct { // Adjustments contains all adjustment CRDs for this node. Adjustments map[string]*extapi.AdjustmentSpec } // HasIdenticalData returns true if RawConfig has identical data to the supplied one. func (c *RawConfig) HasIdenticalData(data map[string]string) bool { if c == nil && data == nil { return true } if c == nil || data == nil { return false } if len(c.Data) != len(data) { return false } for k, v := range c.Data { if dv, found := data[k]; !found || dv != v { return false } } for dk, dv := range data { if v, found := c.Data[dk]; !found || v != dv { return false } } return true } ================================================ FILE: pkg/cri/resource-manager/config/server.go ================================================ /* Copyright 2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ package config import ( "context" "fmt" "net" "os" "path/filepath" "sync" "google.golang.org/grpc" v1 "github.com/intel/cri-resource-manager/pkg/cri/resource-manager/config/api/v1" "github.com/intel/cri-resource-manager/pkg/log" "encoding/json" extapi "github.com/intel/cri-resource-manager/pkg/apis/resmgr/v1alpha1" ) const ( SocketDisabled = "disabled" ) // SetConfigCb is a callback function for a SetConfig request. type SetConfigCb func(*RawConfig) error // SetAdjustmentCb is a callback function for a SetAdjustment request. type SetAdjustmentCb func(*Adjustment) map[string]error // Server is the interface for our gRPC server. type Server interface { Start(string) error Stop() } // server implements Server. type server struct { v1.UnimplementedConfigServer log.Logger socket string // configured socket sync.Mutex // lock for concurrent per-request goroutines. server *grpc.Server // gRPC server instance setConfigCb SetConfigCb // configuration update notification callback setAdjustmentCb SetAdjustmentCb // extneral adjustment update notification callback } // NewConfigServer creates new Server instance. func NewConfigServer(configCb SetConfigCb, adjustmentCb SetAdjustmentCb) (Server, error) { s := &server{ Logger: log.NewLogger("config-server"), setConfigCb: configCb, setAdjustmentCb: adjustmentCb, } return s, nil } // Start runs server instance. func (s *server) Start(socket string) error { if socket == SocketDisabled || socket == "" { s.Info("config-server is disabled...,") return nil } // Make sure we have a directory for the socket if err := os.MkdirAll(filepath.Dir(socket), 0700); err != nil { return serverError("failed to create directory for socket %s: %v", socket, err) } // Remove socket file if it exists if err := os.Remove(socket); err != nil && !os.IsNotExist(err) { return serverError("failed to unlink socket file: %s", err) } // Create server listening for local unix domain socket lis, err := net.Listen("unix", socket) if err != nil { return serverError("failed to listen to socket: %v", err) } serverOpts := []grpc.ServerOption{} s.server = grpc.NewServer(serverOpts...) v1.RegisterConfigServer(s.server, s) s.Info("starting config-server at socket %s...", socket) go func() { defer lis.Close() err := s.server.Serve(lis) if err != nil { s.Fatal("config-server died: %v", err) } }() return nil } // Stop Server instance func (s *server) Stop() { if s.server != nil { s.server.Stop() s.server = nil } } // SetConfig pushes a configuration update to the server. func (s *server) SetConfig(_ context.Context, req *v1.SetConfigRequest) (*v1.SetConfigReply, error) { s.Lock() defer s.Unlock() s.Debug("SetConfig request: %+v", req) reply := &v1.SetConfigReply{} err := s.setConfigCb(&RawConfig{NodeName: req.NodeName, Data: req.Config}) if err != nil { reply.Error = fmt.Sprintf("failed to apply configuration: %v", err) } return reply, nil } // SetAdjustment pushes updated external policies to the server. func (s *server) SetAdjustment(_ context.Context, req *v1.SetAdjustmentRequest) (*v1.SetAdjustmentReply, error) { s.Lock() defer s.Unlock() s.Debug("SetAdjustment request: %+v", req) errors := map[string]error{} specs := map[string]*extapi.AdjustmentSpec{} if err := json.Unmarshal([]byte(req.Adjustment), &specs); err != nil { return nil, serverError("failed to decode SetAdjustment request: %v", err) } for name, spec := range specs { if err := spec.Verify(); err != nil { errors[name] = err } } if len(errors) == 0 { errors = s.setAdjustmentCb(&Adjustment{Adjustments: specs}) } reply := &v1.SetAdjustmentReply{Errors: make(map[string]string)} for str, err := range errors { reply.Errors[str] = err.Error() } return reply, nil } func serverError(format string, args ...interface{}) error { return fmt.Errorf(format, args...) } ================================================ FILE: pkg/cri/resource-manager/control/blockio/blockio.go ================================================ // Copyright 2019 Intel Corporation. All Rights Reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package blockio import ( "errors" "fmt" "github.com/intel/cri-resource-manager/pkg/blockio" "github.com/intel/cri-resource-manager/pkg/config" "github.com/intel/cri-resource-manager/pkg/cri/client" "github.com/intel/cri-resource-manager/pkg/cri/resource-manager/cache" "github.com/intel/cri-resource-manager/pkg/cri/resource-manager/control" logger "github.com/intel/cri-resource-manager/pkg/log" ) const ( // BlockIOController is the name of the block I/O controller. BlockIOController = cache.BlockIO ) // blockio encapsulates the runtime state of our block I/O enforcement/controller. type blockioctl struct { cache cache.Cache // resource manager cache idle *bool // true if we run without any classes configured } // Our logger instance. var log logger.Logger = logger.NewLogger(BlockIOController) // Our singleton block I/O controller instance. var singleton *blockioctl // getBlockIOController returns our singleton block I/O controller instance. func getBlockIOController() *blockioctl { if singleton == nil { singleton = &blockioctl{} } return singleton } // Start initializes the controller for enforcing decisions. func (ctl *blockioctl) Start(cache cache.Cache, _ client.Client) error { ctl.cache = cache ctl.reconfigureRunningContainers() return nil } // Stop shuts down the controller. func (ctl *blockioctl) Stop() { } // PreCreateHook is the block I/O controller pre-create hook. func (ctl *blockioctl) PreCreateHook(_ cache.Container) error { return nil } // PreStartHook is the block I/O controller pre-start hook. func (ctl *blockioctl) PreStartHook(_ cache.Container) error { return nil } // PostStartHook is the block I/O controller post-start hook. func (ctl *blockioctl) PostStartHook(c cache.Container) error { if !c.HasPending(BlockIOController) { return nil } if err := ctl.assign(c); err != nil { return err } c.ClearPending(BlockIOController) return nil } // PostUpdateHook is the block I/O controller post-update hook. func (ctl *blockioctl) PostUpdateHook(c cache.Container) error { if !c.HasPending(BlockIOController) { return nil } if err := ctl.assign(c); err != nil { return err } c.ClearPending(BlockIOController) return nil } // PostStop is the block I/O controller post-stop hook. func (ctl *blockioctl) PostStopHook(_ cache.Container) error { return nil } // isImplicitlyDisabled checks if we run without any classes confiured func (ctl *blockioctl) isImplicitlyDisabled() bool { if ctl.idle != nil { return *ctl.idle } idle := len(blockio.GetClasses()) == 0 if idle { log.Warn("controller implictly disabled (no configured classes)") } ctl.idle = &idle return *ctl.idle } // assign assigns the container to the given block I/O class. func (ctl *blockioctl) assign(c cache.Container) error { class := c.GetBlockIOClass() if class == "" { return nil } if ctl.isImplicitlyDisabled() && cache.IsPodQOSClassName(class) { return nil } if err := blockio.SetContainerClass(c, class); err != nil { return blockioError("%q: failed to assign to class %q: %w", c.PrettyName(), class, err) } log.Info("%q: assigned to class %q", c.PrettyName(), class) return nil } // configNotify is blockio class mapping and class definition configuration callback func (ctl *blockioctl) configNotify(event config.Event, _ config.Source) error { ignoreErrors := (event == config.RevertEvent) err := blockio.UpdateOciConfig(ignoreErrors) if err != nil { return err } // Possible errors in reconfiguring running containers are not errors in // the updated configuration, therefore silently ignored. ctl.reconfigureRunningContainers() // We'll re-check idleness at next operation/request. ctl.idle = nil return nil } // reconfigureRunningContainers force setting current blockio configuration to all containers running on the node func (ctl *blockioctl) reconfigureRunningContainers() error { errs := []error{} if ctl.cache == nil { return nil } for _, c := range ctl.cache.GetContainers() { class := c.GetBlockIOClass() log.Debug("%q: configure blockio class %q", c.PrettyName(), class) err := blockio.SetContainerClass(c, class) if err != nil { errs = append(errs, err) } } return errors.Join(errs...) } // blockioError creates a block I/O-controller-specific formatted error message. func blockioError(format string, args ...interface{}) error { return fmt.Errorf("blockio: "+format, args...) } // init registers this controller and sets configuration change handling. func init() { control.Register(BlockIOController, "Block I/O controller", getBlockIOController()) config.GetModule(blockio.ConfigModuleName).AddNotify(getBlockIOController().configNotify) } ================================================ FILE: pkg/cri/resource-manager/control/control.go ================================================ // Copyright 2019 Intel Corporation. All Rights Reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package control import ( "fmt" "sort" "strings" "github.com/intel/cri-resource-manager/pkg/cri/client" "github.com/intel/cri-resource-manager/pkg/cri/resource-manager/cache" logger "github.com/intel/cri-resource-manager/pkg/log" ) // Control is the interface for triggering controller-/domain-specific post-decision actions. type Control interface { // StartStopControllers starts/stops all controllers according to configuration. StartStopControllers(cache.Cache, client.Client) error // PreCreateHooks runs the pre-create hooks of all registered controllers. RunPreCreateHooks(cache.Container) error // RunPreStartHooks runs the pre-start hooks of all registered controllers. RunPreStartHooks(cache.Container) error // RunPostStartHooks runs the post-start hooks of all registered controllers. RunPostStartHooks(cache.Container) error // RunPostUpdateHooks runs the post-update hooks of all registered controllers. RunPostUpdateHooks(cache.Container) error // RunPostStopHooks runs the post-stop hooks of all registered controllers. RunPostStopHooks(cache.Container) error } // Controller is the interface all resource controllers must implement. type Controller interface { // Start prepares the controller for resource control/decision enforcement. Start(cache.Cache, client.Client) error // Stop shuts down the controller. Stop() // PreCreateHook is the controller's pre-create hook. PreCreateHook(cache.Container) error // PreStartHook is the controller's pre-start hook. PreStartHook(cache.Container) error // PostStartHook is the controller's post-start hook. PostStartHook(cache.Container) error // PostUpdateHook is the controller's post-update hook. PostUpdateHook(cache.Container) error // PostStopHook is the controller's post-stop hook. PostStopHook(cache.Container) error } // control encapsulates our controller-agnostic runtime state. type control struct { cache cache.Cache // resource manager cache client client.Client // resource manager CRI client controllers []*controller // active controllers } // controller represents a single registered controller. type controller struct { name string // controller name description string // controller description c Controller // controller interface mode mode // controller mode running bool // whether the controller is running } // our hook names const ( precreate = "pre-create" prestart = "pre-start" poststart = "post-start" postupdate = "post-update" poststop = "post-stop" ) // All registered controllers. var controllers = make(map[string]*controller) // Our logger instance. var log logger.Logger = logger.NewLogger("resource-control") // NewControl creates a new controller-agnostic instance. func NewControl() (Control, error) { c := &control{} for _, controller := range controllers { c.controllers = append(c.controllers, controller) } sort.Slice(c.controllers, func(i, j int) bool { return strings.Compare(c.controllers[i].name, c.controllers[j].name) < 0 }) return c, nil } // StartStopController starts/stops all controllers according to configuration. func (c *control) StartStopControllers(cache cache.Cache, client client.Client) error { c.cache = cache c.client = client log.Info("syncing controllers with configuration...") for _, controller := range c.controllers { if controller.mode == Disabled { if controller.running { controller.c.Stop() controller.running = false } log.Info("controller %s: disabled", controller.name) continue } if controller.running { log.Info("controller %s: running", controller.name) continue } err := controller.c.Start(cache, client) if err != nil { log.Error("controller %s: failed to start: %v", controller.name, err) controller.running = false switch controller.mode { case Required: return controlError("%s failed to start: %v", controller.name, err) case Optional, Relaxed: log.Warn("disabling %s, failed to start: %v", controller.name, err) controller.mode = Disabled } } else { controller.running = true if controller.mode == Optional { controller.mode = Required } } } for _, controller := range c.controllers { state := map[bool]string{false: "inactive", true: "running"} log.Info("controller %s is now %s, mode %s", controller.name, state[controller.running], controller.mode) } return nil } // RunPreCreateHooks runs all registered controllers' PreCreate hooks. func (c *control) RunPreCreateHooks(container cache.Container) error { for _, controller := range c.controllers { if err := c.runhook(controller, precreate, container); err != nil { return err } } return nil } // RunPreStartHooks runs all registered controllers' PreStart hooks. func (c *control) RunPreStartHooks(container cache.Container) error { for _, controller := range c.controllers { if err := c.runhook(controller, prestart, container); err != nil { return err } } return nil } // RunPostStartHooks runs all registered controllers' PostStart hooks. func (c *control) RunPostStartHooks(container cache.Container) error { for _, controller := range c.controllers { if err := c.runhook(controller, poststart, container); err != nil { return err } } return nil } // RunPostUpdateHooks runs all registered controllers' PostUpdate hooks. func (c *control) RunPostUpdateHooks(container cache.Container) error { for _, controller := range c.controllers { if err := c.runhook(controller, postupdate, container); err != nil { return err } } return nil } // RunPostStopHooks runs all registered controllers' PostStop hooks. func (c *control) RunPostStopHooks(container cache.Container) error { for _, controller := range c.controllers { if err := c.runhook(controller, poststop, container); err != nil { return err } } return nil } // runhook executes the given container hook according to the controller settings func (c *control) runhook(controller *controller, hook string, container cache.Container) error { if controller.mode == Disabled || !controller.running { return nil } var fn func(cache.Container) error switch hook { case precreate: fn = controller.c.PreCreateHook case prestart: fn = controller.c.PreStartHook case poststart: fn = controller.c.PostStartHook case postupdate: fn = controller.c.PostUpdateHook case poststop: fn = controller.c.PostStopHook } log.Debug("running %s %s hook for container %s", controller.name, hook, container.PrettyName()) if err := fn(container); err != nil { if controller.mode == Required { return controlError("%s %s hook failed: %v", controller.name, hook, err) } log.Error("%s %s hook failed: %v", controller.name, hook, err) } return nil } // Register registers a new controller. func Register(name, description string, c Controller) error { log.Info("registering controller %s...", name) if oc, ok := controllers[name]; ok { return controlError("controller %s (%s) already registered.", oc.name, oc.description) } controllers[name] = &controller{ name: name, description: description, c: c, } return nil } // controlError returns a controller-specific formatted error. func controlError(format string, args ...interface{}) error { return fmt.Errorf("control: "+format, args...) } ================================================ FILE: pkg/cri/resource-manager/control/cpu/api.go ================================================ // Copyright 2022 Intel Corporation. All Rights Reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package cpu import ( "github.com/intel/cri-resource-manager/pkg/cri/resource-manager/cache" "github.com/intel/goresctrl/pkg/utils" ) // GetClasses returns all available CPU classes. func GetClasses() map[string]Class { return getCPUController().config.getClasses() } // Assign assigns a set of cpus to a class. // // TODO: Drop this function. Don't store cpu class in policy data but implement // controller-specific data store in cache. func Assign(c cache.Cache, class string, cpus ...int) error { // NOTE: no locking implemented anywhere around -> we don't expect multiple parallel callers // Store the class assignment. Assign cpus to a class and remove them from // other classes assignments := *getClassAssignments(c) if this, ok := assignments[class]; !ok { assignments[class] = utils.NewIDSetFromIntSlice(cpus...) } else { this.Add(cpus...) } for k, v := range assignments { if k != class { v.Del(cpus...) // Don't store empty classes, serves as a garbage collector, too if v.Size() == 0 { delete(assignments, k) } } } setClassAssignments(c, &assignments) if getCPUController().started { // We don't want to try to enforce until the controller has been fully // started. Enforcement of all assignments happens on StarT(), anyway. ctl := getCPUController() if err := ctl.enforceCpufreq(class, cpus...); err != nil { log.Error("cpufreq enforcement failed: %v", err) } if err := ctl.enforceUncore(assignments, cpus...); err != nil { log.Error("uncore frequency enforcement failed: %v", err) } } return nil } ================================================ FILE: pkg/cri/resource-manager/control/cpu/cache.go ================================================ // Copyright 2022 Intel Corporation. All Rights Reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package cpu import ( "github.com/intel/cri-resource-manager/pkg/cri/resource-manager/cache" "github.com/intel/goresctrl/pkg/utils" ) const ( cacheKeyCPUAssignments = "CPUClassAssignments" ) // cpuClassAssignments contains the information about how cpus are assigned to // classes type cpuClassAssignments map[string]utils.IDSet // Get the state of CPU class assignments from cache func getClassAssignments(c cache.Cache) *cpuClassAssignments { a := &cpuClassAssignments{} if !c.GetPolicyEntry(cacheKeyCPUAssignments, a) { log.Error("no cached state of CPU class assignments found") } return a } // Save the state of CPU class assignments in cache func setClassAssignments(c cache.Cache, a *cpuClassAssignments) { c.SetPolicyEntry(cacheKeyCPUAssignments, cache.Cachable(a)) } // Set the value of cached cpuClassAssignments func (c *cpuClassAssignments) Set(value interface{}) { switch value.(type) { case cpuClassAssignments: *c = value.(cpuClassAssignments) case *cpuClassAssignments: cp := value.(*cpuClassAssignments) *c = *cp } } // Get cached cpuClassAssignments func (c *cpuClassAssignments) Get() interface{} { return *c } ================================================ FILE: pkg/cri/resource-manager/control/cpu/cpu.go ================================================ // Copyright 2022 Intel Corporation. All Rights Reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package cpu import ( "fmt" pkgcfg "github.com/intel/cri-resource-manager/pkg/config" "github.com/intel/cri-resource-manager/pkg/cri/client" "github.com/intel/cri-resource-manager/pkg/cri/resource-manager/cache" "github.com/intel/cri-resource-manager/pkg/cri/resource-manager/control" logger "github.com/intel/cri-resource-manager/pkg/log" "github.com/intel/cri-resource-manager/pkg/sysfs" "github.com/intel/cri-resource-manager/pkg/utils/cpuset" "github.com/intel/goresctrl/pkg/utils" ) const ( // ConfigModuleName is the configuration section for the CPU controller. ConfigModuleName = "cpu" // CPUController is the name of the CPU controller. CPUController = cache.CPU ) // cpuctl encapsulates the runtime state of our CPU enforcement/controller. type cpuctl struct { cache cache.Cache // resource manager cache system sysfs.System // system topology config *config started bool } type config struct { Classes map[string]Class `json:"classes"` // Private field for storing info if we need to care about uncore uncoreEnabled bool } type Class struct { MinFreq uint `json:"minFreq"` MaxFreq uint `json:"maxFreq"` EnergyPerformancePreference uint `json:"energyPerformancePreference"` UncoreMinFreq uint `json:"uncoreMinFreq"` UncoreMaxFreq uint `json:"uncoreMaxFreq"` } var log logger.Logger = logger.NewLogger(CPUController) // Ccontroller singleton instance. var singleton *cpuctl // getCPUController returns the (singleton) CPU controller instance. func getCPUController() *cpuctl { if singleton == nil { singleton = &cpuctl{} singleton.config = singleton.defaultOptions().(*config) } return singleton } // Start initializes the controller for enforcing decisions. func (ctl *cpuctl) Start(cache cache.Cache, _ client.Client) error { sys, err := sysfs.DiscoverSystem() if err != nil { return fmt.Errorf("failed to discover system topology: %w", err) } ctl.system = sys ctl.cache = cache // DEBUG: dump the class assignments we have stored in the cache log.Debug("retrieved cpu class assignments from cache:\n%s", utils.DumpJSON(getClassAssignments(ctl.cache))) if err := ctl.configure(); err != nil { // Just print an error. A config update later on may be valid. log.Error("failed apply /cpuinitial configuration: %v", err) } // TODO: We probably could just remove this and the hooks if they are not used pkgcfg.GetModule(ConfigModuleName).AddNotify(getCPUController().configNotify) ctl.started = true return nil } // Stop shuts down the controller. func (ctl *cpuctl) Stop() { } // PreCreateHook handler for the CPU controller. func (ctl *cpuctl) PreCreateHook(_ cache.Container) error { return nil } // PreStartHook handler for the CPU controller. func (ctl *cpuctl) PreStartHook(_ cache.Container) error { return nil } // PostStartHook handler for the CPU controller. func (ctl *cpuctl) PostStartHook(_ cache.Container) error { return nil } // PostUpdateHook handler for the CPU controller. func (ctl *cpuctl) PostUpdateHook(_ cache.Container) error { return nil } // PostStopHook handler for the CPU controller. func (ctl *cpuctl) PostStopHook(_ cache.Container) error { return nil } // enforceCpufreq enforces a class-specific cpufreq configuration to a cpuset func (ctl *cpuctl) enforceCpufreq(class string, cpus ...int) error { if _, ok := ctl.config.Classes[class]; !ok { return fmt.Errorf("non-existent cpu class %q", class) } min := int(ctl.config.Classes[class].MinFreq) max := int(ctl.config.Classes[class].MaxFreq) log.Debug("enforcing cpu frequency limits {%d, %d} from class %q on %v", min, max, class, cpus) if err := utils.SetCPUsScalingMinFreq(cpus, min); err != nil { return fmt.Errorf("Cannot set min freq %d: %w", min, err) } if err := utils.SetCPUsScalingMaxFreq(cpus, max); err != nil { return fmt.Errorf("Cannot set max freq %d: %w", max, err) } return nil } // enforceUncore enforces uncore frequency limits func (ctl *cpuctl) enforceUncore(assignments cpuClassAssignments, affectedCPUs ...int) error { if !ctl.config.uncoreEnabled { return nil } cpus := cpuset.New(affectedCPUs...) for _, cpuPkgID := range ctl.system.PackageIDs() { cpuPkg := ctl.system.Package(cpuPkgID) for _, cpuDieID := range cpuPkg.DieIDs() { dieCPUs := cpuPkg.DieCPUSet(cpuDieID) // Check if this die is affected by the specified cpuset if cpus.Size() == 0 || dieCPUs.Intersection(cpus).Size() > 0 { min, max, minCls, maxCls := effectiveUncoreFreqs(utils.NewIDSet(dieCPUs.List()...), ctl.config.Classes, assignments) if min == 0 && max == 0 { log.Debug("no uncore frequency limits for cpu package/die %d/%d", cpuPkgID, cpuDieID) continue } log.Debug("enforcing uncore min freq to %d (class %q), max freq to %d (class %q) on cpu package/die %d/%d", min, minCls, max, maxCls, cpuPkgID, cpuDieID) if min > 0 { if max > 0 && min > max { log.Warn("uncore frequency limit min > max (%d > %d) on cpu package/die %d/%d", min, max, cpuPkgID, cpuDieID) } if err := utils.SetUncoreMinFreq(cpuPkgID, cpuDieID, int(min)); err != nil { return err } } if max > 0 { if err := utils.SetUncoreMaxFreq(cpuPkgID, cpuDieID, int(max)); err != nil { return err } } } } } return nil } // effectiveUncoreClasses resolves the effective classes for setting the uncore // frequency limits for a cpu package/die. It has "performance preference" so // that the highest value (for both min and max) of the cpu classes effective // on the die is selected. func effectiveUncoreFreqs(cpus utils.IDSet, classes map[string]Class, assignments cpuClassAssignments) (minFreq, maxFreq uint, minCls, maxCls string) { for className, assignedCPUs := range assignments { // Check if this class is "effective" on the specified cpuset if idSetIntersects(cpus, assignedCPUs) { class := classes[className] if class.UncoreMinFreq > minFreq { minCls = className minFreq = class.UncoreMinFreq } if class.UncoreMaxFreq > maxFreq { maxCls = className maxFreq = class.UncoreMaxFreq } } } return minFreq, maxFreq, minCls, maxCls } func idSetIntersects(a, b utils.IDSet) bool { // Try to optimize the search for unbalanced idsets if len(a) < len(b) { for id := range a { if _, ok := b[id]; ok { return true } } } else { for id := range b { if _, ok := a[id]; ok { return true } } } return false } func (ctl *cpuctl) configure() error { // Re-configure CPUs that are assigned to some known class assignments := *getClassAssignments(ctl.cache) // DEBUG: dump the class assignments we have stored in the cache log.Debug("applying cpu controller configuration:\n%s", utils.DumpJSON(ctl.config)) // Sanity check uncoreAvailable := utils.UncoreFreqAvailable() for name, conf := range ctl.config.Classes { if conf.UncoreMinFreq != 0 || conf.UncoreMaxFreq != 0 { if !uncoreAvailable { return fmt.Errorf("uncore limits set in cpu class %q but uncore driver not available in the system, make sure that the intel_uncore_frequency driver is loaded", name) } ctl.config.uncoreEnabled = true break } } // Configure the system for class, cpus := range assignments { if _, ok := ctl.config.Classes[class]; ok { // Re-configure cpus (sysfs) according to new class parameters if err := ctl.enforceCpufreq(class, cpus.SortedMembers()...); err != nil { log.Error("cpufreq enforcement on re-configure failed: %v", err) } } else { // TODO: what should we really do with classes that do not exist in // the configuration anymore? Now we remember the CPUs assigned to // them. A further config update might re-introduce the class in // which case the CPUs will be reconfigured. log.Warn("class %q with cpus %v missing from the configuration", class, cpus) } } if err := ctl.enforceUncore(assignments); err != nil { log.Error("uncore frequency enforcement on re-configure failed: %v", err) } log.Debug("cpu controller configured") return nil } // Callback for runtime configuration notifications. func (ctl *cpuctl) configNotify(_ pkgcfg.Event, _ pkgcfg.Source) error { if !ctl.started { // We don't want to configure until the controller has been fully // started and initialized. We will configure on Start(), anyway. return nil } log.Info("configuration update, applying new config") return ctl.configure() } func (ctl *cpuctl) defaultOptions() interface{} { return &config{} } func (c *config) getClasses() map[string]Class { ret := make(map[string]Class, len(c.Classes)) for k, v := range c.Classes { ret[k] = v } return ret } // Register us as a controller. func init() { control.Register(CPUController, "CPU controller", getCPUController()) pkgcfg.Register(ConfigModuleName, "CPU control", getCPUController().config, getCPUController().defaultOptions) } ================================================ FILE: pkg/cri/resource-manager/control/cri/cri.go ================================================ // Copyright 2019 Intel Corporation. All Rights Reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package cri import ( "fmt" "github.com/intel/cri-resource-manager/pkg/cri/client" "github.com/intel/cri-resource-manager/pkg/cri/resource-manager/cache" "github.com/intel/cri-resource-manager/pkg/cri/resource-manager/control" criv1 "k8s.io/cri-api/pkg/apis/runtime/v1" logger "github.com/intel/cri-resource-manager/pkg/log" ) const ( // CRIController is the name of this controller. CRIController = cache.CRI ) // crictl encapsulated the runtime state of our CRI enforcement/controller. type crictl struct { cache cache.Cache client client.Client } // Our logger instance. var log logger.Logger = logger.NewLogger(CRIController) // Our CRI controller singleton instance. var singleton *crictl // getCRIController returns our singleton CRI controller instance. func getCRIController() control.Controller { if singleton == nil { singleton = &crictl{} } return singleton } // Start initializes the controller for enforcing decisions. func (ctl *crictl) Start(cache cache.Cache, client client.Client) error { ctl.cache = cache ctl.client = client return nil } // Stop shuts down the controller. func (ctl *crictl) Stop() { } // PreCreateHook is the CRI controller pre-create hook. func (ctl *crictl) PreCreateHook(c cache.Container) error { if !c.HasPending(CRIController) { log.Debug("pre-create hook: no pending changes for %s", c.PrettyName()) return nil } log.Debug("pre-create hook: updating %s", c.PrettyName()) request, ok := c.GetCRIRequest() if !ok { return criError("pre-create hook: no pending CRI request") } create, ok := request.(*criv1.CreateContainerRequest) if !ok { return criError("pre-create hook: pending CRI request of wrong type (%T)", request) } create.Config.Command = c.GetCommand() create.Config.Args = c.GetArgs() create.Config.Labels = c.GetLabels() create.Config.Annotations = c.GetAnnotations() create.Config.Envs = c.GetCRIEnvs() create.Config.Mounts = c.GetCRIMounts() create.Config.Devices = c.GetCRIDevices() if create.Config.Linux == nil { create.Config.Linux = &criv1.LinuxContainerConfig{} } create.Config.Linux.Resources = c.GetLinuxResources() c.ClearPending(CRIController) return nil } // PreStartHook is the CRI controller pre-start hook. func (ctl *crictl) PreStartHook(_ cache.Container) error { return nil } // PostStartHook is the CRI controller post-start hook. func (ctl *crictl) PostStartHook(_ cache.Container) error { return nil } // PostUpdateHook is the CRI controller post-update hook. func (ctl *crictl) PostUpdateHook(c cache.Container) error { var update *criv1.UpdateContainerResourcesRequest if !c.HasPending(CRIController) { log.Debug("post-update hook: no changes for %s", c.PrettyName()) return nil } log.Debug("post-update hook: updating %s", c.PrettyName()) resources := c.GetLinuxResources() if resources == nil { return nil } request, ok := c.GetCRIRequest() if !ok { update = &criv1.UpdateContainerResourcesRequest{ ContainerId: c.GetID(), } c.SetCRIRequest(update) } else { if update, ok = request.(*criv1.UpdateContainerResourcesRequest); !ok { return criError("post-update hook: CRI request of wrong type (%T)", request) } } update.Linux = resources c.ClearPending(CRIController) return nil } // PostStop is the CRI controller post-stop hook. func (ctl *crictl) PostStopHook(_ cache.Container) error { return nil } // criError creates an CRI-controller-specific formatted error message. func criError(format string, args ...interface{}) error { return fmt.Errorf("cri: "+format, args...) } // Register us as a controller. func init() { control.Register(CRIController, "CRI controller", getCRIController()) } ================================================ FILE: pkg/cri/resource-manager/control/flags.go ================================================ // Copyright 2019 Intel Corporation. All Rights Reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package control import ( "encoding/json" "fmt" "strings" "github.com/intel/cri-resource-manager/pkg/config" ) // Options captures our runtime configuration. type options struct { Controllers map[string]mode } // Our runtime configuration. var opt = defaultOptions().(*options) // mode describes how errors for the controller should be treated. type mode int const ( // Disabled controller are stopped, hooks are not run. Disabled mode = iota // Required controllers must start, hooks must succeed. Required // Optional controllers are Disabled if they can't start, otherwise they are Required. Optional // Relaxed controllers are Disabled if they can't start, hook failures are not errors. Relaxed // Default mode is Relaxed. Default = Relaxed ) // ControllerMode returns the current mode for the given controller. func (o *options) ControllerMode(name string) mode { if m, ok := o.Controllers[name]; ok { return m } return Default } // configNotify is our configuration update notification callback. func (o *options) configNotify(_ config.Event, _ config.Source) error { log.Info("configuration updated") for name, controller := range controllers { controller.mode = o.ControllerMode(name) } return nil } // String returns the string representation of a mode. func (m mode) String() string { switch m { case Disabled: return "disabled" case Required: return "required" case Optional: return "optional" case Relaxed: return "relaxed" default: return fmt.Sprintf("", m) } } // MarshalJSON is the JSON marshaller for mode. func (m mode) MarshalJSON() ([]byte, error) { return json.Marshal(m.String()) } // UnmarshalJSON is the JSON unmarshaller for mode. func (m *mode) UnmarshalJSON(raw []byte) error { var str string if err := json.Unmarshal(raw, &str); err != nil { return controlError("failed to unmarshal mode: %v", err) } switch strings.ToLower(str) { case "disabled", "disable": *m = Disabled case "required", "mandatory": *m = Required case "optional": *m = Optional case "relaxed": *m = Relaxed default: return controlError("invalid mode %s", str) } return nil } // defaultOptions returns a new options instance, all initialized to defaults. func defaultOptions() interface{} { return &options{Controllers: make(map[string]mode)} } // Register us for configuration handling. func init() { config.Register("resource-manager.control", "Resource control.", opt, defaultOptions, config.WithNotify(opt.configNotify)) } ================================================ FILE: pkg/cri/resource-manager/control/memory/memory.go ================================================ // Copyright 2019-2020 Intel Corporation. All Rights Reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package memory import ( "fmt" "os" "strconv" "github.com/intel/cri-resource-manager/pkg/cgroups" "github.com/intel/cri-resource-manager/pkg/cri/client" "github.com/intel/cri-resource-manager/pkg/cri/resource-manager/cache" "github.com/intel/cri-resource-manager/pkg/cri/resource-manager/control" logger "github.com/intel/cri-resource-manager/pkg/log" ) const ( // MemoryController is the name of the memory controller. MemoryController = cache.Memory // memoryCgroupPath is the path to the root of the memory cgroup. memoryCgroupPath = "/sys/fs/cgroup/memory" // toptierSoftLimitControl is the memory cgroup entry to set top tier soft limit. toptierSoftLimitControl = "memory.toptier_soft_limit_in_bytes" ) // memctl encapsulates the runtime state of our memory enforcement/controller. type memctl struct { cache cache.Cache // resource manager cache disabled bool // true, if kernel lacks the necessary cgroup controls } // Our logger instance. var log logger.Logger = logger.NewLogger(MemoryController) // Our singleton memory controller instance. var singleton *memctl // getMemoryController returns our singleton memory controller instance. func getMemoryController() *memctl { if singleton == nil { singleton = &memctl{} } return singleton } // Start initializes the controller for enforcing decisions. func (ctl *memctl) Start(cache cache.Cache, _ client.Client) error { // Let's keep this off for now so we can exercise this without a patched kernel... if !ctl.checkToptierLimitSupport() { return memctlError("cgroup top tier memory limit control not available") } ctl.cache = cache return nil } // Stop shuts down the controller. func (ctl *memctl) Stop() { } // PreCreateHook is the memory controller pre-create hook. func (ctl *memctl) PreCreateHook(_ cache.Container) error { return nil } // PreStartHook is the memory controller pre-start hook. func (ctl *memctl) PreStartHook(_ cache.Container) error { return nil } // PostStartHook is the memory controller post-start hook. func (ctl *memctl) PostStartHook(c cache.Container) error { if !c.HasPending(MemoryController) { return nil } if err := ctl.setToptierLimit(c); err != nil { return err } c.ClearPending(MemoryController) return nil } // PostUpdateHook is the memory controller post-update hook. func (ctl *memctl) PostUpdateHook(c cache.Container) error { if !c.HasPending(MemoryController) { return nil } if err := ctl.setToptierLimit(c); err != nil { return err } c.ClearPending(MemoryController) return nil } // PostStop is the memory controller post-stop hook. func (ctl *memctl) PostStopHook(_ cache.Container) error { return nil } // Check if memory cgroup controller supports top tier soft limits. func (ctl *memctl) checkToptierLimitSupport() bool { _, err := os.Stat(memoryCgroupPath + "/" + toptierSoftLimitControl) if err != nil && os.IsNotExist(err) { log.Warn("cgroup top tier memory limit control not available") ctl.disabled = true } return !ctl.disabled } // setToptierLimit sets the top tier memory (soft) limit for the container. func (ctl *memctl) setToptierLimit(c cache.Container) error { dir := c.GetCgroupDir() if dir == "" { return memctlError("%q: failed to determine cgroup directory", c.PrettyName()) } limit := strconv.FormatInt(c.GetToptierLimit(), 10) group := cgroups.Memory.Group(dir) entry := toptierSoftLimitControl if err := group.Write(entry, "%s\n", limit); err != nil { return err } log.Info("%q: memory toptier soft limit set to %v", c.PrettyName(), limit) return nil } // memctlError creates a memory I/O-controller-specific formatted error message. func memctlError(format string, args ...interface{}) error { return fmt.Errorf("memory: "+format, args...) } // init registers this controller. func init() { control.Register(MemoryController, "memory toptier controller", getMemoryController()) } ================================================ FILE: pkg/cri/resource-manager/control/page-migrate/demoter.go ================================================ // Copyright 2020 Intel Corporation. All Rights Reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package pagemigrate import ( "encoding/binary" "fmt" "io" "math/rand" "os" "strconv" "strings" "time" "github.com/intel/cri-resource-manager/pkg/cgroups" "github.com/intel/cri-resource-manager/pkg/config" idset "github.com/intel/goresctrl/pkg/utils" ) // Support dynamic pushing of unused pages from DRAM to PMEM. // // The algorithm is be (roughly) this: // // Find out which processes belong to the container. For every process in the // container, find out which pages the process uses. Using move_pages(), push a // number of pages not in the working set, which are present in DRAM, from DRAM // to PMEM. This may need to be done for many times with a delay in between, // because the process will be "stuck" when the pages are moved. Repeat this // process. // // How to figure out which pages are not part of the working set: // // 1. Clear soft-dirty bits on the PTEs: // https://www.kernel.org/doc/html/latest/admin-guide/mm/soft-dirty.html // 2. Wait for a while. // 3. Read out the process page maps: // https://www.kernel.org/doc/html/latest/admin-guide/mm/pagemap.html The pages // which don't have the soft-dirty bit are considered to be outside of the // working set. type page struct { pid int addr uint64 } type addrRange struct { addr uint64 length uint64 } type demoter struct { migration *migration // controller backpointer // Finding pages dirtyBitReset time.Ticker // Ticker for resetting the dirty bits. dirtyBitStop chan interface{} // Channel for stopping the ticker. // Moving pages pageMover PageMover containerDemoters map[string]chan interface{} // Channel for sending pagemap updates to demoters. pageScanInterval config.Duration // How often should we scan pages. pageMoveInterval config.Duration // How often should we move pages for a container. maxPageMoveCount uint // How many pages to move at once. } type pagePool struct { pages map[int][]page longestRange uint } type demotion struct { pagePool pagePool targetNodes idset.IDSet } func copyPagePool(p pagePool) pagePool { c := pagePool{ longestRange: p.longestRange, pages: make(map[int][]page, 0), } for pid, pages := range p.pages { c.pages[pid] = make([]page, len(pages)) copy(c.pages[pid], pages) } return c } func newDemoter(m *migration) *demoter { return &demoter{ migration: m, containerDemoters: make(map[string]chan interface{}, 0), pageMover: &linuxPageMover{}, } } func (d *demoter) start() { if d.pageScanInterval > 0 && d.pageMoveInterval > 0 && d.maxPageMoveCount > 0 { log.Info("scanning pages every %s, moving max. %d pages every %s", d.pageScanInterval.String(), d.maxPageMoveCount, d.pageMoveInterval.String()) d.startDirtyBitResetTimer() } else { log.Info("scanning pages is disabled") } } // Stop stops page scanning and demotion. func (d *demoter) Stop() { d.stopDirtyBitResetTimer() d.migration.Lock() defer d.migration.Unlock() d.stopDemoters() } // Reconfigure restarts, if necessary, page scanning and demotion with new options. func (d *demoter) Reconfigure() { if d.pageScanInterval != opt.PageScanInterval || d.pageMoveInterval != opt.PageMoveInterval || d.maxPageMoveCount != opt.MaxPageMoveCount { d.Stop() d.pageScanInterval = opt.PageScanInterval d.pageMoveInterval = opt.PageMoveInterval d.maxPageMoveCount = opt.MaxPageMoveCount } d.start() } func (d *demoter) updateDemoter(cid string, p pagePool, targetNodes idset.IDSet) { channel, found := d.containerDemoters[cid] if !found { channel := make(chan interface{}) go func() { moveTimer := time.NewTicker(time.Duration(d.pageMoveInterval)) moveTimerChan := moveTimer.C pagePool := p nodes := targetNodes count := d.maxPageMoveCount for { select { case msg := <-channel: demotion, ok := msg.(demotion) if ok { pagePool = demotion.pagePool targetNodes = demotion.targetNodes if p.longestRange > d.maxPageMoveCount { // The number of pages moved needs to be at least as large as a range in numa_maps // file so that we know that all pages will be moved (even if some of them were // already on the PMEM node). // TODO: adjust the timer if we have a larger-than-usual range of pages to move. count = p.longestRange } else { count = d.maxPageMoveCount } } else { // A stop request. if moveTimer != nil { moveTimer.Stop() } return } case _ = <-moveTimerChan: err := d.movePages(pagePool, count, nodes) if err != nil { log.Error("Error demoting pages: %s", err) } } } }() d.containerDemoters[cid] = channel // TODO: trigger instant update when run the first time? } else { channel <- demotion{pagePool: p, targetNodes: targetNodes} } } func (d *demoter) stopDemoter(cid string) { channel, found := d.containerDemoters[cid] if found { channel <- "stop" delete(d.containerDemoters, cid) } } func (d *demoter) stopUnusedDemoters(cs map[string]*container) { for id := range d.containerDemoters { if _, found := cs[id]; !found { d.stopDemoter(id) } } } func (d *demoter) stopDemoters() { for cid, channel := range d.containerDemoters { channel <- "stop" delete(d.containerDemoters, cid) } } func (d *demoter) stopDirtyBitResetTimer() { if d.dirtyBitStop != nil { close(d.dirtyBitStop) d.dirtyBitStop = nil } } func (d *demoter) startDirtyBitResetTimer() { if d.dirtyBitStop != nil { return } stop := make(chan interface{}) go func() { dirtyBitResetTimer := time.NewTicker(time.Duration(d.pageScanInterval)) dirtyBitResetChan := dirtyBitResetTimer.C for { select { case _ = <-stop: if dirtyBitResetTimer != nil { dirtyBitResetTimer.Stop() } return case _ = <-dirtyBitResetChan: d.scanPages() } } }() d.dirtyBitStop = stop } func resetDirtyBit(pid string) error { // Write magic value "4" to the clear_refs file. This resets the dirty bit. path := "/proc/" + pid + "/clear_refs" err := os.WriteFile(path, []byte("4"), 0600) return err } // resetDirtyBit unsets soft-dirty bits for all processes in a container. func (d *demoter) resetDirtyBit(c *container) error { group := cgroups.Memory.Group(c.cgroupDir) pids, err := group.GetProcesses() if err != nil { return err } for _, pid := range pids { err = resetDirtyBit(pid) if err != nil { log.Error("%s: failed to reset dirty but for process %s: %v", c.prettyName, pid, err) return err } } return nil } // scanPages scans pages of tracked containers to detect idle ones. func (d *demoter) scanPages() { d.migration.Lock() defer d.migration.Unlock() for _, container := range d.migration.containers { pm := container.GetPageMigration() if pm == nil { continue } dramNodes := pm.SourceNodes pmemNodes := pm.TargetNodes if dramNodes.Size() == 0 || pmemNodes.Size() == 0 { continue } // Gather the known pages which need to be moved. pagePool, err := d.getPagesForContainer(container, dramNodes) if err != nil { log.Error("failed to get pages for container %v", container.prettyName) continue } count := 0 for _, pages := range pagePool.pages { count += len(pages) } log.Debug("%d pages for (maybe) demoting for %v", count, container.prettyName) // Reset the dirty bit from all pages. d.resetDirtyBit(container) // Give the pages to the page moving goroutine. Copy the page pool so that there's no race. d.updateDemoter(container.GetCacheID(), copyPagePool(pagePool), pmemNodes.Clone()) } d.stopUnusedDemoters(d.migration.containers) } func (d *demoter) getPagesForContainer(c *container, sourceNodes idset.IDSet) (pagePool, error) { pool := pagePool{ pages: make(map[int][]page, 0), longestRange: 0, } group := cgroups.Memory.Group(c.cgroupDir) pids, err := group.GetProcesses() if err != nil { return pagePool{}, err } for _, pid := range pids { addressRanges := make([]addrRange, 0) pidNumber64, err := strconv.ParseInt(pid, 10, 32) if err != nil { log.Error("Failed to parse addr to int: %v", err) continue } pidNumber := int(pidNumber64) // Read /proc/pid/numa_maps and /proc/pid/maps numaMapsPath := "/proc/" + pid + "/numa_maps" numaMapsBytes, err := os.ReadFile(numaMapsPath) if err != nil { log.Error("Could not read numa_maps: %v", err) continue } mapsPath := "/proc/" + pid + "/maps" mapsBytes, err := os.ReadFile(mapsPath) if err != nil { log.Error("Could not read maps: %v\n", err) continue } mapsLines := strings.Split(string(mapsBytes), "\n") for _, line := range strings.Split(string(numaMapsBytes), "\n") { tokens := strings.Split(line, " ") if len(tokens) < 3 { continue } attrs := strings.Join(tokens[2:], " ") // Filter out lines which don't have "anonymous", since we are not // interested in file-mapped or shared pages. Save the interesting ranges. // TODO: consider dropping the "heap" requirement. There are often ranges // in the file which don't have any attributes indicating the memory // location. if !strings.Contains(attrs, "heap") || !strings.Contains(attrs, "anon=") { continue } // We only find out if *any* pages in the range are in a DRAM node. The // more fine-grained analysis is done later by running the move_pages() // system call twice. locatedOnDRAMNode := false for node := range sourceNodes { number := strconv.FormatInt(int64(node), 10) str := "N" + number + "=" if strings.Contains(attrs, str) { locatedOnDRAMNode = true break } } if !locatedOnDRAMNode { continue } for _, mapLine := range mapsLines { if strings.HasPrefix(mapLine, tokens[0]+"-") { spaceIndex := strings.Index(mapLine, " ") if spaceIndex > len(tokens[0]+"-") { endAddrStr := mapLine[len(tokens[0]+"-"):spaceIndex] startAddr, err := strconv.ParseInt(tokens[0], 16, 64) if err != nil { log.Error("Failed to parse addr to int: %v\n", err) break } endAddr, err := strconv.ParseInt(endAddrStr, 16, 64) if err != nil { log.Error("Failed to parse addr to int: %v\n", err) break } rangeLength := endAddr - startAddr addressRanges = append(addressRanges, addrRange{uint64(startAddr), uint64(rangeLength / int64(os.Getpagesize()))}) // log.Debug("found interesting page range for pid %s: %v", pid, addressRanges[len(addressRanges)-1]) break } } } } // Read /proc/pid/pagemap and process only interesting page ranges. For // every read-only page and for every page with the soft-dirty bit on, mark // them as candidates to be moved by adding them to pagePool. if len(addressRanges) > 0 { // log.Debug("Getting pages for PID %s for ranges %v", pid, addressRanges) pages := make([]page, 0) path := "/proc/" + pid + "/pagemap" pageMap, err := os.OpenFile(path, os.O_RDONLY, 0) if err != nil { // Probably the process just died? fmt.Printf("Could not read pagemaps: %v\n", err) break } for _, addressRange := range addressRanges { idx := int64(addressRange.addr / uint64(os.Getpagesize()) * 8) offset, err := pageMap.Seek(idx, io.SeekStart) if err != nil { // Maybe there was a race condition and the maps changed? log.Error("Failed to seek: %v\n", err) continue } for i := uint64(0); i < addressRange.length; i++ { bytes := make([]byte, 8) // Read exactly 8 bytes (because the file interface breaks otherwise). _, err = io.ReadAtLeast(pageMap, bytes, 8) if err != nil { // Possibly the maps changed. log.Error("Could not read data from pagemaps(%v)(page size: %d, seek offset: %d): %v\n", idx, os.Getpagesize(), offset, err) break } data := binary.LittleEndian.Uint64(bytes) // Check that the page is present (not swapped), exclusively // mapped (not used by any other process), and it has the // soft-dirty bit off. // Note: there appears to be no way to see from the pagemap entry what the NUMA node is. // We could map this back to the physical address ranges if needed. Currently this is handled // in movePages() by calling move_pages() first with an empty node array. softDirtyBit := uint64(0x1) << 55 exclusiveBit := uint64(0x1) << 56 presentBit := uint64(0x1) << 63 present := (data&presentBit == presentBit) exclusive := (data&exclusiveBit == exclusiveBit) softDirty := (data&softDirtyBit == softDirtyBit) if present && exclusive && !softDirty { // log.Debug("page a candidate for moving: 0x%08x", addressRange.addr+i*uint64(os.Getpagesize())) pages = append(pages, page{addr: addressRange.addr + i*uint64(os.Getpagesize()), pid: pidNumber}) } } } if _, found := pool.pages[pidNumber]; found { pool.pages[pidNumber] = append(pool.pages[pidNumber], pages...) } else { pool.pages[pidNumber] = pages } if uint(len(addressRanges)) > pool.longestRange { pool.longestRange = uint(len(addressRanges)) } } } return pool, nil } func pickClosestPMEMNode(targetNodes idset.IDSet) idset.ID { // TODO: analyze the topology information (and possibly the amount of free memory) and choose the "best" // PMEM node to demote the page to. The array targetNodes already contains only the subset of PMEM nodes // available in this topology subtree. Right now just pick a random controller. nodes := targetNodes.Members() return nodes[rand.Intn(len(nodes))] } func (d *demoter) movePagesForPid(p []page, count uint, pid int, targetNodes idset.IDSet) (uint, error) { // We move at max count pages, but there might not be that much. nPages := count if uint(len(p)) < count { nPages = uint(len(p)) } // Gather memory page pointers. pages := make([]uintptr, nPages) var i uint for i = 0; i < nPages; i++ { pages[i] = uintptr(p[i].addr) } // MPOL_MF_MOVE - only move pages exclusive to this process. There will be // permission denied errors for pages which couldn't be moved. FIXME: find // out if the whole move_pages() syscall failed or if just the non-exclusive // pages were not moved. flags := 1 << 1 // Call move_pages() first with nil nodes array to find out the current controllers. _, currentStatus, err := d.pageMover.MovePagesSyscall(pid, nPages, pages, nil, flags) if err != nil { log.Error("Failed to find out the current status of the pages: %v.", err) return 0, err } dramPages := make([]uintptr, 0) nodes := make([]int, 0) // Choose a target node for every page. Drop the pages which already are on the right controller from the list. for i, pageStatus := range currentStatus { if pageStatus < 0 { // There was an error regarding this page. continue } // log.Debug("page 0x%08X: old status %d", pages[i], pageStatus) if !targetNodes.Has(idset.ID(pageStatus)) { // In case of many PMEM controllers choose the one that is the closest. dramPages = append(dramPages, pages[i]) nodes = append(nodes, int(pickClosestPMEMNode(targetNodes))) } // else no need to move. } // Call move_pages() to actually move the pages. _, _, err = d.pageMover.MovePagesSyscall(pid, uint(len(dramPages)), dramPages, nodes, flags) // We processed (moved or ignored) at least nPages. return nPages, err } func (d *demoter) movePages(p pagePool, count uint, targetNodes idset.IDSet) error { // Select pid for moving the pages so that the process with the largest number // of non-dirty pages gets the pages moved first. processedPids := make(map[int]bool, 0) for count > 0 { mostPagesPid := 0 nPagesForPid := uint(0) for pid, pages := range p.pages { _, alreadyProcessed := processedPids[pid] if alreadyProcessed { continue } if uint(len(pages)) > nPagesForPid { mostPagesPid = pid nPagesForPid = uint(len(pages)) } } if nPagesForPid == 0 { return nil } processedPids[mostPagesPid] = true nMovePages := nPagesForPid if count < nPagesForPid { nMovePages = count count = 0 } else { count -= nPagesForPid } log.Debug("moving %d pages for pid %d", nMovePages, mostPagesPid) nPages, err := d.movePagesForPid(p.pages[mostPagesPid], nMovePages, mostPagesPid, targetNodes) if err != nil { log.Error("Failed to move pages: %v", err) return err } // Remove processed pages from the pagemap. p.pages[mostPagesPid] = p.pages[mostPagesPid][nPages:] } return nil } ================================================ FILE: pkg/cri/resource-manager/control/page-migrate/demoter_test.go ================================================ // Copyright 2020 Intel Corporation. All Rights Reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package pagemigrate import ( "fmt" idset "github.com/intel/goresctrl/pkg/utils" "testing" ) type mockPageMover struct { firstSuccess bool secondSuccess bool expectedPagesForSecondCall uint firstStatus []int } func (m *mockPageMover) MovePagesSyscall(pid int, count uint, pages []uintptr, nodes []int, flags int) (uint, []int, error) { status := make([]int, len(pages)) fmt.Printf("move_pages(): pid %d, count %d, pages %v, nodes %v, flags %d\n", pid, count, pages, nodes, flags) if nodes == nil { // First call is made without nodes if m.firstSuccess == false { return 0, m.firstStatus, fmt.Errorf("Fake error") } return 0, m.firstStatus, nil } // Second call if m.secondSuccess == false { return 0, status, fmt.Errorf("Fake error") } if uint(len(pages)) != m.expectedPagesForSecondCall { return 0, status, fmt.Errorf("Real error") } return 0, status, nil } func TestMovePages(t *testing.T) { tcases := []struct { name string pool pagePool targetNodes idset.IDSet pageCount uint expectedRemainingPageCount uint expectedError bool pageMover PageMover pid int }{ { name: "move pages (both)", pool: pagePool{ pages: map[int][]page{ 500: { { pid: 500, addr: 0xdeadbeef, }, { pid: 500, addr: 0xc0ffee, }, }, }, }, pid: 500, pageCount: 2, pageMover: &mockPageMover{ firstSuccess: true, secondSuccess: true, firstStatus: []int{0, 0}, expectedPagesForSecondCall: 2, }, targetNodes: idset.NewIDSet(1, 2), expectedError: false, expectedRemainingPageCount: 0, }, { name: "move pages (only one)", pool: pagePool{ pages: map[int][]page{ 500: { { pid: 500, addr: 0xdeadbeef, }, { pid: 500, addr: 0xc0ffee, }, }, }, }, pid: 500, pageCount: 2, pageMover: &mockPageMover{ firstSuccess: true, secondSuccess: true, firstStatus: []int{0, 2}, expectedPagesForSecondCall: 1, }, targetNodes: idset.NewIDSet(1, 2), expectedError: false, expectedRemainingPageCount: 0, }, { name: "move pages (none)", pool: pagePool{ pages: map[int][]page{ 500: { { pid: 500, addr: 0xdeadbeef, }, { pid: 500, addr: 0xc0ffee, }, }, }, }, pid: 500, pageCount: 2, pageMover: &mockPageMover{ firstSuccess: true, secondSuccess: true, firstStatus: []int{2, 1}, expectedPagesForSecondCall: 0, }, targetNodes: idset.NewIDSet(1, 2), expectedError: false, expectedRemainingPageCount: 0, }, { name: "move pages (count 1)", pool: pagePool{ pages: map[int][]page{ 500: { { pid: 500, addr: 0xdeadbeef, }, { pid: 500, addr: 0xc0ffee, }, }, }, }, pid: 500, pageCount: 1, pageMover: &mockPageMover{ firstSuccess: true, secondSuccess: true, firstStatus: []int{0}, expectedPagesForSecondCall: 1, }, targetNodes: idset.NewIDSet(1, 2), expectedError: false, expectedRemainingPageCount: 1, }, { name: "move pages (first call error)", pool: pagePool{ pages: map[int][]page{ 500: { { pid: 500, addr: 0xdeadbeef, }, { pid: 500, addr: 0xc0ffee, }, }, }, }, pid: 500, pageCount: 2, pageMover: &mockPageMover{ firstSuccess: false, secondSuccess: true, firstStatus: []int{0, 0}, expectedPagesForSecondCall: 0, }, targetNodes: idset.NewIDSet(1, 2), expectedError: true, expectedRemainingPageCount: 2, }, { name: "move pages (second call error)", pool: pagePool{ pages: map[int][]page{ 500: { { pid: 500, addr: 0xdeadbeef, }, { pid: 500, addr: 0xc0ffee, }, }, }, }, pid: 500, pageCount: 2, pageMover: &mockPageMover{ firstSuccess: true, secondSuccess: false, firstStatus: []int{0, 0}, expectedPagesForSecondCall: 0, }, targetNodes: idset.NewIDSet(1, 2), expectedError: true, expectedRemainingPageCount: 2, }, } for _, tc := range tcases { t.Run(tc.name, func(t *testing.T) { dynamicDemoter := &demoter{ maxPageMoveCount: tc.pageCount, pageMover: tc.pageMover, } err := dynamicDemoter.movePages(tc.pool, tc.pageCount, tc.targetNodes) if err != nil { if err.Error() != "Fake error" { t.Errorf("Non-fake error: %v", err) } } if (err != nil) != tc.expectedError { t.Errorf("Unexpected error value") } if uint(len(tc.pool.pages[tc.pid])) != tc.expectedRemainingPageCount { t.Errorf("Wrong number of remaining pages: %d", len(tc.pool.pages[tc.pid])) } }) } } ================================================ FILE: pkg/cri/resource-manager/control/page-migrate/flags.go ================================================ // Copyright 2019-2020 Intel Corporation. All Rights Reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package pagemigrate import ( "github.com/intel/cri-resource-manager/pkg/config" ) // options captures our configurable controller parameters. type options struct { // PageScanInterval controls how much time we give containers to touch non-idle pages. PageScanInterval config.Duration // PageMoveInterval controls how often we trigger moving pages. PageMoveInterval config.Duration // MaxPageMoveCount controls how many pages we can move in a single go. MaxPageMoveCount uint } // Our runtime configuration. var opt = defaultOptions().(*options) // defaultOptions returns a new options instance, all initialized to defaults. func defaultOptions() interface{} { return &options{} } // Register us for configuration handling. func init() { config.Register(PageMigrationConfigPath, PageMigrationDescription, opt, defaultOptions) } ================================================ FILE: pkg/cri/resource-manager/control/page-migrate/page-migrate.go ================================================ // Copyright 2019-2020 Intel Corporation. All Rights Reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package pagemigrate import ( "fmt" "sync" "github.com/intel/cri-resource-manager/pkg/cri/client" "github.com/intel/cri-resource-manager/pkg/cri/resource-manager/cache" "github.com/intel/cri-resource-manager/pkg/cri/resource-manager/control" logger "github.com/intel/cri-resource-manager/pkg/log" ) const ( // PageMigrationController is the name/domain of the page migration controller. PageMigrationController = cache.PageMigration // PageMigrationConfigPath is the configuration path for the page migration controller. PageMigrationConfigPath = "resource-manager.control." + PageMigrationController // PageMigrationDescription is the description for the page migration controller. PageMigrationDescription = "page migration controller" ) // migration implements the controller for memory page migration. type migration struct { cache cache.Cache // resource manager cache sync.Mutex // protect access from multiple goroutines containers map[string]*container // containers we migrate demoter *demoter // demoter adopted from topology-aware policy } // // The resource manager serializes access to the cache during request // processing, event processing, and configuration updates by locking // the resource-manager for each of these. Since controller hooks are // invoked either as part of processing a request or an event, access // to the cache from hooks is properly serialized. // // Page scanning or migration on the other hand happen asynchronously // from dedicated goroutines. In order to avoid having to serialize // access to the cache for these, we track and cache locally just enough // data about containers that we can perform these actions completely on // our own, without the need to access the resource manager cache at all. // // An alternative would have been to duplicate what we had originally in // the policy: // - introduce controller events akin to policy events // - have the resource-manager call controller event handlers with the // lock held // - periodically inject a controller event when we want to scan pages // - perform page scanning or demotion from the event handler with the // resource-manager lock held // // However that would have destroyed one of the goals of splitting page // scanning and migration out to a controller of its own, which was to // perform these potentially time consuming actions without blocking // concurrent processing of requests or events. // // container is the per container data we track locally. type container struct { cacheID string id string prettyName string cgroupDir string pm *cache.PageMigrate } // Our logger instance. var log = logger.NewLogger(PageMigrationController) // Our singleton page migration controller. var singleton *migration // getMigrationController returns our singleton controller instance. func getMigrationController() *migration { if singleton == nil { singleton = &migration{ containers: make(map[string]*container), } singleton.demoter = newDemoter(singleton) } return singleton } // Start prepares the controller for resource control/decision enforcement. func (m *migration) Start(cache cache.Cache, _ client.Client) error { m.cache = cache m.syncWithCache() m.demoter.Reconfigure() return nil } // Stop shuts down the controller. func (m *migration) Stop() { m.demoter.Stop() } // PreCreateHook is the controller's pre-create hook. func (m *migration) PreCreateHook(cache.Container) error { return nil } // PreStartHook is the controller's pre-start hook. func (m *migration) PreStartHook(cache.Container) error { return nil } // PostStartHook is the controller's post-start hook. func (m *migration) PostStartHook(cc cache.Container) error { m.Lock() defer m.Unlock() err := m.insertContainer(cc) cc.ClearPending(PageMigrationController) return err } // PostUpdateHook is the controller's post-update hook. func (m *migration) PostUpdateHook(cc cache.Container) error { m.Lock() defer m.Unlock() m.updateContainer(cc) cc.ClearPending(PageMigrationController) return nil } // PostStopHook is the controller's post-stop hook. func (m *migration) PostStopHook(cc cache.Container) error { m.Lock() defer m.Unlock() m.deleteContainer(cc) return nil } // syncWithCache synchronizes tracked containers with the cache. func (m *migration) syncWithCache() { m.Lock() defer m.Unlock() m.containers = make(map[string]*container) for _, cc := range m.cache.GetContainers() { m.insertContainer(cc) } } // insertContainer creates a local copy of the container. func (m *migration) insertContainer(cc cache.Container) error { pm := cc.GetPageMigration() if pm == nil { return nil } c := &container{ cacheID: cc.GetCacheID(), id: cc.GetID(), prettyName: cc.PrettyName(), cgroupDir: cc.GetCgroupDir(), pm: pm.Clone(), } if c.cgroupDir == "" { return migrationError("can't find cgroup dir for container %s", c.prettyName) } m.containers[c.cacheID] = c return nil } // updateContainer updates the local copy of the container. func (m *migration) updateContainer(cc cache.Container) error { pm := cc.GetPageMigration() if pm == nil { delete(m.containers, cc.GetCacheID()) return nil } c, ok := m.containers[cc.GetCacheID()] if !ok { return m.insertContainer(cc) } c.pm = pm.Clone() return nil } // deleteContainer creates a local copy of the container. func (m *migration) deleteContainer(cc cache.Container) error { delete(m.containers, cc.GetCacheID()) return nil } // GetCacheID replicates the respective cache.Container function. func (c *container) GetCacheID() string { return c.cacheID } // GetID replicates the respective cache.Container function. func (c *container) GetID() string { return c.id } // GetCgroupDir replicates the respective cache.Container function. func (c *container) GetCgroupDir() string { return c.GetCgroupDir() } // GetPageMigration replicates the respective cache.Container function. func (c *container) GetPageMigration() *cache.PageMigrate { return c.pm } // PrettyName replicates the respective cache.Container function. func (c *container) PrettyName() string { return c.prettyName } // init registers this controller. func init() { control.Register(PageMigrationController, "page migration controller", getMigrationController()) } // migrationError creates a controller-specific formatted error message. func migrationError(format string, args ...interface{}) error { return fmt.Errorf("page-migrate: "+format, args...) } ================================================ FILE: pkg/cri/resource-manager/control/page-migrate/page-mover.go ================================================ // Copyright 2020 Intel Corporation. All Rights Reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package pagemigrate import "C" import ( "fmt" "unsafe" "golang.org/x/sys/unix" ) type linuxPageMover struct{} // PageMover implements the way to move pages in a given HW/SW platform. type PageMover interface { MovePagesSyscall(pid int, count uint, pages []uintptr, nodes []int, flags int) (uint, []int, error) } func (m *linuxPageMover) MovePagesSyscall(pid int, count uint, pages []uintptr, nodes []int, flags int) (uint, []int, error) { // syscall: // long move_pages(int pid, unsigned long count, void **pages, // const int *nodes, int *status, int flags); var err error if count == 0 { return 0, []int{}, nil } // Go int is 64 bits on a 64-bit system, but C int is only guaranteed to be at least 16 bits, typically 32. cNodes := make([]C.int, len(nodes)) for i := 0; i < len(nodes); i++ { if nodes[i] < 0 || nodes[i] > 32767 { return 0, []int{}, fmt.Errorf("int value error: %d", nodes[i]) } cNodes[i] = C.int(nodes[i]) // safe downcast } cStatus := make([]C.int, len(pages)) nodesPtr := unsafe.Pointer(nil) if nodes != nil { nodesPtr = unsafe.Pointer(&cNodes[0]) } ret, _, en := unix.Syscall6(unix.SYS_MOVE_PAGES, uintptr(pid), uintptr(count), uintptr(unsafe.Pointer(&pages[0])), uintptr(nodesPtr), uintptr(unsafe.Pointer(&cStatus[0])), uintptr(flags)) if en != 0 { err = unix.Errno(en) } // log.Debug("move_pages(): pid %d, count %d, pages %v, nodes %v, flags %d: return value %d, status %d, errno %v", // pid, count, pages, nodes, flags, uint(ret), cStatus, err) status := make([]int, count) for i := uint(0); i < count; i++ { status[i] = int(cStatus[i]) } return uint(ret), status, err } ================================================ FILE: pkg/cri/resource-manager/control/rdt/rdt.go ================================================ // Copyright 2019 Intel Corporation. All Rights Reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package rdt import ( "fmt" corev1 "k8s.io/api/core/v1" pkgcfg "github.com/intel/cri-resource-manager/pkg/config" "github.com/intel/cri-resource-manager/pkg/cri/client" "github.com/intel/cri-resource-manager/pkg/cri/resource-manager/cache" "github.com/intel/cri-resource-manager/pkg/cri/resource-manager/control" logger "github.com/intel/cri-resource-manager/pkg/log" "github.com/intel/cri-resource-manager/pkg/metrics" "github.com/intel/goresctrl/pkg/rdt" ) const ( // ConfigModuleName is the configuration section for RDT ConfigModuleName = "rdt" // RDTController is the name of the RDT controller. RDTController = cache.RDT resctrlGroupPrefix = "cri-resmgr." ) // rdtctl encapsulates the runtime state of our RTD enforcement/controller. type rdtctl struct { cache cache.Cache // resource manager cache noQoSClasses bool // true if mapping pod qos class to rdt class is disabled mode OperatingMode // track the mode here to capture mode changes opt *config } type config struct { rdt.Config Options struct { rdt.Options Mode OperatingMode `json:"mode"` MonitoringDisabled bool `json:"monitoringDisabled"` } `json:"options"` } type OperatingMode string const ( OperatingModeDisabled OperatingMode = "Disabled" OperatingModeDiscovery OperatingMode = "Discovery" OperatingModeFull OperatingMode = "Full" ) // Our logger instance. var log logger.Logger = logger.NewLogger(RDTController) // our RDT controller singleton instance. var singleton *rdtctl // getRDTController returns our singleton RDT controller instance. func getRDTController() *rdtctl { if singleton == nil { singleton = &rdtctl{} singleton.opt = singleton.defaultOptions().(*config) } return singleton } // Start initializes the controller for enforcing decisions. func (ctl *rdtctl) Start(cache cache.Cache, _ client.Client) error { if err := rdt.Initialize(resctrlGroupPrefix); err != nil { return rdtError("failed to initialize RDT controls: %v", err) } ctl.cache = cache if err := ctl.configure(); err != nil { // Just print an error. A config update later on may be valid. log.Error("failed apply initial configuration: %v", err) } rdt.RegisterCustomPrometheusLabels("pod_name", "container_name") err := metrics.RegisterCollector("rdt", rdt.NewCollector) if err != nil { log.Error("failed register rdt collector: %v", err) } pkgcfg.GetModule(ConfigModuleName).AddNotify(getRDTController().configNotify) return nil } // Stop shuts down the controller. func (ctl *rdtctl) Stop() { } // PreCreateHook is the RDT controller pre-create hook. func (ctl *rdtctl) PreCreateHook(_ cache.Container) error { return nil } // PreStartHook is the RDT controller pre-start hook. func (ctl *rdtctl) PreStartHook(_ cache.Container) error { return nil } // PostStartHook is the RDT controller post-start hook. func (ctl *rdtctl) PostStartHook(c cache.Container) error { if !c.HasPending(RDTController) { return nil } if err := ctl.assign(c); err != nil { return err } c.ClearPending(RDTController) return nil } // PostUpdateHook is the RDT controller post-update hook. func (ctl *rdtctl) PostUpdateHook(c cache.Container) error { if !c.HasPending(RDTController) { return nil } if err := ctl.assign(c); err != nil { return err } c.ClearPending(RDTController) return nil } // PostStop is the RDT controller post-stop hook. func (ctl *rdtctl) PostStopHook(c cache.Container) error { if err := ctl.stopMonitor(c); err != nil { return rdtError("%q: failed to remove monitoring group: %v", c.PrettyName(), err) } return nil } // assign assigns all processes/threads in a container to the correct class func (ctl *rdtctl) assign(c cache.Container) error { if ctl.opt.Options.Mode == OperatingModeDisabled { return nil } class := c.GetRDTClass() switch class { case "": class = rdt.RootClassName case cache.RDTClassPodQoS: if ctl.noQoSClasses { class = rdt.RootClassName } else { class = string(c.GetQOSClass()) } } err := ctl.assignClass(c, class) if err != nil && class != rdt.RootClassName { log.Warn("%v; falling back to system root class", err) return ctl.assignClass(c, rdt.RootClassName) } return err } // assignClass assigns all processes/threads in a container to the specified class func (ctl *rdtctl) assignClass(c cache.Container, class string) error { cls, ok := rdt.GetClass(class) if !ok { return rdtError("%q: unknown RDT class %q", c.PrettyName(), class) } pod, ok := c.GetPod() if !ok { return rdtError("%q: failed to get pod", c.PrettyName()) } pids, err := c.GetProcesses() if err != nil { return rdtError("%q: failed to get process list: %v", c.PrettyName(), err) } if err := cls.AddPids(pids...); err != nil { return rdtError("%q: failed to assign to class %q: %v", c.PrettyName(), class, err) } pretty := c.PrettyName() if _, ok := cls.GetMonGroup(pretty); !ok || ctl.monitoringDisabled() { ctl.stopMonitor(c) } if !ctl.monitoringDisabled() { pname, name, id := pod.GetName(), c.GetName(), c.GetID() if err := ctl.monitor(cls, pname, name, id, pretty, pids); err != nil { return err } } log.Info("%q: assigned to class %q", pretty, class) return nil } // monitor starts monitoring a container. func (ctl *rdtctl) monitor(cls rdt.CtrlGroup, pod, name, id, pretty string, pids []string) error { if !rdt.MonSupported() { return nil } annotations := map[string]string{"pod_name": pod, "container_name": name} if mg, err := cls.CreateMonGroup(id, annotations); err != nil { log.Warn("%q: failed to create monitoring group: %v", pretty, err) } else { if err := mg.AddPids(pids...); err != nil { return rdtError("%q: failed to assign to monitoring group %q: %v", pretty, cls.Name()+"/"+mg.Name(), err) } log.Info("%q: assigned to monitoring group %q", pretty, cls.Name()+"/"+mg.Name()) } return nil } // stopMonitor stops monitoring a container. func (ctl *rdtctl) stopMonitor(c cache.Container) error { name := c.PrettyName() for _, cls := range rdt.GetClasses() { if mg, ok := cls.GetMonGroup(name); ok { if err := cls.DeleteMonGroup(name); err != nil { return err } log.Info("%q: removed monitoring group %q", c.PrettyName(), cls.Name()+"/"+mg.Name()) } } return nil } // stopMonitorAll removes all monitoring groups func (ctl *rdtctl) stopMonitorAll() error { for _, cls := range rdt.GetClasses() { if err := cls.DeleteMonGroups(); err != nil { return err } } return nil } func (ctl *rdtctl) assignAll(forceClass string) { // Assign all containers for _, c := range ctl.cache.GetContainers() { var err error if forceClass != "" { err = ctl.assignClass(c, forceClass) } else { err = ctl.assign(c) } if err != nil { log.Warn("failed to assign rdt class of %q: %v", c.PrettyName(), err) } } } func (ctl *rdtctl) monitoringDisabled() bool { return ctl.mode == OperatingModeDisabled || ctl.opt.Options.MonitoringDisabled } func (ctl *rdtctl) configure() error { // Apply RDT configuration, depending on the operating mode switch ctl.opt.Options.Mode { case OperatingModeDisabled: if ctl.mode != ctl.opt.Options.Mode { ctl.stopMonitorAll() // Drop all cri-resctrl specific groups by applying an empty config if err := rdt.SetConfig(&rdt.Config{}, true); err != nil { return rdtError("failed apply empty rdt config: %v", err) } ctl.noQoSClasses = true ctl.mode = ctl.opt.Options.Mode ctl.assignAll(rdt.RootClassName) } case OperatingModeDiscovery: if ctl.mode != ctl.opt.Options.Mode { ctl.stopMonitorAll() // Drop all cri-resctrl specific groups by applying an empty config if err := rdt.SetConfig(&rdt.Config{}, true); err != nil { return rdtError("failed apply empty rdt config: %v", err) } } // Run Initialize with empty prefix to discover existing resctrl groups if err := rdt.DiscoverClasses(""); err != nil { return rdtError("failed to discover classes from fs: %v", err) } // Disable mapping from Pod QoS to RDT class if no Pod QoS class equivalents exist ctl.noQoSClasses = true cs := []corev1.PodQOSClass{corev1.PodQOSBestEffort, corev1.PodQOSBurstable, corev1.PodQOSGuaranteed} for _, c := range cs { if _, ok := rdt.GetClass(string(c)); ok { ctl.noQoSClasses = false break } } ctl.mode = ctl.opt.Options.Mode ctl.assignAll("") case OperatingModeFull: if ctl.mode != ctl.opt.Options.Mode { ctl.stopMonitorAll() } // Copy goresctrl specific part from our extended options ctl.opt.Config.Options = ctl.opt.Options.Options if err := rdt.SetConfig(&ctl.opt.Config, true); err != nil { return err } // Disable mapping from Pod QoS to RDT class if no classes have been defined ctl.noQoSClasses = len(rdt.GetClasses()) <= 1 ctl.mode = ctl.opt.Options.Mode ctl.assignAll("") default: return rdtError("invalid mode %q", ctl.opt.Options.Mode) } log.Debug("rdt controller operating mode set to %q", ctl.mode) if ctl.opt.Options.Mode != OperatingModeDisabled { log.Debug("rdt monitoring %s", map[bool]string{true: "disabled", false: "enabled"}[ctl.monitoringDisabled()]) } return nil } // configNotify is our runtime configuration notification callback. func (ctl *rdtctl) configNotify(_ pkgcfg.Event, _ pkgcfg.Source) error { log.Info("configuration update, applying new config") return ctl.configure() } func (ctl *rdtctl) defaultOptions() interface{} { c := &config{} c.Options.Mode = OperatingModeFull return c } // GetClasses returns all available RDT classes func GetClasses() []rdt.CtrlGroup { return rdt.GetClasses() } // rdtError creates an RDT-controller-specific formatted error message. func rdtError(format string, args ...interface{}) error { return fmt.Errorf("rdt: "+format, args...) } // Register us as a controller. func init() { control.Register(RDTController, "RDT controller", getRDTController()) pkgcfg.Register(ConfigModuleName, "RDT control", getRDTController().opt, getRDTController().defaultOptions) } ================================================ FILE: pkg/cri/resource-manager/controllers.go ================================================ // Copyright 2019 Intel Corporation. All Rights Reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package resmgr import ( // List of controllers to pull in. _ "github.com/intel/cri-resource-manager/pkg/cri/resource-manager/control/blockio" _ "github.com/intel/cri-resource-manager/pkg/cri/resource-manager/control/cpu" _ "github.com/intel/cri-resource-manager/pkg/cri/resource-manager/control/cri" _ "github.com/intel/cri-resource-manager/pkg/cri/resource-manager/control/memory" _ "github.com/intel/cri-resource-manager/pkg/cri/resource-manager/control/page-migrate" _ "github.com/intel/cri-resource-manager/pkg/cri/resource-manager/control/rdt" ) ================================================ FILE: pkg/cri/resource-manager/error.go ================================================ // Copyright 2019 Intel Corporation. All Rights Reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package resmgr import ( "fmt" ) // resmgrError creates a resource manager-specific formatted error. func resmgrError(format string, args ...interface{}) error { return fmt.Errorf("resource-manager: "+format, args...) } ================================================ FILE: pkg/cri/resource-manager/events/events.go ================================================ // Copyright 2019-2020 Intel Corporation. All Rights Reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package events // Metrics is a set of metrics-related events we might need to act upon. type Metrics struct { // Avx describes changes in container AVX512 instruction usage. Avx *Avx } // AVX contains data related to container AVX512 instruction usage. type Avx struct { // Updates contains containers with a change in their AVX512 instruction usage. Updates map[string]bool } // Policy is a policy-specific event to be handled by the active policy. type Policy struct { // Event is the policy-specific type of this event. Type string // Source describes where this event is originated from. Source string // Data is any optional arbitrary data associated with this event. Data interface{} } const ( // ContainerStarted is delivered to policies when a StartContainer request succeeds. ContainerStarted = "container-started" ) ================================================ FILE: pkg/cri/resource-manager/events.go ================================================ // Copyright 2020 Intel Corporation. All Rights Reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package resmgr import ( "time" "github.com/intel/cri-resource-manager/pkg/cri/resource-manager/cache" "github.com/intel/cri-resource-manager/pkg/cri/resource-manager/events" "github.com/intel/cri-resource-manager/pkg/cri/resource-manager/metrics" logger "github.com/intel/cri-resource-manager/pkg/log" ) // Our logger instance for events. var evtlog = logger.NewLogger("events") // setupEventProcessing sets up event and metrics processing. func (m *resmgr) setupEventProcessing() error { var err error m.events = make(chan interface{}, 8) m.stop = make(chan interface{}) options := metrics.Options{ PollInterval: opt.MetricsTimer, Events: m.events, } if m.metrics, err = metrics.NewMetrics(options); err != nil { return resmgrError("failed to create metrics (pre)processor: %v", err) } return nil } // startEventProcessing starts event and metrics processing. func (m *resmgr) startEventProcessing() error { if err := m.metrics.Start(); err != nil { return resmgrError("failed to start metrics (pre)processor: %v", err) } stop := m.stop go func() { var rebalanceTimer *time.Ticker var rebalanceChan <-chan time.Time if opt.RebalanceTimer > 0 { rebalanceTimer = time.NewTicker(opt.RebalanceTimer) rebalanceChan = rebalanceTimer.C } else { m.Info("periodic rebalancing is disabled") } for { select { case _ = <-stop: if rebalanceTimer != nil { rebalanceTimer.Stop() } return case event := <-m.events: m.processEvent(event) case _ = <-rebalanceChan: if err := m.RebalanceContainers(); err != nil { evtlog.Error("rebalancing failed: %v", err) } } logger.Flush() } }() return nil } // stopEventProcessing stops event and metrics processing. func (m *resmgr) stopEventProcessing() { if m.stop != nil { close(m.stop) m.metrics.Stop() m.stop = nil } } // SendEvent injects the given event to the resource manager's event processing loop. func (m *resmgr) SendEvent(event interface{}) error { if m.events == nil { return resmgrError("can't send event, no event channel") } select { case m.events <- event: return nil default: return resmgrError("can't send event of type %T, event channel full", event) } } // processEvent processes the given event. func (m *resmgr) processEvent(e interface{}) { evtlog.Debug("received event of type %T...", e) switch event := e.(type) { case string: evtlog.Debug("'%s'...", event) case *events.Metrics: m.processAvx(event.Avx) case *events.Policy: m.DeliverPolicyEvent(event) default: evtlog.Warn("event of unexpected type %T...", e) } } // processAvx processes AVX512 events. func (m *resmgr) processAvx(e *events.Avx) bool { if e == nil { return false } m.Lock() defer m.Unlock() changes := false for cgroup, active := range e.Updates { c, ok := m.resolveCgroupPath(cgroup) if !ok { continue } // XXX This is just for testing, we should effectively drive state transitions // through a low-pass filter. if active { if _, wasTagged := c.SetTag(cache.TagAVX512, "true"); !wasTagged { evtlog.Info("container %s STARTED using AVX512 instructions", c.PrettyName()) } } else { if _, wasTagged := c.DeleteTag(cache.TagAVX512); wasTagged { evtlog.Info("container %s STOPPED using AVX512 instructions", c.PrettyName()) } } } return changes } // resolveCgroupPath resolves a cgroup path to a container. func (m *resmgr) resolveCgroupPath(path string) (cache.Container, bool) { return m.cache.LookupContainerByCgroup(path) } ================================================ FILE: pkg/cri/resource-manager/flags.go ================================================ // Copyright 2019 Intel Corporation. All Rights Reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package resmgr import ( "flag" "time" "github.com/intel/cri-resource-manager/pkg/cri/relay" "github.com/intel/cri-resource-manager/pkg/cri/resource-manager/sockets" "github.com/intel/cri-resource-manager/pkg/pidfile" ) // Options captures our command line parameters. type options struct { HostRoot string ImageSocket string RuntimeSocket string RelaySocket string RelayDir string AllowUntestedRuntimes bool AgentSocket string ConfigSocket string PidFile string ResctrlPath string FallbackConfig string ForceConfig string ForceConfigSignal string DisablePolicySwitch bool ResetPolicy bool ResetConfig bool MetricsTimer time.Duration RebalanceTimer time.Duration DisableUI bool } // Relay command line options. var opt = options{} const ( allowUntestedRuntimesFlag = "allow-untested-runtimes" ) // Register us for command line option processing. func init() { flag.StringVar(&opt.HostRoot, "host-root", "", "Directory prefix under which the host's sysfs, etc. are mounted.") flag.StringVar(&opt.RuntimeSocket, "runtime-socket", sockets.Containerd, "Unix domain socket path where CRI runtime service requests should be relayed to.") flag.StringVar(&opt.ImageSocket, "image-socket", relay.DefaultImageSocket, "CRI image service socket, defaults to the value used for --runtime-socket.") flag.StringVar(&opt.RelaySocket, "relay-socket", sockets.ResourceManagerRelay, "Unix domain socket path where the resource manager should serve requests on.") flag.StringVar(&opt.RelayDir, "relay-dir", "/var/lib/cri-resmgr", "Permanent storage directory path for the resource manager to store its state in.") flag.BoolVar(&opt.AllowUntestedRuntimes, allowUntestedRuntimesFlag, false, "Allow proxying for untested CRI runtimes. Usually this is not a good idea.") flag.StringVar(&opt.AgentSocket, "agent-socket", sockets.ResourceManagerAgent, "local socket of the cri-resmgr agent to connect") flag.StringVar(&opt.ConfigSocket, "config-socket", sockets.ResourceManagerConfig, "Unix domain socket path where the resource manager listens for cri-resmgr-agent") flag.StringVar(&opt.PidFile, "pid-file", pidfile.GetPath(), "PID file to write daemon PID to") flag.StringVar(&opt.FallbackConfig, "fallback-config", "", "Fallback configuration to use unless/until one is available from the cache or agent.") flag.StringVar(&opt.ForceConfig, "force-config", "", "Configuration used to override the one stored in the cache. Disables the agent.") flag.StringVar(&opt.ForceConfigSignal, "force-config-signal", "SIGHUP", "Signal used to reload forced configuration.") flag.BoolVar(&opt.ResetConfig, "reset-config", false, "Remove configuration (from the agent) stored in the cache, then exit.") flag.BoolVar(&opt.ResetPolicy, "reset-policy", false, "Reset policy data stored in the cache, then exit.") flag.BoolVar(&opt.DisablePolicySwitch, "disable-policy-switch", false, "Disable switching policies during startup.") flag.DurationVar(&opt.MetricsTimer, "metrics-interval", 0, "Interval for polling/gathering runtime metrics data. Use 'disable' for disabling.") flag.DurationVar(&opt.RebalanceTimer, "rebalance-interval", 0, "Minimum interval between two container rebalancing attempts. Use 'disable' for disabling.") flag.BoolVar(&opt.DisableUI, "disable-ui", false, "Disable serving container placement visualization UIs.") } ================================================ FILE: pkg/cri/resource-manager/introspect/introspect.go ================================================ // Copyright 2020 Intel Corporation. All Rights Reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package introspect import ( "encoding/json" "fmt" "net/http" "sync" xhttp "github.com/intel/cri-resource-manager/pkg/instrumentation/http" logger "github.com/intel/cri-resource-manager/pkg/log" "github.com/intel/cri-resource-manager/pkg/topology" ) // Pod describes a single pod and its containers. type Pod struct { ID string // pod CRI ID UID string // pod kubernetes ID Name string // pod name Containers map[string]*Container // containers of this pod } // Container describes a single container. type Container struct { ID string // container CRI ID Name string // container name Command []string // command Args []string // and its arguments CPURequest int64 // CPU requested in milli-CPU (guaranteed amount) CPULimit int64 // CPU limit in milli-CPU (maximum allowed CPU) MemoryRequest int64 // memory requested in bytes MemoryLimit int64 // memory limit in bytes (maximum allowed memory) Hints TopologyHints // topology/allocation hints } // TopologyHints contain a set of allocation hints for a container. type TopologyHints topology.Hints // Assignment describes resource assignments for a single container. type Assignment struct { ContainerID string // ID of container for this assignment SharedCPUs string // shared CPUs CPUShare int // CPU share/weight for SharedCPUs ExclusiveCPUs string // exclusive CPUs Memory string // memory controllers Pool string // pool container is assigned to } // Pool describes a single (resource) pool. type Pool struct { Name string // pool name CPUs string // CPUs in this pool Memory string // memory controllers (NUMA nodes) for this pool Parent string // parent pool Children []string // child pools } // Socket describes a single physical CPU socket in the system. type Socket struct { ID int // CPU ID CPUs string // CPUs in this socket } // Node describes a single NUMA node in the system. type Node struct { ID int // node ID CPUs string // CPUs with locality for this NUMA node. } // System describes the underlying HW/system. type System struct { Sockets map[int]*Socket // physical sockets in the system Nodes map[int]*Node // NUMA nodes in the system Isolated string // kernel-isolated CPUs Offlined string // CPUs offline RDTClasses []string // list of RDT classes BlockIOClasses []string // list of block I/O classes Policy string // active policy } // State is the current introspected state of the resource manager. type State struct { Pools map[string]*Pool // pools Pods map[string]*Pod // pods and containers Assignments map[string]*Assignment // resource assignments System *System // info about hardware/system Error string } // our logger instance var log = logger.NewLogger("instrospect") // Server is our server for external introspection. type Server struct { sync.RWMutex // need to protect against concurrent introspection/update mux *xhttp.ServeMux // our HTTP request multiplexer state *State // introspection data data string // state as a JSON string ready bool } // Setup prepares the given HTTP request multiplexer for serving introspection. func Setup(mux *xhttp.ServeMux, state *State) (*Server, error) { s := &Server{mux: mux} if err := s.set(state); err != nil { return nil, err } mux.HandleFunc("/introspect", s.serve) return s, nil } // Set sets the current state for introspection. func (s *Server) Set(state *State) error { s.Lock() defer s.Unlock() return s.set(state) } // Start enables serving HTTP requests. func (s *Server) Start() { log.Info("starting introspection server...") s.ready = true } // Stop stops serving further HTTP requests. func (s *Server) Stop() { log.Info("stopping introspection server...") s.ready = false } // set sets the given state and encodes it as a JSON string. func (s *Server) set(state *State) error { log.Debug("updating introspection data...") s.state = state data, err := json.Marshal(s.state) if err != nil { err = introspectError("failed to marshal state for introspection: %v", err) s.state = &State{Error: fmt.Sprintf("%v", err)} data, _ = json.Marshal(s.state) } s.data = string(data) return err } // serve serves a single HTTP request. func (s *Server) serve(w http.ResponseWriter, _ *http.Request) { if !s.ready { return } log.Debug("serving introspection data...") s.RLock() fmt.Fprintf(w, "%s\r\n", s.data) s.RUnlock() } // introspectError creates an introspection-specific error. func introspectError(format string, args ...interface{}) error { return fmt.Errorf("introspection: "+format, args...) } ================================================ FILE: pkg/cri/resource-manager/kubernetes/kubernetes.go ================================================ /* Copyright 2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ package kubernetes const ( // ResmgrKeyNamespace is a CRI Resource Manager namespace ResmgrKeyNamespace = "cri-resource-manager.intel.com" // NamespaceSystem is the kubernetes system namespace. NamespaceSystem = "kube-system" // PodNameLabel is the key for the kubernetes pod name label. PodNameLabel = "io.kubernetes.pod.name" // PodNameLabel is the key for the kubernetes pod UID label. PodUIDLabel = "io.kubernetes.pod.uid" // ContainerNameLabel is the key for the kubernetes container name label. ContainerNameLabel = "io.kubernetes.container.name" ) // ResmgrKey returns a full namespaced name of a resource manager specific key func ResmgrKey(name string) string { return ResmgrKeyNamespace + "/" + name } ================================================ FILE: pkg/cri/resource-manager/kubernetes/resources.go ================================================ // Copyright The NRI Plugins Authors. All Rights Reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package kubernetes const ( // Constants for converting back and forth between CPU requirements in // terms of milli-CPUs and kernel cgroup/scheduling parameters. // MinShares is the minimum cpu.shares accepted by cgroups. MinShares = 2 // MaxShares is the minimum cpu.shares accepted by cgroups. MaxShares = 262144 // SharesPerCPU is cpu.shares worth one full CPU. SharesPerCPU = 1024 // MilliCPUToCPU is milli-CPUs worth a full CPU. MilliCPUToCPU = 1000 // QuotaPeriod is 100000 microseconds, or 100ms QuotaPeriod = 100000 // MinQuotaPeriod is 1000 microseconds, or 1ms MinQuotaPeriod = 1000 ) // MilliCPUToQuota converts milliCPU to CFS quota and period values. // (Almost) identical to the same function in kubelet. func MilliCPUToQuota(milliCPU int64) (quota, period int64) { if milliCPU == 0 { return 0, 0 } // TODO(klihub): this is behind the CPUSFSQuotaPerdiod feature gate in kubelet period = int64(QuotaPeriod) quota = (milliCPU * period) / MilliCPUToCPU if quota < MinQuotaPeriod { quota = MinQuotaPeriod } return quota, period } // MilliCPUToShares converts the milliCPU to CFS shares. // Identical to the same function in kubelet. func MilliCPUToShares(milliCPU int64) uint64 { if milliCPU == 0 { return MinShares } shares := (milliCPU * SharesPerCPU) / MilliCPUToCPU if shares < MinShares { return MinShares } if shares > MaxShares { return MaxShares } return uint64(shares) } // SharesToMilliCPU converts CFS CPU shares to milli-CPUs. func SharesToMilliCPU(shares int64) int64 { if shares == MinShares { return 0 } return int64(float64(shares*MilliCPUToCPU)/float64(SharesPerCPU) + 0.5) } // QuotaToMilliCPU converts CFS quota and period to milli-CPUs. func QuotaToMilliCPU(quota, period int64) int64 { if quota == 0 || period == 0 { return 0 } return int64(float64(quota*MilliCPUToCPU)/float64(period) + 0.5) } ================================================ FILE: pkg/cri/resource-manager/metrics/avx.go ================================================ // Copyright 2020 Intel Corporation. All Rights Reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package metrics import ( model "github.com/prometheus/client_model/go" "path/filepath" "github.com/intel/cri-resource-manager/pkg/cgroups" "github.com/intel/cri-resource-manager/pkg/cri/resource-manager/events" ) func (m *Metrics) collectAvxEvents(raw map[string]*model.MetricFamily) *events.Avx { all, ok := raw["all_switch_count_per_cgroup"] if !ok { return nil } dump("all context switches", all) avx, ok := raw["avx_switch_count_per_cgroup"] if !ok { return nil } dump("AVX context switches", avx) ratio := map[string]float64{} for _, v := range avx.Metric { cgroup, err := filepath.Rel(cgroups.GetV2Dir(), v.Label[0].GetValue()) if err != nil { continue } ratio[cgroup] = v.Gauge.GetValue() } for _, v := range all.Metric { cgroup, err := filepath.Rel(cgroups.GetV2Dir(), v.Label[0].GetValue()) if err != nil { continue } ratio[cgroup] /= v.Gauge.GetValue() } usage := map[string]bool{} for cgroup, use := range ratio { active := use >= m.opts.AvxThreshold log.Debug(" %s AVX ratio = %f, active?: %v", cgroup, use, active) usage["/"+cgroup] = active } return &events.Avx{Updates: usage} } ================================================ FILE: pkg/cri/resource-manager/metrics/metrics.go ================================================ // Copyright 2020 Intel Corporation. All Rights Reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package metrics import ( "bytes" "fmt" "strings" "sync" "time" "github.com/prometheus/client_golang/prometheus" model "github.com/prometheus/client_model/go" "github.com/prometheus/common/expfmt" logger "github.com/intel/cri-resource-manager/pkg/log" "github.com/intel/cri-resource-manager/pkg/cri/resource-manager/events" "github.com/intel/cri-resource-manager/pkg/instrumentation" "github.com/intel/cri-resource-manager/pkg/metrics" // pull in all metrics collectors _ "github.com/intel/cri-resource-manager/pkg/metrics/register" ) const ( // DefaultAvxThreshold is the cutoff below which a cgroup/container is not an AVX user. DefaultAvxThreshold = float64(0.1) ) // Options describes options for metrics collection and processing. type Options struct { // PollInterval is the interval for polling raw metrics. PollInterval time.Duration // Events is the channel for delivering metrics events. Events chan interface{} // AvxThreshold is the threshold (0 - 1) for a cgroup to be considered AVX512-active AvxThreshold float64 } // Metrics implements collecting, caching and processing of raw metrics. type Metrics struct { sync.RWMutex opts Options // metrics collecting options g prometheus.Gatherer // prometheus/raw metrics gatherer stop chan interface{} // channel to stop polling goroutine raw []*model.MetricFamily // latest set of raw metrics pend []*model.MetricFamily // pending metrics for forwarding } // Our logger instance. var log = logger.NewLogger("metrics") // NewMetrics creates a new instance for metrics collecting and processing. func NewMetrics(opts Options) (*Metrics, error) { if opts.Events == nil { return nil, metricsError("invalid options, nil Event channel") } if opts.AvxThreshold == 0.0 { opts.AvxThreshold = DefaultAvxThreshold } g, err := metrics.NewMetricGatherer() if err != nil { return nil, metricsError("failed to create raw metrics gatherer: %v", err) } m := &Metrics{ opts: opts, raw: make([]*model.MetricFamily, 0), g: g, } m.poll() instrumentation.RegisterGatherer(m) return m, nil } // Start starts metrics collection and processing. func (m *Metrics) Start() error { if m.stop != nil { return nil } stop := make(chan interface{}) go func() { var pollTimer *time.Ticker var pollChan <-chan time.Time if m.opts.PollInterval > 0 { pollTimer = time.NewTicker(m.opts.PollInterval) pollChan = pollTimer.C } else { log.Info("periodic collection of metrics is disabled") } for { select { case _ = <-stop: if pollTimer != nil { pollTimer.Stop() } return case _ = <-pollChan: if err := m.poll(); err != nil { log.Error("failed to poll raw metrics: %v", err) continue } if err := m.process(); err != nil { log.Error("failed to deliver metrics event: %v", err) } } } }() m.stop = stop return nil } // Stop stops metrics collection and processing. func (m *Metrics) Stop() { if m.stop != nil { close(m.stop) m.stop = nil } } // poll does a single round of raw metrics collection. func (m *Metrics) poll() error { m.Lock() defer m.Unlock() f, err := m.g.Gather() if err != nil { return metricsError("failed to poll raw metrics: %v", err) } m.raw = f m.pend = f return nil } // process processes the collected raw metrics. func (m *Metrics) process() error { raw := map[string]*model.MetricFamily{} for _, f := range m.raw { dump(" ", f) raw[*f.Name] = f } event := &events.Metrics{ Avx: m.collectAvxEvents(raw), } return m.sendEvent(event) } // sendEvent sends a metrics-based event for processing. func (m *Metrics) sendEvent(e *events.Metrics) error { select { case m.opts.Events <- e: return nil default: return metricsError("failed to deliver event %v (channel full?)", *e) } } // dump debug-dumps the given MetricFamily data func dump(prefix string, f *model.MetricFamily) { if !log.DebugEnabled() { return } buf := &bytes.Buffer{} if _, err := expfmt.MetricFamilyToText(buf, f); err != nil { return } log.DebugBlock(" <"+prefix+"> ", "%s", strings.TrimSpace(buf.String())) } // metricsError returns a new formatted error specific to metrics-processing. func metricsError(format string, args ...interface{}) error { return fmt.Errorf("metrics: "+format, args...) } ================================================ FILE: pkg/cri/resource-manager/metrics/prometheus.go ================================================ // Copyright 2020 Intel Corporation. All Rights Reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package metrics import ( model "github.com/prometheus/client_model/go" ) // Gather is our prometheus.Gatherer interface for proxying metrics. func (m *Metrics) Gather() ([]*model.MetricFamily, error) { m.Lock() pend := m.pend m.Unlock() if pend == nil { log.Debug("no data to proxy to prometheus...") } else { log.Debug("proxying data to prometheus...") } return pend, nil } ================================================ FILE: pkg/cri/resource-manager/no-test-api.go ================================================ // Copyright 2020 Intel Corporation. All Rights Reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. //go:build !test // +build !test package resmgr // ResourceManagerTestAPI is dummy if we're compiling without test build flag. type ResourceManagerTestAPI interface { } ================================================ FILE: pkg/cri/resource-manager/policy/builtin/balloons/balloons-policy.go ================================================ // Copyright 2022 Intel Corporation. All Rights Reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package balloons import ( "fmt" "path/filepath" corev1 "k8s.io/api/core/v1" resapi "k8s.io/apimachinery/pkg/api/resource" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" pkgcfg "github.com/intel/cri-resource-manager/pkg/config" "github.com/intel/cri-resource-manager/pkg/cpuallocator" "github.com/intel/cri-resource-manager/pkg/cri/resource-manager/cache" cpucontrol "github.com/intel/cri-resource-manager/pkg/cri/resource-manager/control/cpu" "github.com/intel/cri-resource-manager/pkg/cri/resource-manager/events" "github.com/intel/cri-resource-manager/pkg/cri/resource-manager/introspect" "github.com/intel/cri-resource-manager/pkg/cri/resource-manager/kubernetes" "github.com/intel/cri-resource-manager/pkg/cri/resource-manager/policy" policyapi "github.com/intel/cri-resource-manager/pkg/cri/resource-manager/policy" logger "github.com/intel/cri-resource-manager/pkg/log" "github.com/intel/cri-resource-manager/pkg/utils" "github.com/intel/cri-resource-manager/pkg/utils/cpuset" idset "github.com/intel/goresctrl/pkg/utils" ) const ( // PolicyName is the name used to activate this policy. PolicyName = "balloons" // PolicyDescription is a short description of this policy. PolicyDescription = "Flexible pools with per-pool CPU parameters" // PolicyPath is the path of this policy in the configuration hierarchy. PolicyPath = "policy." + PolicyName // balloonKey is a pod annotation key, the value is a pod balloon name. balloonKey = "balloon." + PolicyName + "." + kubernetes.ResmgrKeyNamespace // reservedBalloonDefName is the name in the reserved balloon definition. reservedBalloonDefName = "reserved" // defaultBalloonDefName is the name in the default balloon definition. defaultBalloonDefName = "default" // NoLimit value denotes no limit being set. NoLimit = 0 ) // balloons contains configuration and runtime attributes of the balloons policy type balloons struct { options *policyapi.BackendOptions // configuration common to all policies bpoptions BalloonsOptions // balloons-specific configuration cch cache.Cache // cri-resmgr cache allowed cpuset.CPUSet // bounding set of CPUs we're allowed to use reserved cpuset.CPUSet // system-/kube-reserved CPUs freeCpus cpuset.CPUSet // CPUs to be included in growing or new ballons cpuTree *cpuTreeNode // system CPU topology cpuTreeAllocator *cpuTreeAllocator // CPU allocator from system CPU topology reservedBalloonDef *BalloonDef // built-in definition of the reserved balloon defaultBalloonDef *BalloonDef // built-in definition of the default balloon balloons []*Balloon // balloon instances: reserved, default and user-defined cpuAllocator cpuallocator.CPUAllocator // CPU allocator used by the policy } // Balloon contains attributes of a balloon instance type Balloon struct { // Def is the definition from which this balloon instance is created. Def *BalloonDef // Instance is the index of this balloon instance, starting from // zero for every balloon definition. Instance int // Cpus is the set of CPUs exclusive to this balloon instance only. Cpus cpuset.CPUSet // Mems is the set of memory nodes with minimal access delay // from CPUs. Mems idset.IDSet // SharedIdleCpus is the set of idle CPUs that workloads in a // balloon are allowed to use with workloads in other balloons // that shareIdleCpus. SharedIdleCpus cpuset.CPUSet // PodIDs maps pod ID to list of container IDs. // - len(PodIDs) is the number of pods in the balloon. // - len(PodIDs[podID]) is the number of containers of podID // currently assigned to the balloon. PodIDs map[string][]string cpuTreeAllocator *cpuTreeAllocator } var log logger.Logger = logger.NewLogger("policy") // String is a stringer for a balloon. func (bln Balloon) String() string { return fmt.Sprintf("%s{Cpus:%s, Mems:%s}", bln.PrettyName(), bln.Cpus, bln.Mems) } // PrettyName returns a unique name for a balloon. func (bln Balloon) PrettyName() string { return fmt.Sprintf("%s[%d]", bln.Def.Name, bln.Instance) } // ContainerIDs returns IDs of containers assigned in a balloon. // (Using cache.Container.GetCacheID()'s) func (bln Balloon) ContainerIDs() []string { cIDs := []string{} for _, ctrIDs := range bln.PodIDs { cIDs = append(cIDs, ctrIDs...) } return cIDs } // ContainerCount returns the number of containers in a balloon. func (bln Balloon) ContainerCount() int { count := 0 for _, ctrIDs := range bln.PodIDs { count += len(ctrIDs) } return count } func (bln Balloon) AvailMilliCpus() int { return bln.Cpus.Size() * 1000 } func (bln Balloon) MaxAvailMilliCpus(freeCpus cpuset.CPUSet) int { if bln.Def.MaxCpus == NoLimit { return (bln.Cpus.Size() + freeCpus.Size()) * 1000 } return bln.Def.MaxCpus * 1000 } // CreateBalloonsPolicy creates a new policy instance. func CreateBalloonsPolicy(policyOptions *policy.BackendOptions) policy.Backend { var err error p := &balloons{ options: policyOptions, cch: policyOptions.Cache, cpuAllocator: cpuallocator.NewCPUAllocator(policyOptions.System), } log.Info("creating %s policy...", PolicyName) if p.cpuTree, err = NewCpuTreeFromSystem(); err != nil { log.Errorf("creating CPU topology tree failed: %s", err) } log.Debug("CPU topology: %s", p.cpuTree) // Handle common policy options: AvailableResources and ReservedResources. // p.allowed: CPUs available for the policy if allowed, ok := policyOptions.Available[policyapi.DomainCPU]; ok { p.allowed = allowed.(cpuset.CPUSet) } else { // Available CPUs not specified, default to all on-line CPUs. p.allowed = policyOptions.System.CPUSet().Difference(policyOptions.System.Offlined()) } // p.reserved: CPUs reserved for kube-system pods, subset of p.allowed. p.reserved = cpuset.New() if reserved, ok := p.options.Reserved[policyapi.DomainCPU]; ok { switch v := reserved.(type) { case cpuset.CPUSet: p.reserved = p.allowed.Intersection(v) case resapi.Quantity: reserveCnt := (int(v.MilliValue()) + 999) / 1000 cpus, err := p.cpuAllocator.AllocateCpus(&p.allowed, reserveCnt, cpuallocator.PriorityNone) if err != nil { log.Fatal("failed to allocate reserved CPUs: %s", err) } p.reserved = cpus p.allowed = p.allowed.Union(cpus) } } if p.reserved.IsEmpty() { log.Fatal("%s cannot run without reserved CPUs that are also AvailableResources", PolicyName) } // Handle policy-specific options log.Debug("creating %s configuration", PolicyName) if err := p.setConfig(balloonsOptions); err != nil { log.Fatal("failed to create %s policy: %v", PolicyName, err) } log.Debug("first effective configuration:\n%s\n", utils.DumpJSON(p.bpoptions)) pkgcfg.GetModule(PolicyPath).AddNotify(p.configNotify) return p } // Name returns the name of this policy. func (p *balloons) Name() string { return PolicyName } // Description returns the description for this policy. func (p *balloons) Description() string { return PolicyDescription } // Start prepares this policy for accepting allocation/release requests. func (p *balloons) Start(add []cache.Container, del []cache.Container) error { log.Info("%s policy started", PolicyName) // reassign all containers return p.Sync(p.cch.GetContainers(), del) } // Sync synchronizes the active policy state. func (p *balloons) Sync(add []cache.Container, del []cache.Container) error { log.Debug("synchronizing state...") for _, c := range del { p.ReleaseResources(c) } for _, c := range add { p.AllocateResources(c) } return nil } // AllocateResources is a resource allocation request for this policy. func (p *balloons) AllocateResources(c cache.Container) error { log.Debug("allocating resources for container %s (request %d mCPU, limit %d mCPU)...", c.PrettyName(), p.containerRequestedMilliCpus(c.GetCacheID()), p.containerLimitedMilliCpus(c.GetCacheID())) bln, err := p.allocateBalloon(c) if err != nil { return balloonsError("balloon allocation for container %s failed: %w", c.PrettyName(), err) } if bln == nil { return balloonsError("no suitable balloons found for container %s", c.PrettyName()) } // Resize selected balloon to fit the new container, unless it // uses the ReservedResources CPUs, which is a fixed set. reqMilliCpus := p.containerRequestedMilliCpus(c.GetCacheID()) + p.requestedMilliCpus(bln) // Even if all containers in a balloon request is 0 mCPU in // total (all are BestEffort, for example), force the size of // the balloon to be enough for at least 1 mCPU // request. Otherwise balloon's cpuset becomes empty, which in // would mean no CPU pinning and balloon's containers would // run on any CPUs. if bln.AvailMilliCpus() < max(1, reqMilliCpus) { p.resizeBalloon(bln, max(1, reqMilliCpus)) } p.assignContainer(c, bln) if log.DebugEnabled() { log.Debug(p.dumpBalloon(bln)) } return nil } // ReleaseResources is a resource release request for this policy. func (p *balloons) ReleaseResources(c cache.Container) error { log.Debug("releasing container %s...", c.PrettyName()) if bln := p.balloonByContainer(c); bln != nil { p.dismissContainer(c, bln) if log.DebugEnabled() { log.Debug(p.dumpBalloon(bln)) } if bln.ContainerCount() == 0 { // Deflate the balloon completely before // freeing it. p.resizeBalloon(bln, 0) log.Debug("all containers removed, free balloon allocation %s", bln.PrettyName()) p.freeBalloon(bln) } else { // Make sure that the balloon will have at // least 1 CPU to run remaining containers. p.resizeBalloon(bln, max(1, p.requestedMilliCpus(bln))) } } else { log.Debug("ReleaseResources: balloon-less container %s, nothing to release", c.PrettyName()) } return nil } // UpdateResources is a resource allocation update request for this policy. func (p *balloons) UpdateResources(c cache.Container) error { log.Debug("(not) updating container %s...", c.PrettyName()) return nil } // Rebalance tries to find an optimal allocation of resources for the current containers. func (p *balloons) Rebalance() (bool, error) { log.Debug("(not) rebalancing containers...") return false, nil } // HandleEvent handles policy-specific events. func (p *balloons) HandleEvent(*events.Policy) (bool, error) { log.Debug("(not) handling event...") return false, nil } // ExportResourceData provides resource data to export for the container. func (p *balloons) ExportResourceData(c cache.Container) map[string]string { return nil } // Introspect provides data for external introspection. func (p *balloons) Introspect(*introspect.State) { return } // balloonByContainer returns a balloon that contains a container. func (p *balloons) balloonByContainer(c cache.Container) *Balloon { podID := c.GetPodID() cID := c.GetCacheID() for _, bln := range p.balloons { for _, ctrID := range bln.PodIDs[podID] { if ctrID == cID { return bln } } } return nil } // balloonsByNamespace returns balloons that contain containers in a // namespace. func (p *balloons) balloonsByNamespace(namespace string) []*Balloon { blns := []*Balloon{} for _, bln := range p.balloons { for podID, ctrIDs := range bln.PodIDs { if len(ctrIDs) == 0 { continue } pod, ok := p.cch.LookupPod(podID) if !ok { continue } if pod.GetNamespace() == namespace { blns = append(blns, bln) break } } } return blns } // balloonsByPod returns balloons that contain any container of a pod. func (p *balloons) balloonsByPod(pod cache.Pod) []*Balloon { podID := pod.GetID() blns := []*Balloon{} for _, bln := range p.balloons { if _, ok := bln.PodIDs[podID]; ok { blns = append(blns, bln) } } return blns } // balloonsByDef returns list of balloons instantiated from a balloon // definition. func (p *balloons) balloonsByDef(blnDef *BalloonDef) []*Balloon { balloons := []*Balloon{} for _, balloon := range p.balloons { if balloon.Def == blnDef { balloons = append(balloons, balloon) } } return balloons } // balloonDefByName returns a balloon definition with a name. func (p *balloons) balloonDefByName(defName string) *BalloonDef { if defName == "reserved" { return p.reservedBalloonDef } if defName == "default" { return p.defaultBalloonDef } for _, blnDef := range p.bpoptions.BalloonDefs { if blnDef.Name == defName { return blnDef } } return nil } func (p *balloons) chooseBalloonDef(c cache.Container) (*BalloonDef, error) { var blnDef *BalloonDef // BalloonDef is defined by annotation? if blnDefName, ok := c.GetEffectiveAnnotation(balloonKey); ok { blnDef = p.balloonDefByName(blnDefName) if blnDef == nil { return nil, balloonsError("no balloon for annotation %q", blnDefName) } return blnDef, nil } // BalloonDef is defined by a special namespace (kube-system + // ReservedPoolNamespaces)? if namespaceMatches(c.GetNamespace(), append(p.bpoptions.ReservedPoolNamespaces, metav1.NamespaceSystem)) { return p.balloons[0].Def, nil } // BalloonDef is defined by the namespace. for _, blnDef := range append([]*BalloonDef{p.reservedBalloonDef, p.defaultBalloonDef}, p.bpoptions.BalloonDefs...) { if namespaceMatches(c.GetNamespace(), blnDef.Namespaces) { return blnDef, nil } } // Fallback to the default balloon. return p.defaultBalloonDef, nil } func (p *balloons) containerRequestedMilliCpus(contID string) int { cont, ok := p.cch.LookupContainer(contID) if !ok { return 0 } reqCpu, ok := cont.GetResourceRequirements().Requests[corev1.ResourceCPU] if !ok { return 0 } return int(reqCpu.MilliValue()) } func (p *balloons) containerLimitedMilliCpus(contID string) int { cont, ok := p.cch.LookupContainer(contID) if !ok { return 0 } reqCpu, ok := cont.GetResourceRequirements().Limits[corev1.ResourceCPU] if !ok { return 0 } return int(reqCpu.MilliValue()) } // requestedMilliCpus sums up and returns CPU requests of all // containers assigned to a balloon. func (p *balloons) requestedMilliCpus(bln *Balloon) int { cpuRequested := 0 for _, cID := range bln.ContainerIDs() { cpuRequested += p.containerRequestedMilliCpus(cID) } return cpuRequested } // freeMilliCpus returns free CPU resources in a balloon without // inflating the balloon. func (p *balloons) freeMilliCpus(bln *Balloon) int { return bln.AvailMilliCpus() - p.requestedMilliCpus(bln) } // maxFreeMilliCpus returns free CPU resources in a balloon when it is // inflated as large as possible. func (p *balloons) maxFreeMilliCpus(bln *Balloon) int { return bln.MaxAvailMilliCpus(p.freeCpus) - p.requestedMilliCpus(bln) } // largest helps finding the largest element and value in a slice. // Input the length of a slice and a function that returns the // magnitude of given element in the slice as int. func largest(sliceLen int, valueOf func(i int) int) (int, int) { largestIndex := -1 largestValue := 0 for index := 0; index < sliceLen; index++ { value := valueOf(index) if largestIndex == -1 || value > largestValue { largestIndex = index largestValue = value } } return largestIndex, largestValue } // resetCpuClass resets CPU configurations globally. All balloons can // be ignored, their CPU configurations will be applied later. func (p *balloons) resetCpuClass() error { // Usual inputs: // - p.allowed (cpuset.CPUset): all CPUs available for this // policy. // - p.IdleCpuClass (string): CPU class for allowed CPUs. // // Other inputs, if needed: // - p.reserved (cpuset.CPUset): CPUs of ReservedResources // (typically for kube-system containers). // // Note: p.useCpuClass(balloon) will be called before assigning // containers on the balloon, including the reserved balloon. // // TODO: don't depend on cpu controller directly cpucontrol.Assign(p.cch, p.bpoptions.IdleCpuClass, p.allowed.UnsortedList()...) log.Debugf("resetCpuClass available: %s; reserved: %s", p.allowed, p.reserved) return nil } // useCpuClass configures CPUs of a balloon. func (p *balloons) useCpuClass(bln *Balloon) error { // Usual inputs: // - CPUs that cpuallocator has reserved for this balloon: // bln.Cpus (cpuset.CPUSet). // - User-defined CPU configuration for CPUs of balloon of this type: // bln.Def.CpuClass (string). // - Current configuration(?): feel free to add data // structure for this. For instance policy-global p.cpuConfs, // or balloon-local bln.cpuConfs. // // Other input examples, if needed: // - Requested CPU resources by all containers in the balloon: // p.requestedMilliCpus(bln). // - Free CPU resources in the balloon: p.freeMilliCpus(bln). // - Number of assigned containers: bln.ContainerCount(). // - Container details: access p.cch with bln.ContainerIDs(). // - User-defined CPU AllocatorPriority: bln.Def.AllocatorPriority. // - All existing balloon instances: p.balloons. // - CPU configurations by user: bln.Def.CpuClass (for bln in p.balloons) cpucontrol.Assign(p.cch, bln.Def.CpuClass, bln.Cpus.UnsortedList()...) log.Debugf("useCpuClass Cpus: %s; CpuClass: %s", bln.Cpus, bln.Def.CpuClass) return nil } // forgetCpuClass is called when CPUs of a balloon are released from duty. func (p *balloons) forgetCpuClass(bln *Balloon) { // Use p.IdleCpuClass for bln.Cpus. // Usual inputs: see useCpuClass cpucontrol.Assign(p.cch, p.bpoptions.IdleCpuClass, bln.Cpus.UnsortedList()...) log.Debugf("forgetCpuClass Cpus: %s; CpuClass: %s", bln.Cpus, bln.Def.CpuClass) } func (p *balloons) newBalloon(blnDef *BalloonDef, confCpus bool) (*Balloon, error) { var cpus cpuset.CPUSet var err error blnsOfDef := p.balloonsByDef(blnDef) // Allowed to create new balloon instance from blnDef? if blnDef.MaxBalloons > NoLimit && blnDef.MaxBalloons <= len(blnsOfDef) { return nil, balloonsError("cannot create new %q balloon, MaxBalloons limit (%d) reached", blnDef.Name, blnDef.MaxBalloons) } // Find the first unused balloon instance index. freeInstance := 0 for freeInstance = 0; freeInstance < len(blnsOfDef); freeInstance++ { isFree := true for _, bln := range blnsOfDef { if bln.Instance == freeInstance { isFree = false break } } if isFree { break } } // Configure new cpuTreeAllocator for this balloon if there // are type specific allocator options, otherwise use policy // default allocator. cpuTreeAllocator := p.cpuTreeAllocator if blnDef.AllocatorTopologyBalancing != nil || blnDef.PreferSpreadOnPhysicalCores != nil { allocatorOptions := cpuTreeAllocatorOptions{ topologyBalancing: p.bpoptions.AllocatorTopologyBalancing, preferSpreadOnPhysicalCores: p.bpoptions.PreferSpreadOnPhysicalCores, } if blnDef.AllocatorTopologyBalancing != nil { allocatorOptions.topologyBalancing = *blnDef.AllocatorTopologyBalancing } if blnDef.PreferSpreadOnPhysicalCores != nil { allocatorOptions.preferSpreadOnPhysicalCores = *blnDef.PreferSpreadOnPhysicalCores } cpuTreeAllocator = p.cpuTree.NewAllocator(allocatorOptions) } // Allocate CPUs if blnDef == p.reservedBalloonDef || (blnDef == p.defaultBalloonDef && blnDef.MinCpus == 0 && blnDef.MaxCpus == 0) { // The reserved balloon uses ReservedResources CPUs. // So does the default balloon unless its CPU counts are tweaked. cpus = p.reserved } else { addFromCpus, _, err := cpuTreeAllocator.ResizeCpus(cpuset.New(), p.freeCpus, blnDef.MinCpus) if err != nil { return nil, balloonsError("failed to choose a cpuset for allocating first %d CPUs from %#s", blnDef.MinCpus, p.freeCpus) } cpus, err = p.cpuAllocator.AllocateCpus(&addFromCpus, blnDef.MinCpus, blnDef.AllocatorPriority) if err != nil { return nil, balloonsError("could not allocate %d MinCpus for balloon %s[%d]: %w", blnDef.MinCpus, blnDef.Name, freeInstance, err) } p.freeCpus = p.freeCpus.Difference(cpus) } bln := &Balloon{ Def: blnDef, Instance: freeInstance, PodIDs: make(map[string][]string), Cpus: cpus, SharedIdleCpus: cpuset.New(), Mems: p.closestMems(cpus), cpuTreeAllocator: cpuTreeAllocator, } if confCpus { if err = p.useCpuClass(bln); err != nil { log.Errorf("failed to apply CPU configuration to new balloon %s[%d] (cpus: %s): %w", blnDef.Name, freeInstance, cpus, err) return nil, err } } return bln, nil } // deleteBalloon removes an empty balloon. func (p *balloons) deleteBalloon(bln *Balloon) { log.Debugf("deleting balloon %s", bln) remainingBalloons := []*Balloon{} for _, b := range p.balloons { if b != bln { remainingBalloons = append(remainingBalloons, b) } } p.balloons = remainingBalloons p.forgetCpuClass(bln) p.freeCpus = p.freeCpus.Union(bln.Cpus) p.cpuAllocator.ReleaseCpus(&bln.Cpus, bln.Cpus.Size(), bln.Def.AllocatorPriority) } // freeBalloon clears a balloon and deletes it if allowed. func (p *balloons) freeBalloon(bln *Balloon) { bln.PodIDs = make(map[string][]string) blnsSameDef := p.balloonsByDef(bln.Def) if len(blnsSameDef) > bln.Def.MinBalloons { p.deleteBalloon(bln) } } func (p *balloons) chooseBalloonInstance(blnDef *BalloonDef, fm FillMethod, c cache.Container) (*Balloon, error) { // If assigning to the reserved or the default balloon, fill // method is ignored: always fill the chosen balloon. if blnDef == p.balloons[0].Def { return p.balloons[0], nil } if blnDef == p.balloons[1].Def { return p.balloons[1], nil } reqMilliCpus := p.containerRequestedMilliCpus(c.GetCacheID()) // Handle fill methods that do not use existing instances of // balloonDef. switch fm { case FillReservedBalloon: return p.balloons[0], nil case FillDefaultBalloon: return p.balloons[1], nil case FillNewBalloon, FillNewBalloonMust: // Choosing an existing balloon without containers is // preferred over instantiating a new balloon. for _, bln := range p.balloonsByDef(blnDef) { if len(bln.PodIDs) == 0 { return bln, nil } } newBln, err := p.newBalloon(blnDef, false) if err != nil { if fm == FillNewBalloonMust { return nil, err } return nil, nil } // newBln may already have CPUs allocated for it. If // we notice that the new balloon fill method cannot // be used after all, collect steps to undo() new // balloon creation. undoFuncs := []func(){} undo := func() { for _, undoFunc := range undoFuncs { undoFunc() } } undoFuncs = append(undoFuncs, func() { p.freeCpus = p.freeCpus.Union(newBln.Cpus) }) if newBln.MaxAvailMilliCpus(p.freeCpus) < reqMilliCpus { // New balloon cannot be inflated to fit new // container. Release its CPUs if already // allocated (MinCPUs > 0), and never add it // to the list of balloons. undo() if fm == FillNewBalloonMust { return nil, balloonsError("not enough CPUs to run container %s requesting %s mCPU. %s.MaxCPUs: %d mCPU, free CPUs: %s", c.PrettyName(), reqMilliCpus, blnDef.Name, blnDef.MaxCpus*1000, p.freeCpus.Size()*1000) } else { return nil, nil } } // Make the existence of the new balloon official by // adding it to the balloons slice. p.balloons = append(p.balloons, newBln) undoFuncs = append(undoFuncs, func() { p.balloons = p.balloons[:len(p.balloons)-1] }) // If the new balloon already has CPUs, there is some // housekeeping to do. if newBln.Cpus.Size() > 0 { // Make sure CPUs in the balloon use correct // CPU class. if err = p.useCpuClass(newBln); err != nil { log.Errorf("failed to apply CPU configuration to new balloon %s (cpus: %s): %s", newBln.PrettyName(), newBln.Cpus, err) undo() return nil, err } // Reshare idle CPUs because freeCpus have // changed and CPUs of the new balloon are no // more idle. p.updatePinning(p.shareIdleCpus(p.freeCpus, newBln.Cpus)...) } return newBln, nil case FillSameNamespace: for _, bln := range p.balloonsByNamespace(c.GetNamespace()) { if bln.Def == blnDef && p.maxFreeMilliCpus(bln) >= reqMilliCpus { return bln, nil } } return nil, nil case FillSamePod: if pod, ok := c.GetPod(); ok { for _, bln := range p.balloonsByPod(pod) { if p.maxFreeMilliCpus(bln) >= reqMilliCpus { return bln, nil } } return nil, nil } else { return nil, balloonsError("fill method %s failed: cannot find pod for container %s", fm, c.PrettyName()) } } // Handle fill methods that need existing instances of // balloonDef, and fail if there are no instances. balloons := p.balloonsByDef(blnDef) if len(balloons) == 0 { return nil, nil } switch fm { case FillBalanced: // Are there balloons where the container would fit // without inflating the balloon? blnIdx, freeMilliCpus := largest(len(balloons), func(i int) int { return p.freeMilliCpus(balloons[i]) }) if freeMilliCpus >= reqMilliCpus { return balloons[blnIdx], nil } case FillBalancedInflate: // Are there balloons where the container would fit // after inflating the balloon? blnIdx, maxFreeMilliCpus := largest(len(balloons), func(i int) int { return p.maxFreeMilliCpus(balloons[i]) }) if maxFreeMilliCpus >= reqMilliCpus { return balloons[blnIdx], nil } default: return nil, balloonsError("balloon type fill method not implemented: %s", fm) } // No error, but balloon type remains undecided in this assign method. return nil, nil } func namespaceMatches(namespace string, patterns []string) bool { for _, pattern := range patterns { ret, err := filepath.Match(pattern, namespace) if err == nil && ret { return true } } return false } // allocateBalloon returns a balloon allocated for a container. func (p *balloons) allocateBalloon(c cache.Container) (*Balloon, error) { blnDef, err := p.chooseBalloonDef(c) if err != nil { return nil, err } if blnDef == nil { return nil, balloonsError("no applicable balloon type found") } bln, err := p.allocateBalloonOfDef(blnDef, c) if err != nil { return nil, err } if bln == nil { return nil, balloonsError("no suitable balloon instance available") } return bln, nil } // allocateBalloonOfDef returns a balloon instantiated from a // definition for a container. func (p *balloons) allocateBalloonOfDef(blnDef *BalloonDef, c cache.Container) (*Balloon, error) { if blnDef == p.reservedBalloonDef { return p.balloons[0], nil } if blnDef == p.defaultBalloonDef { return p.balloons[1], nil } fillChain := []FillMethod{} if !blnDef.PreferSpreadingPods { fillChain = append(fillChain, FillSamePod) } if blnDef.PreferPerNamespaceBalloon { fillChain = append(fillChain, FillSameNamespace, FillNewBalloon) } if blnDef.PreferNewBalloons { fillChain = append(fillChain, FillNewBalloon, FillBalanced, FillBalancedInflate) } else { fillChain = append(fillChain, FillBalanced, FillBalancedInflate, FillNewBalloon) } for _, fillMethod := range fillChain { bln, err := p.chooseBalloonInstance(blnDef, fillMethod, c) if err != nil { log.Debugf("fill method %q prevents allocation: %w", fillMethod, err) return nil, err } if bln == nil { log.Debugf("fill method %q not applicable", fillMethod) continue } log.Debugf("fill method %q suggests balloon instance %v", fillMethod, bln) return bln, nil } return nil, nil } // dumpBalloon dumps balloon contents in detail. func (p *balloons) dumpBalloon(bln *Balloon) string { conts := []string{} pods := []string{} for podID, contIDs := range bln.PodIDs { podName := podID if pod, ok := p.cch.LookupPod(podID); ok { podName = pod.GetName() } pods = append(pods, podName) for _, contID := range contIDs { if cont, ok := p.cch.LookupContainer(contID); ok { conts = append(conts, cont.PrettyName()) } else { conts = append(conts, podName+"."+contID) } } } s := fmt.Sprintf("Balloon %s{Cpus: %s; Mems: %s; mCPU used: %d; capacity: %d; max. capacity: %d; pods: %s; conts: %s}", bln.PrettyName(), bln.Cpus, bln.Mems, p.requestedMilliCpus(bln), bln.AvailMilliCpus(), bln.MaxAvailMilliCpus(p.freeCpus), pods, conts) return s } // getPodMilliCPU returns mCPUs requested by podID. func (p *balloons) getPodMilliCPU(podID string) int64 { cpuRequested := int64(0) for _, c := range p.cch.GetContainers() { if c.GetPodID() == podID { if reqCpu, ok := c.GetResourceRequirements().Requests[corev1.ResourceCPU]; ok { cpuRequested += reqCpu.MilliValue() } } } return cpuRequested } // changesBalloons returns true if two balloons policy configurations // may lead into different balloon instances or workload assignment. func changesBalloons(opts0, opts1 *BalloonsOptions) bool { if opts0 == nil && opts1 == nil { return false } if opts0 == nil || opts1 == nil { return true } if len(opts0.BalloonDefs) != len(opts1.BalloonDefs) { return true } o0 := opts0.DeepCopy() o1 := opts1.DeepCopy() // Ignore differences in CPU class names. Every other change // potentially changes balloons or workloads. o0.IdleCpuClass = "" o1.IdleCpuClass = "" for i := range o0.BalloonDefs { o0.BalloonDefs[i].CpuClass = "" o1.BalloonDefs[i].CpuClass = "" } return utils.DumpJSON(o0) != utils.DumpJSON(o1) } // changesCpuClasses returns true if two balloons policy // configurations can lead to using different CPU classes on // corresponding balloon instances. Calling changesCpuClasses(o0, o1) // makes sense only if changesBalloons(o0, o1) has returned false. func changesCpuClasses(opts0, opts1 *BalloonsOptions) bool { if opts0 == nil && opts1 == nil { return false } if opts0 == nil || opts1 == nil { return true } if opts0.IdleCpuClass != opts1.IdleCpuClass { return true } if len(opts0.BalloonDefs) != len(opts1.BalloonDefs) { return true } for i := range opts0.BalloonDefs { if opts0.BalloonDefs[i].CpuClass != opts1.BalloonDefs[i].CpuClass { return true } } return false } // configNotify applies new configuration. func (p *balloons) configNotify(event pkgcfg.Event, source pkgcfg.Source) error { log.Info("configuration %s", event) defer log.Debug("effective configuration:\n%s\n", utils.DumpJSON(p.bpoptions)) newBalloonsOptions := balloonsOptions.DeepCopy() if !changesBalloons(&p.bpoptions, newBalloonsOptions) { if !changesCpuClasses(&p.bpoptions, newBalloonsOptions) { log.Info("no configuration changes") } else { log.Info("configuration changes only on CPU classes") // Update new CPU classes to existing balloon // definitions. The same BalloonDef instances // must be kept in use, because each Balloon // instance holds a direct reference to its // BalloonDef. for i := range p.bpoptions.BalloonDefs { p.bpoptions.BalloonDefs[i].CpuClass = newBalloonsOptions.BalloonDefs[i].CpuClass } // (Re)configures all CPUs in balloons. p.resetCpuClass() for _, bln := range p.balloons { p.useCpuClass(bln) } } return nil } if err := p.setConfig(newBalloonsOptions); err != nil { log.Error("config update failed: %v", err) return err } log.Info("config updated successfully") p.Sync(p.cch.GetContainers(), p.cch.GetContainers()) return nil } // applyBalloonDef creates user-defined balloons or reconfigures built-in // balloons according to the blnDef. Does not initialize balloon CPUs. func (p *balloons) applyBalloonDef(balloons *[]*Balloon, blnDef *BalloonDef, freeCpus *cpuset.CPUSet) error { if len(*balloons) < 2 { return balloonsError("internal error: reserved and default balloons missing, cannot apply balloon definitions") } reservedBalloon := (*balloons)[0] defaultBalloon := (*balloons)[1] // Every BalloonDef does one of the following: // 1. reconfigures the "reserved" balloon (most restricted) // 2. reconfigures the "default" balloon (somewhat restricted) // 3. defines new user-defined balloons. switch blnDef.Name { case "": // Case 0: bad name return balloonsError("undefined or empty balloon name") case reservedBalloon.Def.Name: // Case 1: reconfigure the "reserved" balloon. if blnDef.MinCpus != 0 { return balloonsError("cannot reconfigure the reserved balloon MinCpus, specified in ReservedResources CPUs") } if blnDef.MaxCpus != 0 { return balloonsError("cannot reconfigure the reserved balloon MaxCpus, specified in ReservedResources CPUs") } if blnDef.MinBalloons != 0 { return balloonsError("cannot reconfigure the reserved balloon MinBalloons") } p.reservedBalloonDef.AllocatorPriority = blnDef.AllocatorPriority p.reservedBalloonDef.CpuClass = blnDef.CpuClass p.reservedBalloonDef.Namespaces = blnDef.Namespaces case defaultBalloon.Def.Name: // Case 2: reconfigure the "default" balloon. defaultUsesReservedCpus := true if blnDef.MinCpus != 0 || blnDef.MaxCpus != 0 { defaultUsesReservedCpus = false } if blnDef.MinBalloons != 0 { return balloonsError("cannot reconfigure the default balloon MinBalloons") } p.defaultBalloonDef.MinCpus = blnDef.MinCpus p.defaultBalloonDef.MaxCpus = blnDef.MaxCpus p.defaultBalloonDef.AllocatorPriority = blnDef.AllocatorPriority p.defaultBalloonDef.CpuClass = blnDef.CpuClass p.defaultBalloonDef.Namespaces = blnDef.Namespaces if !defaultUsesReservedCpus { // Overwrite existing default balloon instance // that uses reserved CPUs with a balloon that // uses its own CPUs. newDefaultBln, err := p.newBalloon(p.defaultBalloonDef, false) if err != nil { return balloonsError("cannot create new default balloon: %w", err) } newDefaultBln.Instance = 0 (*balloons)[1] = newDefaultBln } default: // Case 3: create minimum amount (MinBalloons) of each user-defined balloons. for allocPrio := cpuallocator.CPUPriority(0); allocPrio < cpuallocator.NumCPUPriorities; allocPrio++ { if blnDef.AllocatorPriority != allocPrio { continue } for blnIdx := 0; blnIdx < blnDef.MinBalloons; blnIdx++ { newBln, err := p.newBalloon(blnDef, false) if err != nil { return err } if newBln == nil { return balloonsError("failed to create balloon '%s[%d]' as required by MinBalloons=%d", blnDef.Name, blnIdx, blnDef.MinBalloons) } *balloons = append(*balloons, newBln) } } } return nil } func (p *balloons) validateConfig(bpoptions *BalloonsOptions) error { for _, blnDef := range bpoptions.BalloonDefs { if blnDef.MaxCpus != NoLimit && blnDef.MinCpus > blnDef.MaxCpus { return balloonsError("MinCpus (%d) > MaxCpus (%d) in balloon type %q", blnDef.MinCpus, blnDef.MaxCpus, blnDef.Name) } if blnDef.MaxBalloons != NoLimit && blnDef.MinBalloons > blnDef.MaxBalloons { return balloonsError("MinBalloons (%d) > MaxBalloons (%d) in balloon type %q", blnDef.MinCpus, blnDef.MaxCpus, blnDef.Name) } } return nil } // setConfig takes new balloon configuration into use. func (p *balloons) setConfig(bpoptions *BalloonsOptions) error { // TODO: revert allocations (p.freeCpus) to old ones if the // configuration is invalid. Currently bad configuration // leaves a mess in bookkeeping. if err := p.validateConfig(bpoptions); err != nil { return balloonsError("invalid configuration: %w", err) } // Create the default reserved and default balloon // definitions. Some properties of these definitions may be // altered by user configuration. p.reservedBalloonDef = &BalloonDef{ Name: reservedBalloonDefName, MinBalloons: 1, AllocatorPriority: 3, } p.defaultBalloonDef = &BalloonDef{ Name: defaultBalloonDefName, MinBalloons: 1, AllocatorPriority: 3, } p.balloons = []*Balloon{} p.freeCpus = p.allowed.Clone() p.freeCpus = p.freeCpus.Difference(p.reserved) p.cpuTreeAllocator = p.cpuTree.NewAllocator(cpuTreeAllocatorOptions{ topologyBalancing: bpoptions.AllocatorTopologyBalancing, preferSpreadOnPhysicalCores: bpoptions.PreferSpreadOnPhysicalCores, }) // We can't delay taking new configuration into use beyond this point, // because p.newBalloon() dereferences our options via p.bpoptions, so // it would end up using the old configuration. p.bpoptions = *bpoptions // Instantiate built-in reserved and default balloons. reservedBalloon, err := p.newBalloon(p.reservedBalloonDef, false) if err != nil { return err } p.balloons = append(p.balloons, reservedBalloon) defaultBalloon, err := p.newBalloon(p.defaultBalloonDef, false) if err != nil { return err } p.balloons = append(p.balloons, defaultBalloon) // First apply customizations to built-in balloons: "reserved" // and "default". for _, blnDef := range bpoptions.BalloonDefs { if blnDef.Name != reservedBalloonDefName && blnDef.Name != defaultBalloonDefName { continue } if err := p.applyBalloonDef(&p.balloons, blnDef, &p.freeCpus); err != nil { return err } } // Apply all user balloon definitions, skip already customized // "reserved" and "default" balloons. for _, blnDef := range bpoptions.BalloonDefs { if blnDef.Name == reservedBalloonDefName || blnDef.Name == defaultBalloonDefName { continue } if err := p.applyBalloonDef(&p.balloons, blnDef, &p.freeCpus); err != nil { return err } } // Finish balloon instance initialization. log.Info("%s policy balloons:", PolicyName) for blnIdx, bln := range p.balloons { log.Info("- balloon %d: %s", blnIdx, bln) } p.updatePinning(p.shareIdleCpus(p.freeCpus, cpuset.New())...) // (Re)configures all CPUs in balloons. p.resetCpuClass() for _, bln := range p.balloons { p.useCpuClass(bln) } return nil } // closestMems returns memory node IDs good for pinning containers // that run on given CPUs func (p *balloons) closestMems(cpus cpuset.CPUSet) idset.IDSet { mems := idset.NewIDSet() sys := p.options.System for _, nodeID := range sys.NodeIDs() { if !cpus.Intersection(sys.Node(nodeID).CPUSet()).IsEmpty() { mems.Add(nodeID) } } return mems } // filterBalloons returns balloons for which the test function returns true func filterBalloons(balloons []*Balloon, test func(*Balloon) bool) (ret []*Balloon) { for _, bln := range balloons { if test(bln) { ret = append(ret, bln) } } return } // availableMilliCPU returns mCPUs available in a balloon. func (p *balloons) availableMilliCpus(balloon *Balloon) int64 { cpuAvail := int64(balloon.Cpus.Size() * 1000) cpuRequested := int64(0) for podID := range balloon.PodIDs { cpuRequested += p.getPodMilliCPU(podID) } return cpuAvail - cpuRequested } // resizeBalloon changes the CPUs allocated for a balloon, if allowed. func (p *balloons) resizeBalloon(bln *Balloon, newMilliCpus int) error { if bln.Cpus.Equals(p.reserved) { log.Debugf("not resizing %s to %d mCPU, using fixed CPUs", bln, newMilliCpus) return nil } oldCpuCount := bln.Cpus.Size() newCpuCount := (newMilliCpus + 999) / 1000 if bln.Def.MaxCpus > NoLimit && newCpuCount > bln.Def.MaxCpus { newCpuCount = bln.Def.MaxCpus } if bln.Def.MinCpus > 0 && newCpuCount < bln.Def.MinCpus { newCpuCount = bln.Def.MinCpus } log.Debugf("resize %s to fit %d mCPU", bln, newMilliCpus) log.Debugf("- change full CPUs from %d to %d", oldCpuCount, newCpuCount) log.Debugf("- freecpus: %#s", p.freeCpus) if oldCpuCount == newCpuCount { return nil } cpuCountDelta := newCpuCount - oldCpuCount p.forgetCpuClass(bln) defer p.useCpuClass(bln) if cpuCountDelta > 0 { // Inflate the balloon. addFromCpus, _, err := bln.cpuTreeAllocator.ResizeCpus(bln.Cpus, p.freeCpus, cpuCountDelta) if err != nil { return balloonsError("resize/inflate: failed to choose a cpuset for allocating additional %d CPUs: %w", cpuCountDelta, err) } log.Debugf("- allocate CPUs %d from %#s", cpuCountDelta, addFromCpus) newCpus, err := p.cpuAllocator.AllocateCpus(&addFromCpus, newCpuCount-oldCpuCount, bln.Def.AllocatorPriority) if err != nil { return balloonsError("resize/inflate: allocating %d CPUs for %s failed: %w", cpuCountDelta, bln, err) } p.freeCpus = p.freeCpus.Difference(newCpus) bln.Cpus = bln.Cpus.Union(newCpus) p.updatePinning(p.shareIdleCpus(p.freeCpus, newCpus)...) } else { // Deflate the balloon. _, removeFromCpus, err := bln.cpuTreeAllocator.ResizeCpus(bln.Cpus, p.freeCpus, cpuCountDelta) if err != nil { return balloonsError("resize/deflate: failed to choose a cpuset for releasing %d CPUs: %w", -cpuCountDelta, err) } log.Debugf("- releasing %d CPUs from cpuset %#s", -cpuCountDelta, removeFromCpus) _, err = p.cpuAllocator.ReleaseCpus(&removeFromCpus, -cpuCountDelta, bln.Def.AllocatorPriority) if err != nil { return balloonsError("resize/deflate: releasing %d CPUs from %s failed: %w", -cpuCountDelta, bln, err) } log.Debugf("- old freeCpus: %#s, old bln.Cpus: %#s, releasing: %#s", p.freeCpus, bln.Cpus, removeFromCpus) p.freeCpus = p.freeCpus.Union(removeFromCpus) bln.Cpus = bln.Cpus.Difference(removeFromCpus) p.updatePinning(p.shareIdleCpus(removeFromCpus, cpuset.New())...) } log.Debugf("- resize successful: %s, freecpus: %#s", bln, p.freeCpus) p.updatePinning(bln) return nil } func (p *balloons) updatePinning(blns ...*Balloon) { for _, bln := range blns { cpus := bln.Cpus.Union(bln.SharedIdleCpus) bln.Mems = p.closestMems(cpus) for _, cID := range bln.ContainerIDs() { if c, ok := p.cch.LookupContainer(cID); ok { p.pinCpuMem(c, cpus, bln.Mems) } } } } // shareIdleCpus adds addCpus and removes removeCpus to those balloons // that whose containers are allowed to use shared idle CPUs. Returns // balloons that will need re-pinning. func (p *balloons) shareIdleCpus(addCpus, removeCpus cpuset.CPUSet) []*Balloon { updateBalloons := map[int]struct{}{} if removeCpus.Size() > 0 { for blnIdx, bln := range p.balloons { if bln.SharedIdleCpus.Intersection(removeCpus).Size() > 0 { bln.SharedIdleCpus = bln.SharedIdleCpus.Difference(removeCpus) updateBalloons[blnIdx] = struct{}{} } } } if addCpus.Size() > 0 { for blnIdx, bln := range p.balloons { topoLevel := bln.Def.ShareIdleCpusInSame if topoLevel == CPUTopologyLevelUndefined { continue } idleCpusInTopoLevel := cpuset.New() p.cpuTree.DepthFirstWalk(func(t *cpuTreeNode) error { // Dive in correct topology level. if t.level != topoLevel { return nil } // Does the balloon include CPUs in the correct topology level? if t.cpus.Intersection(bln.Cpus).Size() > 0 { // Share idle CPUs on this level to this balloon. idleCpusInTopoLevel = idleCpusInTopoLevel.Union(t.cpus.Intersection(addCpus)) } // Do not walk deeper than the correct level. return WalkSkipChildren }) if idleCpusInTopoLevel.Size() == 0 { continue } sharedBefore := bln.SharedIdleCpus.Size() bln.SharedIdleCpus = bln.SharedIdleCpus.Union(idleCpusInTopoLevel) sharedNow := bln.SharedIdleCpus.Size() if sharedBefore != sharedNow { log.Debugf("balloon %d shares %d new idle CPU(s) in %s(s), %d in total (%s)", bln.PrettyName(), sharedNow-sharedBefore, topoLevel, bln.SharedIdleCpus.Size(), bln.SharedIdleCpus) updateBalloons[blnIdx] = struct{}{} } } } updatedBalloons := make([]*Balloon, 0, len(updateBalloons)) for blnIdx := range updateBalloons { updatedBalloons = append(updatedBalloons, p.balloons[blnIdx]) } return updatedBalloons } // assignContainer adds a container to a balloon func (p *balloons) assignContainer(c cache.Container, bln *Balloon) { log.Info("assigning container %s to balloon %s", c.PrettyName(), bln) // TODO: inflate the balloon (add CPUs / reconfigure balloons) // if necessary podID := c.GetPodID() bln.PodIDs[podID] = append(bln.PodIDs[podID], c.GetCacheID()) p.updatePinning(bln) } // dismissContainer removes a container from a balloon func (p *balloons) dismissContainer(c cache.Container, bln *Balloon) { podID := c.GetPodID() bln.PodIDs[podID] = removeString(bln.PodIDs[podID], c.GetCacheID()) if len(bln.PodIDs[podID]) == 0 { delete(bln.PodIDs, podID) } } // pinCpuMem pins container to CPUs and memory nodes if flagged func (p *balloons) pinCpuMem(c cache.Container, cpus cpuset.CPUSet, mems idset.IDSet) { if p.bpoptions.PinCPU == nil || *p.bpoptions.PinCPU { log.Debug(" - pinning %s to cpuset: %s", c.PrettyName(), cpus) c.SetCpusetCpus(cpus.String()) if reqCpu, ok := c.GetResourceRequirements().Requests[corev1.ResourceCPU]; ok { mCpu := int(reqCpu.MilliValue()) c.SetCPUShares(int64(cache.MilliCPUToShares(int64(mCpu)))) } } if p.bpoptions.PinMemory == nil || *p.bpoptions.PinMemory { log.Debug(" - pinning %s to memory %s", c.PrettyName(), mems) c.SetCpusetMems(mems.String()) } } // balloonsError formats an error from this policy. func balloonsError(format string, args ...interface{}) error { return fmt.Errorf(PolicyName+": "+format, args...) } // removeString returns the first occurrence of a string from string slice. func removeString(strings []string, element string) []string { for index, s := range strings { if s == element { strings[index] = strings[len(strings)-1] return strings[:len(strings)-1] } } return strings } func max(a, b int) int { if a > b { return a } return b } // Register us as a policy implementation. func init() { policy.Register(PolicyName, PolicyDescription, CreateBalloonsPolicy) } ================================================ FILE: pkg/cri/resource-manager/policy/builtin/balloons/balloons-policy_test.go ================================================ // Copyright 2022 Intel Corporation. All Rights Reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package balloons import ( "testing" ) func TestChangesBalloons(t *testing.T) { tcases := []struct { name string opts1 *BalloonsOptions opts2 *BalloonsOptions expectedValue bool }{ { name: "both options are nil", expectedValue: false, }, { name: "one option is nil", opts2: &BalloonsOptions{}, expectedValue: true, }, { name: "reserved pool namespaces differ by len", opts1: &BalloonsOptions{ IdleCpuClass: "icc0", ReservedPoolNamespaces: []string{"ns0"}, }, opts2: &BalloonsOptions{ IdleCpuClass: "icc0", ReservedPoolNamespaces: []string{}, }, expectedValue: true, }, { name: "reserved pool namespaces differ by content", opts1: &BalloonsOptions{ IdleCpuClass: "icc0", ReservedPoolNamespaces: []string{"ns0"}, }, opts2: &BalloonsOptions{ IdleCpuClass: "icc0", ReservedPoolNamespaces: []string{"ns1"}, }, expectedValue: true, }, { name: "idle cpu classes differ", opts1: &BalloonsOptions{ IdleCpuClass: "icc0", ReservedPoolNamespaces: []string{"ns0"}, }, opts2: &BalloonsOptions{ IdleCpuClass: "icc1", ReservedPoolNamespaces: []string{"ns0"}, }, expectedValue: false, }, { name: "balloon defs differ", opts1: &BalloonsOptions{ IdleCpuClass: "icc0", ReservedPoolNamespaces: []string{"ns0"}, BalloonDefs: []*BalloonDef{}, }, opts2: &BalloonsOptions{ IdleCpuClass: "icc1", ReservedPoolNamespaces: []string{"ns0"}, BalloonDefs: []*BalloonDef{}, }, expectedValue: false, }, } for _, tc := range tcases { t.Run(tc.name, func(t *testing.T) { value := changesBalloons(tc.opts1, tc.opts2) if value != tc.expectedValue { t.Errorf("Expected return value %v but got %v", tc.expectedValue, value) } }) } } ================================================ FILE: pkg/cri/resource-manager/policy/builtin/balloons/cputree.go ================================================ // Copyright 2022 Intel Corporation. All Rights Reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package balloons import ( "encoding/json" "errors" "fmt" "sort" "strings" system "github.com/intel/cri-resource-manager/pkg/sysfs" "github.com/intel/cri-resource-manager/pkg/utils/cpuset" ) type CPUTopologyLevel int const ( CPUTopologyLevelUndefined CPUTopologyLevel = iota CPUTopologyLevelSystem CPUTopologyLevelPackage CPUTopologyLevelDie CPUTopologyLevelNuma CPUTopologyLevelCore CPUTopologyLevelThread CPUTopologyLevelCount ) // cpuTreeNode is a node in the CPU tree. type cpuTreeNode struct { name string level CPUTopologyLevel parent *cpuTreeNode children []*cpuTreeNode cpus cpuset.CPUSet // union of CPUs of child nodes } // cpuTreeNodeAttributes contains various attributes of a CPU tree // node. When allocating or releasing CPUs, all CPU tree nodes in // which allocating/releasing could be possible are stored to the same // slice with these attributes. The attributes contain all necessary // information for comparing which nodes are the best choices for // allocating/releasing, thus traversing the tree is not needed in the // comparison phase. type cpuTreeNodeAttributes struct { t *cpuTreeNode depth int currentCpus cpuset.CPUSet freeCpus cpuset.CPUSet currentCpuCount int currentCpuCounts []int freeCpuCount int freeCpuCounts []int } // cpuTreeAllocator allocates CPUs from the branch of a CPU tree // where the "root" node is the topmost CPU of the branch. type cpuTreeAllocator struct { options cpuTreeAllocatorOptions root *cpuTreeNode } // cpuTreeAllocatorOptions contains parameters for the CPU allocator // that that selects CPUs from a CPU tree. type cpuTreeAllocatorOptions struct { // topologyBalancing true prefers allocating from branches // with most free CPUs (spread allocations), while false is // the opposite (packed allocations). topologyBalancing bool preferSpreadOnPhysicalCores bool } // Strings returns topology level as a string func (ctl CPUTopologyLevel) String() string { s, ok := cpuTopologyLevelToString[ctl] if ok { return s } return fmt.Sprintf("CPUTopologyLevelUnknown(%d)", ctl) } // cpuTopologyLevelToString defines names for all CPU topology levels. var cpuTopologyLevelToString = map[CPUTopologyLevel]string{ CPUTopologyLevelUndefined: "", CPUTopologyLevelSystem: "system", CPUTopologyLevelPackage: "package", CPUTopologyLevelDie: "die", CPUTopologyLevelNuma: "numa", CPUTopologyLevelCore: "core", CPUTopologyLevelThread: "thread", } // MarshalJSON() func (ctl CPUTopologyLevel) MarshalJSON() ([]byte, error) { return json.Marshal(ctl.String()) } // UnmarshalJSON unmarshals a JSON string to CPUTopologyLevel func (ctl *CPUTopologyLevel) UnmarshalJSON(data []byte) error { var dataString string if err := json.Unmarshal(data, &dataString); err != nil { return err } name := strings.ToLower(dataString) for ctlConst, ctlString := range cpuTopologyLevelToString { if ctlString == name { *ctl = ctlConst return nil } } return fmt.Errorf("invalid CPU topology level %q", name) } // String returns string representation of a CPU tree node. func (t *cpuTreeNode) String() string { if len(t.children) == 0 { return t.name } return fmt.Sprintf("%s%v", t.name, t.children) } func (t *cpuTreeNode) PrettyPrint() string { origDepth := t.Depth() lines := []string{} t.DepthFirstWalk(func(tn *cpuTreeNode) error { lines = append(lines, fmt.Sprintf("%s%s: %q cpus: %s", strings.Repeat(" ", (tn.Depth()-origDepth)*4), tn.level, tn.name, tn.cpus)) return nil }) return strings.Join(lines, "\n") } // String returns cpuTreeNodeAttributes as a string. func (tna cpuTreeNodeAttributes) String() string { return fmt.Sprintf("%s{%d,%v,%d,%d}", tna.t.name, tna.depth, tna.currentCpuCounts, tna.freeCpuCount, tna.freeCpuCounts) } // NewCpuTree returns a named CPU tree node. func NewCpuTree(name string) *cpuTreeNode { return &cpuTreeNode{ name: name, cpus: cpuset.New(), } } func (t *cpuTreeNode) CopyTree() *cpuTreeNode { newNode := t.CopyNode() newNode.children = make([]*cpuTreeNode, 0, len(t.children)) for _, child := range t.children { newNode.AddChild(child.CopyTree()) } return newNode } func (t *cpuTreeNode) CopyNode() *cpuTreeNode { newNode := cpuTreeNode{ name: t.name, level: t.level, parent: t.parent, children: t.children, cpus: t.cpus, } return &newNode } // Depth returns the distance from the root node. func (t *cpuTreeNode) Depth() int { if t.parent == nil { return 0 } return t.parent.Depth() + 1 } // AddChild adds new child node to a CPU tree node. func (t *cpuTreeNode) AddChild(child *cpuTreeNode) { child.parent = t t.children = append(t.children, child) } // AddCpus adds CPUs to a CPU tree node and all its parents. func (t *cpuTreeNode) AddCpus(cpus cpuset.CPUSet) { t.cpus = t.cpus.Union(cpus) if t.parent != nil { t.parent.AddCpus(cpus) } } // Cpus returns CPUs of a CPU tree node. func (t *cpuTreeNode) Cpus() cpuset.CPUSet { return t.cpus } // SiblingIndex returns the index of this node among its parents // children. Returns -1 for the root node, -2 if this node is not // listed among the children of its parent. func (t *cpuTreeNode) SiblingIndex() int { if t.parent == nil { return -1 } for idx, child := range t.parent.children { if child == t { return idx } } return -2 } func (t *cpuTreeNode) FindLeafWithCpu(cpu int) *cpuTreeNode { var found *cpuTreeNode t.DepthFirstWalk(func(tn *cpuTreeNode) error { if len(tn.children) > 0 { return nil } for _, cpuHere := range tn.cpus.List() { if cpu == cpuHere { found = tn return WalkStop } } return nil // not found here, no more children to search }) return found } // WalkSkipChildren error returned from a DepthFirstWalk handler // prevents walking deeper in the tree. The caller of the // DepthFirstWalk will get no error. var WalkSkipChildren error = errors.New("skip children") // WalkStop error returned from a DepthFirstWalk handler stops the // walk altogether. The caller of the DepthFirstWalk will get the // WalkStop error. var WalkStop error = errors.New("stop") // DepthFirstWalk walks through nodes in a CPU tree. Every node is // passed to the handler callback that controls next step by // returning: // - nil: continue walking to the next node // - WalkSkipChildren: continue to the next node but skip children of this node // - WalkStop: stop walking. func (t *cpuTreeNode) DepthFirstWalk(handler func(*cpuTreeNode) error) error { if err := handler(t); err != nil { if err == WalkSkipChildren { return nil } return err } for _, child := range t.children { if err := child.DepthFirstWalk(handler); err != nil { return err } } return nil } // CpuLocations returns a slice where each element contains names of // topology elements over which a set of CPUs spans. Example: // systemNode.CpuLocations(cpuset:0,99) = [["system"],["p0", "p1"], ["p0d0", "p1d0"], ...] func (t *cpuTreeNode) CpuLocations(cpus cpuset.CPUSet) [][]string { names := make([][]string, int(CPUTopologyLevelCount)-int(t.level)) t.DepthFirstWalk(func(tn *cpuTreeNode) error { if tn.cpus.Intersection(cpus).Size() == 0 { return WalkSkipChildren } levelIndex := int(tn.level) - int(t.level) names[levelIndex] = append(names[levelIndex], tn.name) return nil }) return names } // NewCpuTreeFromSystem returns the root node of the topology tree // constructed from the underlying system. func NewCpuTreeFromSystem() (*cpuTreeNode, error) { sys, err := system.DiscoverSystem() if err != nil { return nil, err } // TODO: split deep nested loops into functions sysTree := NewCpuTree("system") sysTree.level = CPUTopologyLevelSystem for _, packageID := range sys.PackageIDs() { packageTree := NewCpuTree(fmt.Sprintf("p%d", packageID)) packageTree.level = CPUTopologyLevelPackage cpuPackage := sys.Package(packageID) sysTree.AddChild(packageTree) for _, dieID := range cpuPackage.DieIDs() { dieTree := NewCpuTree(fmt.Sprintf("p%dd%d", packageID, dieID)) dieTree.level = CPUTopologyLevelDie packageTree.AddChild(dieTree) for _, nodeID := range cpuPackage.DieNodeIDs(dieID) { nodeTree := NewCpuTree(fmt.Sprintf("p%dd%dn%d", packageID, dieID, nodeID)) nodeTree.level = CPUTopologyLevelNuma dieTree.AddChild(nodeTree) node := sys.Node(nodeID) threadsSeen := map[int]struct{}{} for _, cpuID := range node.CPUSet().List() { if _, alreadySeen := threadsSeen[cpuID]; alreadySeen { continue } cpuTree := NewCpuTree(fmt.Sprintf("p%dd%dn%dcpu%d", packageID, dieID, nodeID, cpuID)) cpuTree.level = CPUTopologyLevelCore nodeTree.AddChild(cpuTree) cpu := sys.CPU(cpuID) for _, threadID := range cpu.ThreadCPUSet().List() { threadsSeen[threadID] = struct{}{} threadTree := NewCpuTree(fmt.Sprintf("p%dd%dn%dcpu%dt%d", packageID, dieID, nodeID, cpuID, threadID)) threadTree.level = CPUTopologyLevelThread cpuTree.AddChild(threadTree) threadTree.AddCpus(cpuset.New(threadID)) } } } } } return sysTree, nil } // ToAttributedSlice returns a CPU tree node and recursively all its // child nodes in a slice that contains nodes with their attributes // for allocation/releasing comparison. // - currentCpus is the set of CPUs that can be freed in coming operation // - freeCpus is the set of CPUs that can be allocated in coming operation // - filter(tna) returns false if the node can be ignored func (t *cpuTreeNode) ToAttributedSlice( currentCpus, freeCpus cpuset.CPUSet, filter func(*cpuTreeNodeAttributes) bool) []cpuTreeNodeAttributes { tnas := []cpuTreeNodeAttributes{} currentCpuCounts := []int{} freeCpuCounts := []int{} t.toAttributedSlice(currentCpus, freeCpus, filter, &tnas, 0, currentCpuCounts, freeCpuCounts) return tnas } func (t *cpuTreeNode) toAttributedSlice( currentCpus, freeCpus cpuset.CPUSet, filter func(*cpuTreeNodeAttributes) bool, tnas *[]cpuTreeNodeAttributes, depth int, currentCpuCounts []int, freeCpuCounts []int) { currentCpusHere := t.cpus.Intersection(currentCpus) freeCpusHere := t.cpus.Intersection(freeCpus) currentCpuCountHere := currentCpusHere.Size() currentCpuCountsHere := make([]int, len(currentCpuCounts)+1, len(currentCpuCounts)+1) copy(currentCpuCountsHere, currentCpuCounts) currentCpuCountsHere[depth] = currentCpuCountHere freeCpuCountHere := freeCpusHere.Size() freeCpuCountsHere := make([]int, len(freeCpuCounts)+1, len(freeCpuCounts)+1) copy(freeCpuCountsHere, freeCpuCounts) freeCpuCountsHere[depth] = freeCpuCountHere tna := cpuTreeNodeAttributes{ t: t, depth: depth, currentCpus: currentCpusHere, freeCpus: freeCpusHere, currentCpuCount: currentCpuCountHere, currentCpuCounts: currentCpuCountsHere, freeCpuCount: freeCpuCountHere, freeCpuCounts: freeCpuCountsHere, } if filter != nil && !filter(&tna) { return } *tnas = append(*tnas, tna) for _, child := range t.children { child.toAttributedSlice(currentCpus, freeCpus, filter, tnas, depth+1, currentCpuCountsHere, freeCpuCountsHere) } } // SplitLevel returns the root node of a new CPU tree where all // branches of a topology level have been split into new classes. func (t *cpuTreeNode) SplitLevel(splitLevel CPUTopologyLevel, cpuClassifier func(int) int) *cpuTreeNode { newRoot := t.CopyTree() newRoot.DepthFirstWalk(func(tn *cpuTreeNode) error { // Dive into the level that will be split. if tn.level != splitLevel { return nil } // Classify CPUs to the map: class -> list of cpus classCpus := map[int][]int{} for _, cpu := range t.cpus.List() { class := cpuClassifier(cpu) classCpus[class] = append(classCpus[class], cpu) } // Clear existing children of this node. New children // will be classes whose children are masked versions // of original children of this node. origChildren := tn.children tn.children = make([]*cpuTreeNode, 0, len(classCpus)) // Add new child corresponding each class. for class, cpus := range classCpus { cpuMask := cpuset.New(cpus...) newNode := NewCpuTree(fmt.Sprintf("%sclass%d", tn.name, class)) tn.AddChild(newNode) newNode.cpus = tn.cpus.Intersection(cpuMask) newNode.level = tn.level newNode.parent = tn for _, child := range origChildren { newChild := child.CopyTree() newChild.DepthFirstWalk(func(cn *cpuTreeNode) error { cn.cpus = cn.cpus.Intersection(cpuMask) if cn.cpus.Size() == 0 && cn.parent != nil { // all cpus masked // out: cut out this // branch newSiblings := []*cpuTreeNode{} for _, child := range cn.parent.children { if child != cn { newSiblings = append(newSiblings, child) } } cn.parent.children = newSiblings return WalkSkipChildren } return nil }) newNode.AddChild(newChild) } } return WalkSkipChildren }) return newRoot } // NewAllocator returns new CPU allocator for allocating CPUs from a // CPU tree branch. func (t *cpuTreeNode) NewAllocator(options cpuTreeAllocatorOptions) *cpuTreeAllocator { ta := &cpuTreeAllocator{ root: t, options: options, } if options.preferSpreadOnPhysicalCores { newTree := t.SplitLevel(CPUTopologyLevelNuma, // CPU classifier: class of the CPU equals to // the index in the child list of its parent // node in the tree. Expect leaf node is a // hyperthread, parent a physical core. func(cpu int) int { leaf := t.FindLeafWithCpu(cpu) if leaf == nil { log.Fatalf("SplitLevel CPU classifier: cpu %d not in tree:\n%s\n\n", cpu, t.PrettyPrint()) } return leaf.SiblingIndex() }) ta.root = newTree } return ta } // sorterAllocate implements an "is-less-than" callback that helps // sorting a slice of cpuTreeNodeAttributes. The first item in the // sorted list contains an optimal CPU tree node for allocating new // CPUs. func (ta *cpuTreeAllocator) sorterAllocate(tnas []cpuTreeNodeAttributes) func(int, int) bool { return func(i, j int) bool { if tnas[i].depth != tnas[j].depth { return tnas[i].depth > tnas[j].depth } for tdepth := 0; tdepth < len(tnas[i].currentCpuCounts); tdepth += 1 { // After this currentCpus will increase. // Maximize the maximal amount of currentCpus // as high level in the topology as possible. if tnas[i].currentCpuCounts[tdepth] != tnas[j].currentCpuCounts[tdepth] { return tnas[i].currentCpuCounts[tdepth] > tnas[j].currentCpuCounts[tdepth] } } for tdepth := 0; tdepth < len(tnas[i].freeCpuCounts); tdepth += 1 { // After this freeCpus will decrease. if tnas[i].freeCpuCounts[tdepth] != tnas[j].freeCpuCounts[tdepth] { if ta.options.topologyBalancing { // Goal: minimize maximal freeCpus in topology. return tnas[i].freeCpuCounts[tdepth] > tnas[j].freeCpuCounts[tdepth] } else { // Goal: maximize maximal freeCpus in topology. return tnas[i].freeCpuCounts[tdepth] < tnas[j].freeCpuCounts[tdepth] } } } return tnas[i].t.name < tnas[j].t.name } } // sorterRelease implements an "is-less-than" callback that helps // sorting a slice of cpuTreeNodeAttributes. The first item in the // list contains an optimal CPU tree node for releasing new CPUs. func (ta *cpuTreeAllocator) sorterRelease(tnas []cpuTreeNodeAttributes) func(int, int) bool { return func(i, j int) bool { if tnas[i].depth != tnas[j].depth { return tnas[i].depth > tnas[j].depth } for tdepth := 0; tdepth < len(tnas[i].currentCpuCounts); tdepth += 1 { // After this currentCpus will decrease. Aim // to minimize the minimal amount of // currentCpus in order to decrease // fragmentation as high level in the topology // as possible. if tnas[i].currentCpuCounts[tdepth] != tnas[j].currentCpuCounts[tdepth] { return tnas[i].currentCpuCounts[tdepth] < tnas[j].currentCpuCounts[tdepth] } } for tdepth := 0; tdepth < len(tnas[i].freeCpuCounts); tdepth += 1 { // After this freeCpus will increase. Try to // maximize minimal free CPUs for better // isolation as high level in the topology as // possible. if tnas[i].freeCpuCounts[tdepth] != tnas[j].freeCpuCounts[tdepth] { if ta.options.topologyBalancing { return tnas[i].freeCpuCounts[tdepth] < tnas[j].freeCpuCounts[tdepth] } else { return tnas[i].freeCpuCounts[tdepth] < tnas[j].freeCpuCounts[tdepth] } } } return tnas[i].t.name > tnas[j].t.name } } // ResizeCpus implements topology awareness to both adding CPUs to and // removing them from a set of CPUs. It returns CPUs from which actual // allocation or releasing of CPUs can be done. ResizeCpus does not // allocate or release CPUs. // // Parameters: // - currentCpus: a set of CPUs to/from which CPUs would be added/removed. // - freeCpus: a set of CPUs available CPUs. // - delta: number of CPUs to add (if positive) or remove (if negative). // // Return values: // - addFromCpus contains free CPUs from which delta CPUs can be // allocated. Note that the size of the set may be larger than // delta: there is room for other allocation logic to select from // these CPUs. // - removeFromCpus contains CPUs in currentCpus set from which // abs(delta) CPUs can be freed. func (ta *cpuTreeAllocator) ResizeCpus(currentCpus, freeCpus cpuset.CPUSet, delta int) (cpuset.CPUSet, cpuset.CPUSet, error) { if delta > 0 { addFromSuperset, removeFromSuperset, err := ta.resizeCpus(currentCpus, freeCpus, delta) if !ta.options.preferSpreadOnPhysicalCores || addFromSuperset.Size() == delta { return addFromSuperset, removeFromSuperset, err } // addFromSuperset contains more CPUs (equally good // choices) than actually needed. In case of // preferSpreadOnPhysicalCores, however, selecting any // of these does not result in equally good // result. Therefore, in this case, construct addFrom // set by adding one CPU at a time. addFrom := cpuset.New() for n := 0; n < delta; n++ { addSingleFrom, _, err := ta.resizeCpus(currentCpus, freeCpus, 1) if err != nil { return addFromSuperset, removeFromSuperset, err } if addSingleFrom.Size() != 1 { return addFromSuperset, removeFromSuperset, fmt.Errorf("internal error: failed to find single CPU to allocate, "+ "currentCpus=%s freeCpus=%s expectedSingle=%s", currentCpus, freeCpus, addSingleFrom) } addFrom = addFrom.Union(addSingleFrom) if addFrom.Size() != n+1 { return addFromSuperset, removeFromSuperset, fmt.Errorf("internal error: double add the same CPU (%s) to cpuset %s on round %d", addSingleFrom, addFrom, n+1) } currentCpus = currentCpus.Union(addSingleFrom) freeCpus = freeCpus.Difference(addSingleFrom) } return addFrom, removeFromSuperset, nil } // In multi-CPU removal, remove CPUs one by one instead of // trying to find a single topology element from which all of // them could be removed. removeFrom := cpuset.New() addFrom := cpuset.New() for n := 0; n < -delta; n++ { _, removeSingleFrom, err := ta.resizeCpus(currentCpus, freeCpus, -1) if err != nil { return addFrom, removeFrom, err } // Make cheap internal error checks in order to capture // issues in alternative algorithms. if removeSingleFrom.Size() != 1 { return addFrom, removeFrom, fmt.Errorf("internal error: failed to find single cpu to free, "+ "currentCpus=%s freeCpus=%s expectedSingle=%s", currentCpus, freeCpus, removeSingleFrom) } if removeFrom.Union(removeSingleFrom).Size() != n+1 { return addFrom, removeFrom, fmt.Errorf("internal error: double release of a cpu, "+ "currentCpus=%s freeCpus=%s alreadyRemoved=%s removedNow=%s", currentCpus, freeCpus, removeFrom, removeSingleFrom) } removeFrom = removeFrom.Union(removeSingleFrom) currentCpus = currentCpus.Difference(removeSingleFrom) freeCpus = freeCpus.Union(removeSingleFrom) } return addFrom, removeFrom, nil } func (ta *cpuTreeAllocator) resizeCpus(currentCpus, freeCpus cpuset.CPUSet, delta int) (cpuset.CPUSet, cpuset.CPUSet, error) { tnas := ta.root.ToAttributedSlice(currentCpus, freeCpus, func(tna *cpuTreeNodeAttributes) bool { // filter out branches with insufficient cpus if delta > 0 && tna.freeCpuCount-delta < 0 { // cannot allocate delta cpus return false } if delta < 0 && tna.currentCpuCount+delta < 0 { // cannot release delta cpus return false } return true }) // Sort based on attributes if delta > 0 { sort.Slice(tnas, ta.sorterAllocate(tnas)) } else { sort.Slice(tnas, ta.sorterRelease(tnas)) } if len(tnas) == 0 { return freeCpus, currentCpus, fmt.Errorf("not enough free CPUs") } return tnas[0].freeCpus, tnas[0].currentCpus, nil } ================================================ FILE: pkg/cri/resource-manager/policy/builtin/balloons/cputree_test.go ================================================ // Copyright 2022 Intel Corporation. All Rights Reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package balloons import ( "fmt" "sort" "strings" "testing" "github.com/intel/cri-resource-manager/pkg/utils/cpuset" ) type cpuInTopology struct { packageID, dieID, numaID, coreID, threadID, cpuID int packageName, dieName, numaName, coreName, threadName, cpuName string } type cpusInTopology map[int]cpuInTopology func (cit cpuInTopology) TopoName(topoLevel string) string { switch topoLevel { case "thread": return cit.threadName case "core": return cit.coreName case "numa": return cit.numaName case "die": return cit.dieName case "package": return cit.packageName } panic("invalid topoLevel") } func (csit cpusInTopology) dumps(nameCpus map[string]cpuset.CPUSet) string { lines := []string{} names := make([]string, 0, len(nameCpus)) for name := range nameCpus { names = append(names, name) } sort.Strings(names) for cpuID := 0; cpuID < len(csit); cpuID++ { line := fmt.Sprintf("cpu%02d %s", cpuID, csit[cpuID].threadName) for _, name := range names { if nameCpus[name].Contains(cpuID) { line = fmt.Sprintf("%s %s", line, name) } } lines = append(lines, line) } return strings.Join(lines, "\n") } func newCpuTreeFromInt5(pdnct [5]int) (*cpuTreeNode, cpusInTopology) { pkgs := pdnct[0] dies := pdnct[1] numas := pdnct[2] cores := pdnct[3] threads := pdnct[4] cpuID := 0 sysTree := NewCpuTree("system") sysTree.level = CPUTopologyLevelSystem csit := cpusInTopology{} for packageID := 0; packageID < pkgs; packageID++ { packageTree := NewCpuTree(fmt.Sprintf("p%d", packageID)) packageTree.level = CPUTopologyLevelPackage sysTree.AddChild(packageTree) for dieID := 0; dieID < dies; dieID++ { dieTree := NewCpuTree(fmt.Sprintf("p%dd%d", packageID, dieID)) dieTree.level = CPUTopologyLevelDie packageTree.AddChild(dieTree) for numaID := 0; numaID < numas; numaID++ { numaTree := NewCpuTree(fmt.Sprintf("p%dd%dn%d", packageID, dieID, numaID)) numaTree.level = CPUTopologyLevelNuma dieTree.AddChild(numaTree) for coreID := 0; coreID < cores; coreID++ { coreTree := NewCpuTree(fmt.Sprintf("p%dd%dn%dc%02d", packageID, dieID, numaID, coreID)) coreTree.level = CPUTopologyLevelCore numaTree.AddChild(coreTree) for threadID := 0; threadID < threads; threadID++ { threadTree := NewCpuTree(fmt.Sprintf("p%dd%dn%dc%02dt%d", packageID, dieID, numaID, coreID, threadID)) threadTree.level = CPUTopologyLevelThread coreTree.AddChild(threadTree) threadTree.AddCpus(cpuset.New(cpuID)) csit[cpuID] = cpuInTopology{ packageID, dieID, numaID, coreID, threadID, cpuID, packageTree.name, dieTree.name, numaTree.name, coreTree.name, threadTree.name, fmt.Sprintf("cpu%d", cpuID), } cpuID += 1 } } } } } return sysTree, csit } func verifyNotOn(t *testing.T, nameContents string, cpus cpuset.CPUSet, csit cpusInTopology) { for _, cpuID := range cpus.List() { name := csit[cpuID].threadName if strings.Contains(name, nameContents) { t.Errorf("cpu%d (%s) in unexpected region %s", cpuID, name, nameContents) } } } func doVerifySame(t *testing.T, topoLevel string, cpus cpuset.CPUSet, csit cpusInTopology, inversed bool) { seenName := "" seenCpuID := -1 for _, cpuID := range cpus.List() { cit := csit[cpuID] thisName := cit.TopoName(topoLevel) thisCpuID := cit.cpuID if thisName == "" { t.Errorf("unexpected (invalid) topology level %q", topoLevel) } if seenName == "" { seenName = thisName seenCpuID = cit.cpuID continue } if (seenName != thisName && !inversed) || (seenName == thisName && inversed) { msg := "the same" if inversed { msg = "not the same" } t.Errorf("expected %s %s, got: cpu%d in %s, cpu%d in %s", msg, topoLevel, seenCpuID, seenName, thisCpuID, thisName) } } } func verifySame(t *testing.T, topoLevel string, cpus cpuset.CPUSet, csit cpusInTopology) { doVerifySame(t, topoLevel, cpus, csit, false) } func verifyNotSame(t *testing.T, topoLevel string, cpus cpuset.CPUSet, csit cpusInTopology) { doVerifySame(t, topoLevel, cpus, csit, true) } func (csit cpusInTopology) getElements(topoLevel string, cpus cpuset.CPUSet) []string { elts := []string{} for _, cpuID := range cpus.List() { elts = append(elts, csit[cpuID].TopoName(topoLevel)) } return elts } func (csit cpusInTopology) verifyDisjoint(t *testing.T, topoLevel string, cpusA cpuset.CPUSet, cpusB cpuset.CPUSet) { eltsA := csit.getElements(topoLevel, cpusA) eltsB := csit.getElements(topoLevel, cpusB) for _, eltA := range eltsA { for _, eltB := range eltsB { if eltA == eltB { t.Errorf("expected disjoint %ss, got %s on both cpusets %s and %s", topoLevel, eltA, cpusA, cpusB) return } } } } /* CPU ids and locations in the 2-2-2-2-2-topology for verifying current and developing future unit tests. The location in topology is in format: p/d/n/c/t topology: [5]int{2, 2, 2, 2, 2}, allocations: []int{ 0, // cpu on p0/d0/n0/c0/t0 1, // cpu on p0/d0/n0/c0/t1 2, // cpu on p0/d0/n0/c1/t0 3, // cpu on p0/d0/n0/c1/t1 4, // cpu on p0/d0/n1/c0/t0 5, // cpu on p0/d0/n1/c0/t1 6, // cpu on p0/d0/n1/c1/t0 7, // cpu on p0/d0/n1/c1/t1 8, // cpu on p0/d1/n0/c0/t0 9, // cpu on p0/d1/n0/c0/t1 10, // cpu on p0/d1/n0/c1/t0 11, // cpu on p0/d1/n0/c1/t1 12, // cpu on p0/d1/n1/c0/t0 13, // cpu on p0/d1/n1/c0/t1 14, // cpu on p0/d1/n1/c1/t0 15, // cpu on p0/d1/n1/c1/t1 16, // cpu on p1/d0/n0/c0/t0 17, // cpu on p1/d0/n0/c0/t1 18, // cpu on p1/d0/n0/c1/t0 19, // cpu on p1/d0/n0/c1/t1 20, // cpu on p1/d0/n1/c0/t0 21, // cpu on p1/d0/n1/c0/t1 22, // cpu on p1/d0/n1/c1/t0 23, // cpu on p1/d0/n1/c1/t1 24, // cpu on p1/d1/n0/c0/t0 25, // cpu on p1/d1/n0/c0/t1 26, // cpu on p1/d1/n0/c1/t0 27, // cpu on p1/d1/n0/c1/t1 28, // cpu on p1/d1/n1/c0/t0 29, // cpu on p1/d1/n1/c0/t1 30, // cpu on p1/d1/n1/c1/t0 31, // cpu on p1/d1/n1/c1/t1 }, */ func TestResizeCpus(t *testing.T) { type TopoCcids struct { topo string ccids []int } tcases := []struct { name string topology [5]int // package, die, numa, core, thread count allocatorTB bool // allocator topologyBalancing allocatorPSoPC bool // allocator preferSpreadOnPhysicalCores allocations []int deltas []int allocate bool operateOnCcid []int // which ccid (currentCpus id) will be used on call expectCurrentOnSame []string expectCurrentNotOnSame []string expectAllOnSame []string expectCurrentNotOn []string expectAddSizes []int expectDisjoint []TopoCcids // which ccids should be disjoint expectErrors []string }{ { name: "first allocations", topology: [5]int{2, 2, 2, 2, 2}, deltas: []int{0, 1, 2, 3, 4, 5, 7, 8, 9, 15, 16, 17, 31, 32}, expectAddSizes: []int{0, 1, 2, 4, 4, 8, 8, 8, 16, 16, 16, 32, 32, 32}, }, { name: "too large an allocation", topology: [5]int{2, 2, 2, 2, 2}, deltas: []int{33}, expectErrors: []string{"not enough free CPUs"}, }, { name: "spread allocations", topology: [5]int{2, 2, 2, 2, 2}, allocatorTB: true, deltas: []int{1, 1, 1, 1, 1, 1, 1, 1}, allocate: true, operateOnCcid: []int{1, 2, 3, 4, 5, 6, 7, 8}, expectDisjoint: []TopoCcids{ {}, {"package", []int{1, 2}}, {"die", []int{1, 2, 3}}, {"die", []int{1, 2, 3, 4}}, {"numa", []int{1, 2, 3, 4, 5}}, {"numa", []int{1, 2, 3, 4, 5, 6}}, {"numa", []int{1, 2, 3, 4, 5, 6, 7}}, {"numa", []int{1, 2, 3, 4, 5, 6, 7, 8}}, }, }, { name: "spread allocations2", topology: [5]int{4, 1, 4, 8, 2}, allocatorTB: true, deltas: []int{1, 3, 2, 4, 1, 4, 2, 4}, allocate: true, operateOnCcid: []int{1, 2, 3, 4, 5, 6, 7, 8}, expectDisjoint: []TopoCcids{ {}, {"package", []int{1, 2}}, {"package", []int{1, 2, 3}}, {"package", []int{1, 2, 3, 4}}, {"numa", []int{1, 2, 3, 4, 5}}, {"numa", []int{1, 2, 3, 4, 5, 6}}, {"numa", []int{1, 2, 3, 4, 5, 6, 7}}, {"numa", []int{1, 2, 3, 4, 5, 6, 7, 8}}, }, }, { name: "pack allocations", topology: [5]int{2, 2, 2, 2, 2}, allocatorTB: false, deltas: []int{1, 1, 1, 1}, allocate: true, operateOnCcid: []int{1, 2, 3, 4, 5}, expectAllOnSame: []string{ "", "core", "numa", "numa", "die", "die", }, }, { name: "inflate", topology: [5]int{2, 2, 2, 2, 2}, allocate: true, deltas: []int{ 1, 1, 1, 1, // cpu0..cpu3 on numaN0, dieD0 1, 3, // cpu4..cpu7 on numaN1, still dieD0 6, 1, 1, // cpu8..15 on dieD1, still packageP0 }, operateOnCcid: []int{ 1, 1, 1, 1, 1, 1, 1, 1, 1}, expectCurrentOnSame: []string{ "core", "core", "numa", "numa", "die", "die", "package", "package", "package"}, expectAddSizes: []int{ 1, 1, 1, 1, 1, 3, 8, 1, 1}, }, { name: "defragmenting single removals", topology: [5]int{2, 2, 2, 2, 2}, allocations: []int{ 0, // cpu on p0/d0/n0/c0/t0 2, // cpu on p0/d0/n0/c1/t0 3, // cpu on p0/d0/n0/c1/t1 7, // cpu on p0/d0/n1/c1/t1 10, // cpu on p0/d1/n0/c1/t0 17, // cpu on p1/d0/n0/c0/t1 18, // cpu on p1/d0/n0/c1/t0 }, allocate: true, deltas: []int{ -1, // release cpu17 or cpu18 -1, // release cpu17 or cpu18 => all on same package -1, // release cpu10 => all on same die -1, // release cpu7 => all on same numa -1, // release cpu0 => all on same core -1, // release cpu2 or cpu3 -1, // release cpu2 or cpu3 }, operateOnCcid: []int{1, 1, 1, 1, 1, 1, 1}, expectCurrentOnSame: []string{ "", "package", "die", "numa", "core", "core", "core", }, expectCurrentNotOn: []string{ "", "p1", "p0d1", "p0d0n1", "p0d0n0c00", }, }, { name: "defragmenting multi-removals", topology: [5]int{2, 2, 2, 2, 2}, allocations: []int{ 0, // cpu on p0/d0/n0/c0/t0 2, // cpu on p0/d0/n0/c1/t0 4, // cpu on p0/d0/n1/c0/t0 6, // cpu on p0/d0/n1/c1/t0 8, // cpu on p0/d1/n0/c0/t0 9, // cpu on p0/d1/n0/c0/t1 10, // cpu on p0/d1/n0/c1/t0 24, // cpu on p1/d1/n0/c0/t0 25, // cpu on p1/d1/n0/c0/t1 26, // cpu on p1/d1/n0/c1/t0 27, // cpu on p1/d1/n0/c1/t1 28, // cpu on p1/d1/n1/c0/t0 29, // cpu on p1/d1/n1/c0/t1 30, // cpu on p1/d1/n1/c1/t0 31, // cpu on p1/d1/n1/c1/t1 }, allocate: true, deltas: []int{ -2, // release from p0d1n0 -1, // release completely p0d1 -5, // release completely p0, one from p1d1nX -3, // release completely p1d1nX => all on same numa }, operateOnCcid: []int{1, 1, 1, 1}, expectCurrentOnSame: []string{ "", "", "die", "numa", }, expectCurrentNotOn: []string{ "", "p0d1", "p0", "", }, }, { name: "gentle rebalancing", topology: [5]int{2, 1, 1, 16, 2}, // 2 packages, 16 hyperthreaded cores per package => 64 cpus in total deltas: []int{ 4, 4, 14, 7, 7, 4, 4, 14, // allocate 8 sets of cpus, the last 14cpus fills package0, spills over to package1 -2, -2, -2, -2, // free a little room to package0 -1, 1, -1, 1, -1, 1, -1, 1}, // deflate/inflate the last 14cpus, see that it gradually travels to package0 operateOnCcid: []int{ 1, 2, 3, 4, 5, 6, 7, 8, 1, 2, 3, 4, 8, 8, 8, 8, 8, 8, 8, 8, }, allocate: true, expectCurrentOnSame: []string{ "package", "package", "package", "package", "package", "package", "package", "", "", "", "", "", "", "", "", "", "", "", "package", "package", }, }, { name: "prefer spread on physical cores", topology: [5]int{4, 1, 4, 8, 2}, allocatorTB: true, allocatorPSoPC: true, deltas: []int{ 2, 1, 4, 1, // allocate one thread from each core from the same NUMA 3, 9, 16, // allocate three other cpusets, each should be from separate package (due to topology balancing) 3, 4, 3, // increase the size of the // original, 3+4 fits to the same // NUMA, in the last 3: first cpu // should fill the NUMA and the rest 2 // go to another NUMA on the same package. -2, 2, // release two CPUs that went to another NUMA on the same package, and put them back -10, // release 2+8 CPUs, the rest should be single threads each on their own core }, allocate: true, operateOnCcid: []int{ 1, 1, 1, 1, // allocate one thread from each core from the same NUMA by inflating all the time the same cpuset 2, 3, 4, // three new cpusets 1, 1, 1, // increase size over one NUMA 1, 1, 1, }, expectCurrentOnSame: []string{ "numa", "numa", "numa", "numa", "numa", "numa", "numa", "numa", "numa", "package", "numa", "package", "numa", }, expectCurrentNotOnSame: []string{ "core", "core", "core", "core", "core", "", "", "", "", "", "", "", "core", }, expectDisjoint: []TopoCcids{ {}, {}, {}, {}, {"package", []int{1, 2}}, {"package", []int{1, 2, 3}}, {"package", []int{1, 2, 3, 4}}, }, }, } for _, tc := range tcases { t.Run(tc.name, func(t *testing.T) { tree, csit := newCpuTreeFromInt5(tc.topology) treeA := tree.NewAllocator(cpuTreeAllocatorOptions{ topologyBalancing: tc.allocatorTB, preferSpreadOnPhysicalCores: tc.allocatorPSoPC, }) currentCpus := cpuset.New() freeCpus := tree.Cpus() if len(tc.allocations) > 0 { currentCpus = currentCpus.Union(cpuset.New(tc.allocations...)) freeCpus = freeCpus.Difference(cpuset.New(tc.allocations...)) } ccidCurrentCpus := map[int]cpuset.CPUSet{0: currentCpus} allocs := map[string]cpuset.CPUSet{"--:allo": currentCpus} for i, delta := range tc.deltas { if i < len(tc.operateOnCcid) && tc.operateOnCcid[i] > 0 { currentCpus = ccidCurrentCpus[tc.operateOnCcid[i]] } t.Logf("ResizeCpus(current=%s; free=%s; delta=%d)", currentCpus, freeCpus, delta) addFrom, removeFrom, err := treeA.ResizeCpus(currentCpus, freeCpus, delta) t.Logf("== addFrom=%s; removeFrom=%s, err=%v", addFrom, removeFrom, err) if i < len(tc.expectAddSizes) { if tc.expectAddSizes[i] != addFrom.Size() { t.Errorf("expected add size: %d, got %d", tc.expectAddSizes[i], addFrom.Size()) } } if i < len(tc.expectErrors) { if tc.expectErrors[i] == "" && err != nil { t.Errorf("expected nil error, but got %v", err) } if tc.expectErrors[i] != "" { if err == nil { t.Errorf("expected error containing %q, got nil", tc.expectErrors[i]) } else if !strings.Contains(fmt.Sprintf("%s", err), tc.expectErrors[i]) { t.Errorf("expected error containing %q, got %q", tc.expectErrors[i], err) } } } if tc.allocate { allocName := fmt.Sprintf("%02d:allo", i+1) allocs[allocName] = cpuset.New() for n, cpuID := range addFrom.List() { if n >= delta { break } freeCpus = freeCpus.Difference(cpuset.New(cpuID)) currentCpus = currentCpus.Union(cpuset.New(cpuID)) allocs[allocName] = allocs[allocName].Union(cpuset.New(cpuID)) } allocName = fmt.Sprintf("%02d:free", i+1) for n, cpuID := range removeFrom.List() { if n >= -delta { break } freeCpus = freeCpus.Union(cpuset.New(cpuID)) if i < len(tc.operateOnCcid) && tc.operateOnCcid[i] > 0 { currentCpus = currentCpus.Difference(cpuset.New(cpuID)) } allocs[allocName] = allocs[allocName].Union(cpuset.New(cpuID)) } if i < len(tc.operateOnCcid) && tc.operateOnCcid[i] > 0 { ccidCurrentCpus[tc.operateOnCcid[i]] = currentCpus } allocs["free"] = freeCpus t.Logf("=> current=%s; free=%s", currentCpus, freeCpus) if i < len(tc.expectCurrentOnSame) && tc.expectCurrentOnSame[i] != "" { verifySame(t, tc.expectCurrentOnSame[i], currentCpus, csit) } if i < len(tc.expectCurrentNotOnSame) && tc.expectCurrentNotOnSame[i] != "" { verifyNotSame(t, tc.expectCurrentNotOnSame[i], currentCpus, csit) } if i < len(tc.expectCurrentNotOn) && tc.expectCurrentNotOn[i] != "" { verifyNotOn(t, tc.expectCurrentNotOn[i], currentCpus, csit) } if i < len(tc.expectAllOnSame) && tc.expectAllOnSame[i] != "" { allCpus := cpuset.New() for _, cpus := range ccidCurrentCpus { allCpus = allCpus.Union(cpus) } verifySame(t, tc.expectAllOnSame[i], allCpus, csit) } if i < len(tc.expectDisjoint) && len(tc.expectDisjoint) > 1 { for first := 0; first < len(tc.expectDisjoint[i].ccids); first++ { for second := first + 1; second < len(tc.expectDisjoint[i].ccids); second++ { csit.verifyDisjoint(t, tc.expectDisjoint[i].topo, ccidCurrentCpus[tc.expectDisjoint[i].ccids[first]], ccidCurrentCpus[tc.expectDisjoint[i].ccids[second]]) } } } } if t.Failed() { t.Logf("current and free cpus:\n%s\n", csit.dumps(allocs)) break } } }) } } func TestWalk(t *testing.T) { t.Run("single-node tree", func(t *testing.T) { tree := NewCpuTree("system") tree.level = CPUTopologyLevelSystem foundName := "unfound" foundLevel := CPUTopologyLevelUndefined rv := tree.DepthFirstWalk(func(tn *cpuTreeNode) error { foundName = tn.name foundLevel = tn.level return nil }) if rv != nil { t.Errorf("expected no error, got %s", rv) } if foundLevel != CPUTopologyLevelSystem { t.Errorf("expected to find level %q, got %q", CPUTopologyLevelSystem, foundLevel) } if foundName != "system" { t.Errorf("expected to find name %q, got %q", "system", foundName) } }) t.Run("fetch first core", func(t *testing.T) { tree, _ := newCpuTreeFromInt5([5]int{2, 2, 2, 2, 2}) foundCount := 0 foundName := "" rv := tree.DepthFirstWalk(func(tn *cpuTreeNode) error { foundCount += 1 if tn.level == CPUTopologyLevelCore { foundName = tn.name return WalkStop } return nil }) if rv != WalkStop { t.Errorf("expected WalkStop error, got %s", rv) } if foundCount != 5 { t.Errorf("expected to find 5 nodes, got %d", foundCount) } if foundName != "p0d0n0c00" { t.Errorf("expected to find p0d0n0c00, got %q", foundName) } }) t.Run("skip children", func(t *testing.T) { tree, _ := newCpuTreeFromInt5([5]int{2, 2, 2, 2, 2}) foundCount := 0 rv := tree.DepthFirstWalk(func(tn *cpuTreeNode) error { foundCount += 1 if tn.level == CPUTopologyLevelDie { return WalkSkipChildren } return nil }) if rv != nil { t.Errorf("expected no error, got %s", rv) } if foundCount != 7 { t.Errorf("expected to find 7 nodes, got %d", foundCount) } }) } func TestCpuLocations(t *testing.T) { tree, _ := newCpuTreeFromInt5([5]int{2, 2, 2, 4, 2}) cpus := cpuset.New(0, 1, 3, 4, 16) systemlocations := tree.CpuLocations(cpus) package1locations := tree.children[1].CpuLocations(cpus) if len(package1locations) != 5 { t.Errorf("expected package1locations length 5, got %d", len(package1locations)) return } if len(systemlocations) != 6 { t.Errorf("expected systemlocations length 6, got %d", len(systemlocations)) return } if systemlocations[0][0] != "system" { t.Errorf("expected 'system' location, got %q", systemlocations[0][0]) return } if systemlocations[1][0] != "p0" { t.Errorf("expected 'system' location, got %q", systemlocations[1][0]) return } if len(systemlocations[4]) != 4 { t.Errorf("expected len(systemlocations[4]) 4, got %d", len(systemlocations[4])) return } } func TestCPUTopologyLevel(t *testing.T) { var lvl CPUTopologyLevel if lvl != CPUTopologyLevelUndefined { t.Errorf("unexpected default inital value for lvl: %s, expected undefined", lvl) } if err := lvl.UnmarshalJSON([]byte("\"\"")); err != nil || lvl != CPUTopologyLevelUndefined { t.Errorf("unexpected outcome unmarshalling topology level: \"\", error: %s, result: %s", err, lvl) } if err := lvl.UnmarshalJSON([]byte("\"system\"")); err != nil || lvl != CPUTopologyLevelSystem { t.Errorf("unexpected outcome unmarshalling topology level: system, error: %s, result: %s", err, lvl) } if err := lvl.UnmarshalJSON([]byte("\"NUMA\"")); err != nil || lvl != CPUTopologyLevelNuma { t.Errorf("unexpected outcome unmarshalling topology level: \"NUMA\", error: %s, result: %s", err, lvl) } if err := lvl.UnmarshalJSON([]byte("\"undefined\"")); err == nil { t.Errorf("unexpected outcome unmarshalling topology level: \"undefined\", error: %s, result: %s", err, lvl) } if err := lvl.UnmarshalJSON([]byte("system")); err == nil { t.Errorf("unexpected non-error outcome unmarshalling topology level: system, error: %s, result: %s", err, lvl) } if err := lvl.UnmarshalJSON([]byte("0")); err == nil { t.Errorf("unexpected non-error outcome unmarshalling topology level: 0, error: %s, result: %s", err, lvl) } if err := lvl.UnmarshalJSON([]byte("\"4\"")); err == nil { t.Errorf("unexpected non-error outcome unmarshalling topology level: \"0\", error: %s, result: %s", err, lvl) } if undefBytes, err := CPUTopologyLevelUndefined.MarshalJSON(); err != nil { t.Errorf("unexpected error marshaling undefined: %s", err) } else { if err = lvl.UnmarshalJSON(undefBytes); err != nil || lvl != CPUTopologyLevelUndefined { t.Errorf("unexpected outcome unmarshaling marshaled undefined: error: %s, result: %s", err, lvl) } } if threadBytes, err := CPUTopologyLevelThread.MarshalJSON(); err != nil { t.Errorf("unexpected error marshaling thread: %s", err) } else { if err = lvl.UnmarshalJSON(threadBytes); err != nil || lvl != CPUTopologyLevelThread { t.Errorf("unexpected outcome unmarshaling marshaled thread: error: %s, result: %s", err, lvl) } } } func TestSplitLevel(t *testing.T) { root, _ := newCpuTreeFromInt5([5]int{2, 2, 2, 4, 2}) newRoot := root.SplitLevel(CPUTopologyLevelNuma, func(cpu int) int { leaf := root.FindLeafWithCpu(cpu) if leaf == nil { t.Fatalf("cpu %d not in tree:\n%s\n\n", cpu, root.PrettyPrint()) } return leaf.SiblingIndex() }) oldc62 := root.FindLeafWithCpu(62) oldc63 := root.FindLeafWithCpu(63) if oldc62.parent != oldc63.parent { t.Errorf("expected: 62 and 63 are hyperthreads of the same physical core in the original tree, observed parents %s and %s", oldc62.parent, oldc63.parent) } newc62 := newRoot.FindLeafWithCpu(62) newc63 := newRoot.FindLeafWithCpu(63) if newc62.parent == newc63.parent { t.Errorf("expected: 62 and 63 have different parents (physical cores), but they have the same %s", newc62.parent) } if newc62.parent.parent == newc63.parent.parent { t.Errorf("expected: 62 and 63 have different grand parents (numa subclasses), but they have the same: %s", newc62.parent.parent) } if newc62.parent.parent.parent != newc63.parent.parent.parent { t.Errorf("expected: 62 and 63 have the same great grand parents (numa), but they differ: %s and %s", newc62.parent.parent.parent, newc63.parent.parent.parent) } if t.Failed() { t.Logf("newRoot:\n%s\n", newRoot.PrettyPrint()) } } ================================================ FILE: pkg/cri/resource-manager/policy/builtin/balloons/fillmethod.go ================================================ // Copyright 2022 Intel Corporation. All Rights Reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package balloons import ( "bytes" "encoding/json" "fmt" ) // FillMethod specifies the order in which balloon instances should be filled. type FillMethod int const ( FillUnspecified FillMethod = iota // FillBalanced: put a container into the balloon with most // free CPU without changing the size of the balloon. FillBalanced // FillBalancedInflate: put a container into the balloon with // most free CPU when the balloon is inflated to the maximum // size. FillBalancedInflate // FillPacked: put a container into a balloon so that it // minimizes the amount of currently unused CPUs in the // balloon. FillPacked // FillPackedInflate: put a container into a balloon so that // it minimizes the amount of unused CPUs if the balloon is // inflated to the maximum size. FillPackedInflate // FillSameNamespace: put a container into a balloon that already // includes another container from the same namespace FillSameNamespace // FillSamePod: put a container into a balloon that already // includes another container from the same pod. FillSamePod // FillNewBalloon: create a new balloon, if possible, and put // a container into it. FillNewBalloon // FillNewBalloonMust: create a new balloon for a container, // but refuse to run the container if the balloon cannot be // created. FillNewBalloonMust // FillReservedBalloon: put a container into the reserved // balloon. FillReservedBalloon // FillDefaultBalloon: put a container into the default // balloon. FillDefaultBalloon ) var fillMethodNames = map[FillMethod]string{ FillUnspecified: "unspecified", FillBalanced: "balanced", FillBalancedInflate: "balanced-inflate", FillPacked: "packed", FillPackedInflate: "packed-inflate", FillSameNamespace: "same-namespace", FillSamePod: "same-pod", FillNewBalloon: "new-balloon", FillNewBalloonMust: "new-balloon-must", FillDefaultBalloon: "default-balloon", FillReservedBalloon: "reserved-balloon", } // String stringifies a FillMethod func (fm FillMethod) String() string { if fmn, ok := fillMethodNames[fm]; ok { return fmn } return fmt.Sprintf("#UNNAMED-FILLMETHOD(%d)", int(fm)) } // MarshalJSON marshals a FillMethod as a quoted json string func (fm FillMethod) MarshalJSON() ([]byte, error) { buffer := bytes.NewBufferString(fmt.Sprintf("%q", fm)) return buffer.Bytes(), nil } // UnmarshalJSON unmarshals a FillMethod quoted json string to the enum value func (fm *FillMethod) UnmarshalJSON(b []byte) error { var fillMethodName string err := json.Unmarshal(b, &fillMethodName) if err != nil { return err } for fmID, fmName := range fillMethodNames { if fmName == fillMethodName { *fm = fmID return nil } } return balloonsError("invalid fill method %q", fillMethodName) } ================================================ FILE: pkg/cri/resource-manager/policy/builtin/balloons/flags.go ================================================ // Copyright 2022 Intel Corporation. All Rights Reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package balloons import ( "encoding/json" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" pkgcfg "github.com/intel/cri-resource-manager/pkg/config" "github.com/intel/cri-resource-manager/pkg/cpuallocator" ) type BalloonsOptions balloonsOptionsWrapped // BalloonsOptions contains configuration options specific to this policy. type balloonsOptionsWrapped struct { // PinCPU controls pinning containers to CPUs. PinCPU *bool `json:"PinCPU,omitempty"` // PinMemory controls pinning containers to memory nodes. PinMemory *bool `json:"PinMemory,omitempty"` // IdleCpuClass controls how unusded CPUs outside any a // balloons are (re)configured. IdleCpuClass string `json:"IdleCPUClass",omitempty"` // ReservedPoolNamespaces is a list of namespace globs that // will be allocated to reserved CPUs. ReservedPoolNamespaces []string `json:"ReservedPoolNamespaces,omitempty"` // If AllocatorTopologyBalancing is true, balloons are // allocated and resized so that all topology elements // (packages, dies, numa nodes, cores) have roughly same // amount of allocations. The default is false: balloons are // packed tightly to optimize power efficiency. The value set // here can be overridden with the balloon type specific // setting with the same name. AllocatorTopologyBalancing bool // PreferSpreadOnPhysicalCores prefers allocating logical CPUs // (possibly hyperthreads) for a balloon from separate physical CPU // cores. This prevents workloads in the balloon from interfering with // themselves as they do not compete on the resources of the same CPU // cores. On the other hand, it allows more interference between // workloads in different balloons. The default is false: balloons // are packed tightly to a minimum number of physical CPU cores. The // value set here is the default for all balloon types, but it can be // overridden with the balloon type specific setting with the same // name. PreferSpreadOnPhysicalCores bool `json:"PreferSpreadOnPhysicalCores,omitempty"` // BallonDefs contains balloon type definitions. BalloonDefs []*BalloonDef `json:"BalloonTypes,omitempty"` } // BalloonDef contains a balloon definition. type BalloonDef struct { // Name of the balloon definition. Name string `json:"Name"` // Namespaces control which namespaces are assigned into // balloon instances from this definition. This is used by // namespace assign methods. Namespaces []string `json:"Namespaces",omitempty` // MaxCpus specifies the maximum number of CPUs exclusively // usable by containers in a balloon. Balloon size will not be // inflated larger than MaxCpus. MaxCpus int `json:"MaxCPUs"` // MinCpus specifies the minimum number of CPUs exclusively // usable by containers in a balloon. When new balloon is created, // this will be the number of CPUs reserved for it even if a container // would request less. MinCpus int `json:"MinCPUs"` // AllocatorPriority (0: High, 1: Normal, 2: Low, 3: None) // This parameter is passed to CPU allocator when creating or // resizing a balloon. At init, balloons with highest priority // CPUs are allocated first. AllocatorPriority cpuallocator.CPUPriority `json:"AllocatorPriority"` // PreferSpreadOnPhysicalCores is the balloon type specific // parameter of the policy level parameter with the same name. PreferSpreadOnPhysicalCores *bool `json:"PreferSpreadOnPhysicalCores,omitempty"` // AllocatorTopologyBalancing is the balloon type specific // parameter of the policy level parameter with the same name. AllocatorTopologyBalancing *bool `json:"AllocatorTopologyBalancing,omitempty"` // CpuClass controls how CPUs of a balloon are (re)configured // whenever a balloon is created, inflated or deflated. CpuClass string `json:"CpuClass"` // MinBalloons is the number of balloon instances that always // exist even if they would become empty. At init this number // of instances will be created before assigning any // containers. MinBalloons int `json:"MinBalloons"` // MaxBalloons is the maximum number of balloon instances that // is allowed to co-exist. If reached, new balloons cannot be // created anymore. MaxBalloons int `json:"MaxBalloons"` // PreferSpreadingPods: containers of the same pod may be // placed on separate balloons. The default is false: prefer // placing containers of a pod to the same balloon(s). PreferSpreadingPods bool // PreferPerNamespaceBalloon: if true, containers in different // namespaces are preferrably placed in separate balloons, // even if the balloon type is the same for all of them. On // the other hand, containers in the same namespace will be // placed in the same balloon instances. The default is false: // namespaces have no effect on placement. PreferPerNamespaceBalloon bool // PreferNewBalloons: prefer creating new balloons over adding // containers to existing balloons. The default is false: // prefer using filling free capacity and possibly inflating // existing balloons before creating new ones. PreferNewBalloons bool // ShareIdleCpusInSame : if there are idle // CPUs, that is CPUs not in any balloon, in the same // as any CPU in the balloon, then allow // workloads to run on those (shared) CPUs in addition to the // (dedicated) CPUs of the balloon. ShareIdleCpusInSame CPUTopologyLevel `json:"ShareIdleCPUsInSame,omitempty"` } var defaultPinCPU bool = true var defaultPinMemory bool = true // DeepCopy creates a deep copy of a BalloonsOptions func (bo *BalloonsOptions) DeepCopy() *BalloonsOptions { outBo := *bo outBo.ReservedPoolNamespaces = make([]string, len(bo.ReservedPoolNamespaces)) copy(outBo.ReservedPoolNamespaces, bo.ReservedPoolNamespaces) outBo.BalloonDefs = make([]*BalloonDef, len(bo.BalloonDefs)) for i := range bo.BalloonDefs { outBo.BalloonDefs[i] = bo.BalloonDefs[i].DeepCopy() } return &outBo } // String stringifies a BalloonDef func (bdef BalloonDef) String() string { return bdef.Name } // DeepCopy creates a deep copy of a BalloonDef func (bdef *BalloonDef) DeepCopy() *BalloonDef { outBdef := *bdef outBdef.Namespaces = make([]string, len(bdef.Namespaces)) copy(outBdef.Namespaces, bdef.Namespaces) return &outBdef } // defaultBalloonsOptions returns a new BalloonsOptions instance, all initialized to defaults. func defaultBalloonsOptions() interface{} { return &BalloonsOptions{ ReservedPoolNamespaces: []string{metav1.NamespaceSystem}, PinCPU: &defaultPinCPU, PinMemory: &defaultPinMemory, } } // Our runtime configuration. var balloonsOptions = defaultBalloonsOptions().(*BalloonsOptions) // UnmarshalJSON makes sure all options from previous unmarshals get // cleared before unmarshaling new data to the same address. func (bo *BalloonsOptions) UnmarshalJSON(data []byte) error { bow := balloonsOptionsWrapped{} if err := json.Unmarshal(data, &bow); err != nil { return err } *bo = BalloonsOptions(bow) return nil } // Register us for configuration handling. func init() { pkgcfg.Register(PolicyPath, PolicyDescription, balloonsOptions, defaultBalloonsOptions) } ================================================ FILE: pkg/cri/resource-manager/policy/builtin/balloons/metrics.go ================================================ // Copyright 2022 Intel Corporation. All Rights Reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package balloons import ( "sort" "strconv" "strings" "github.com/intel/cri-resource-manager/pkg/cri/resource-manager/policy" "github.com/intel/cri-resource-manager/pkg/utils/cpuset" "github.com/prometheus/client_golang/prometheus" ) // Prometheus Metric descriptor indices and descriptor table const ( balloonsDesc = iota ) var descriptors = []*prometheus.Desc{ balloonsDesc: prometheus.NewDesc( "balloons", "CPUs", []string{ "balloon_type", "cpu_class", "cpus_min", "cpus_max", "balloon", "cpus", "cpus_count", "numas", "numas_count", "dies", "dies_count", "packages", "packages_count", "sharedidlecpus", "sharedidlecpus_count", "cpus_allowed", "cpus_allowed_count", "mems", "containers", "tot_req_millicpu", }, nil, ), } // Metrics defines the balloons-specific metrics from policy level. type Metrics struct { Balloons []*BalloonMetrics } // BalloonMetrics define metrics of a balloon instance. type BalloonMetrics struct { // Balloon type metrics DefName string CpuClass string MinCpus int MaxCpus int // Balloon instance metrics PrettyName string Cpus cpuset.CPUSet CpusCount int Numas []string NumasCount int Dies []string DiesCount int Packages []string PackagesCount int SharedIdleCpus cpuset.CPUSet SharedIdleCpusCount int CpusAllowed cpuset.CPUSet CpusAllowedCount int Mems string ContainerNames string ContainerReqMilliCpus int } // DescribeMetrics generates policy-specific prometheus metrics data // descriptors. func (p *balloons) DescribeMetrics() []*prometheus.Desc { return descriptors } // PollMetrics provides policy metrics for monitoring. func (p *balloons) PollMetrics() policy.Metrics { policyMetrics := &Metrics{} policyMetrics.Balloons = make([]*BalloonMetrics, len(p.balloons)) for index, bln := range p.balloons { cpuLoc := p.cpuTree.CpuLocations(bln.Cpus) bm := &BalloonMetrics{} policyMetrics.Balloons[index] = bm bm.DefName = bln.Def.Name bm.CpuClass = bln.Def.CpuClass bm.MinCpus = bln.Def.MinCpus bm.MaxCpus = bln.Def.MaxCpus bm.PrettyName = bln.PrettyName() bm.Cpus = bln.Cpus bm.CpusCount = bm.Cpus.Size() if len(cpuLoc) > 3 { bm.Numas = cpuLoc[3] bm.NumasCount = len(bm.Numas) bm.Dies = cpuLoc[2] bm.DiesCount = len(bm.Dies) bm.Packages = cpuLoc[1] bm.PackagesCount = len(bm.Packages) } bm.SharedIdleCpus = bln.SharedIdleCpus bm.SharedIdleCpusCount = bm.SharedIdleCpus.Size() bm.CpusAllowed = bm.Cpus.Union(bm.SharedIdleCpus) bm.CpusAllowedCount = bm.CpusAllowed.Size() bm.Mems = bln.Mems.String() cNames := []string{} // Get container names and total requested milliCPUs. for _, containerIDs := range bln.PodIDs { for _, containerID := range containerIDs { if c, ok := p.cch.LookupContainer(containerID); ok { cNames = append(cNames, c.PrettyName()) bm.ContainerReqMilliCpus += p.containerRequestedMilliCpus(containerID) } } } sort.Strings(cNames) bm.ContainerNames = strings.Join(cNames, ",") } return policyMetrics } // CollectMetrics generates prometheus metrics from cached/polled // policy-specific metrics data. func (p *balloons) CollectMetrics(m policy.Metrics) ([]prometheus.Metric, error) { metrics, ok := m.(*Metrics) if !ok { return nil, balloonsError("type mismatch in balloons metrics") } promMetrics := make([]prometheus.Metric, len(metrics.Balloons)) for index, bm := range metrics.Balloons { promMetrics[index] = prometheus.MustNewConstMetric( descriptors[balloonsDesc], prometheus.GaugeValue, float64(bm.Cpus.Size()), bm.DefName, bm.CpuClass, strconv.Itoa(bm.MinCpus), strconv.Itoa(bm.MaxCpus), bm.PrettyName, bm.Cpus.String(), strconv.Itoa(bm.CpusCount), strings.Join(bm.Numas, ","), strconv.Itoa(bm.NumasCount), strings.Join(bm.Dies, ","), strconv.Itoa(bm.DiesCount), strings.Join(bm.Packages, ","), strconv.Itoa(bm.PackagesCount), bm.SharedIdleCpus.String(), strconv.Itoa(bm.SharedIdleCpusCount), bm.CpusAllowed.String(), strconv.Itoa(bm.CpusAllowedCount), bm.Mems, bm.ContainerNames, strconv.Itoa(bm.ContainerReqMilliCpus)) } return promMetrics, nil } ================================================ FILE: pkg/cri/resource-manager/policy/builtin/dynamic-pools/cpu.go ================================================ package dyp import ( "bufio" "context" "io" "math" "os" "path/filepath" "runtime" "strconv" "strings" "time" "github.com/intel/cri-resource-manager/pkg/sysfs" ) type cpuTimesStat struct { cpu string `json:"cpu"` user float64 `json:"user"` system float64 `json:"system"` idle float64 `json:"idle"` nice float64 `json:"nice"` ioWait float64 `json:"iowait"` irq float64 `json:"irq"` softirq float64 `json:"softirq"` steal float64 `json:"steal"` guest float64 `json:"guest"` guestNice float64 `json:"guestNice"` } // getCpuUtilization returns the utilization of each cpu in an interval func getCpuUtilization(interval time.Duration) ([]float64, error) { ctx := context.Background() cpuTimesStat1, err := getCpuTimesStat(ctx) if err != nil { return nil, err } if err := wait(ctx, interval); err != nil { return nil, err } cpuTimesStat2, err := getCpuTimesStat(ctx) if err != nil { return nil, err } return calculateAllCpusUtilization(cpuTimesStat1, cpuTimesStat2) } func getCpuTimesStat(ctx context.Context) ([]cpuTimesStat, error) { filename := filepath.Join("/", sysfs.SysRoot(), "proc", "stat") lines := []string{} cpuLines, err := readCpuLines(filename) if err != nil || len(cpuLines) == 0 { return []cpuTimesStat{}, err } stat := make([]cpuTimesStat, 0, len(lines)) for _, l := range cpuLines { oneStat, err := parseStatLine(l) if err != nil { continue } stat = append(stat, *oneStat) } return stat, nil } func wait(ctx context.Context, interval time.Duration) error { timer := time.NewTimer(interval) select { case <-ctx.Done(): return ctx.Err() case <-timer.C: return nil } } func calculateAllCpusUtilization(cts1, cts2 []cpuTimesStat) ([]float64, error) { if len(cts1) != len(cts2) { return nil, dynamicPoolsError("received two CPU counts: %d != %d", len(cts1), len(cts2)) } allCpusUtilization := make([]float64, len(cts1)) for i := 0; i < len(cts1); i++ { allCpusUtilization[i] = calculateOneCpuUtilization(cts1[i], cts2[i]) } return allCpusUtilization, nil } // readCpuLines skips the first line indicating the total CPU utilization. func readCpuLines(filename string) ([]string, error) { f, err := os.Open(filename) if err != nil { return nil, err } defer f.Close() var statLines []string reader := bufio.NewReader(f) for { line, _, err := reader.ReadLine() if err == io.EOF { break } statLines = append(statLines, string(line)) } var cpuLines []string if len(statLines) < 2 { return nil, nil } for _, line := range statLines[1:] { if !strings.HasPrefix(line, "cpu") { break } cpuLines = append(cpuLines, line) } return cpuLines, nil } // parseStatLine is to parse cpuLine into cpuTimesStat. func parseStatLine(cpuLine string) (*cpuTimesStat, error) { values := strings.Fields(cpuLine) if len(values) == 0 || len(values) < 8 { return nil, dynamicPoolsError("Stat does not contain cpu info.") } cpu := values[0] user, err := strconv.ParseFloat(values[1], 64) if err != nil { return nil, err } nice, err := strconv.ParseFloat(values[2], 64) if err != nil { return nil, err } system, err := strconv.ParseFloat(values[3], 64) if err != nil { return nil, err } idle, err := strconv.ParseFloat(values[4], 64) if err != nil { return nil, err } ioWait, err := strconv.ParseFloat(values[5], 64) if err != nil { return nil, err } irq, err := strconv.ParseFloat(values[6], 64) if err != nil { return nil, err } softirq, err := strconv.ParseFloat(values[7], 64) if err != nil { return nil, err } cts := &cpuTimesStat{ cpu: cpu, user: user, nice: nice, system: system, idle: idle, ioWait: ioWait, irq: irq, softirq: softirq, } if len(values) > 8 { // Linux >= 2.6.11 steal, err := strconv.ParseFloat(values[8], 64) if err != nil { return nil, err } cts.steal = steal } if len(values) > 9 { // Linux >= 2.6.24 guest, err := strconv.ParseFloat(values[9], 64) if err != nil { return nil, err } cts.guest = guest } if len(values) > 10 { // Linux >= 3.2.0 guestNice, err := strconv.ParseFloat(values[10], 64) if err != nil { return nil, err } cts.guestNice = guestNice } return cts, nil } // calculateOneCpuUtilization returns the utilization of one cpu in an interval func calculateOneCpuUtilization(cts1, cts2 cpuTimesStat) float64 { cts1Total, cts1Busy := getBusyTime(cts1) cts2Total, cts2Busy := getBusyTime(cts2) if cts2Busy <= cts1Busy { return 0 } if cts2Total <= cts1Total { return 100 } return math.Min(100, math.Max(0, (cts2Busy-cts1Busy)/(cts2Total-cts1Total)*100)) } func getBusyTime(cts cpuTimesStat) (float64, float64) { total := cts.user + cts.system + cts.idle + cts.nice + cts.ioWait + cts.irq + cts.softirq + cts.steal + cts.guest + cts.guestNice if runtime.GOOS == "linux" { total -= cts.guest // Linux 2.6.24+ total -= cts.guestNice // Linux 3.2.0+ } busy := total - cts.idle - cts.ioWait return total, busy } ================================================ FILE: pkg/cri/resource-manager/policy/builtin/dynamic-pools/dyp.go ================================================ // Copyright 2022 Intel Corporation. All Rights Reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package dyp import ( "fmt" "path/filepath" "time" corev1 "k8s.io/api/core/v1" resapi "k8s.io/apimachinery/pkg/api/resource" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" pkgcfg "github.com/intel/cri-resource-manager/pkg/config" "github.com/intel/cri-resource-manager/pkg/cpuallocator" "github.com/intel/cri-resource-manager/pkg/cri/resource-manager/cache" cpucontrol "github.com/intel/cri-resource-manager/pkg/cri/resource-manager/control/cpu" "github.com/intel/cri-resource-manager/pkg/cri/resource-manager/events" "github.com/intel/cri-resource-manager/pkg/cri/resource-manager/introspect" "github.com/intel/cri-resource-manager/pkg/cri/resource-manager/kubernetes" "github.com/intel/cri-resource-manager/pkg/cri/resource-manager/policy" policyapi "github.com/intel/cri-resource-manager/pkg/cri/resource-manager/policy" logger "github.com/intel/cri-resource-manager/pkg/log" "github.com/intel/cri-resource-manager/pkg/utils" "github.com/intel/cri-resource-manager/pkg/utils/cpuset" idset "github.com/intel/goresctrl/pkg/utils" ) const ( // PolicyName is the name used to activate this policy. PolicyName = "dynamic-pools" // PolicyDescription is a short description of this policy. PolicyDescription = "The cpuset of the dynamic pools can be dynamically changed based on workload." // PolicyPath is the path of this policy in the configuration hierarchy. PolicyPath = "policy." + PolicyName // dynamicPoolKey is a pod annotation key, the value is a pod dynamicPool name. dynamicPoolKey = "dynamic-pool." + PolicyName + "." + kubernetes.ResmgrKeyNamespace // reservedDynamicPoolDefName is the name in the reserved dynamicPool definition. reservedDynamicPoolDefName = "reserved" // sharedDynamicPoolDefName is the name in the shared dynamicPool definition. sharedDynamicPoolDefName = "shared" ) // dynamicPools contains configuration and runtime attributes of the dynamic-pools policy type dynamicPools struct { options *policyapi.BackendOptions // configuration common to all policies dpoptions DynamicPoolsOptions // dynamicPool-specific configuration cch cache.Cache // cri-resmgr cache allowed cpuset.CPUSet // bounding set of CPUs we're allowed to use reserved cpuset.CPUSet // system-/kube-reserved CPUs freeCpus cpuset.CPUSet // CPUs to be included in growing dynamicPools reservedDynamicPoolDef *DynamicPoolDef // built-in definition of the reserved dynamicPool sharedDynamicPoolDef *DynamicPoolDef // built-in definition of the shared dynamicPool dynamicPools []*DynamicPool // dynamicPool instances: reserved, shared and user-defined cpuAllocator cpuallocator.CPUAllocator // CPU allocator used by the policy } // DynamicPool contains attributes of a dynamicPool type DynamicPool struct { // Def is the definition from which this dynamicPool is created. Def *DynamicPoolDef // Cpus is the set of CPUs exclusive to this dynamicPool only. Cpus cpuset.CPUSet // Mems is the set of memory nodes with minimal access delay from CPUs. Mems idset.IDSet // PodIDs maps pod ID to list of container IDs. // - len(PodIDs) is the number of pods in the dynamicPool. // - len(PodIDs[podID]) is the number of containers of podID currently assigned to the dynamicPool. PodIDs map[string][]string } var log logger.Logger = logger.NewLogger("policy") // String is a stringer for a dynamicPool. func (dp DynamicPool) String() string { return fmt.Sprintf("%s{Cpus:%s, Mems:%s}", dp.PrettyName(), dp.Cpus, dp.Mems) } // PrettyName returns a unique name for a dynamicPool. func (dp DynamicPool) PrettyName() string { return dp.Def.Name } // ContainerIDs returns IDs of containers assigned in a dynamicPool. // (Using cache.Container.GetCacheID()'s) func (dp DynamicPool) ContainerIDs() []string { cIDs := []string{} for _, ctrIDs := range dp.PodIDs { cIDs = append(cIDs, ctrIDs...) } return cIDs } // ContainerCount returns the number of containers in a dynamicPool. func (dp DynamicPool) ContainerCount() int { count := 0 for _, ctrIDs := range dp.PodIDs { count += len(ctrIDs) } return count } // AvailMilliCpus returns the number of CPUs in a dynamicPool. func (dp DynamicPool) AvailMilliCpus() int { return dp.Cpus.Size() * 1000 } // updateRealCpuUsed returns cpu utilization of a dynamicPool. func (dp *DynamicPool) updateRealCpuUsed(cpuInfo []float64) (float64, error) { if dp.Cpus.Size() == 0 { log.Debug("dynamic pool %s cpuset is 0", dp.Def.Name) return 0, nil } cpus := dp.Cpus.UnsortedList() var sum float64 for i := 0; i < len(cpus); i++ { sum += cpuInfo[cpus[i]] } log.Debug("dynamic pool %s cpuset: %s, cpu utilization: %v", dp.Def.Name, dp.Cpus, sum) return sum, nil } // calculateAllPoolWeights returns weights of all dynamicPools and the sum of weights. // Use dynamicPool's cpu utilization as its weight. func (p *dynamicPools) calculateAllPoolWeights() (map[*DynamicPool]float64, float64, error) { cpuInfo, _ := getCpuUtilization(time.Duration(time.Second)) weight := make(map[*DynamicPool]float64) sumWeight := 0.0 for _, dp := range p.dynamicPools { if dp.Def.Name == reservedDynamicPoolDefName { continue } // If there is no container in a dynamic pool, there is no need to calculate its weight, that is, there is no need to allocate CPUs to it. if dp.ContainerCount() == 0 { weight[dp] = 0.0 } else { realCpuUsed, err := dp.updateRealCpuUsed(cpuInfo) if err != nil { return weight, sumWeight, dynamicPoolsError("The actual cpu usage of the dynamic pool %s cannot be obtained: %w", dp.PrettyName(), err) } weight[dp] = realCpuUsed sumWeight += weight[dp] } log.Debug("dynamic pool: %s, weight: %v", dp, weight[dp]) } log.Debug("sum weight: %v", sumWeight) return weight, sumWeight, nil } // calculateAllPoolRequests returns the sum of the requests of containers in each dynamicPool and remaining free cpu. // remainFree = allowed cpu - reserved cpu - sum(requests of containers in each dynamicPool) func (p *dynamicPools) calculateAllPoolRequests() (map[*DynamicPool]int, int) { requestCpu := make(map[*DynamicPool]int) remainFree := p.allowed.Difference(p.reserved).Size() for _, dp := range p.dynamicPools { if dp.Def.Name == reservedDynamicPoolDefName { continue } requestCpu[dp] = (p.requestedMinMilliCpus(dp) + 999) / 1000 remainFree -= requestCpu[dp] log.Debug("dynamic pool %s request cpu %d", dp, requestCpu[dp]) } log.Debug("sum remain free cpu %d", remainFree) return requestCpu, remainFree } func (p *dynamicPools) containerPinPool(dp *DynamicPool) { dp.Mems = p.closestMems(dp.Cpus) for _, cID := range dp.ContainerIDs() { if c, ok := p.cch.LookupContainer(cID); ok { p.pinCpuMem(c, dp.Cpus, dp.Mems) } } } // calculatePoolCpuset returns the cpus that dynamic pools need to allocate. func (p *dynamicPools) calculatePoolCpuset(requestCpu map[*DynamicPool]int, remainFree int, weight map[*DynamicPool]float64, sumWeight float64) map[*DynamicPool]int { usedCpu := 0 // If there are containers in the shared dynamic pool, allocate at least one CPU to it, // otherwise there is no need to allocate a CPU to it. // Ensure that there is at least one cpu in the shared dynamicPool. for _, dp := range p.dynamicPools { if dp.Def.Name == sharedDynamicPoolDefName && dp.ContainerCount() > 0 && sumWeight != 0 { addCpu := int(float64(remainFree) * weight[dp] / sumWeight) if requestCpu[dp]+addCpu < 1 { requestCpu[dp] = 1 remainFree -= 1 } } } for _, dp := range p.dynamicPools { if dp.Def.Name == reservedDynamicPoolDefName { requestCpu[dp] = dp.Cpus.Size() } if sumWeight != 0 { addCpu := int(float64(remainFree) * weight[dp] / sumWeight) requestCpu[dp] += addCpu usedCpu += addCpu } log.Info("The cpu that dynamic pool %s needs to allocate is %d, remain free cpu %d", dp, requestCpu[dp], remainFree-usedCpu) } if usedCpu < remainFree { // If there is still cpus, give the dynamicPool with the highest cpu utilization. tmp := p.dynamicPools[1] for _, dp := range p.dynamicPools { if dp.Def.Name == reservedDynamicPoolDefName { continue } if weight[dp] > weight[tmp] { tmp = dp } } requestCpu[tmp] += (remainFree - usedCpu) log.Info("The cpu that dynamic pool %s needs to allocate is %d, remain free cpu %d", tmp, requestCpu[tmp], 0) } return requestCpu } // isNeedReallocate returns whether the cpus need to be reallocated. func (p *dynamicPools) isNeedReallocate(newPoolCpu map[*DynamicPool]int) bool { for _, dp := range p.dynamicPools { if dp.Def.Name == reservedDynamicPoolDefName { continue } if dp.Cpus.Size() != newPoolCpu[dp] { return true } } return false } // updatePoolCpuset updates the cpuset of the dynamicPools. func (p *dynamicPools) updatePoolCpuset() error { requestCpu, remainFree := p.calculateAllPoolRequests() weight, sumWeight, err := p.calculateAllPoolWeights() if err != nil { return err } if remainFree >= 1 { requestCpu = p.calculatePoolCpuset(requestCpu, remainFree, weight, sumWeight) } // If the number of newly allocated CPUs is the same as the number of existing CPUs in the pool, // it means that there is no need to re-allocate if !p.isNeedReallocate(requestCpu) { log.Info("The number of CPUs required by the pools is the same as the number of CPUs already in the pools, so there is no need to reallocate.") for _, dp := range p.dynamicPools { p.containerPinPool(dp) } return nil } for _, dp := range p.dynamicPools { if dp.Def.Name == reservedDynamicPoolDefName { continue } if dp.Cpus.Size() == 0 { continue } oldCpus := dp.Cpus.Clone() keptCpus, err := p.cpuAllocator.ReleaseCpus(&oldCpus, dp.Cpus.Size(), dp.Def.AllocatorPriority) if err != nil || keptCpus.Size() != 0 { return dynamicPoolsError("releasing %d CPUs from %s failed: %w (kept: %s)", dp.Cpus.Size(), dp, err, keptCpus) } p.freeCpus = p.freeCpus.Union(dp.Cpus) } for _, dp := range p.dynamicPools { if dp.Def.Name == reservedDynamicPoolDefName { continue } newCpus, err := p.cpuAllocator.AllocateCpus(&p.freeCpus, requestCpu[dp], dp.Def.AllocatorPriority) if err != nil { return dynamicPoolsError("allocating %d CPUs for %s failed: %w", requestCpu[dp], dp, err) } dp.Cpus = newCpus log.Debugf("resize successful for container: %s, new Cpus: %#s", dp.PrettyName(), dp.Cpus) p.containerPinPool(dp) p.useCpuClass(dp) } return nil } // CreateDynamicPoolsPolicy creates a new policy instance. func CreateDynamicPoolsPolicy(policyOptions *policy.BackendOptions) policy.Backend { p := &dynamicPools{ options: policyOptions, cch: policyOptions.Cache, cpuAllocator: cpuallocator.NewCPUAllocator(policyOptions.System), } log.Info("creating %s policy...", PolicyName) // Handle common policy options: AvailableResources and ReservedResources. // p.allowed: CPUs available for the policy if allowed, ok := policyOptions.Available[policyapi.DomainCPU]; ok { p.allowed = allowed.(cpuset.CPUSet) } else { // Available CPUs not specified, default to all on-line CPUs. p.allowed = policyOptions.System.CPUSet().Difference(policyOptions.System.Offlined()) } // p.reserved: CPUs reserved for kube-system pods, subset of p.allowed. p.reserved = cpuset.New() if reserved, ok := p.options.Reserved[policyapi.DomainCPU]; ok { switch v := reserved.(type) { case cpuset.CPUSet: p.reserved = p.allowed.Intersection(v) case resapi.Quantity: reserveCnt := (int(v.MilliValue()) + 999) / 1000 cpus, err := p.cpuAllocator.AllocateCpus(&p.allowed, reserveCnt, cpuallocator.PriorityNone) if err != nil { log.Fatal("failed to allocate reserved CPUs: %s", err) } p.reserved = cpus p.allowed = p.allowed.Union(cpus) } } if p.reserved.IsEmpty() { log.Fatal("%s cannot run without reserved CPUs that are also AvailableResources", PolicyName) } // Handle policy-specific options log.Debug("creating %s configuration", PolicyName) if err := p.setConfig(dynamicPoolsOptions); err != nil { log.Fatal("failed to create %s policy: %v", PolicyName, err) } pkgcfg.GetModule(PolicyPath).AddNotify(p.configNotify) return p } // Name returns the name of this policy. func (p *dynamicPools) Name() string { return PolicyName } // Description returns the description for this policy. func (p *dynamicPools) Description() string { return PolicyDescription } // Start prepares this policy for accepting allocation/release requests. func (p *dynamicPools) Start(add []cache.Container, del []cache.Container) error { log.Info("%s policy started", PolicyName) return p.Sync(p.cch.GetContainers(), nil) } // Sync synchronizes the active policy state. func (p *dynamicPools) Sync(add []cache.Container, del []cache.Container) error { log.Debug("synchronizing state...") for _, c := range del { p.ReleaseResources(c) } for _, c := range add { p.AllocateResources(c) } return nil } // AllocateResources is a resource allocation request for this policy. func (p *dynamicPools) AllocateResources(c cache.Container) error { log.Debug("allocating resources for container %s...", c.PrettyName()) dp, err := p.allocateDynamicPool(c) if err != nil { return dynamicPoolsError("dynamicPool allocation for container %s failed: %w", c.PrettyName(), err) } if dp == nil { return dynamicPoolsError("no suitable dynamicPools found for container %s", c.PrettyName()) } log.Info("assigning container %s to dynamicPool %s", c.PrettyName(), dp) podID := c.GetPodID() dp.PodIDs[podID] = append(dp.PodIDs[podID], c.GetCacheID()) if dp.Cpus.Equals(p.reserved) { p.assignContainer(c, dp) log.Debugf("if dynamic pool is reserved, do not updatePoolCpuset.") } else { p.updatePoolCpuset() } if log.DebugEnabled() { log.Debug(p.dumpDynamicPool(dp)) } return nil } // ReleaseResources is a resource release request for this policy. func (p *dynamicPools) ReleaseResources(c cache.Container) error { log.Debug("releasing container %s...", c.PrettyName()) dp := p.dynamicPoolByContainer(c) if dp == nil { log.Debug("ReleaseResources: dynamicPool-less container %s, nothing to release", c.PrettyName()) return nil } p.dismissContainer(c, dp) if dp.Cpus.Equals(p.reserved) { log.Debugf("if dynamic pool is reserved, do not updatePoolCpuset.") } else { p.updatePoolCpuset() } if log.DebugEnabled() { log.Debug(p.dumpDynamicPool(dp)) } return nil } // UpdateResources is a resource allocation update request for this policy. func (p *dynamicPools) UpdateResources(c cache.Container) error { log.Debug("(not) updating container %s...", c.PrettyName()) return nil } // Rebalance tries to find an optimal allocation of resources for the current containers. func (p *dynamicPools) Rebalance() (bool, error) { log.Debug("rebalancing containers...") err := p.updatePoolCpuset() return true, err } // HandleEvent handles policy-specific events. func (p *dynamicPools) HandleEvent(*events.Policy) (bool, error) { log.Debug("(not) handling event...") return false, nil } // ExportResourceData provides resource data to export for the container. func (p *dynamicPools) ExportResourceData(c cache.Container) map[string]string { return nil } // Introspect provides data for external introspection. func (p *dynamicPools) Introspect(*introspect.State) { return } // dynamicPoolByContainer returns a dynamicPool that contains a container. func (p *dynamicPools) dynamicPoolByContainer(c cache.Container) *DynamicPool { podID := c.GetPodID() cID := c.GetCacheID() for _, dp := range p.dynamicPools { for _, ctrID := range dp.PodIDs[podID] { if ctrID == cID { return dp } } } return nil } // dynamicPoolsByDef returns a dynamicPool instantiated from a dynamicPool definition. func (p *dynamicPools) dynamicPoolByDef(dpDef *DynamicPoolDef) *DynamicPool { for _, dp := range p.dynamicPools { if dp.Def == dpDef { return dp } } return nil } // dynamicPoolDefByName returns a dynamicPool definition with a name. func (p *dynamicPools) dynamicPoolDefByName(defName string) *DynamicPoolDef { if defName == reservedDynamicPoolDefName { return p.reservedDynamicPoolDef } if defName == sharedDynamicPoolDefName { return p.sharedDynamicPoolDef } for _, dpDef := range p.dpoptions.DynamicPoolDefs { if dpDef.Name == defName { return dpDef } } return nil } // chooseDynamicPoolDef returns the dynamicPoolDef selected by the container func (p *dynamicPools) chooseDynamicPoolDef(c cache.Container) (*DynamicPoolDef, error) { var dpDef *DynamicPoolDef // If the requests and limits of container are 0, they are assigned to the shared dynamicPool. if !namespaceMatches(c.GetNamespace(), append(p.dpoptions.ReservedPoolNamespaces, metav1.NamespaceSystem)) && p.containerRequestedMilliCpus(c.GetCacheID()) == 0 && p.containerLimitedMilliCpus(c.GetCacheID()) == 0 { return p.sharedDynamicPoolDef, nil } // DynamicPoolDef is defined by annotation? if dpDefName, ok := c.GetEffectiveAnnotation(dynamicPoolKey); ok { dpDef = p.dynamicPoolDefByName(dpDefName) if dpDef == nil { return nil, dynamicPoolsError("no dynamicPool for annotation %q", dpDefName) } return dpDef, nil } // DynamicPoolDef is defined by a special namespace (kube-system + // ReservedPoolNamespaces)? if namespaceMatches(c.GetNamespace(), append(p.dpoptions.ReservedPoolNamespaces, metav1.NamespaceSystem)) { return p.dynamicPools[0].Def, nil } // DynamicPoolDef is defined by the namespace? for _, dpDef := range append([]*DynamicPoolDef{p.reservedDynamicPoolDef, p.sharedDynamicPoolDef}, p.dpoptions.DynamicPoolDefs...) { if namespaceMatches(c.GetNamespace(), dpDef.Namespaces) { return dpDef, nil } } // Fallback to the shared dynamicPool. return p.sharedDynamicPoolDef, nil } func (p *dynamicPools) containerRequestedMilliCpus(contID string) int { cont, ok := p.cch.LookupContainer(contID) if !ok { return 0 } reqCpu, ok := cont.GetResourceRequirements().Requests[corev1.ResourceCPU] if !ok { return 0 } return int(reqCpu.MilliValue()) } func (p *dynamicPools) containerLimitedMilliCpus(contID string) int { cont, ok := p.cch.LookupContainer(contID) if !ok { return 0 } limitCpu, ok := cont.GetResourceRequirements().Limits[corev1.ResourceCPU] if !ok { return 0 } return int(limitCpu.MilliValue()) } // requestedMaxMilliCpus sums up and returns CPU limits of all // containers assigned to a dynamicPool. func (p *dynamicPools) requestedMaxMilliCpus(dp *DynamicPool) int { cpuRequested := 0 for _, cID := range dp.ContainerIDs() { cpuRequested += p.containerLimitedMilliCpus(cID) } return cpuRequested } // requestedMinMilliCpus sums up and returns CPU requests of all // containers assigned to a dynamicPool. func (p *dynamicPools) requestedMinMilliCpus(dp *DynamicPool) int { cpuRequested := 0 for _, cID := range dp.ContainerIDs() { cpuRequested += p.containerRequestedMilliCpus(cID) } return cpuRequested } // useCpuClass configures CPUs of a dynamicPool. func (p *dynamicPools) useCpuClass(dp *DynamicPool) error { // Usual inputs: // - CPUs that cpuallocator has reserved for this dynamicPool: // dp.Cpus (cpuset.CPUSet). // - User-defined CPU configuration for CPUs of dynamicPool of this type: // dp.Def.CpuClass (string). // - Current configuration(?): feel free to add data // structure for this. For instance policy-global p.cpuConfs, // or dynamicPool-local dp.cpuConfs. // // Other input examples, if needed: // - Requested CPU resources by all containers in the dynamicPool: // p.requestedMilliCpus(dp). // - Free CPU resources in the dynamicPool: p.freeMilliCpus(dp). // - Number of assigned containers: dp.ContainerCount(). // - Container details: access p.cch with dp.ContainerIDs(). // - User-defined CPU AllocatorPriority: dp.Def.AllocatorPriority. // - All existing dynamicPool instances: p.dynamicPools. // - CPU configurations by user: dp.Def.CpuClass (for dp in p.dynamicPools) cpucontrol.Assign(p.cch, dp.Def.CpuClass, dp.Cpus.UnsortedList()...) log.Debugf("useCpuClass Cpus: %s; CpuClass: %s", dp.Cpus, dp.Def.CpuClass) return nil } func (p *dynamicPools) newDynamicPool(dpDef *DynamicPoolDef, confCpus bool) (*DynamicPool, error) { var cpus cpuset.CPUSet var err error if dpDef == p.reservedDynamicPoolDef { cpus = p.reserved } else { cpus, err = p.cpuAllocator.AllocateCpus(&p.freeCpus, 0, dpDef.AllocatorPriority) if err != nil { return nil, dynamicPoolsError("could not allocate Cpus for dynamicPool %s: %w", dpDef.Name, err) } } dp := &DynamicPool{ Def: dpDef, PodIDs: make(map[string][]string), Cpus: cpus, Mems: p.closestMems(cpus), } if confCpus { if err = p.useCpuClass(dp); err != nil { log.Errorf("failed to apply CPU configuration to new dynamicPool %s (cpus: %s): %w", dpDef.Name, cpus, err) return nil, err } } return dp, nil } func namespaceMatches(namespace string, patterns []string) bool { for _, pattern := range patterns { ret, err := filepath.Match(pattern, namespace) if err == nil && ret { return true } } return false } // allocateDynamicPool returns a dynamicPool allocated for a container. func (p *dynamicPools) allocateDynamicPool(c cache.Container) (*DynamicPool, error) { dpDef, err := p.chooseDynamicPoolDef(c) if err != nil { return nil, err } if dpDef == nil { return nil, dynamicPoolsError("no applicable dynamicPool type found") } dynamicPool := p.dynamicPoolByDef(dpDef) if dynamicPool == nil { return nil, dynamicPoolsError("no suitable dynamicPool instance available") } return dynamicPool, err } // dumpDynamicPool dumps dynamicPool contents in detail. func (p *dynamicPools) dumpDynamicPool(dp *DynamicPool) string { conts := []string{} pods := []string{} for podID, contIDs := range dp.PodIDs { podName := podID if pod, ok := p.cch.LookupPod(podID); ok { podName = pod.GetName() } pods = append(pods, podName) for _, contID := range contIDs { if cont, ok := p.cch.LookupContainer(contID); ok { conts = append(conts, cont.PrettyName()) } else { conts = append(conts, podName+"."+contID) } } } s := fmt.Sprintf("DynamicPool %s{Cpus: %s; Mems: %s; mCPU requests: %d; mCPU limits: %d; capacity: %d; pods: %s; conts: %s}", dp.PrettyName(), dp.Cpus, dp.Mems, p.requestedMinMilliCpus(dp), p.requestedMaxMilliCpus(dp), dp.AvailMilliCpus(), pods, conts) return s } // changesDynamicPools returns true if two dynamicPools policy configurations // may lead into different dynamicPools or workload assignment. func changesDynamicPools(opts0, opts1 *DynamicPoolsOptions) bool { if opts0 == nil && opts1 == nil { return false } if opts0 == nil || opts1 == nil { return true } if len(opts0.DynamicPoolDefs) != len(opts1.DynamicPoolDefs) { return true } o0 := opts0.DeepCopy() o1 := opts1.DeepCopy() // Ignore differences in CPU class names. Every other change // potentially changes dynamicPools or workloads. for i := range o0.DynamicPoolDefs { o0.DynamicPoolDefs[i].CpuClass = "" o1.DynamicPoolDefs[i].CpuClass = "" } return utils.DumpJSON(o0) != utils.DumpJSON(o1) } // changesCpuClasses returns true if two dynamicPools policy // configurations can lead to using different CPU classes on // corresponding dynamicPool instances. Calling changesCpuClasses(o0, o1) // makes sense only if changesDynamicPools(o0, o1) has returned false. func changesCpuClasses(opts0, opts1 *DynamicPoolsOptions) bool { if opts0 == nil && opts1 == nil { return false } if opts0 == nil || opts1 == nil { return true } if len(opts0.DynamicPoolDefs) != len(opts1.DynamicPoolDefs) { return true } for i := range opts0.DynamicPoolDefs { if opts0.DynamicPoolDefs[i].CpuClass != opts1.DynamicPoolDefs[i].CpuClass { return true } } return false } // configNotify applies new configuration. func (p *dynamicPools) configNotify(event pkgcfg.Event, source pkgcfg.Source) error { log.Info("configuration %s", event) defer log.Debug("effective configuration:\n%s\n", utils.DumpJSON(p.dpoptions)) newDynamicPoolsOptions := dynamicPoolsOptions.DeepCopy() if !changesDynamicPools(&p.dpoptions, newDynamicPoolsOptions) { if !changesCpuClasses(&p.dpoptions, newDynamicPoolsOptions) { log.Info("no configuration changes") } else { log.Info("configuration changes only on CPU classes") // Update new CPU classes to existing DynamicPool // definitions. The same DynamicPoolDef instances // must be kept in use, because each dynamicPool // instance holds a direct reference to its // DynamicPoolDef. for i := range p.dpoptions.DynamicPoolDefs { p.dpoptions.DynamicPoolDefs[i].CpuClass = newDynamicPoolsOptions.DynamicPoolDefs[i].CpuClass } // (Re)configures all CPUs in DynamicPools. for _, dp := range p.dynamicPools { p.useCpuClass(dp) } } return nil } if err := p.setConfig(newDynamicPoolsOptions); err != nil { log.Error("config update failed: %v", err) return err } log.Info("config updated successfully") p.Sync(p.cch.GetContainers(), p.cch.GetContainers()) return nil } // applyDynamicPoolDef creates user-defined dynamicPools or reconfigures built-in // dynamicPools according to the dpDef. Does not initialize dynamicPool CPUs. func (p *dynamicPools) applyDynamicPoolDef(dynamicPools *[]*DynamicPool, dpDef *DynamicPoolDef) error { if len(*dynamicPools) < 2 { return dynamicPoolsError("internal error: reserved and shared dynamicPools missing, cannot apply dynamicPool definitions") } reservedDynamicPool := (*dynamicPools)[0] sharedDynamicPool := (*dynamicPools)[1] // Every dynamicPoolDef does one of the following: // 1. reconfigures the "reserved" dynamicPool (most restricted) // 2. reconfigures the "shared" dynamicPool (somewhat restricted) // 3. defines new user-defined dynamicPool. switch dpDef.Name { case "": // Case 0: bad name return dynamicPoolsError("undefined or empty dynamicPool name") case reservedDynamicPool.Def.Name: // Case 1: reconfigure the "reserved" dynamicPool. p.reservedDynamicPoolDef.AllocatorPriority = dpDef.AllocatorPriority p.reservedDynamicPoolDef.CpuClass = dpDef.CpuClass p.reservedDynamicPoolDef.Namespaces = dpDef.Namespaces case sharedDynamicPool.Def.Name: // Case 2: reconfigure the "shared" dynamicPool. p.sharedDynamicPoolDef.AllocatorPriority = dpDef.AllocatorPriority p.sharedDynamicPoolDef.CpuClass = dpDef.CpuClass p.sharedDynamicPoolDef.Namespaces = dpDef.Namespaces default: // Case 3: create each user-defined dynamicPool without CPU. newdp, err := p.newDynamicPool(dpDef, false) if err != nil { return err } *dynamicPools = append(*dynamicPools, newdp) } return nil } // setConfig takes new dynamicPool configuration into use. func (p *dynamicPools) setConfig(dpoptions *DynamicPoolsOptions) error { // Create the default reserved and shared dynamicPool // definitions. Some properties of these definitions may be // altered by user configuration. p.reservedDynamicPoolDef = &DynamicPoolDef{ Name: reservedDynamicPoolDefName, AllocatorPriority: 3, } p.sharedDynamicPoolDef = &DynamicPoolDef{ Name: sharedDynamicPoolDefName, AllocatorPriority: 3, } p.dynamicPools = []*DynamicPool{} p.freeCpus = p.allowed.Clone() p.freeCpus = p.freeCpus.Difference(p.reserved) // Instantiate built-in reserved and shared dynamicPool. reservedDynamicPool, err := p.newDynamicPool(p.reservedDynamicPoolDef, false) if err != nil { return err } p.dynamicPools = append(p.dynamicPools, reservedDynamicPool) sharedDynamicPool, err := p.newDynamicPool(p.sharedDynamicPoolDef, false) if err != nil { return err } p.dynamicPools = append(p.dynamicPools, sharedDynamicPool) // First apply customizations to built-in dynamicPools: "reserved" // and "shared". for _, dpDef := range dpoptions.DynamicPoolDefs { if dpDef.Name != reservedDynamicPoolDefName && dpDef.Name != sharedDynamicPoolDefName { continue } if err := p.applyDynamicPoolDef(&p.dynamicPools, dpDef); err != nil { return err } } // Apply all user dynamicPool definitions, skip already customized // "reserved" and "shared" dynamicPools. for _, dpDef := range dpoptions.DynamicPoolDefs { if dpDef.Name == reservedDynamicPoolDefName || dpDef.Name == sharedDynamicPoolDefName { continue } if err := p.applyDynamicPoolDef(&p.dynamicPools, dpDef); err != nil { return err } } // Finish dynamicPool initialization. log.Info("%s policy dynamicPools:", PolicyName) for dpIdx, dp := range p.dynamicPools { log.Info("- dynamicPool %d: %s", dpIdx, dp) } // No errors in dynamicPool creation, take new configuration into use. p.dpoptions = *dpoptions // (Re)configures all CPUs in dynamicPools. for _, dp := range p.dynamicPools { p.useCpuClass(dp) } return nil } // closestMems returns memory node IDs good for pinning containers // that run on given CPUs. func (p *dynamicPools) closestMems(cpus cpuset.CPUSet) idset.IDSet { mems := idset.NewIDSet() sys := p.options.System for _, nodeID := range sys.NodeIDs() { if !cpus.Intersection(sys.Node(nodeID).CPUSet()).IsEmpty() { mems.Add(nodeID) } } return mems } // assignContainer adds a container to a dynamicPool. func (p *dynamicPools) assignContainer(c cache.Container, dp *DynamicPool) { log.Info("assigning container %s to dynamicPool %s", c.PrettyName(), dp) podID := c.GetPodID() dp.PodIDs[podID] = append(dp.PodIDs[podID], c.GetCacheID()) p.pinCpuMem(c, dp.Cpus, dp.Mems) } // dismissContainer removes a container from a dynamicPool. func (p *dynamicPools) dismissContainer(c cache.Container, dp *DynamicPool) { podID := c.GetPodID() dp.PodIDs[podID] = removeString(dp.PodIDs[podID], c.GetCacheID()) if len(dp.PodIDs[podID]) == 0 { delete(dp.PodIDs, podID) } } // pinCpuMem pins container to CPUs and memory nodes if flagged. func (p *dynamicPools) pinCpuMem(c cache.Container, cpus cpuset.CPUSet, mems idset.IDSet) { if p.dpoptions.PinCPU == nil || *p.dpoptions.PinCPU { log.Debug(" - pinning %s to cpuset: %s", c.PrettyName(), cpus) c.SetCpusetCpus(cpus.String()) if reqCpu, ok := c.GetResourceRequirements().Requests[corev1.ResourceCPU]; ok { mCpu := int(reqCpu.MilliValue()) c.SetCPUShares(int64(cache.MilliCPUToShares(int64(mCpu)))) } } if p.dpoptions.PinMemory == nil || *p.dpoptions.PinMemory { log.Debug(" - pinning %s to memory %s", c.PrettyName(), mems) c.SetCpusetMems(mems.String()) } } // dynamicPoolsError formats an error from this policy. func dynamicPoolsError(format string, args ...interface{}) error { return fmt.Errorf(PolicyName+": "+format, args...) } // removeString returns the first occurrence of a string from string slice. func removeString(strings []string, element string) []string { for index, s := range strings { if s == element { strings[index] = strings[len(strings)-1] return strings[:len(strings)-1] } } return strings } // Register us as a policy implementation. func init() { policy.Register(PolicyName, PolicyDescription, CreateDynamicPoolsPolicy) } ================================================ FILE: pkg/cri/resource-manager/policy/builtin/dynamic-pools/dyp_test.go ================================================ // Copyright 2022 Intel Corporation. All Rights Reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package dyp import ( "testing" "github.com/intel/cri-resource-manager/pkg/utils/cpuset" ) func TestChangesDynamicPools(t *testing.T) { tcases := []struct { name string opts1 *DynamicPoolsOptions opts2 *DynamicPoolsOptions expectedValue bool }{ { name: "both options are nil", expectedValue: false, }, { name: "one option is nil", opts2: &DynamicPoolsOptions{}, expectedValue: true, }, { name: "reserved pool namespaces differ by len", opts1: &DynamicPoolsOptions{ ReservedPoolNamespaces: []string{"ns0"}, }, opts2: &DynamicPoolsOptions{ ReservedPoolNamespaces: []string{}, }, expectedValue: true, }, { name: "reserved pool namespaces differ by content", opts1: &DynamicPoolsOptions{ ReservedPoolNamespaces: []string{"ns0"}, }, opts2: &DynamicPoolsOptions{ ReservedPoolNamespaces: []string{"ns1"}, }, expectedValue: true, }, { name: "dynamic-pool defs differ", opts1: &DynamicPoolsOptions{ ReservedPoolNamespaces: []string{"ns0"}, DynamicPoolDefs: []*DynamicPoolDef{}, }, opts2: &DynamicPoolsOptions{ ReservedPoolNamespaces: []string{"ns1"}, DynamicPoolDefs: []*DynamicPoolDef{}, }, expectedValue: true, }, } for _, tc := range tcases { t.Run(tc.name, func(t *testing.T) { value := changesDynamicPools(tc.opts1, tc.opts2) if value != tc.expectedValue { t.Errorf("Expected return value %v but got %v", tc.expectedValue, value) } }) } } func TestIsNeedReallocate(t *testing.T) { p := &dynamicPools{ dynamicPools: []*DynamicPool{ { Def: &DynamicPoolDef{ Name: reservedDynamicPoolDefName, }, Cpus: cpuset.New(1, 2), }, { Def: &DynamicPoolDef{ Name: sharedDynamicPoolDefName, }, Cpus: cpuset.New(3, 4, 5, 6), }, { Def: &DynamicPoolDef{ Name: "poo1", }, Cpus: cpuset.New(7, 8, 9, 10, 11, 12), }, { Def: &DynamicPoolDef{ Name: "poo2", }, Cpus: cpuset.New(0), }, }, } tcases := []struct { name string newPoolCpu map[*DynamicPool]int expectedValue bool }{ { name: "no need to reallocate", newPoolCpu: map[*DynamicPool]int{ p.dynamicPools[0]: 2, p.dynamicPools[1]: 4, p.dynamicPools[2]: 6, p.dynamicPools[3]: 1, }, expectedValue: false, }, { name: "need to reallocate", newPoolCpu: map[*DynamicPool]int{ p.dynamicPools[0]: 2, p.dynamicPools[1]: 6, p.dynamicPools[2]: 4, p.dynamicPools[3]: 1, }, expectedValue: true, }, } for _, tc := range tcases { t.Run(tc.name, func(t *testing.T) { value := p.isNeedReallocate(tc.newPoolCpu) if value != tc.expectedValue { t.Errorf("Expected return value %v but got %v", tc.expectedValue, value) } }) } } func TestCalculatePoolCpuset(t *testing.T) { p := &dynamicPools{ allowed: cpuset.New(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13), reserved: cpuset.New(1, 2), dynamicPools: []*DynamicPool{ { Def: &DynamicPoolDef{ Name: reservedDynamicPoolDefName, }, Cpus: cpuset.New(1, 2), }, { Def: &DynamicPoolDef{ Name: sharedDynamicPoolDefName, }, Cpus: cpuset.New(3, 4, 5, 6), }, { Def: &DynamicPoolDef{ Name: "poo1", }, Cpus: cpuset.New(7, 8, 9, 10, 11, 12, 13), }, { Def: &DynamicPoolDef{ Name: "poo2", }, Cpus: cpuset.New(0), }, }, } tcases := []struct { name string requestCpu map[*DynamicPool]int remainFree int weight map[*DynamicPool]float64 sumWeight float64 expectedValue map[*DynamicPool]int }{ { name: "The requests and weight of the dynamic pools are both nil", requestCpu: map[*DynamicPool]int{}, remainFree: 12, weight: map[*DynamicPool]float64{}, sumWeight: 0.0, expectedValue: map[*DynamicPool]int{ p.dynamicPools[0]: 2, p.dynamicPools[1]: 12, p.dynamicPools[2]: 0, p.dynamicPools[3]: 0, }, }, { name: "The requests of the dynamic pools is not nil, and the requests of the shared dynamic pools is 0", requestCpu: map[*DynamicPool]int{ p.dynamicPools[0]: 1, p.dynamicPools[1]: 0, p.dynamicPools[2]: 2, p.dynamicPools[3]: 2, }, remainFree: 8, weight: map[*DynamicPool]float64{}, sumWeight: 0.0, expectedValue: map[*DynamicPool]int{ p.dynamicPools[0]: 2, p.dynamicPools[1]: 8, p.dynamicPools[2]: 2, p.dynamicPools[3]: 2, }, }, { name: "The requests of the dynamic pools is not nil, and the requests of the shared dynamic pools is not 0", requestCpu: map[*DynamicPool]int{ p.dynamicPools[0]: 1, p.dynamicPools[1]: 2, p.dynamicPools[2]: 2, p.dynamicPools[3]: 2, }, remainFree: 6, weight: map[*DynamicPool]float64{}, sumWeight: 0.0, expectedValue: map[*DynamicPool]int{ p.dynamicPools[0]: 2, p.dynamicPools[1]: 8, p.dynamicPools[2]: 2, p.dynamicPools[3]: 2, }, }, { name: "The weight of the dynamic pools is not nil, and the weight of the shared dynamic pools is not 0", requestCpu: map[*DynamicPool]int{}, remainFree: 12, weight: map[*DynamicPool]float64{ p.dynamicPools[0]: 10.0, p.dynamicPools[1]: 100.0, p.dynamicPools[2]: 200.0, p.dynamicPools[3]: 100.0, }, sumWeight: 400.0, expectedValue: map[*DynamicPool]int{ p.dynamicPools[0]: 2, p.dynamicPools[1]: 3, p.dynamicPools[2]: 6, p.dynamicPools[3]: 3, }, }, { name: "The weight of the dynamic pools is not nil, and the weight of the shared dynamic pools is 0", requestCpu: map[*DynamicPool]int{}, remainFree: 12, weight: map[*DynamicPool]float64{ p.dynamicPools[0]: 10.0, p.dynamicPools[1]: 0.0, p.dynamicPools[2]: 200.0, p.dynamicPools[3]: 100.0, }, sumWeight: 300.0, expectedValue: map[*DynamicPool]int{ p.dynamicPools[0]: 2, p.dynamicPools[1]: 0, p.dynamicPools[2]: 8, p.dynamicPools[3]: 4, }, }, { name: "The requests and weight of the dynamic pools are not nil, and the requests of the shared dynamic pools is 0", requestCpu: map[*DynamicPool]int{ p.dynamicPools[0]: 1, p.dynamicPools[1]: 0, p.dynamicPools[2]: 2, p.dynamicPools[3]: 2, }, remainFree: 8, weight: map[*DynamicPool]float64{ p.dynamicPools[0]: 10.0, p.dynamicPools[1]: 100.0, p.dynamicPools[2]: 200.0, p.dynamicPools[3]: 100.0, }, sumWeight: 400.0, expectedValue: map[*DynamicPool]int{ p.dynamicPools[0]: 2, p.dynamicPools[1]: 2, p.dynamicPools[2]: 6, p.dynamicPools[3]: 4, }, }, { name: "The requests and weight of the dynamic pools are not nil, and the weight of the shared dynamic pools is 0", requestCpu: map[*DynamicPool]int{ p.dynamicPools[0]: 1, p.dynamicPools[1]: 1, p.dynamicPools[2]: 2, p.dynamicPools[3]: 2, }, remainFree: 7, weight: map[*DynamicPool]float64{ p.dynamicPools[0]: 10.0, p.dynamicPools[1]: 0.0, p.dynamicPools[2]: 200.0, p.dynamicPools[3]: 100.0, }, sumWeight: 300.0, expectedValue: map[*DynamicPool]int{ p.dynamicPools[0]: 2, p.dynamicPools[1]: 1, p.dynamicPools[2]: 7, p.dynamicPools[3]: 4, }, }, { name: "The requests and weight of the dynamic pools are not nil, and the requests and weight of the shared dynamic pools are both 0", requestCpu: map[*DynamicPool]int{ p.dynamicPools[0]: 1, p.dynamicPools[1]: 0, p.dynamicPools[2]: 2, p.dynamicPools[3]: 2, }, remainFree: 8, weight: map[*DynamicPool]float64{ p.dynamicPools[0]: 10.0, p.dynamicPools[1]: 0.0, p.dynamicPools[2]: 200.0, p.dynamicPools[3]: 100.0, }, sumWeight: 300.0, expectedValue: map[*DynamicPool]int{ p.dynamicPools[0]: 2, p.dynamicPools[1]: 0, p.dynamicPools[2]: 8, p.dynamicPools[3]: 4, }, }, } for _, tc := range tcases { t.Run(tc.name, func(t *testing.T) { value := p.calculatePoolCpuset(tc.requestCpu, tc.remainFree, tc.weight, tc.sumWeight) for k, v := range value { if v != tc.expectedValue[k] { t.Errorf("dynamic pool %v Expected return value %v but got %v", k, tc.expectedValue[k], v) } } }) } } ================================================ FILE: pkg/cri/resource-manager/policy/builtin/dynamic-pools/flags.go ================================================ // Copyright 2022 Intel Corporation. All Rights Reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package dyp import ( "encoding/json" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" pkgcfg "github.com/intel/cri-resource-manager/pkg/config" "github.com/intel/cri-resource-manager/pkg/cpuallocator" ) type DynamicPoolsOptions dynamicPoolsOptionsWrapped // dynamicPoolsOptions contains configuration options specific to this policy. type dynamicPoolsOptionsWrapped struct { // PinCPU controls pinning containers to CPUs. PinCPU *bool `json:"PinCPU,omitempty"` // PinMemory controls pinning containers to memory nodes. PinMemory *bool `json:"PinMemory,omitempty"` // ReservedPoolNamespaces is a list of namespace globs that // will be allocated to reserved CPUs. ReservedPoolNamespaces []string `json:"ReservedPoolNamespaces,omitempty"` // DynamicPoolDefs contains dynamicPool type definitions. DynamicPoolDefs []*DynamicPoolDef `json:"DynamicPoolTypes,omitempty"` } // DynamicPoolDef contains a dynamicPool definition. type DynamicPoolDef struct { // Name of the dynamicPool definition. Name string `json:"Name"` Namespaces []string `json:"Namespaces",omitempty` CpuClass string `json:"CpuClass"` // AllocatorPriority (0: High, 1: Normal, 2: Low, 3: None) // This parameter is passed to CPU allocator when creating or // resizing a dynamicPool. At init, dynamicPools with highest priority // CPUs are allocated first. AllocatorPriority cpuallocator.CPUPriority `json:"AllocatorPriority"` } var defaultPinCPU bool = true var defaultPinMemory bool = true // DeepCopy creates a deep copy of a DynamicPoolsOptions func (dpo *DynamicPoolsOptions) DeepCopy() *DynamicPoolsOptions { outDpo := *dpo outDpo.ReservedPoolNamespaces = make([]string, len(dpo.ReservedPoolNamespaces)) copy(outDpo.ReservedPoolNamespaces, dpo.ReservedPoolNamespaces) outDpo.DynamicPoolDefs = make([]*DynamicPoolDef, len(dpo.DynamicPoolDefs)) for i := range dpo.DynamicPoolDefs { outDpo.DynamicPoolDefs[i] = dpo.DynamicPoolDefs[i].DeepCopy() } return &outDpo } // String stringifies a DynamicPoolsDef func (dpDef DynamicPoolDef) String() string { return dpDef.Name } // DeepCopy creates a deep copy of a DynamicPoolDef func (bdef *DynamicPoolDef) DeepCopy() *DynamicPoolDef { outBdef := *bdef outBdef.Namespaces = make([]string, len(bdef.Namespaces)) copy(outBdef.Namespaces, bdef.Namespaces) return &outBdef } // defaultDynamicPoolsOptions returns a new DynamicPoolsOptions instance, all initialized to defaults. func defaultDynamicPoolsOptions() interface{} { return &DynamicPoolsOptions{ ReservedPoolNamespaces: []string{metav1.NamespaceSystem}, PinCPU: &defaultPinCPU, PinMemory: &defaultPinMemory, } } // Our runtime configuration. var dynamicPoolsOptions = defaultDynamicPoolsOptions().(*DynamicPoolsOptions) // UnmarshalJSON makes sure all options from previous unmarshals get // cleared before unmarshaling new data to the same address. func (bo *DynamicPoolsOptions) UnmarshalJSON(data []byte) error { bow := dynamicPoolsOptionsWrapped{} if err := json.Unmarshal(data, &bow); err != nil { return err } *bo = DynamicPoolsOptions(bow) return nil } // Register us for configuration handling. func init() { pkgcfg.Register(PolicyPath, PolicyDescription, dynamicPoolsOptions, defaultDynamicPoolsOptions) } ================================================ FILE: pkg/cri/resource-manager/policy/builtin/dynamic-pools/metrics.go ================================================ // Copyright 2022 Intel Corporation. All Rights Reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package dyp import ( "sort" "strconv" "strings" "github.com/intel/cri-resource-manager/pkg/cri/resource-manager/policy" "github.com/intel/cri-resource-manager/pkg/utils/cpuset" "github.com/prometheus/client_golang/prometheus" ) // Prometheus Metric descriptor indices and descriptor table const ( dynamicPoolsDesc = iota ) var descriptors = []*prometheus.Desc{ dynamicPoolsDesc: prometheus.NewDesc( "DynamicPools", "CPUs", []string{ "dynamicPool_type", "cpu_class", "dynamicPool", "cpus", "mems", "containers", "tot_req_millicpu", "tot_limit_millicpu", }, nil, ), } // Metrics defines the dynamicPools-specific metrics from policy level. type Metrics struct { DynamicPools []*DynamicPoolMetrics } // DynamicPoolMetrics define metrics of a dynamicPool instance. type DynamicPoolMetrics struct { // dynamicPool type metrics DefName string CpuClass string // DynamicPool instance metrics PrettyName string Cpus cpuset.CPUSet Mems string ContainerNames string ContainerReqMilliCpus int ContainerLimitMilliCpus int } // DescribeMetrics generates policy-specific prometheus metrics data // descriptors. func (p *dynamicPools) DescribeMetrics() []*prometheus.Desc { return descriptors } // PollMetrics provides policy metrics for monitoring. func (p *dynamicPools) PollMetrics() policy.Metrics { policyMetrics := &Metrics{} policyMetrics.DynamicPools = make([]*DynamicPoolMetrics, len(p.dynamicPools)) for index, dp := range p.dynamicPools { dm := &DynamicPoolMetrics{} policyMetrics.DynamicPools[index] = dm dm.DefName = dp.Def.Name dm.CpuClass = dp.Def.CpuClass dm.PrettyName = dp.PrettyName() dm.Cpus = dp.Cpus dm.Mems = dp.Mems.String() cNames := []string{} // Get container names, total requested milliCPUs and total limit milliCPUs. for _, containerIDs := range dp.PodIDs { for _, containerID := range containerIDs { if c, ok := p.cch.LookupContainer(containerID); ok { cNames = append(cNames, c.PrettyName()) dm.ContainerReqMilliCpus += p.containerRequestedMilliCpus(containerID) dm.ContainerLimitMilliCpus += p.containerLimitedMilliCpus(containerID) } } } sort.Strings(cNames) dm.ContainerNames = strings.Join(cNames, ",") } return policyMetrics } // CollectMetrics generates prometheus metrics from cached/polled // policy-specific metrics data. func (p *dynamicPools) CollectMetrics(m policy.Metrics) ([]prometheus.Metric, error) { metrics, ok := m.(*Metrics) if !ok { return nil, dynamicPoolsError("type mismatch in dynamicPools metrics") } promMetrics := make([]prometheus.Metric, len(metrics.DynamicPools)) for index, dm := range metrics.DynamicPools { promMetrics[index] = prometheus.MustNewConstMetric( descriptors[dynamicPoolsDesc], prometheus.GaugeValue, float64(dm.Cpus.Size()), dm.DefName, dm.CpuClass, dm.PrettyName, dm.Cpus.String(), dm.Mems, dm.ContainerNames, strconv.Itoa(dm.ContainerReqMilliCpus), strconv.Itoa(dm.ContainerLimitMilliCpus)) } return promMetrics, nil } ================================================ FILE: pkg/cri/resource-manager/policy/builtin/none/none-policy.go ================================================ // Copyright 2019 Intel Corporation. All Rights Reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package none import ( "github.com/intel/cri-resource-manager/pkg/cri/resource-manager/cache" "github.com/intel/cri-resource-manager/pkg/cri/resource-manager/events" "github.com/intel/cri-resource-manager/pkg/cri/resource-manager/introspect" "github.com/intel/cri-resource-manager/pkg/cri/resource-manager/policy" logger "github.com/intel/cri-resource-manager/pkg/log" "github.com/prometheus/client_golang/prometheus" ) const ( // PolicyName is the name used to activate this policy implementation. PolicyName = policy.NonePolicy // PolicyDescription is a short description of this policy. PolicyDescription = "A no-op policy, doing pretty much nothing." ) type none struct { logger.Logger cch cache.Cache } var _ policy.Backend = &none{} // CreateNonePolicy creates a new policy instance. func CreateNonePolicy(opts *policy.BackendOptions) policy.Backend { n := &none{Logger: logger.NewLogger(PolicyName)} n.Info("creating policy...") return n } // Name returns the name of this policy. func (n *none) Name() string { return PolicyName } // Description returns the description for this policy. func (n *none) Description() string { return PolicyDescription } // Start prepares this policy for accepting allocation/release requests. func (n *none) Start(add []cache.Container, del []cache.Container) error { n.Debug("got started...") return nil } // Sync synchronizes the active policy state. func (n *none) Sync(add []cache.Container, del []cache.Container) error { n.Debug("(not) synchronizing policy state") return nil } // AllocateResources is a resource allocation request for this policy. func (n *none) AllocateResources(c cache.Container) error { n.Debug("(not) allocating container %s...", c.PrettyName()) return nil } // ReleaseResources is a resource release request for this policy. func (n *none) ReleaseResources(c cache.Container) error { n.Debug("(not) releasing container %s...", c.PrettyName()) return nil } // UpdateResources is a resource allocation update request for this policy. func (n *none) UpdateResources(c cache.Container) error { n.Debug("(not) updating container %s...", c.PrettyName()) return nil } // Rebalance tries to find an optimal allocation of resources for the current containers. func (n *none) Rebalance() (bool, error) { n.Debug("(not) rebalancing containers...") return false, nil } // HandleEvent handles policy-specific events. func (n *none) HandleEvent(*events.Policy) (bool, error) { n.Debug("(not) handling event...") return false, nil } // ExportResourceData provides resource data to export for the container. func (n *none) ExportResourceData(c cache.Container) map[string]string { return nil } // Introspect provides data for external introspection. func (n *none) Introspect(*introspect.State) { return } // PollMetrics provides policy metrics for monitoring. func (p *none) PollMetrics() policy.Metrics { return nil } // DescribeMetrics generates policy-specific prometheus metrics data descriptors. func (p *none) DescribeMetrics() []*prometheus.Desc { return nil } // CollectMetrics generates prometheus metrics from cached/polled policy-specific metrics data. func (p *none) CollectMetrics(policy.Metrics) ([]prometheus.Metric, error) { return nil, nil } // Register us as a policy implementation. func init() { policy.Register(PolicyName, PolicyDescription, CreateNonePolicy) } ================================================ FILE: pkg/cri/resource-manager/policy/builtin/podpools/flags.go ================================================ // Copyright 2020-2021 Intel Corporation. All Rights Reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package podpools import ( "bytes" "encoding/json" "fmt" pkgcfg "github.com/intel/cri-resource-manager/pkg/config" ) // PodpoolsOptions contains configuration options specific to this policy. type PodpoolsOptions struct { // PinCPU controls pinning containers to CPUs. PinCPU bool `json:"PinCPU,omitempty"` // PinMemory controls pinning containers to memory nodes. PinMemory bool `json:"PinMemory,omitempty"` // PoolDefs contains pool definitions PoolDefs []*PoolDef `json:"Pools,omitempty"` } // PoolDef contains a pool definition. type PoolDef struct { // Name is the name of the pool, or name prefix of // multi-instance pools. Name string `json:"Name"` // CPU specifies the number of CPUs exclusively usable by // pods in the pool. CPU string `json:"CPU"` // MaxPods specifies the maximum number of pods assigned to // the pool. 0 (the default) means unlimited. -1 means no // pods. MaxPods int `json:"MaxPods"` // Instances specifies the number of multi-instance pools, // either directly or as CPU (count/percentage) reserved for // instances. The default is 1. Instances string `json:"Instances,omitempty"` // FillOrder specifies how multi-instance pools are filled. FillOrder FillOrder `json:"FillOrder"` // For the future: when enabling dynamic (on-demand) pool // instantiation, consider different ways of handling the case // of MaxPods>1, FillOrder==Balanced. Creating underloaded // pool instances will consume CPUs from other pool instances, // in a bad case causing workload migrations between memory // controllers when rearranging pool load is needed for // creation of new pools. } // FillOrder specifies the order in which pool instances should be filled. type FillOrder int const ( FillBalanced FillOrder = iota FillPacked FillFirstFree ) var fillOrderNames = map[FillOrder]string{ FillBalanced: "Balanced", FillPacked: "Packed", FillFirstFree: "FirstFree", } // String stringifies a FillOrder func (fo FillOrder) String() string { if fon, ok := fillOrderNames[fo]; ok { return fon } return fmt.Sprintf("#UNNAMED-FILLORDER(%d)", int(fo)) } // MarshalJSON marshals a FillOrder as a quoted json string func (fo FillOrder) MarshalJSON() ([]byte, error) { buffer := bytes.NewBufferString(fmt.Sprintf("%q", fo)) return buffer.Bytes(), nil } // UnmarshalJSON unmarshals a FillOrder quoted json string to the enum value func (fo *FillOrder) UnmarshalJSON(b []byte) error { var fillOrderName string err := json.Unmarshal(b, &fillOrderName) if err != nil { return err } for foID, foName := range fillOrderNames { if foName == fillOrderName { *fo = foID return nil } } return podpoolsError("invalid fill order %q", fillOrderName) } // defaultPodpoolsOptions returns a new PodpoolsOptions instance, all initialized to defaults. func defaultPodpoolsOptions() interface{} { return &PodpoolsOptions{ PinCPU: true, PinMemory: true, } } // Our runtime configuration. var podpoolsOptions = defaultPodpoolsOptions().(*PodpoolsOptions) // Register us for configuration handling. func init() { pkgcfg.Register(PolicyPath, PolicyDescription, podpoolsOptions, defaultPodpoolsOptions) } ================================================ FILE: pkg/cri/resource-manager/policy/builtin/podpools/metrics.go ================================================ // Copyright 2020-2021 Intel Corporation. All Rights Reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package podpools import ( "fmt" "sort" "strconv" "github.com/intel/cri-resource-manager/pkg/cri/resource-manager/policy" "github.com/intel/cri-resource-manager/pkg/procstats" "github.com/intel/cri-resource-manager/pkg/sysfs" "github.com/intel/cri-resource-manager/pkg/utils/cpuset" "github.com/prometheus/client_golang/prometheus" ) // Metrics defines the podpools-specific metrics from policy level. type Metrics struct { PoolMetrics map[string]*PoolMetrics } // PoolMetrics defines the podpools-specific metrics from pool level. type PoolMetrics struct { DefName string PrettyName string CPUs cpuset.CPUSet CPUIds []int MilliCPUs string Memory string ContainerNames string PodNames string } // Prometheus Metric descriptor indices and descriptor table const ( cpuUsageDesc = iota poolCPUUsageDesc ) var descriptors = []*prometheus.Desc{ cpuUsageDesc: prometheus.NewDesc( "cpu_usage", "CPU usage per logical processor", []string{ "cpu", }, nil, ), poolCPUUsageDesc: prometheus.NewDesc( "pool_cpu_usage", "CPU usage for a given pool", []string{ "policy", "pretty_name", "def_name", "CPUs", "memory", "pool_size", "pod_name", "container_name", }, nil, ), } var cpuTimeStat *procstats.CPUTimeStat // DescribeMetrics generates policy-specific prometheus metrics data descriptors. func (p *podpools) DescribeMetrics() []*prometheus.Desc { return descriptors } // PollMetrics provides policy metrics for monitoring. func (p *podpools) PollMetrics() policy.Metrics { if p.pools == nil || len(p.pools) <= 0 { log.Error("Failed to pull metrics.") return nil } policyMetrics := &Metrics{} policyMetrics.PoolMetrics = make(map[string]*PoolMetrics, len(p.pools)) for _, pool := range p.pools { policyMetrics.PoolMetrics[pool.PrettyName()] = &PoolMetrics{} policyMetrics.PoolMetrics[pool.PrettyName()].DefName = pool.Def.Name policyMetrics.PoolMetrics[pool.PrettyName()].PrettyName = pool.PrettyName() policyMetrics.PoolMetrics[pool.PrettyName()].CPUs = pool.CPUs policyMetrics.PoolMetrics[pool.PrettyName()].CPUIds = pool.CPUs.List() policyMetrics.PoolMetrics[pool.PrettyName()].MilliCPUs = strconv.Itoa(pool.CPUs.Size() * 1000) policyMetrics.PoolMetrics[pool.PrettyName()].Memory = pool.Mems.String() policyMetrics.PoolMetrics[pool.PrettyName()].ContainerNames = "" policyMetrics.PoolMetrics[pool.PrettyName()].PodNames = "" if len(pool.PodIDs) > 0 { podIds := make([]string, 0, len(pool.PodIDs)) for podId := range pool.PodIDs { podIds = append(podIds, podId) } sort.Sort(sort.StringSlice(podIds)) for _, podId := range podIds { for _, containerId := range pool.PodIDs[podId] { if container, ok := p.cch.LookupContainer(containerId); ok { containerName := container.PrettyName() if policyMetrics.PoolMetrics[pool.PrettyName()].ContainerNames == "" { policyMetrics.PoolMetrics[pool.PrettyName()].ContainerNames = containerName } else { policyMetrics.PoolMetrics[pool.PrettyName()].ContainerNames = fmt.Sprintf("%s,%s", policyMetrics.PoolMetrics[pool.PrettyName()].ContainerNames, containerName) } } } if pod, ok := p.cch.LookupPod(podId); ok { podName := pod.GetName() if policyMetrics.PoolMetrics[pool.PrettyName()].PodNames == "" { policyMetrics.PoolMetrics[pool.PrettyName()].PodNames = podName } else { policyMetrics.PoolMetrics[pool.PrettyName()].PodNames = fmt.Sprintf("%s,%s", policyMetrics.PoolMetrics[pool.PrettyName()].PodNames, podName) } } } } } return policyMetrics } // CollectMetrics generates prometheus metrics from cached/polled policy-specific metrics data. func (p *podpools) CollectMetrics(m policy.Metrics) ([]prometheus.Metric, error) { metrics, ok := m.(*Metrics) if !ok { return nil, fmt.Errorf("Wrong podpools metrics.") } if cpuTimeStat == nil { if initSys, err := sysfs.DiscoverSystem(); err != nil { return nil, err } else { cpuCount := len(initSys.CPUIDs()) cpuTimeStat = &procstats.CPUTimeStat{ PrevIdleTime: make([]uint64, cpuCount), PrevTotalTime: make([]uint64, cpuCount), CurIdleTime: make([]uint64, cpuCount), CurTotalTime: make([]uint64, cpuCount), DeltaIdleTime: make([]uint64, cpuCount), DeltaTotalTime: make([]uint64, cpuCount), CPUUsage: make([]float64, cpuCount), IsGetCPUUsageBegin: false, } } } err := cpuTimeStat.GetCPUTimeStat() if err != nil { return nil, err } cpuMetrics, err := updateCPUUsageMetrics() if err != nil { return nil, err } poolCPUMetrics, err := updatePoolCPUUsageMetrics(metrics) if err != nil { return nil, err } return append(cpuMetrics, poolCPUMetrics...), nil } // updateCPUUsageMetrics collects the CPU usage per logical processor. func updateCPUUsageMetrics() ([]prometheus.Metric, error) { cpuTimeStat.RLock() defer cpuTimeStat.RUnlock() sys, err := sysfs.DiscoverSystem() if err != nil { return nil, err } onlined := sys.CPUSet().Difference(sys.Offlined()) onlinedUsage := make([]prometheus.Metric, onlined.Size()) for i, j := range onlined.List() { onlinedUsage[i] = prometheus.MustNewConstMetric( descriptors[cpuUsageDesc], prometheus.GaugeValue, cpuTimeStat.CPUUsage[j], strconv.Itoa(j), ) } return onlinedUsage, nil } // updatePoolCPUUsageMetrics collects the CPU usage of pools defined by podpools-policy. func updatePoolCPUUsageMetrics(ppm *Metrics) ([]prometheus.Metric, error) { if ppm == nil { return nil, fmt.Errorf("Podpools metrics used to count pool CPU usage is missing.") } // Sort the pool metrics. poolNames := make([]string, 0, len(ppm.PoolMetrics)) for poolName := range ppm.PoolMetrics { poolNames = append(poolNames, poolName) } sort.Sort(sort.StringSlice(poolNames)) // Calculate the CPU usage of a pool and send to prometheus. poolCPUUsageMetrics := make([]prometheus.Metric, len(poolNames)) poolCPUUsageList := make(map[string]float64, len(poolNames)) cpuTimeStat.RLock() defer cpuTimeStat.RUnlock() for index, poolName := range poolNames { poolDeltaIdleTime := uint64(0) poolDeltaTotalTime := uint64(0) for _, cpuId := range ppm.PoolMetrics[poolName].CPUIds { poolDeltaIdleTime += cpuTimeStat.DeltaIdleTime[cpuId] poolDeltaTotalTime += cpuTimeStat.DeltaTotalTime[cpuId] } poolCPUUsageList[poolName] = 0.0 if poolDeltaTotalTime != 0 { sys, err := sysfs.DiscoverSystem() if err != nil { return nil, err } poolCPUOnlined := ppm.PoolMetrics[poolName].CPUs.Difference(sys.Offlined()) poolCPUUsageList[poolName] = (1.0 - float64(poolDeltaIdleTime)/float64(poolDeltaTotalTime)) * 100.0 * float64(len(poolCPUOnlined.List())) } poolCPUUsageMetrics[index] = prometheus.MustNewConstMetric( descriptors[poolCPUUsageDesc], prometheus.GaugeValue, poolCPUUsageList[poolName], PolicyName, poolName, ppm.PoolMetrics[poolName].DefName, ppm.PoolMetrics[poolName].CPUs.String(), ppm.PoolMetrics[poolName].Memory, ppm.PoolMetrics[poolName].MilliCPUs, ppm.PoolMetrics[poolName].PodNames, ppm.PoolMetrics[poolName].ContainerNames, ) } return poolCPUUsageMetrics, nil } ================================================ FILE: pkg/cri/resource-manager/policy/builtin/podpools/podpools-policy.go ================================================ // Copyright 2020-2021 Intel Corporation. All Rights Reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package podpools import ( "fmt" "sort" "strconv" "strings" corev1 "k8s.io/api/core/v1" resapi "k8s.io/apimachinery/pkg/api/resource" pkgcfg "github.com/intel/cri-resource-manager/pkg/config" "github.com/intel/cri-resource-manager/pkg/cpuallocator" "github.com/intel/cri-resource-manager/pkg/cri/resource-manager/cache" "github.com/intel/cri-resource-manager/pkg/cri/resource-manager/events" "github.com/intel/cri-resource-manager/pkg/cri/resource-manager/introspect" "github.com/intel/cri-resource-manager/pkg/cri/resource-manager/kubernetes" "github.com/intel/cri-resource-manager/pkg/cri/resource-manager/policy" policyapi "github.com/intel/cri-resource-manager/pkg/cri/resource-manager/policy" logger "github.com/intel/cri-resource-manager/pkg/log" "github.com/intel/cri-resource-manager/pkg/utils" "github.com/intel/cri-resource-manager/pkg/utils/cpuset" idset "github.com/intel/goresctrl/pkg/utils" ) const ( // PolicyName is the name used to activate this policy. PolicyName = "podpools" // PolicyDescription is a short description of this policy. PolicyDescription = "Pod-granularity workload placement" // PolicyPath is the path of this policy in the configuration hierarchy. PolicyPath = "policy." + PolicyName // podpoolKey is a pod annotation key, the value is a pod pool name. podpoolKey = "pool." + PolicyName + "." + kubernetes.ResmgrKeyNamespace // reservedPoolDefName is the name in the reserved pool definition. reservedPoolDefName = "reserved" // defaultPoolDefName is the name in the default pool definition. defaultPoolDefName = "default" // podMilliCPUErrorMargin is the maximum error in requested vs // allocated mCPUs per pod. For instance, 10 mCPU error margin // allows error of magnitude of +-0.5 mCPU/container up to 20 // containers/pod. podMilliCPUErrorMargin = int64(10) ) // podpools contains configuration and runtime attributes of the podpools policy type podpools struct { options *policyapi.BackendOptions // configuration common to all policies ppoptions PodpoolsOptions // podpools-specific configuration cch cache.Cache // cri-resmgr cache allowed cpuset.CPUSet // bounding set of CPUs we're allowed to use reserved cpuset.CPUSet // system-/kube-reserved CPUs reservedPoolDef *PoolDef // built-in definition of the reserved pool defaultPoolDef *PoolDef // built-in definition of the default pool pools []*Pool // pools for pods: reserved, default and user-defined podMaxMilliCPU map[string]int64 // maximum total MilliCPUs requested by containers of pods in pools cpuAllocator cpuallocator.CPUAllocator // CPU allocator used by the policy } // Pool contains attributes of a pool instance type Pool struct { // Def is the definition from which this pool instance is created. Def *PoolDef // Instance is the index of this pool instance, starting from // zero for every pool definition. Instance int // CPUs is the set of CPUs exclusive to this pool instance only. CPUs cpuset.CPUSet // Mems is the set of memory nodes with minimal access delay // from CPUs. Mems idset.IDSet // PodIDs maps pod ID to list of container IDs. // - len(PodIDs) is the number of pods in the pool. // - len(PodIDs[podID]) is the number of containers of podID // currently assigned to the pool. // - Def.MaxPods - len(PodIDs) is free pod capacity. PodIDs map[string][]string } var log logger.Logger = logger.NewLogger("policy") // String is a stringer for a pool. func (pool Pool) String() string { podCount := len(pool.PodIDs) contCount := 0 for _, contIDs := range pool.PodIDs { contCount += len(contIDs) } s := fmt.Sprintf("%s{cpus:%s, mems:%s, pods:%d/%d, containers:%d}", pool.PrettyName(), pool.CPUs, pool.Mems, podCount, pool.Def.MaxPods, contCount) return s } // PrettyName returns unique name for a pool. func (pool Pool) PrettyName() string { return fmt.Sprintf("%s[%d]", pool.Def.Name, pool.Instance) } // CreatePodpoolsPolicy creates a new policy instance. func CreatePodpoolsPolicy(policyOptions *policy.BackendOptions) policy.Backend { p := &podpools{ options: policyOptions, cch: policyOptions.Cache, reservedPoolDef: &PoolDef{ Name: reservedPoolDefName, MaxPods: 0, }, defaultPoolDef: &PoolDef{ Name: defaultPoolDefName, MaxPods: 0, }, podMaxMilliCPU: make(map[string]int64), cpuAllocator: cpuallocator.NewCPUAllocator(policyOptions.System), } log.Info("creating %s policy...", PolicyName) // Handle common policy options: AvailableResources and ReservedResources. // p.allowed: CPUs available for the policy if allowed, ok := policyOptions.Available[policyapi.DomainCPU]; ok { p.allowed = allowed.(cpuset.CPUSet) } else { // Available CPUs not specified, default to all on-line CPUs. p.allowed = policyOptions.System.CPUSet().Difference(policyOptions.System.Offlined()) } // p.reserved: CPUs reserved for kube-system pods, subset of p.allowed. p.reserved = cpuset.New() if reserved, ok := p.options.Reserved[policyapi.DomainCPU]; ok { switch v := reserved.(type) { case cpuset.CPUSet: p.reserved = p.allowed.Intersection(v) case resapi.Quantity: reserveCnt := (int(v.MilliValue()) + 999) / 1000 cpus, err := p.cpuAllocator.AllocateCpus(&p.allowed, reserveCnt, cpuallocator.PriorityNone) if err != nil { log.Fatal("failed to allocate reserved CPUs: %s", err) } p.reserved = cpus p.allowed = p.allowed.Union(cpus) } } if p.reserved.IsEmpty() { log.Fatal("%s cannot run without reserved CPUs that are also AvailableResources", PolicyName) } // Handle policy-specific options log.Debug("creating %s configuration", PolicyName) if err := p.setConfig(podpoolsOptions); err != nil { log.Fatal("failed to create %s policy: %v", PolicyName, err) } pkgcfg.GetModule(PolicyPath).AddNotify(p.configNotify) return p } // Name returns the name of this policy. func (p *podpools) Name() string { return PolicyName } // Description returns the description for this policy. func (p *podpools) Description() string { return PolicyDescription } // Start prepares this policy for accepting allocation/release requests. func (p *podpools) Start(add []cache.Container, del []cache.Container) error { log.Info("%s policy started", PolicyName) return p.Sync(p.cch.GetContainers(), del) } // Sync synchronizes the active policy state. func (p *podpools) Sync(add []cache.Container, del []cache.Container) error { log.Debug("synchronizing state...") for _, c := range del { p.ReleaseResources(c) } for _, c := range add { p.AllocateResources(c) } return nil } // AllocateResources is a resource allocation request for this policy. func (p *podpools) AllocateResources(c cache.Container) error { log.Debug("allocating container %s...", c.PrettyName()) // Assign container to correct pool. pod, ok := c.GetPod() if !ok { return podpoolsError("cannot find pod of container %s from the cache", c.PrettyName()) } if pool := p.allocatePool(pod); pool != nil { p.assignContainer(c, pool) p.trackPodCPU(pod, pool) if log.DebugEnabled() { log.Debug(p.dumpPool(pool)) } } else { // Cannot assign container to any of the pooled CPUs. return podpoolsError("cannot find CPUs to run container %s - no default or reserved CPUs available", c.PrettyName()) } return nil } // ReleaseResources is a resource release request for this policy. func (p *podpools) ReleaseResources(c cache.Container) error { log.Debug("releasing container %s...", c.PrettyName()) pod, ok := c.GetPod() if !ok { return podpoolsError("cannot find pod of container %s from the cache", c.PrettyName()) } if pool := p.allocatedPool(pod); pool != nil { p.dismissContainer(c, pool) if log.DebugEnabled() { log.Debug(p.dumpPool(pool)) } if p.containersInPool(pod, pool) == 0 { log.Debug("all containers removed, free pool allocation %s for pod %q", pool.PrettyName(), pod.GetName()) p.validatePodCPU(pod, pool) p.freePool(pod, pool) } } else { log.Debug("ReleaseResources: pool-less container %s, nothing to release", c.PrettyName()) } return nil } // UpdateResources is a resource allocation update request for this policy. func (p *podpools) UpdateResources(c cache.Container) error { log.Debug("(not) updating container %s...", c.PrettyName()) return nil } // Rebalance tries to find an optimal allocation of resources for the current containers. func (p *podpools) Rebalance() (bool, error) { log.Debug("(not) rebalancing containers...") return false, nil } // HandleEvent handles policy-specific events. func (p *podpools) HandleEvent(*events.Policy) (bool, error) { log.Debug("(not) handling event...") return false, nil } // ExportResourceData provides resource data to export for the container. func (p *podpools) ExportResourceData(c cache.Container) map[string]string { return nil } // Introspect provides data for external introspection. func (p *podpools) Introspect(*introspect.State) { return } // allocatedPool returns a pool already allocated for a pod. func (p *podpools) allocatedPool(pod cache.Pod) *Pool { podID := pod.GetID() pools := filterPools(p.pools, func(pl *Pool) bool { _, ok := pl.PodIDs[podID]; return ok }) if len(pools) == 0 { return nil } return pools[0] } // allocatePool returns a pool allocated for a pod. func (p *podpools) allocatePool(pod cache.Pod) *Pool { if pool := p.allocatedPool(pod); pool != nil { return pool } poolDef := p.getPoolDef(pod) if poolDef == nil { return nil } // Try to find a suitable pool and allocate it for the pod. pools := filterPools(p.pools, func(pl *Pool) bool { return poolDef.Name == pl.Def.Name && (pl.Def.MaxPods > len(pl.PodIDs) || pl.Def.MaxPods == 0) }) // Sort pools according to pool type fill order so that the // first pool in the list is the preferred one. switch poolDef.FillOrder { case FillBalanced: sort.Slice(pools, func(i, j int) bool { return len(pools[i].PodIDs) < len(pools[j].PodIDs) }) case FillPacked: sort.Slice(pools, func(i, j int) bool { return len(pools[i].PodIDs) > len(pools[j].PodIDs) }) case FillFirstFree: // FirstFree is already the first of the pools list. } if len(pools) == 0 { log.Error("cannot find free %q pool for pod %q, falling back to %q", poolDef.Name, pod.GetName(), defaultPoolDefName) pools = []*Pool{p.pools[1]} } // Found a suitable pool. Allocate it for the pod. podID := pod.GetID() pool := pools[0] pool.PodIDs[podID] = []string{} log.Debug("allocated pool %s[%d] for pod %q", pool.Def.Name, pool.Instance, pod.GetName()) return pool } // containersInPool returns the number of containers of a pod in a pool. func (p *podpools) containersInPool(pod cache.Pod, pool *Pool) int { if cnts, ok := pool.PodIDs[pod.GetID()]; ok { return len(cnts) } return 0 } // dumpPool dumps pool contents in detail. func (p *podpools) dumpPool(pool *Pool) string { conts := []string{} pods := []string{} for podID, contIDs := range pool.PodIDs { podName := podID if pod, ok := p.cch.LookupPod(podID); ok { podName = pod.GetName() } pods = append(pods, fmt.Sprintf("%s (mCPU: %d, max=%d)", podName, p.getPodMilliCPU(podID), p.podMaxMilliCPU[podID])) for _, contID := range contIDs { if cont, ok := p.cch.LookupContainer(contID); ok { conts = append(conts, cont.PrettyName()) } else { conts = append(conts, podName+":"+contID) } } } s := fmt.Sprintf("Pool{Def.Name: %q, Instance: %d, CPUs: %s, Mems: %s, Def.MaxPods: %d, pods: %v, containers:%v}", pool.Def.Name, pool.Instance, pool.CPUs, pool.Mems, pool.Def.MaxPods, pods, conts) return s } // freePool removes an empty pod from a pool func (p *podpools) freePool(pod cache.Pod, pool *Pool) { podID := pod.GetID() delete(pool.PodIDs, podID) delete(p.podMaxMilliCPU, podID) } // trackPodCPU keeps track on pod's CPU requests. func (p *podpools) trackPodCPU(pod cache.Pod, pool *Pool) { // As we do not have direct information on total CPU resources // requested by a pod, we gather the information indirectly by // tracking the sum of requested CPUs of its running // containers. This enables reacting to misalignment between // CPU resources per pod in a pool and CPU resource requests // visible to the kube-scheduler. podID := pod.GetID() current := p.getPodMilliCPU(podID) if max, ok := p.podMaxMilliCPU[podID]; ok { if max < current { p.podMaxMilliCPU[podID] = current } } else { p.podMaxMilliCPU[podID] = current } // Check overbooking if cpuAvail := p.availableMilliCPUs(pool); cpuAvail < 0 { log.Error("overbooked pool %q, cpuset:%s: %dm / %dm CPUs used, %d mCPU available", pool.PrettyName(), pool.CPUs, pool.CPUs.Size()*1000-int(cpuAvail), pool.CPUs.Size()*1000, cpuAvail) } } // validatePodCPU compares max CPU requests against pool CPU capacity per pod. func (p *podpools) validatePodCPU(pod cache.Pod, pool *Pool) { // Log pod configuration error if a pool has fixed amount of // CPUs per pod but the pod failed to request the correct // amount. podID := pod.GetID() if podmCPU, ok := p.podMaxMilliCPU[podID]; ok { if pool.Def.MaxPods > 0 { poolmCPUperPod := int64(pool.CPUs.Size() * 1000 / pool.Def.MaxPods) mCPUerr := podmCPU - poolmCPUperPod // Allow rounding errors (up and down) when // comparing the sum of containers' CPU usages // against milli-CPUs allocated per pod in its // pool. if mCPUerr < -podMilliCPUErrorMargin || mCPUerr > podMilliCPUErrorMargin { podName := "" if pod, ok := p.cch.LookupPod(podID); ok { podName = pod.GetName() } log.Error("bad CPU requests: pod %q requested %d mCPUs, but in pool %q pods must request %d mCPUs.", podName, podmCPU, pool.Def.Name, poolmCPUperPod) } } } } // getPodMilliCPU returns mCPUs requested by podID. func (p *podpools) getPodMilliCPU(podID string) int64 { cpuRequested := int64(0) for _, c := range p.cch.GetContainers() { if c.GetPodID() == podID { if reqCpu, ok := c.GetResourceRequirements().Requests[corev1.ResourceCPU]; ok { cpuRequested += reqCpu.MilliValue() } } } return cpuRequested } // configNotify applies new configuration. func (p *podpools) configNotify(event pkgcfg.Event, source pkgcfg.Source) error { log.Info("configuration %s", event) if err := p.setConfig(podpoolsOptions); err != nil { log.Error("config update failed: %v", err) return err } log.Info("config updated successfully") p.Sync(p.cch.GetContainers(), nil) return nil } // getPoolDefName returns the name of the pool definition of a pod. func (p *podpools) getPoolDefName(pod cache.Pod) string { if poolDefName, ok := pod.GetEffectiveAnnotation(podpoolKey, ""); ok { return poolDefName } if pod.GetNamespace() == "kube-system" { return reservedPoolDefName } return defaultPoolDefName } // getPoolDef returns the pool definition of a pod. func (p *podpools) getPoolDef(pod cache.Pod) *PoolDef { poolDefName := p.getPoolDefName(pod) if poolDefName == reservedPoolDefName { return p.reservedPoolDef } if poolDefName == defaultPoolDefName { return p.defaultPoolDef } for _, poolDef := range p.ppoptions.PoolDefs { if poolDef.Name == poolDefName { return poolDef } } log.Error("pod %q pool %q does not match any pool definition, falling back to %q", pod.GetName(), poolDefName, p.defaultPoolDef.Name) return p.defaultPoolDef } // applyPoolDef creates user-defined pools or reconfigures built-in // pools according to the poolDef. func (p *podpools) applyPoolDef(pools *[]*Pool, poolDef *PoolDef, freeCpus *cpuset.CPUSet, nonReservedCpuCount int) error { if len(*pools) < 2 { return podpoolsError("internal error: reserved and default pools missing, cannot apply pool definitions") } reservedPool := (*pools)[0] defaultPool := (*pools)[1] // Every PoolDef does one of the following: // 1. reconfigures the "reserved" pool (most restricted) // 2. reconfigutes the "default" pool (somewhat restricted) // 3. defines new user-defined pools. switch poolDef.Name { case "": // Case 0: bad name return podpoolsError("undefined or empty pool name") case reservedPool.Def.Name: // Case 1: reconfigure the "reserved" pool. // Forbid redefinition of CPU and Instances. if poolDef.CPU != "" || poolDef.Instances != "" { poolCount, cpusPerPool, err := parseInstancesCPUs(poolDef.Instances, poolDef.CPU, nonReservedCpuCount) if err != nil { return podpoolsError("pool %q: %w", poolDef.Name, err) } if poolCount != 1 { return podpoolsError("pool %q: cannot change the number of instances", poolDef.Name) } if cpusPerPool != reservedPool.CPUs.Size() { return podpoolsError("pool %q: number of CPUs is conflicting ReservedResources CPUs", poolDef.Name) } } reservedPool.Def.MaxPods = poolDef.MaxPods case defaultPool.Def.Name: // Case 2: reconfigure the "default" pool. // Allow redefinition of CPU but not Instances. if poolDef.CPU != "" || poolDef.Instances != "" { poolCount, cpusPerPool, err := parseInstancesCPUs(poolDef.Instances, poolDef.CPU, nonReservedCpuCount) if err != nil { return podpoolsError("pool %q: %w", poolDef.Name, err) } if poolCount != 1 { return podpoolsError("pool %q: cannot change the number of instances", poolDef.Name) } cpus, err := p.cpuAllocator.AllocateCpus(freeCpus, cpusPerPool, cpuallocator.PriorityNormal) if err != nil { return podpoolsError("could not allocate %d CPUs for pool %q: %w", cpusPerPool, poolDef.Name, err) } defaultPool.CPUs = cpus } defaultPool.Def.MaxPods = poolDef.MaxPods default: // Case 3: create new user-defined pool(s). poolCount, cpusPerPool, err := parseInstancesCPUs(poolDef.Instances, poolDef.CPU, nonReservedCpuCount) if err != nil { return podpoolsError("pool %q: %w", poolDef.Name, err) } if poolCount == 0 { return podpoolsError("pool %q: insufficient CPUs to create any instances", poolDef.Name) } if poolCount > 1 && poolDef.FillOrder == FillPacked && poolDef.MaxPods == 0 { return podpoolsError("pool %q: %d pool(s) unreachable due to unlimited pod capacity and FillOrder: %s", poolDef.Name, poolCount-1, poolDef.FillOrder) } log.Debug("allocating %d out of %d non-reserved CPUs for %d %q pools", poolCount*cpusPerPool, nonReservedCpuCount, poolCount, poolDef.Name) for poolIndex := 0; poolIndex < poolCount; poolIndex++ { if cpusPerPool > freeCpus.Size() { return podpoolsError("insufficient CPUs when trying to allocate %d CPUs for pool %s[%d]", cpusPerPool, poolDef.Name, poolIndex) } cpus, err := p.cpuAllocator.AllocateCpus(freeCpus, cpusPerPool, cpuallocator.PriorityNormal) if err != nil { return podpoolsError("could not allocate %d CPUs for instance %d of pool %q: %w", cpusPerPool, poolIndex, poolDef.Name, err) } pool := Pool{ Def: poolDef, Instance: poolIndex, CPUs: cpus, } *pools = append(*pools, &pool) } } return nil } // setConfig takes new pool configuration into use. func (p *podpools) setConfig(ppoptions *PodpoolsOptions) error { // Instantiate pools for pods. pools := []*Pool{} // Built-in reserved pool. reservedPool := Pool{ Def: p.reservedPoolDef, CPUs: p.reserved, } pools = append(pools, &reservedPool) // Built-in default pool. // The default pool will use reserved CPUs by default. If CPUs // are left over after constructing user-defined pools, those // will be used as the Default pool instead. defaultPool := Pool{ Def: p.defaultPoolDef, CPUs: reservedPool.CPUs, } pools = append(pools, &defaultPool) // Apply pool definitions from configuration. freeCpus := p.allowed.Clone() freeCpus = freeCpus.Difference(p.reserved) nonReservedCpuCount := freeCpus.Size() userPoolDefs := 0 // First apply customizations to built-in pools: "reserved" // and "default". for _, poolDef := range ppoptions.PoolDefs { if poolDef.Name != reservedPoolDefName && poolDef.Name != defaultPoolDefName { continue } if err := p.applyPoolDef(&pools, poolDef, &freeCpus, nonReservedCpuCount); err != nil { return err } } // Update nonReservedCount: if the default pool is customized // with its own CPUs, do not count those CPUs in the // "Instances: 100%" syntax of user-defined pools. nonReservedCpuCount = freeCpus.Size() // Apply all user pool definitions, skip "reserved" and "default". for _, poolDef := range ppoptions.PoolDefs { if poolDef.Name == reservedPoolDefName || poolDef.Name == defaultPoolDefName { continue } if err := p.applyPoolDef(&pools, poolDef, &freeCpus, nonReservedCpuCount); err != nil { return err } userPoolDefs += 1 } // Check if there are unallocated CPUs. if freeCpus.Size() > 0 { if defaultPool.CPUs.Intersection(reservedPool.CPUs).IsEmpty() { // User has reallocated "default" pool CPUs log.Debug("%d unused CPUs are added to the default pool.", freeCpus.Size()) defaultPool.CPUs = defaultPool.CPUs.Union(freeCpus) } else { log.Debug("%d unused CPUs are used as the default pool.", freeCpus.Size()) defaultPool.CPUs = freeCpus } } // Finish pool instance initialization. log.Info("%s policy pools:", PolicyName) for index, pool := range pools { pool.Mems = p.closestMems(pool.CPUs) pool.PodIDs = make(map[string][]string) log.Info("- pool %d: %s", index, pool) } // No errors in pool creation, take new configuration into use. log.Debug("new %s configuration:\n%s", PolicyName, utils.DumpJSON(ppoptions)) p.pools = pools p.ppoptions = *ppoptions // Warning on multiple user-defined pools. if userPoolDefs > 1 { log.Warn("Multiple (%d) user-defined pool definitions on the node. kube-scheduler does not know which of the pools has CPUs left for new workloads, and may overbook pools on the node.", userPoolDefs) } return nil } // closestMems returns memory node IDs good for pinning containers // that run on given CPUs func (p *podpools) closestMems(cpus cpuset.CPUSet) idset.IDSet { mems := idset.NewIDSet() sys := p.options.System for _, nodeID := range sys.NodeIDs() { if !cpus.Intersection(sys.Node(nodeID).CPUSet()).IsEmpty() { mems.Add(nodeID) } } return mems } // filterPools returns pools for which the test function returns true func filterPools(pools []*Pool, test func(*Pool) bool) (ret []*Pool) { for _, pool := range pools { if test(pool) { ret = append(ret, pool) } } return } // parseInstancesCPUs parses the number of pool instances and the // number of CPUs per pool instance from PoolDef Instances and CPUs // fields. func parseInstancesCPUs(is string, cs string, freeCpus int) (int, int, error) { if cs == "" { return 0, 0, podpoolsError("missing CPUs") } c64, err := strconv.ParseInt(cs, 0, 32) if err != nil || c64 <= 0 { return 0, 0, podpoolsError("invalid CPUs per pool: %q, integer > 1 expected", cs) } cpusPerPool := int(c64) // Supported Instances specifications: // 0. Instances is an empty string. // Create 1 instance. // 1. Instances: N % // Use at most N % of freeCpus for all PoolDef instances. // The number of instances is floor(freeCpus * N/100 / cpusPerPool). // 2. Instances: N CPUs // Use at most N CPUs for all PoolDef instances. // The number of instances is floor(N / cpusPerPool). // 3. Instances: N // Create N instances from PoolDef. var instances int switch { case is == "": instances = 1 case strings.HasSuffix(is, "%"): tis := strings.TrimSpace(strings.TrimSuffix(is, "%")) i64, err := strconv.ParseInt(tis, 0, 32) if err != nil || i64 < 0 { return 0, 0, podpoolsError("invalid Instances: %q", is) } instances = freeCpus * int(i64) / 100 / cpusPerPool case strings.HasSuffix(strings.ToLower(is), "cpu"): // All these are equivalent: N(cpu|cpus|CPU|CPUs|CPUS) for any N > 0. // Handling "CPU" suffix is an alias for "CPUs". is = strings.TrimSpace(strings.TrimSuffix(strings.ToLower(is), "cpu")) + "cpus" fallthrough case strings.HasSuffix(strings.ToLower(is), "cpus"): tis := strings.TrimSpace(strings.TrimSuffix(strings.ToLower(is), "cpus")) i64, err := strconv.ParseInt(tis, 0, 32) if err != nil || i64 < 0 { return 0, 0, podpoolsError("invalid Instances: %q", is) } if i64 > int64(freeCpus) { return 0, 0, podpoolsError("insufficient CPUs: %d required for instances but %d is available", i64, freeCpus) } instances = int(i64) / cpusPerPool default: i64, err := strconv.ParseInt(is, 0, 32) if err != nil || i64 < 0 { return 0, 0, podpoolsError("invalid Instances: %q", is) } instances = int(i64) } return instances, cpusPerPool, nil } // availableMilliCPU returns mCPUs available in a pool. func (p *podpools) availableMilliCPUs(pool *Pool) int64 { cpuAvail := int64(pool.CPUs.Size() * 1000) cpuRequested := int64(0) for podID := range pool.PodIDs { cpuRequested += p.getPodMilliCPU(podID) } return cpuAvail - cpuRequested } // assignContainer adds a container to a pool func (p *podpools) assignContainer(c cache.Container, pool *Pool) { log.Info("assigning container %s to pool %s", c.PrettyName(), pool) podID := c.GetPodID() pool.PodIDs[podID] = append(pool.PodIDs[podID], c.GetCacheID()) p.pinCpuMem(c, pool.CPUs, pool.Mems) } // dismissContainer removes a container from a pool func (p *podpools) dismissContainer(c cache.Container, pool *Pool) { podID := c.GetPodID() pool.PodIDs[podID] = removeString(pool.PodIDs[podID], c.GetCacheID()) } // pinCpuMem pins container to CPUs and memory nodes if flagged func (p *podpools) pinCpuMem(c cache.Container, cpus cpuset.CPUSet, mems idset.IDSet) { if p.ppoptions.PinCPU { log.Debug(" - pinning to cpuset: %s", cpus) c.SetCpusetCpus(cpus.String()) if reqCpu, ok := c.GetResourceRequirements().Requests[corev1.ResourceCPU]; ok { mCpu := int(reqCpu.MilliValue()) c.SetCPUShares(int64(cache.MilliCPUToShares(int64(mCpu)))) } } if p.ppoptions.PinMemory { log.Debug(" - pinning to memory %s", mems) c.SetCpusetMems(mems.String()) } } // podpoolsError formats an error from this policy. func podpoolsError(format string, args ...interface{}) error { return fmt.Errorf(PolicyName+": "+format, args...) } // removeString returns the first occurrence of a string from string slice. func removeString(strings []string, element string) []string { for index, s := range strings { if s == element { strings[index] = strings[len(strings)-1] return strings[:len(strings)-1] } } return strings } // Register us as a policy implementation. func init() { policy.Register(PolicyName, PolicyDescription, CreatePodpoolsPolicy) } ================================================ FILE: pkg/cri/resource-manager/policy/builtin/podpools/podpools-policy_test.go ================================================ // Copyright 2020-2021 Intel Corporation. All Rights Reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package podpools import ( "fmt" "strings" "testing" "github.com/intel/cri-resource-manager/pkg/cpuallocator" "github.com/intel/cri-resource-manager/pkg/utils/cpuset" ) func validateError(t *testing.T, expectedError string, err error) bool { if expectedError != "" { if err == nil { t.Errorf("Expected error containing %q, did not get any error", expectedError) return false } else if !strings.Contains(err.Error(), expectedError) { t.Errorf("Expected error containing %q, but got %q", expectedError, err.Error()) return false } } else { if err != nil { t.Errorf("Unexpected error %s", err) return false } } return true } func assertEqualPools(t *testing.T, expectedPool, gotPool Pool) bool { if expectedPool.String() != gotPool.String() { // Compares Def.Name, Def.Instance, .CPUs, .Mems, Def.MaxPods // and assigned pods/containers. t.Errorf("expected pool %s, got %s", expectedPool, gotPool) return false } if expectedPool.Def.Instances != gotPool.Def.Instances { t.Errorf("pools %s: PoolDef.Instances differ: expected %q, got %q", expectedPool, expectedPool.Def.Instances, gotPool.Def.Instances) return false } if expectedPool.Def.FillOrder != gotPool.Def.FillOrder { t.Errorf("pools %s: PoolDef.FillOrder differ: expected %s, got %s", expectedPool, expectedPool.Def.FillOrder, gotPool.Def.FillOrder) return false } return true } type mockCpuAllocator struct{} func (mca *mockCpuAllocator) AllocateCpus(from *cpuset.CPUSet, cnt int, dontcare cpuallocator.CPUPriority) (cpuset.CPUSet, error) { switch { case from.Size() < cnt: return cpuset.New(), fmt.Errorf("cpuset %s does not have %d CPUs", from, cnt) case from.Size() == cnt: result := from.Clone() *from = cpuset.New() return result, nil default: result := cpuset.New() for _, cpu := range from.List() { if result.Size() >= cnt { break } result = result.Union(cpuset.New(cpu)) } *from = from.Difference(result) return result, nil } } func (mca *mockCpuAllocator) ReleaseCpus(*cpuset.CPUSet, int, cpuallocator.CPUPriority) (cpuset.CPUSet, error) { return cpuset.New(), nil } func TestApplyPoolDef(t *testing.T) { reservedCpus1 := cpuset.CPUSet{} reservedPoolDef := PoolDef{ Name: reservedPoolDefName, } defaultPoolDef := PoolDef{ Name: defaultPoolDefName, } reservedPool := Pool{ Def: &reservedPoolDef, CPUs: reservedCpus1, } defaultPool := Pool{ Def: &defaultPoolDef, CPUs: reservedCpus1, } normalPoolsAtStart := []Pool{reservedPool, defaultPool} singlecpuSingleInstance := PoolDef{ Name: "singlecpu", CPU: "1", } quadcpuDualInstance := PoolDef{ Name: "quadcpu", CPU: "4", Instances: "8 CPUs", } quadcpuMultiInstance := PoolDef{ Name: "quadcpu", CPU: "4", Instances: "100%", } tcases := []struct { name string pools *[]Pool poolDef PoolDef freeCpus string // example: "0-2" expectedFreeCpus string // "": no check, "-": assert empty expectedError string // "": error is not allowed, otherwise expected error substring expectedPools *[]Pool }{ // negative tests { name: "call apply without built-in pools", pools: &([]Pool{}), poolDef: singlecpuSingleInstance, freeCpus: "0-3", expectedError: "pools missing", }, { name: "bad reserved CPUs", poolDef: PoolDef{ Name: "reserved", CPU: "two", }, expectedError: "invalid CPUs", }, { name: "bad reserved Instances", poolDef: PoolDef{ Name: "reserved", CPU: "1", Instances: "0x", }, expectedError: "invalid Instances", }, { name: "bad default CPUs", poolDef: PoolDef{ Name: "default", CPU: "2500m", }, freeCpus: "0-8", expectedError: "invalid CPUs", }, { name: "bad default Instances", poolDef: PoolDef{ Name: "default", CPU: "0xf", Instances: "100 % CPUs", }, freeCpus: "0-95", expectedError: "invalid Instances", }, { name: "bad user-defined CPUs", poolDef: PoolDef{ Name: "mypool", }, freeCpus: "0-8", expectedError: "missing CPUs", }, { name: "too many CPUs on user-defined Instances", poolDef: PoolDef{ Name: "user pool", CPU: "1", Instances: "100 CPUs", }, freeCpus: "0-95", expectedError: "insufficient CPUs", }, { name: "unnamed pool", poolDef: PoolDef{ CPU: "1", MaxPods: 1, }, freeCpus: "0-3", expectedError: "undefined or empty pool name", }, { name: "unreachable pools", poolDef: PoolDef{ Name: "unlimited capacity", CPU: "3", MaxPods: 0, FillOrder: FillPacked, Instances: "3", }, freeCpus: "0-95", expectedError: "2 pool(s) unreachable", }, // redefine the reserved pool { name: "redefine reserved CPUs", poolDef: PoolDef{ Name: "reserved", CPU: "2", }, freeCpus: "0-3", expectedError: "conflicting ReservedResources CPUs", }, { name: "redefine reserved instances", poolDef: PoolDef{ Name: "reserved", CPU: "1", Instances: "2", }, freeCpus: "0-3", expectedError: "cannot change the number of instances", }, { name: "redefine reserved MaxPods", poolDef: PoolDef{ Name: "reserved", MaxPods: 42, }, freeCpus: "0-3", expectedPools: &[]Pool{ { Def: &PoolDef{ Name: reservedPoolDefName, MaxPods: 42, }, CPUs: reservedPool.CPUs, }, defaultPool, }, }, // redefine the default pool { name: "redefine default CPUs", poolDef: PoolDef{ Name: "default", CPU: "2", }, freeCpus: "0-3", expectedFreeCpus: "2-3", expectedPools: &[]Pool{ reservedPool, { Def: &PoolDef{ Name: defaultPoolDefName, }, CPUs: cpuset.MustParse("0-1"), }, }, }, { name: "redefine default instances", poolDef: PoolDef{ Name: "default", CPU: "1", Instances: "2", }, freeCpus: "0-3", expectedError: "cannot change the number of instances", }, { name: "redefine default MaxPods", poolDef: PoolDef{ Name: "default", MaxPods: 52, }, freeCpus: "0-3", expectedPools: &[]Pool{ reservedPool, { Def: &PoolDef{ Name: defaultPoolDefName, MaxPods: 52, }, CPUs: defaultPool.CPUs, }, }, }, // user-defined pools { name: "use one CPUs - insufficient", poolDef: singlecpuSingleInstance, expectedError: "insufficient CPUs", }, { name: "use one CPU", freeCpus: "0-3", poolDef: singlecpuSingleInstance, expectedFreeCpus: "1-3", expectedPools: &[]Pool{ reservedPool, defaultPool, { Def: &singlecpuSingleInstance, Instance: 0, CPUs: cpuset.MustParse("0"), }, }, }, { name: "use the only CPU", freeCpus: "0", poolDef: singlecpuSingleInstance, expectedFreeCpus: "-", }, { name: "use 2x4 CPUs - insufficient", freeCpus: "0-6", poolDef: quadcpuDualInstance, expectedError: "insufficient CPUs", }, { name: "use 2x4 CPUs - consume all", freeCpus: "0-7", poolDef: quadcpuDualInstance, expectedFreeCpus: "-", }, { name: "use 2x4 CPUs - CPUs left", freeCpus: "0-8", poolDef: quadcpuDualInstance, expectedFreeCpus: "8", }, { name: "use all cpus - but insufficient", freeCpus: "0-2", poolDef: quadcpuMultiInstance, expectedError: "insufficient CPUs", }, { name: "use all cpus - partial", freeCpus: "0-6", poolDef: quadcpuMultiInstance, expectedFreeCpus: "4-6", expectedPools: &[]Pool{ reservedPool, defaultPool, { Def: &quadcpuMultiInstance, Instance: 0, CPUs: cpuset.MustParse("0-3"), }, }, }, { name: "use all cpus - every single one", freeCpus: "0-7", poolDef: quadcpuMultiInstance, expectedFreeCpus: "-", expectedPools: &[]Pool{ reservedPool, defaultPool, { Def: &quadcpuMultiInstance, Instance: 0, CPUs: cpuset.MustParse("0-3"), }, { Def: &quadcpuMultiInstance, Instance: 1, CPUs: cpuset.MustParse("4-7"), }, }, }, } for _, tc := range tcases { t.Run(tc.name, func(t *testing.T) { // Tests should not change original pools/pooldefs/freeCpus // Create copies before calling the function. pools := []*Pool{} if tc.pools == nil { tc.pools = &normalPoolsAtStart } for i := range *tc.pools { copyOfPool := (*tc.pools)[i] pools = append(pools, ©OfPool) } freeCpus := cpuset.New() if tc.freeCpus != "" { freeCpus = cpuset.MustParse(tc.freeCpus) } p := &podpools{ cpuAllocator: &mockCpuAllocator{}, } err := p.applyPoolDef(&pools, &tc.poolDef, &freeCpus, freeCpus.Size()) if ok := validateError(t, tc.expectedError, err); ok { // check freeCpus modified by applyPoolDef if tc.expectedFreeCpus != "" { expectedFreeCpus := cpuset.New() if tc.expectedFreeCpus != "-" { expectedFreeCpus = cpuset.MustParse(tc.expectedFreeCpus) } if expectedFreeCpus.Size() != freeCpus.Size() { t.Errorf("unexpected number of free CPUs left, expected %d, got %d", expectedFreeCpus.Size(), freeCpus.Size()) } } // check pools modified by applyPoolDef if tc.expectedPools != nil { if len(pools) != len(*tc.expectedPools) { t.Errorf("unexpected number of new pools, expected %d got %d", len(pools), len(*tc.expectedPools)) return } for i := 0; i < len(pools); i++ { if !assertEqualPools(t, (*tc.expectedPools)[i], *pools[i]) { return } } } } }) } } func TestParseInstancesCPUs(t *testing.T) { tcases := []struct { name string instances string cpus string freeCpus int expectedInstances int expectedCPUs int expectedError string }{ { name: "empty CPUs", expectedError: "missing CPUs", }, { name: "bad CPUs", cpus: "55%", expectedError: "> 1 expected", }, { name: "zero CPUs", cpus: "0", expectedError: "> 1 expected", }, { name: "negative CPUs", cpus: "-1", expectedError: "> 1 expected", }, { name: "42 CPUs, empty instances defaults to 1", cpus: "42", expectedCPUs: 42, expectedInstances: 1, }, { name: "instances: 0", instances: "0", cpus: "2", freeCpus: 100, expectedInstances: 0, expectedCPUs: 2, }, { name: "instances: N", instances: "10", cpus: "2", freeCpus: 100, expectedInstances: 10, expectedCPUs: 2, }, { name: "instances: N CPUs", instances: "10 CPUs", cpus: "2", freeCpus: 100, expectedInstances: 10 / 2, expectedCPUs: 2, }, { name: "instances: 1 CPUS", instances: "1 CPUS", cpus: "1", freeCpus: 1, expectedInstances: 1, expectedCPUs: 1, }, { name: "instances: 1 cpu", instances: "1 cpu", cpus: "1", freeCpus: 2, expectedInstances: 1, expectedCPUs: 1, }, { name: "instances: 8cpu", instances: "8cpu", cpus: "2", freeCpus: 9, expectedInstances: 4, expectedCPUs: 2, }, { name: "instances: N %", instances: "90 %", cpus: "2", freeCpus: 10, expectedInstances: 4, // 10 * (90/100) / 2 expectedCPUs: 2, }, { name: "instances: N%", instances: "90%", cpus: "90", freeCpus: 100, expectedInstances: 1, expectedCPUs: 90, }, { name: "instances: N %, not enough for any pools", instances: "10 %", cpus: "2", freeCpus: 10, expectedInstances: 0, // 10 * (10/100) / 2 expectedCPUs: 2, }, { name: "instances: -N", instances: "-10", cpus: "2", expectedError: "invalid Instances", }, { name: "instances: -N CPUs", instances: "-10 CPUs", cpus: "2", expectedError: "invalid Instances", }, { name: "instances: N CPUs CPU", instances: "2 CPUs CPU", cpus: "2", expectedError: "invalid Instances", }, { name: "instances: -N %", instances: "-10 %", cpus: "2", expectedError: "invalid Instances", }, { name: "instances: N CPUs, N < cpus", instances: "3 CPUs", cpus: "4", expectedError: "insufficient CPUs", }, } for _, tc := range tcases { t.Run(tc.name, func(t *testing.T) { instances, cpus, err := parseInstancesCPUs(tc.instances, tc.cpus, tc.freeCpus) if ok := validateError(t, tc.expectedError, err); ok { if instances != tc.expectedInstances || cpus != tc.expectedCPUs { t.Errorf("Expected (instances, cpus) (%v, %v), but got (%v, %v)", tc.expectedInstances, tc.expectedCPUs, instances, cpus) } } }) } } ================================================ FILE: pkg/cri/resource-manager/policy/builtin/static/flags.go ================================================ // Copyright 2019 Intel Corporation. All Rights Reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package static import ( "github.com/intel/cri-resource-manager/pkg/config" "sigs.k8s.io/yaml" ) // Options captures our configurable policy parameters. type options struct { // Relax exclusive isolated CPU allocation criteria RelaxedIsolation bool `json:"RelaxedIsolation"` // Control whether containers are assigned to RDT classes by this policy. Rdt Tristate `json:"Rdt"` } // Tristate is boolean-like value with 3 states: on, off, automatically-determined. type Tristate int const ( // TristateOff is unconditional boolean false TristateOff = iota // TristateOn is unconditional boolean true TristateOn // TristateAuto indicates boolean value should be inferred using other data. TristateAuto ) // Our runtime configuration. var opt = defaultOptions().(*options) // UnmarshalJSON implements the unmarshaller function for "encoding/json" func (t *Tristate) UnmarshalJSON(data []byte) error { var value interface{} if err := yaml.Unmarshal(data, &value); err != nil { return policyError("invalid Tristate value '%s': %v", string(data), err) } switch value.(type) { case bool: *t = map[bool]Tristate{false: TristateOff, true: TristateOn}[value.(bool)] return nil case string: if value.(string) == "auto" { *t = TristateAuto return nil } } return policyError("invalid Tristate value %v of type %T", value, value) } // MarshalJSON implements the marshaller function for "encoding/json" func (t Tristate) MarshalJSON() ([]byte, error) { switch t { case TristateOff: return []byte("false"), nil case TristateOn: return []byte("true"), nil case TristateAuto: return []byte("\"auto\""), nil } return nil, policyError("invalid tristate value %v", t) } // String returns the value of Tristate as a string func (t *Tristate) String() string { switch *t { case TristateOff: return "false" case TristateOn: return "true" } return "auto" } // defaultOptions returns a new options instance, all initialized to defaults. func defaultOptions() interface{} { return &options{Rdt: TristateAuto} } // Register us for configuration handling. func init() { config.Register(PolicyPath, PolicyDescription, opt, defaultOptions) } ================================================ FILE: pkg/cri/resource-manager/policy/builtin/static/static-policy.go ================================================ // Copyright 2019 Intel Corporation. All Rights Reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package static import ( "fmt" "strconv" corev1 "k8s.io/api/core/v1" "k8s.io/apimachinery/pkg/api/resource" "github.com/intel/cri-resource-manager/pkg/config" logger "github.com/intel/cri-resource-manager/pkg/log" "github.com/prometheus/client_golang/prometheus" "github.com/intel/cri-resource-manager/pkg/cpuallocator" "github.com/intel/cri-resource-manager/pkg/cri/resource-manager/cache" "github.com/intel/cri-resource-manager/pkg/cri/resource-manager/events" "github.com/intel/cri-resource-manager/pkg/cri/resource-manager/introspect" "github.com/intel/cri-resource-manager/pkg/cri/resource-manager/kubernetes" "github.com/intel/cri-resource-manager/pkg/cri/resource-manager/policy" "github.com/intel/cri-resource-manager/pkg/sysfs" "github.com/intel/cri-resource-manager/pkg/utils/cpuset" idset "github.com/intel/goresctrl/pkg/utils" ) const ( // PolicyName is the name used to activate this policy implementation. PolicyName = "static" // PolicyDescription is a short description of this policy. PolicyDescription = "A reimplementation of the static CPU Manager policy." // PolicyPath is the path of this policy in the configuration hierarchy. PolicyPath = "policy." + PolicyName ) type static struct { logger.Logger available policy.ConstraintSet // resource availability constraints reserved policy.ConstraintSet // system/kube-reservation constraints reservedCpus cpuset.CPUSet // CPUs reserved for system- and kube-tasks availableCpus cpuset.CPUSet // CPUs free usable by this policy isolatedCpus cpuset.CPUSet // available CPUs isolated from normal scheduling sys sysfs.System // system/topology information numHT int // number of hyperthreads per core state cache.Cache // policy/state cache cpuAllocator cpuallocator.CPUAllocator // CPU allocator used by the policy } // Make sure static implements the policy backend interface. var _ policy.Backend = &static{} const ( // keyPreferIsolated is the annotation used to mark pods preferring isolated CPUs. keyPreferIsolated = "prefer-isolated-cpus" ) // NewStaticPolicy creates a new policy instance. func NewStaticPolicy(opts *policy.BackendOptions) policy.Backend { s := &static{ Logger: logger.NewLogger(PolicyName), state: opts.Cache, sys: opts.System, available: opts.Available, reserved: opts.Reserved, cpuAllocator: cpuallocator.NewCPUAllocator(opts.System), } s.Info("creating policy...") s.numHT = s.sys.CPU(idset.ID(0)).ThreadCPUSet().Size() if err := s.checkConstraints(); err != nil { s.Fatal("cannot start with given constraints: %v", err) } config.GetModule(PolicyPath).AddNotify(s.configNotify) return s } // Name returns the name of this policy. func (s *static) Name() string { return PolicyName } // Description returns the description for this policy. func (s *static) Description() string { return PolicyDescription } // Start prepares this policy for accepting allocation/release requests. func (s *static) Start(add []cache.Container, del []cache.Container) error { s.Debug("starting up...") if err := s.allocateReserved(); err != nil { return policyError("failed allocate reserved CPUs: %v", err) } s.Info("using reserved CPUs: %s", s.reservedCpus.String()) s.Info("using available CPUs: %s", s.availableCpus.String()) if err := s.validateState(s.state); err != nil { return policyError("failed to start with given cache/state: %v", err) } s.validateAssignments() return s.Sync(add, del) } // Sync synchronizes the active policy state. func (s *static) Sync(add []cache.Container, del []cache.Container) error { s.Debug("synchronizing state...") for _, c := range del { s.ReleaseResources(c) } for _, c := range add { s.AllocateResources(c) } return nil } // AllocateResources is a resource allocation request for this policy. func (s *static) AllocateResources(c cache.Container) error { s.Info("allocating resource for container %s...", c.PrettyName()) container := c containerID := c.GetCacheID() pod, found := c.GetPod() if !found { return policyError("can't find pod for container %s", containerID) } err := s.AddContainer(pod, container, containerID) return err } // ReleaseResources is a resource release request for this policy. func (s *static) ReleaseResources(c cache.Container) error { s.Info("releasing resources of container %s...", c.PrettyName()) containerID := c.GetCacheID() err := s.RemoveContainer(containerID) return err } // UpdateResources is a resource allocation update request for this policy. func (s *static) UpdateResources(c cache.Container) error { s.Debug("(not) updating container %s...", c.PrettyName()) return nil } // Rebalance tries to find an optimal allocation of resources for the current containers. func (s *static) Rebalance() (bool, error) { s.Debug("(not) rebalancing containers...") return false, nil } // HandleEvent handles policy-specific events. func (s *static) HandleEvent(*events.Policy) (bool, error) { s.Debug("(not) handling event...") return false, nil } // ExportResourceData provides resource data to export for the container. func (s *static) ExportResourceData(c cache.Container) map[string]string { data := map[string]string{} if cset, ok := s.GetCPUSet(c.GetCacheID()); !ok { cset = s.GetDefaultCPUSet() data[policy.ExportSharedCPUs] = cset.String() } else { isolated := cset.Intersection(s.sys.Isolated()).String() if isolated != "" { data[policy.ExportIsolatedCPUs] = isolated } exclusive := cset.Difference(s.sys.Isolated()).String() if exclusive != "" { data[policy.ExportExclusiveCPUs] = exclusive } } return data } // Introspect provides data for external introspection. func (s *static) Introspect(*introspect.State) { return } // DescribeMetrics generates policy-specific prometheus metrics data descriptors. func (p *static) DescribeMetrics() []*prometheus.Desc { return nil } // PollMetrics provides policy metrics for monitoring. func (p *static) PollMetrics() policy.Metrics { return nil } // CollectMetrics generates prometheus metrics from cached/polled policy-specific metrics data. func (p *static) CollectMetrics(policy.Metrics) ([]prometheus.Metric, error) { return nil, nil } func (s *static) configNotify(event config.Event, source config.Source) error { s.Info("configuration %s", event) if opt.RelaxedIsolation { s.Info("isolated exclusive CPUs: globally preferred (all pods)") } else { s.Info("isolated exclusive CPUs: per-pod (by annotation '%s')", kubernetes.ResmgrKey(keyPreferIsolated)) } s.Info("rdt support set to %v", opt.Rdt) return nil } // assignableCPUs returns the set of unassigned CPUs minus the reserved set. func (s *static) assignableCPUs(numCPUs int) cpuset.CPUSet { cset := s.GetDefaultCPUSet().Difference(s.reservedCpus) if cset.Size() < numCPUs && s.isolatedCpus.Size() > 0 { s.Warn("not enough non-isolated CPUs (%d) left for request (%d)", cset.Size(), numCPUs) cset = cset.Union(s.isolatedCpus) } return cset } // AddContainer is the CPU Manager static policy AddContainer function. func (s *static) AddContainer(pod cache.Pod, container cache.Container, containerID string) error { if numCPUs := s.guaranteedCPUs(pod, container); numCPUs != 0 { s.Info("[cpumanager] static policy: AddContainer (pod: %s, container: %s, container id: %s)", pod.GetName(), container.GetName(), containerID) // container belongs in an exclusively allocated pool if _, ok := s.GetCPUSet(containerID); ok { s.Info("[cpumanager] static policy: container already present in state, skipping (container: %s, container id: %s)", container.GetName(), containerID) return nil } cpuset, err := s.allocateCPUs(numCPUs, containerID) if err != nil { s.Error("[cpumanager] unable to allocate %d CPUs (container id: %s, error: %v)", numCPUs, containerID, err) return err } s.Debug("setting cpuset of %s to allocated %s", containerID, cpuset) s.SetCPUSet(containerID, cpuset) } // container belongs in the shared pool (nothing to do; use default cpuset) return nil } // RemoveContainer is the CPU Manager static policy RemoveContainer function. func (s *static) RemoveContainer(containerID string) error { s.Info("[cpumanager] static policy: RemoveContainer (container id: %s)", containerID) if toRelease, ok := s.GetCPUSet(containerID); ok { s.Delete(containerID) isolated := toRelease.Intersection(s.sys.Isolated()) ordinary := toRelease.Difference(isolated) // Mutate the shared pool, adding released cpus. s.SetDefaultCPUSet(s.GetDefaultCPUSet().Union(ordinary)) s.isolatedCpus = s.isolatedCpus.Union(isolated) } return nil } // Notes: // By default we assume workloads are not isolation-aware. We // only allocate isolated CPUs exclusively to containers if // // - we globally prefer isolated exclusive CPUs, or // - the pod prefers isolated exclusive CPUs, or // - the container asks a single hyperthread worth of CPU // cpuPreference checks if isolated CPUs should be tried and are preferred for an allocation. func (s *static) cpuPreference(containerID string, numCPUs int) (bool, bool) { var try, prefer bool // Check if we prefer isolated CPUs (globally of per this containers pod). if opt.RelaxedIsolation { prefer = true } else { if c, ok := s.state.LookupContainer(containerID); ok { p, found := c.GetPod() if !found { s.Warn("can't find pod for container %s", c.GetID()) return false, false } if value, ok := p.GetResmgrAnnotation(keyPreferIsolated); ok { if isolated, err := strconv.ParseBool(value); isolated { prefer = true } else { if err != nil { s.Error("invalid annotation '%s' on container %s, expecting boolean: %v", keyPreferIsolated, c.PrettyName(), err) } } } } } // Try isolated cpus when explicitly asked, or, if a single HT of CPU is requested if prefer || (numCPUs == 1 && s.isolatedCpus.Size() >= 1) { try = true } return try, prefer } // allocateOrdinaryCPUs tries to take a number of non-isolated CPUs. func (s *static) allocateOrdinaryCPUs(numCPUs int) (cpuset.CPUSet, error) { assignable := s.assignableCPUs(numCPUs) result, err := s.takeByTopology(assignable, numCPUs, cpuallocator.PriorityHigh) if err != nil { return cpuset.New(), err } s.Info("allocated %d ordinary CPUs: %s", numCPUs, result.String()) return result, nil } // allocateIsolatedCPUs tries to take a number of isolated CPUs, falling back to ordinary ones. func (s *static) allocateIsolatedCPUs(numCPUs int, prefer bool) (cpuset.CPUSet, error) { result, err := s.takeByTopology(s.isolatedCpus, numCPUs, cpuallocator.PriorityHigh) switch { case err != nil: s.Info("falling back to %d ordinary CPUs", numCPUs) return s.allocateOrdinaryCPUs(numCPUs) case numCPUs == 1 || prefer: s.Info("allocated %d isolated CPUs: %s", numCPUs, result.String()) return result, nil default: s.Info("falling back to %d ordinary CPUs", numCPUs) return s.allocateOrdinaryCPUs(numCPUs) } } // allocateCPUs allocates the requested number of CPUs. func (s *static) allocateCPUs(numCPUs int, containerID string) (cpuset.CPUSet, error) { var result cpuset.CPUSet var err error s.Info("[cpumanager] allocateCpus: (numCPUs: %d)", numCPUs) if try, prefer := s.cpuPreference(containerID, numCPUs); !try { result, err = s.allocateOrdinaryCPUs(numCPUs) } else { result, err = s.allocateIsolatedCPUs(numCPUs, prefer) } if err != nil { return result, err } // Remove allocated CPUs from the shared and/or isolated CPUSet. s.SetDefaultCPUSet(s.GetDefaultCPUSet().Difference(result)) s.isolatedCpus = s.isolatedCpus.Difference(result) s.Info("[cpumanager] allocateCPUs: returning \"%v\"", result) return result, nil } func (s *static) guaranteedCPUs(pod cache.Pod, container cache.Container) int { qos := pod.GetQOSClass() s.Debug("* QoS class for pod %s (%s) is %s", pod.GetID(), pod.GetName(), qos) if qos != corev1.PodQOSGuaranteed { return 0 } cpuQuantity := container.GetResourceRequirements().Requests[corev1.ResourceCPU] if cpuQuantity.Value()*1000 != cpuQuantity.MilliValue() { return 0 } // Safe downcast to do for all systems with < 2.1 billion CPUs. // Per the language spec, `int` is guaranteed to be at least 32 bits wide. // https://golang.org/ref/spec#Numeric_types return int(cpuQuantity.Value()) } // Check our allocations constraints. func (s *static) checkConstraints() error { online := s.sys.CPUSet().Difference(s.sys.Offlined()) isolated := s.sys.Isolated().Intersection(online) online = online.Difference(isolated) cpus, ok := s.available[policy.DomainCPU] if !ok { s.availableCpus = online } else { switch cpus.(type) { case cpuset.CPUSet: s.availableCpus = cpus.(cpuset.CPUSet).Intersection(online) default: return policyError("invalid type for available CPU set: %T", cpus) } } s.isolatedCpus = isolated s.Info("system isolated CPUs: %s", s.isolatedCpus) return nil } // Allocate the requested reserved cpus. func (s *static) allocateReserved() error { var err error var reserved cpuset.CPUSet cpus, ok := s.reserved[policy.DomainCPU] if !ok { return policyError("static policy cannot start without reserved CPUs") } switch cpus.(type) { case cpuset.CPUSet: reserved = cpus.(cpuset.CPUSet) if !reserved.Intersection(s.availableCpus).Equals(reserved) { return policyError("some reserved CPUs (%s) are unavailable", reserved.Difference(s.availableCpus).String()) } case resource.Quantity: qty := cpus.(resource.Quantity) count := (int(qty.MilliValue()) + 999) / 1000 from := s.availableCpus.Clone() if reserved, err = s.takeByTopology(from, count, cpuallocator.PriorityNormal); err != nil { return policyError("failed to reserve %d CPUs: %v", cpus.(int), err) } } s.reservedCpus = reserved return nil } // Validate the cache/state supplied for starting. func (s *static) validateState(state cache.Cache) error { s.state = state tmpAssignments := s.GetCPUAssignments() tmpDefaultCPUset := s.GetDefaultCPUSet() allCPUs := s.availableCpus.Clone() isolated := s.isolatedCpus.Clone() // Default cpuset cannot be empty when assignments exist if tmpDefaultCPUset.IsEmpty() { if len(tmpAssignments) != 0 { return fmt.Errorf("default cpuset cannot be empty") } // state is empty initialize s.SetDefaultCPUSet(allCPUs) return nil } // State has already been initialized from file (is not empty) // 1. Check if the reserved cpuset is not part of default cpuset because: // - kube/system reserved have changed (increased) - may lead to some containers not being able to start // - user tampered with file if !s.reservedCpus.Intersection(tmpDefaultCPUset).Equals(s.reservedCpus) { return fmt.Errorf("not all reserved cpus: \"%s\" are present in defaultCpuSet: \"%s\"", s.reservedCpus.String(), tmpDefaultCPUset.String()) } // 2. Check if state for static policy is consistent for cID, cset := range tmpAssignments { // None of the cpu in DEFAULT cset should be in s.assignments if !tmpDefaultCPUset.Intersection(cset).IsEmpty() { return fmt.Errorf("container id: %s cpuset: \"%s\" overlaps with default cpuset \"%s\"", cID, cset.String(), tmpDefaultCPUset.String()) } // Remove any potentially taken isolated CPUs from the available isolated set. s.isolatedCpus = s.isolatedCpus.Difference(cset) } s.Info("available (unallocated) isolated CPUs: %s", s.isolatedCpus) // 3. It's possible that the set of available CPUs has changed since // the state was written. This can be due to for example // offlining a CPU when kubelet is not running. If this happens, // CPU manager will run into trouble when later it tries to // assign non-existent CPUs to containers. Validate that the // topology that was received during CPU manager startup matches with // the set of CPUs stored in the state. totalKnownCPUs := tmpDefaultCPUset.Clone() for _, cset := range tmpAssignments { totalKnownCPUs = totalKnownCPUs.Union(cset) } if !totalKnownCPUs.Equals(allCPUs) { if totalKnownCPUs.IsSubsetOf(allCPUs.Union(isolated)) { return nil } return fmt.Errorf("current available CPUs \"%s\" are not a superset of CPUs in state \"%s\"", allCPUs.Union(isolated).String(), totalKnownCPUs.String()) } return nil } // Topology-aware-like allocation wrapper. func (s *static) takeByTopology(available cpuset.CPUSet, numCPUs int, preferredPrio cpuallocator.CPUPriority) (cpuset.CPUSet, error) { from := &available cset, err := s.cpuAllocator.AllocateCpus(from, numCPUs, preferredPrio) if err != nil { return cset, err } return cset, err } // Validate static assignments, purge stale ones. func (s *static) validateAssignments() { // Instead of relying/waiting for an external reconcilation loop to // clean up stale container/assignments, we do it ourselves upon startup. ca := s.GetCPUAssignments() for id, cset := range ca { if _, ok := s.state.LookupContainer(id); !ok { s.Info("Removing stale assignment of container %s (cpus %s)", id, cset.String()) s.RemoveContainer(id) } } } // policyError creates a policy-specific formatted error func policyError(format string, args ...interface{}) error { return fmt.Errorf(PolicyName+": "+format, args...) } // // Kubelet CPU Manager / policy_static.go adaptation // // A set of rudimentary functions to get policy_static.go up and running // with small enough changes that the code (above) remains recognisable // for those who are already familiar with the original. These functions // basically implements a CPU Manager state-like interface on top of our // cache. // ContainerCPUAssignments assigns CPU sets per container id. type ContainerCPUAssignments map[string]cpuset.CPUSet // // Cache keys for storing the default cpuset (one for containers // without exclusive allocations) and static assignments (cpusets // for containers with exclusive allocations). const ( keyAssignments = "CPUAssignments" keyDefaultCPUs = "DefaultCPUSet" ) // GetCPUAssignments gets the current CPU assignments from our state. func (s *static) GetCPUAssignments() ContainerCPUAssignments { var ca map[string]cpuset.CPUSet if !s.state.GetPolicyEntry(keyAssignments, &ca) { s.Error("no cached CPU assignments") } if ca == nil { ca = make(map[string]cpuset.CPUSet) s.state.SetPolicyEntry(keyAssignments, ca) } return ca } // SetCPUAssginments sets the current CPU assignments in our state. func (s *static) SetCPUAssignments(ca ContainerCPUAssignments) { s.state.SetPolicyEntry(keyAssignments, map[string]cpuset.CPUSet(ca)) } // GetDefaultCPUSet gets the current default CPUSet from our state. func (s *static) GetDefaultCPUSet() cpuset.CPUSet { var cset cpuset.CPUSet if !s.state.GetPolicyEntry(keyDefaultCPUs, &cset) { s.Error("no cached default CPU set") } return cset } // SetDefaultCPUSet sets the current default CPUSet in our state. func (s *static) SetDefaultCPUSet(cset cpuset.CPUSet) { s.state.SetPolicyEntry(keyDefaultCPUs, cset) // update cpuset for containers with default assignment ca := s.GetCPUAssignments() for _, id := range s.state.GetContainerCacheIds() { if _, ok := ca[id]; !ok { s.SetCpusetCpus(id, cset.String()) } } } // GetCPUSet gets the CPUSet for a container from our state. func (s *static) GetCPUSet(containerID string) (cpuset.CPUSet, bool) { ca := s.GetCPUAssignments() cset, ok := ca[containerID] return cset.Clone(), ok } // SetCPUSet sets the CPUSet for a container in our state. func (s *static) SetCPUSet(containerID string, cset cpuset.CPUSet) { ca := s.GetCPUAssignments() ca[containerID] = cset s.SetCPUAssignments(ca) s.SetCpusetCpus(containerID, cset.String()) } // Delete deletes the given container from our state. func (s *static) Delete(containerID string) { s.Debug("deleting container %s from assignments", containerID) ca := s.GetCPUAssignments() delete(ca, containerID) s.SetCPUAssignments(ca) } // SetCPUSetCpus updates cpuset.cpus for a container. func (s *static) SetCpusetCpus(id, value string) error { c, ok := s.state.LookupContainer(id) if !ok { return policyError("can't find container '%s'", id) } c.SetCpusetCpus(value) s.Info("container %s: CpusetCpus set to %s", c.PrettyName(), value) return nil } // Register us as a policy implementation. func init() { policy.Register(PolicyName, PolicyDescription, NewStaticPolicy) } ================================================ FILE: pkg/cri/resource-manager/policy/builtin/static-plus/static-plus-policy.go ================================================ // Copyright 2019 Intel Corporation. All Rights Reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package staticplus import ( "encoding/json" "fmt" "strconv" corev1 "k8s.io/api/core/v1" "k8s.io/apimachinery/pkg/api/resource" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" logger "github.com/intel/cri-resource-manager/pkg/log" "github.com/prometheus/client_golang/prometheus" "github.com/intel/cri-resource-manager/pkg/cpuallocator" "github.com/intel/cri-resource-manager/pkg/cri/resource-manager/cache" "github.com/intel/cri-resource-manager/pkg/cri/resource-manager/events" "github.com/intel/cri-resource-manager/pkg/cri/resource-manager/introspect" "github.com/intel/cri-resource-manager/pkg/cri/resource-manager/policy" "github.com/intel/cri-resource-manager/pkg/sysfs" "github.com/intel/cri-resource-manager/pkg/utils/cpuset" ) const ( // PolicyName is the name used to activate this policy implementation. PolicyName = "static-plus" // PolicyDescription is a short description of this policy. PolicyDescription = "A simple policy supporting exclusive/pinned and shared allocations." // Cache key for storing container resource allocations. keyAllocations = "allocations" // Cache key for storing the shared pool. keySharedPool = "shared-pool" // keyPreferIsolated is the annotation used to mark pods preferring isolated CPUs. keyPreferIsolated = "prefer-isolated-cpus" ) // Assignment tracks resource assignments for a single container. type Assignment struct { exclusive cpuset.CPUSet // exclusively allocated cpus shared int // milli-cpus to allocated from shared cpus } // Allocations track all resources allocations by the static+ policy. type Allocations map[string]*Assignment // static-plus policy runtime state. type staticplus struct { logger.Logger offline cpuset.CPUSet // offlined cpus available cpuset.CPUSet // bounding set of cpus available for us reserved cpuset.CPUSet // pool (primarily) for system-/kube-tasks isolated cpuset.CPUSet // primary pool for exclusive allocations allocations Allocations // container cpu allocations sys sysfs.System // system/topologu information cache cache.Cache // system state/cache shared cpuset.CPUSet // pool for fractional and shared allocations cpuAllocator cpuallocator.CPUAllocator // CPU allocator used by the policy } // Make sure staticplus implements the policy backend interface. var _ policy.Backend = &staticplus{} // CreateStaticPlusPolicy creates a new policy instance. func CreateStaticPlusPolicy(opts *policy.BackendOptions) policy.Backend { p := &staticplus{ Logger: logger.NewLogger(PolicyName), cache: opts.Cache, sys: opts.System, cpuAllocator: cpuallocator.NewCPUAllocator(opts.System), } p.Info("creating policy...") if err := p.setupPools(opts.Available, opts.Reserved); err != nil { p.Fatal("failed to set up cpu pools: %v", err) } p.dumpPools() return p } // Name returns the name of this policy. func (p *staticplus) Name() string { return PolicyName } // Description returns the description for this policy. func (p *staticplus) Description() string { return PolicyDescription } // Start prepares this policy for accepting allocation/release requests. func (p *staticplus) Start(add []cache.Container, del []cache.Container) error { if err := p.restoreCache(); err != nil { return policyError("failed to start: %v", err) } if err := p.updatePools(); err != nil { return policyError("failed to start: %v", err) } return p.Sync(add, del) } // Sync synchronizes the state ofd this policy. func (p *staticplus) Sync(add []cache.Container, del []cache.Container) error { p.Debug("synchronizing state...") for _, c := range del { p.ReleaseResources(c) } for _, c := range add { p.AllocateResources(c) } return nil } // AllocateResources allocates resources for the given container. func (p *staticplus) AllocateResources(c cache.Container) error { var a *Assignment id := c.GetCacheID() p.Debug("allocating container %s...", id) if _, ok := p.allocations[id]; ok { return nil } a, err := p.assignCpus(c) if err != nil { return err } return p.addAssignment(c, a) } // ReleaseResources release resources assigned to the given container. func (p *staticplus) ReleaseResources(c cache.Container) error { id := c.GetCacheID() p.Debug("releasing container %s...", id) a, ok := p.allocations[id] if !ok { return nil } return p.delAssignment(a, id) } // UpdateResources is a resource allocation update request for this policy. func (p *staticplus) UpdateResources(c cache.Container) error { p.Debug("(not) updating container %s...", c.PrettyName()) return nil } // Rebalance tries to find an optimal allocation of resources for the current containers. func (p *staticplus) Rebalance() (bool, error) { p.Debug("(not) rebalancing containers...") return false, nil } // HandleEvent handles policy-specific events. func (p *staticplus) HandleEvent(*events.Policy) (bool, error) { p.Debug("(not) handling event...") return false, nil } // ExportResourceData provides resource data to export for the container. func (p *staticplus) ExportResourceData(c cache.Container) map[string]string { a, ok := p.allocations[c.GetCacheID()] if !ok { // Hmm... p.Warn("can't find allocation for container %s", c.PrettyName()) return nil } data := map[string]string{} if a.shared != 0 { data[policy.ExportSharedCPUs] = p.shared.String() } if a != nil && !a.exclusive.IsEmpty() { isolated := a.exclusive.Intersection(p.sys.Isolated()).String() if isolated != "" { data[policy.ExportIsolatedCPUs] = isolated } exclusive := a.exclusive.Difference(p.sys.Isolated()).String() if exclusive != "" { data[policy.ExportExclusiveCPUs] = exclusive } } return data } // Introspect provides data for external introspection. func (p *staticplus) Introspect(*introspect.State) { return } // DescribeMetrics generates policy-specific prometheus metrics data descriptors. func (p *staticplus) DescribeMetrics() []*prometheus.Desc { return nil } // PollMetrics provides policy metrics for monitoring. func (p *staticplus) PollMetrics() policy.Metrics { return nil } // CollectMetrics generates prometheus metrics from cached/polled policy-specific metrics data. func (p *staticplus) CollectMetrics(policy.Metrics) ([]prometheus.Metric, error) { return nil, nil } // policyError creates a formatted policy-specific error. func policyError(format string, args ...interface{}) error { return fmt.Errorf(PolicyName+": "+format, args...) } // setupPools sets up the pools we allocate resources from. func (p *staticplus) setupPools(available, reserved policy.ConstraintSet) error { // Set up three disjoint CPU pools for allocating CPU to containers. These // three pools are: // // 1) reserved pool: kube- and system-tasks // Pods in the kube-system namespace are assigned to this pool. The // size of this pool is the requested reservation rounded up to the // closest integer. Any unused fractional part of this pool is used // as a shared pool if the shared pool ever gets fully allocated. // // 2) isolated pool: primary exclusive allocations // Exclusive CPU allocations are primarily done from this pool. Pods // that request at least 1 full CPU get their exclusive (integer) // CPU shares allocated from this pool unless the pool has already // been exhausted (in which case we try to slice off exclusive CPUs // from the shared pool). // // 3) shared pool: shared allocations, secondary exclusive allocations // Shared CPU allocations are served from this pool. Pods fractional // CPU shares are allocated from this pool. If the isolated pool has // been exhausted exclusive allocations are sliced off from this // pool. If this pool has been fully allocated, shared allocations // are oversubscribed to the reserved pool. p.offline = p.sys.Offlined() cpus, ok := available[policy.DomainCPU] if !ok { p.available = p.sys.CPUSet().Difference(p.offline) } else { p.available = cpus.(cpuset.CPUSet).Difference(p.offline) } p.isolated = p.sys.Isolated().Intersection(p.available) p.available = p.available.Difference(p.isolated) cpus, ok = reserved[policy.DomainCPU] if !ok { return policyError("cannot start without any reserved CPUs") } switch cpus.(type) { case cpuset.CPUSet: p.reserved = cpus.(cpuset.CPUSet).Intersection(p.available) if !p.reserved.Equals(cpus.(cpuset.CPUSet)) { return policyError("part of the reserved CPUs (%s) are not available: %s", cpus.(cpuset.CPUSet).String(), cpus.(cpuset.CPUSet).Difference(p.available)) } p.available = p.available.Difference(p.reserved) case resource.Quantity: var err error qty := cpus.(resource.Quantity) count := (int(qty.MilliValue()) + 999) / 1000 if count < 2 && p.available.Contains(0) { p.reserved = cpuset.New(0) p.available = p.available.Difference(p.reserved) } else { p.reserved, err = p.takeCPUs(&p.available, nil, count, cpuallocator.PriorityNormal) if err != nil { return policyError("failed to reserve %d CPUs from %s: %v", count, p.available.String()) } } } p.shared = p.available return nil } // Restore saved policy state from the cache. func (p *staticplus) restoreCache() error { if !p.cache.GetPolicyEntry(keySharedPool, &p.shared) { p.Warn("initializing empty policy state...") p.shared = p.available p.allocations = make(Allocations) p.cache.SetPolicyEntry(keySharedPool, &p.shared) p.cache.SetPolicyEntry(keyAllocations, cache.Cachable(&cachedAllocations{a: p.allocations})) } else { p.Info("restoring cached policy state...") ca := cachedAllocations{} if !p.cache.GetPolicyEntry(keyAllocations, &ca) { return policyError("failed to restore state from cache, no allocations") } p.allocations = ca.a } p.dumpPools() p.dumpAllocations() return nil } // requestedCpus calculates the exclusive and shared cpu allocations for a container. func (p *staticplus) requestedCpus(c cache.Container) (int, int) { cpuReq, ok := c.GetResourceRequirements().Requests[corev1.ResourceCPU] if !ok { return 0, 0 } full := int(cpuReq.MilliValue()) / 1000 part := int(cpuReq.MilliValue()) - 1000*full return full, part } // optOutFromIsolated checks if a container prefers (to opt out from) isolated CPUs. func (p *staticplus) optOutFromIsolation(c cache.Container) bool { preferIsolated := true if pod, found := c.GetPod(); !found { p.Warn("can't find pod for container %s", c.PrettyName()) } else { if value, ok := pod.GetResmgrAnnotation(keyPreferIsolated); ok { if isolated, err := strconv.ParseBool(value); !isolated { if err != nil { p.Error("invalid annotation '%s' on container %s, expecting boolean: %v", keyPreferIsolated, c.PrettyName(), err) } else { p.Info("container %s is opted-out from isolation", c.PrettyName()) } preferIsolated = false } else { p.Info("container %s explicitly opted-in for isolation", c.PrettyName()) } } else { p.Info("container %s goes with default isolation", c.PrettyName()) } } return !preferIsolated } // assignCpus allocates cpus for a containers. func (p *staticplus) assignCpus(c cache.Container) (*Assignment, error) { full, part := p.requestedCpus(c) // system containers always share (the reserved) cpus if c.GetNamespace() == metav1.NamespaceSystem { return &Assignment{shared: 1000*full + part}, nil } // assign to the shared pool if less than a single cpu was requested if full == 0 { return &Assignment{shared: part}, nil } // if there is capacity in the isolated pool, slice cpus off from it if p.isolated.Size() >= full && !p.optOutFromIsolation(c) { cpus, err := p.takeCPUs(&p.isolated, nil, full, cpuallocator.PriorityHigh) if err != nil { return nil, policyError("failed to allocate %d isolated CPUs: %v", full, err) } return &Assignment{exclusive: cpus, shared: part}, nil } // otherwise, try to slice off cpus from the shared pool if p.shared.Size() >= full { cpus, err := p.takeCPUs(&p.shared, nil, full, cpuallocator.PriorityHigh) if err != nil { return nil, policyError("failed to allocate %d exclusive CPUs: %v", full, err) } return &Assignment{exclusive: cpus, shared: part}, nil } // we're screwed, not enough cpu in either isolated or shared pool return nil, policyError("failed to allocate %d exclusive CPUs: %s", full, "not enough capacity") } // addAssignment updates container allocations for a newly added container assignment. func (p *staticplus) addAssignment(c cache.Container, a *Assignment) error { switch { // always assign system containers to the reserved pool case c.GetNamespace() == metav1.NamespaceSystem: c.SetCpusetCpus(p.reserved.String()) c.SetCPUShares(int64(MilliCPUToShares(a.shared))) p.Info("system container %s allocated (%d mCPU) to reserved pool %s", c.PrettyName(), a.shared, p.reserved.String()) // for shared-only assignments, it's enough to update the container case a.exclusive.IsEmpty(): c.SetCpusetCpus(p.shared.String()) c.SetCPUShares(int64(MilliCPUToShares(a.shared))) p.Info("container %s allocated (%d mCPU) to shared pool %s", c.PrettyName(), a.shared, p.shared.String()) // isolated, sliced-off exclusive, or mixed allocation default: var kind string var isolated bool if isolated = !a.exclusive.Intersection(p.sys.Isolated()).IsEmpty(); isolated { kind = "isolated" } else { kind = "exclusive" } if a.shared != 0 { c.SetCpusetCpus(a.exclusive.Union(p.shared).String()) c.SetCPUShares(int64(MilliCPUToShares(a.shared))) p.Info("container %s allocated to %s (%s) and shared (%d mCPU) pool %s", c.PrettyName(), kind, a.exclusive.String(), a.shared, p.shared.String()) } else { c.SetCpusetCpus(a.exclusive.String()) c.SetCPUShares(int64(MilliCPUToShares(1000 * a.exclusive.Size()))) p.Info("container %s allocated to %s CPUs %s", c.PrettyName(), kind, a.exclusive.String()) } // for sliced-off exclusive we might need to update other containers shared allocations if !a.exclusive.IsEmpty() && a.exclusive.Intersection(p.sys.Isolated()).IsEmpty() { if err := p.updateSharedAllocations(); err != nil { return err } } } p.allocations[c.GetCacheID()] = a p.cache.SetPolicyEntry(keySharedPool, p.shared) p.cache.SetPolicyEntry(keyAllocations, cache.Cachable(&cachedAllocations{a: p.allocations})) return nil } // delAssignment updates container allocations for a deleted container assignment. func (p *staticplus) delAssignment(a *Assignment, id string) error { delete(p.allocations, id) switch { // for shared-only allocations there is not much to do... case a.exclusive.IsEmpty(): p.Info("freed shared-only (%d mCPU) allocations of container %s", a.shared, id) // for isolated exclusive cpus, return them to the pool case !a.exclusive.Intersection(p.sys.Isolated()).IsEmpty(): p.isolated = p.isolated.Union(a.exclusive) p.Info("freed isolated allocations (%s) of container %s", a.exclusive.String(), id) // for cpus sliced off the shared pool, return then and update others default: p.shared = p.shared.Union(a.exclusive) p.Info("freed exclusive allocations (%s) of container %s", a.exclusive.String(), id) if err := p.updateSharedAllocations(); err != nil { return err } } p.cache.SetPolicyEntry(keySharedPool, p.shared) p.cache.SetPolicyEntry(keyAllocations, cache.Cachable(&cachedAllocations{a: p.allocations})) return nil } // updateSharedAllocations updates containers with shared allocations. func (p *staticplus) updateSharedAllocations() error { avail := 1000 * p.shared.Size() for id, ca := range p.allocations { cac, ok := p.cache.LookupContainer(id) if !ok { p.Warn("can't find allocated container %s", id) // remove and recalculate shared CPUs p.delAssignment(ca, id) return p.updateSharedAllocations() } if !ca.exclusive.Intersection(p.sys.Isolated()).IsEmpty() && ca.shared == 0 { continue } cset := p.shared.Union(ca.exclusive) if avail <= 0 { cset = cset.Union(p.reserved) p.Warn("out of free shared (%s) capacity, using reserved pool (%s) as well", p.shared.String(), p.reserved.String()) } if cac.GetCpusetCpus() != cset.String() { cac.SetCpusetCpus(cset.String()) p.Info("container %s reallocated to exclusive (%s) and shared (%d mCPU) pool %s", cac.PrettyName(), ca.exclusive.String(), ca.shared, cset.String()) } avail -= ca.shared } if avail < 0 { p.Warn("not enough free capacity in shared pool (%s): lacking %d mCPU", p.shared.String(), -avail) } else { p.Info("free shared (%s) capacity left: %d mCPU", p.shared.String(), avail) } return nil } // updatePools updates the pools according to the current asignments. func (p *staticplus) updatePools() error { for id, ca := range p.allocations { if ca.exclusive.IsEmpty() { continue } isolated := ca.exclusive.Intersection(p.sys.Isolated()) excshare := ca.exclusive.Difference(isolated) if !isolated.IsEmpty() && !excshare.IsEmpty() { return policyError("container %s has exclusive isolated (%s) and shareable (%s) cpus", id, isolated.String(), excshare.String()) } p.isolated = p.isolated.Difference(isolated) p.shared = p.shared.Difference(excshare) } if err := p.updateSharedAllocations(); err != nil { return err } p.cache.SetPolicyEntry(keySharedPool, p.shared) p.cache.SetPolicyEntry(keyAllocations, cache.Cachable(&cachedAllocations{a: p.allocations})) return nil } // dumpPools dumps the current state of pools. func (p *staticplus) dumpPools() { p.Info("current CPU pools:") offline := p.offline.String() if offline == "" { offline = "" } isolated := p.isolated.String() if isolated == "" { isolated = "" } p.Info(" offline: %s", offline) p.Info(" reserved: %s", p.reserved.String()) p.Info(" shared: %s", p.shared.String()) p.Info(" isolated: %s", isolated) } // dumpAllocations dumps the current allocations. func (p *staticplus) dumpAllocations() { p.Info("container CPU allocations:") switch { case p.allocations == nil: p.Info(" ") case len(p.allocations) == 0: p.Info(" ") default: for id, ca := range p.allocations { e := ca.exclusive.String() if e == "" { e = "" } p.Info(" %s: exclusive: %s, shared: %d milli-cpu", id, e, ca.shared) } } } // Take up to cnt CPUs from a given CPU set to another. func (p *staticplus) takeCPUs(from, to *cpuset.CPUSet, cnt int, preferredPrio cpuallocator.CPUPriority) (cpuset.CPUSet, error) { cset, err := p.cpuAllocator.AllocateCpus(from, cnt, preferredPrio) if err != nil { return cset, err } if to != nil { *to = to.Union(cset) } return cset, err } // // Cachable data types for storing private static-plus policy data in the cache. // // CachedAllocations implements Cache.Cachable boilerplate for Allocations. type CachedAllocations interface { cache.Cachable } type cachedAllocations struct { a Allocations } var _ cache.Cachable = &cachedAllocations{} var _ json.Marshaler = &cachedAllocations{} var _ json.Unmarshaler = &cachedAllocations{} func (ca *cachedAllocations) Get() interface{} { return *ca } func (ca *cachedAllocations) Set(value interface{}) { switch value.(type) { case cachedAllocations: ca.a = value.(cachedAllocations).a case *cachedAllocations: ca.a = value.(*cachedAllocations).a } } type marshallableAssignment struct { Exclusive string Shared int } func (ca *cachedAllocations) MarshalJSON() ([]byte, error) { dst := make(map[string]*marshallableAssignment) for id, r := range ca.a { dst[id] = &marshallableAssignment{ Exclusive: r.exclusive.String(), Shared: r.shared, } } return json.Marshal(dst) } func (ca *cachedAllocations) UnmarshalJSON(data []byte) error { var err error dst := make(map[string]*marshallableAssignment) if err = json.Unmarshal(data, &dst); err != nil { return err } ca.a = make(map[string]*Assignment) for id, r := range dst { if r == nil { continue } cset, err := cpuset.Parse(r.Exclusive) if err != nil { return policyError("failed to unmarshal cpuset '%s': %v", r.Exclusive, err) } ca.a[id] = &Assignment{ exclusive: cset, shared: r.Shared, } } return nil } // Functions for calculating CFS cpu.shares and cpu.cfs_quota_us. // // Notes: These functions are almost verbatim taken from the kubelet // code (from k8s.io/kubernetes/pkg/kubelet/cm/helpers_linux.go). // Since these are exported there, we could try to import them, set the // related feature gates (kubefeatures.CPUCFSQuotaPeriod) for ourselves // into the desired positions (disabled most probably for now) and use // the imported code. const ( MinShares = 2 SharesPerCPU = 1024 MilliCPUToCPU = 1000 // 100000 is equivalent to 100ms QuotaPeriod = 100000 MinQuotaPeriod = 1000 ) // MilliCPUToQuota converts milliCPU to CFS quota and period values. func MilliCPUToQuota(milliCPU int64, period int64) (quota int64) { // CFS quota is measured in two values: // - cfs_period_us=100ms (the amount of time to measure usage across given by period) // - cfs_quota=20ms (the amount of cpu time allowed to be used across a period) // so in the above example, you are limited to 20% of a single CPU // for multi-cpu environments, you just scale equivalent amounts // see https://www.kernel.org/doc/Documentation/scheduler/sched-bwc.txt for details if milliCPU == 0 { return } if true /*!utilfeature.DefaultFeatureGate.Enabled(kubefeatures.CPUCFSQuotaPeriod)*/ { period = QuotaPeriod } // we then convert your milliCPU to a value normalized over a period quota = (milliCPU * period) / MilliCPUToCPU // quota needs to be a minimum of 1ms. if quota < MinQuotaPeriod { quota = MinQuotaPeriod } return } // MilliCPUToShares converts the milliCPU to CFS shares. func MilliCPUToShares(milliCPU int) int64 { if milliCPU == 0 { // Docker converts zero milliCPU to unset, which maps to kernel default // for unset: 1024. Return 2 here to really match kernel default for // zero milliCPU. return MinShares } // Conceptually (milliCPU / milliCPUToCPU) * sharesPerCPU, but factored to improve rounding. shares := (milliCPU * SharesPerCPU) / MilliCPUToCPU if shares < MinShares { return MinShares } return int64(shares) } // Register us as a policy implementation. func init() { policy.Register(PolicyName, PolicyDescription, CreateStaticPlusPolicy) } ================================================ FILE: pkg/cri/resource-manager/policy/builtin/static-pools/config.go ================================================ /* Copyright 2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ package stp import ( "fmt" "os" "path" "regexp" "strconv" "strings" "sigs.k8s.io/yaml" pkgcfg "github.com/intel/cri-resource-manager/pkg/config" ) // config captures our runtime configurable parameters. type config struct { // Pools defines our set of pools in use. Pools pools `json:"pools,omitempty"` // ConfDirPath is the filesystem path to the legacy configuration directry structure. ConfDirPath string // ConfFilePath is the filesystem path to the legacy configuration file. ConfFilePath string // LabelNode controls whether backwards-compatible CMK node label is created. LabelNode bool // TaintNode controls whether backwards-compatible CMK node taint is created. TaintNode bool } type pools map[string]poolConfig type cpuList struct { Socket uint64 Cpuset string // TODO: might want to use cpuset from kubelet containers map[string]struct{} } // STP policy runtime configuration with their defaults. var conf = defaultConfig().(*config) // defaultConfig returns a new conf instance, all initialized to defaults. func defaultConfig() interface{} { return &config{ Pools: make(pools), ConfDirPath: "/etc/cmk", } } func (c *cpuList) addContainer(id string) { if c.containers == nil { c.containers = make(map[string]struct{}) } c.containers[id] = struct{}{} } func (c *cpuList) removeContainer(id string) { if c.containers == nil { return } delete(c.containers, id) } func (c *cpuList) getContainers() []string { if c.containers == nil { return []string{} } ret := make([]string, len(c.containers)) i := 0 for k := range c.containers { ret[i] = k i++ } return ret } type poolConfig struct { Exclusive bool `json:"exclusive"` // Per-socket cpu lists CPULists []*cpuList `json:"cpuLists"` } func (p *poolConfig) cpuSet() string { cpuset := "" delim := "" for _, cl := range p.CPULists { cpuset += delim + cl.Cpuset delim = "," } return cpuset } var ( cpusetValidationRe = regexp.MustCompile(`^(([\d]+)|([\d]+-[\d]+))(,(([\d]+)|([\d]+-[\d]+)))*$`) ) func parseConfData(raw []byte) (pools, error) { conf := &struct { Pools pools }{} err := yaml.Unmarshal(raw, &conf) if err != nil { return nil, stpError("Failed to parse config file: %v", err) } return conf.Pools, nil } func readConfFile(filepath string) (pools, error) { // Read config data data, err := os.ReadFile(filepath) if err != nil { return nil, stpError("Failed to read config file: %v", err) } return parseConfData(data) } func readConfDir(confDir string) (pools, error) { conf := pools{} // List pools in the pools configuration directory poolsDir := path.Join(confDir, "pools") pools, err := os.ReadDir(poolsDir) if err != nil { return nil, stpError("Failed to list pools config directory %s: %v", poolsDir, err) } // Read pool configurations for _, pool := range pools { poolConf, err := readPoolConfDir(path.Join(poolsDir, pool.Name())) if err != nil { return nil, stpError("Failed to read pool çonfiguration: %v", err) } conf[pool.Name()] = poolConf } return conf, nil } // Read configuration of one pool from original CMK configuration directory tree func readPoolConfDir(poolDir string) (poolConfig, error) { conf := poolConfig{Exclusive: false, CPULists: []*cpuList{}} // Read pool's exclusivity flag exclusive, err := os.ReadFile(path.Join(poolDir, "exclusive")) if err != nil { return conf, fmt.Errorf("Failed to read pool exclusive setting in %s: %v", poolDir, err) } if len(exclusive) == 1 && exclusive[0] == '1' { conf.Exclusive = true } // Read socket configurations (per-socket cpu lists) files, err := os.ReadDir(poolDir) if err != nil { return conf, fmt.Errorf("Failed to list pool config directory %s: %v", poolDir, err) } for _, file := range files { if !file.IsDir() { // Skip non-directory files (e.g. 'exclusive' file) continue } socketPath := path.Join(poolDir, file.Name()) socketCPULists, err := readSocketConfDir(socketPath) if err != nil { return conf, fmt.Errorf("Failed to list pool socket config: %s", err) } conf.CPULists = append(conf.CPULists, socketCPULists...) } return conf, nil } // Read configuration (cpu lists) of a socket of one pool in original CMK // configuration directory tree func readSocketConfDir(socketDir string) ([]*cpuList, error) { // Get socket number from the name of the directory socketNum, err := strconv.ParseUint(path.Base(socketDir), 10, 32) if err != nil { return nil, fmt.Errorf("Invalid socket id %s: %v", socketDir, err) } // Socket directory contains a set of subdirectories, one per cpu list cpuListDirs, err := os.ReadDir(socketDir) if err != nil { return nil, fmt.Errorf("Failed to list socket directory %s: %v", socketDir, err) } conf := make([]*cpuList, len(cpuListDirs)) for i, cpuListDir := range cpuListDirs { // Validate that the cpulist conforms to cpuset formatting if err := validateCPUList(cpuListDir.Name()); err != nil { return nil, fmt.Errorf("Invalid cpu list in %s: %v", socketDir, err) } conf[i] = &cpuList{Socket: socketNum, Cpuset: cpuListDir.Name(), containers: map[string]struct{}{}} } return conf, nil } func validateCPUList(name string) error { // NOTE: Naive implementation, we only check that it "looks right", we don't // check that the actual numbers make sense, i.e. that numbers are in // ascending order if !cpusetValidationRe.MatchString(name) { return fmt.Errorf("%q does not look like a cpuset", name) } return nil } // Convert cpu list configuration directory name into a cpuList func parseCPUListName(name string) ([]uint, error) { // The name should be a list of cpu ids (positive integers) separated by commas cpuListMembers := strings.Split(name, ",") cpus := make([]uint, len(cpuListMembers)) // Convert cpu ids to a list of integers for i, cpuStr := range cpuListMembers { cpu, err := strconv.ParseUint(cpuStr, 10, 32) if err != nil { return cpus, fmt.Errorf("Invalid cpu id in %s: %v", name, err) } cpus[i] = uint(cpu) } return cpus, nil } // Register us for command line option processing and configuration management. func init() { pkgcfg.Register(PolicyPath, PolicyDescription, conf, defaultConfig) } ================================================ FILE: pkg/cri/resource-manager/policy/builtin/static-pools/node.go ================================================ /* Copyright 2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ package stp import ( "strconv" "time" core_v1 "k8s.io/api/core/v1" "github.com/intel/cri-resource-manager/pkg/cri/resource-manager/agent" "github.com/intel/cri-resource-manager/pkg/log" ) const ( exclusiveCoreResourceName = "cmk.intel.com/exclusive-cores" cmkLegacyNodeLabelName = "cmk.intel.com/cmk-node" ) type nodeUpdater struct { log.Logger agent agent.Interface conf chan config } func newNodeUpdater(agent agent.Interface) *nodeUpdater { return &nodeUpdater{ Logger: log.NewLogger("static-pools-nu"), agent: agent, conf: make(chan config, 1), } } func (u *nodeUpdater) start() error { u.Info("starting node updater") if u.agent == nil || u.agent.IsDisabled() { return stpError("cri-resmgr-agent connection required") } go func() { var pending *config var retry <-chan time.Time for { select { case c := <-u.conf: pending = &c retry = time.After(0) case _ = <-retry: if pending != nil { err := u.updateNode(pending, -1) if err != nil { u.Info("node update failed: %v", err) retry = time.After(5 * time.Second) } else { u.Info("node successfully updated") pending = nil retry = nil } } else { u.Panic("BUG: node update with nil config requested") } } } }() return nil } func (u *nodeUpdater) update(c config) { // Pop possibly pending value from the channel select { case <-u.conf: default: } u.conf <- c } // Update Node object with STP/CMK-specific things func (u *nodeUpdater) updateNode(conf *config, opTimeout time.Duration) error { // Count total number of cpu lists of all exclusive pools numExclusiveCPULists := 0 for _, pool := range conf.Pools { if pool.Exclusive { numExclusiveCPULists += len(pool.CPULists) } } // Update extended resources resources := map[string]string{ exclusiveCoreResourceName: strconv.Itoa(numExclusiveCPULists)} u.Info("updating node capacity (extended resources)") if err := u.agent.UpdateNodeCapacity(resources, opTimeout); err != nil { return err } // Manage legacy node label if conf.LabelNode { u.Info("creating CMK node label") err := u.agent.SetLabels(map[string]string{cmkLegacyNodeLabelName: "true"}, opTimeout) if err != nil { return stpError("failed to update legacy node label: %v", err) } } else { u.Info("removing CMK node label") err := u.agent.RemoveLabels([]string{cmkLegacyNodeLabelName}, opTimeout) if err != nil { return stpError("failed to update legacy node label: %v", err) } } // Manage legacy node taint nodeTaints, err := u.agent.GetTaints(opTimeout) if err != nil { return stpError("failed to fetch node taints: %v", err) } legacyTaint := core_v1.Taint{ Key: "cmk", Value: "true", Effect: core_v1.TaintEffectNoSchedule, } cmkTaints := []core_v1.Taint{legacyTaint} _, tainted := u.agent.FindTaintIndex(nodeTaints, &legacyTaint) if !tainted && conf.TaintNode { u.Info("creating CMK node taint") if err := u.agent.SetTaints(cmkTaints, opTimeout); err != nil { return stpError("failed to set legacy node taint: %v", err) } } if tainted && !conf.TaintNode { u.Debug("removing CMK node taint") if err := u.agent.RemoveTaints(cmkTaints, opTimeout); err != nil { return stpError("failed to clear legacy node taint: %v", err) } } return nil } ================================================ FILE: pkg/cri/resource-manager/policy/builtin/static-pools/stp-policy.go ================================================ // Copyright 2019 Intel Corporation. All Rights Reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package stp import ( "flag" "fmt" "io" "math/rand" "strconv" "strings" pkgcfg "github.com/intel/cri-resource-manager/pkg/config" "github.com/intel/cri-resource-manager/pkg/cri/resource-manager/cache" "github.com/intel/cri-resource-manager/pkg/cri/resource-manager/events" "github.com/intel/cri-resource-manager/pkg/cri/resource-manager/introspect" "github.com/intel/cri-resource-manager/pkg/cri/resource-manager/policy" logger "github.com/intel/cri-resource-manager/pkg/log" "github.com/intel/cri-resource-manager/pkg/utils" "github.com/prometheus/client_golang/prometheus" ) const ( // PolicyName is the name used to activate this policy implementation. PolicyName = "static-pools" // PolicyDescription is a short description of this policy. PolicyDescription = "A reimplementation of CMK (CPU Manager for Kubernetes)." // PolicyPath is the path of this policy in the configuration hierarchy. PolicyPath = "policy." + PolicyName // StpEnvPool is the name of the env variable for selecting STP pool of a container StpEnvPool = "STP_POOL" // StpEnvSocketID is the name of the env variable for selecting cpu socket of a container StpEnvSocketID = "STP_SOCKET_ID" // StpEnvNoAffinity is the name of the env variable for switching off cpuset enforcement StpEnvNoAffinity = "STP_NO_AFFINITY" // CmkEnvAssigned is the name of the env variable that the original CMK // sets to communicate the selected cpuset to the workload. We use the same // environment variable for compatibility. CmkEnvAssigned = "CMK_CPUS_ASSIGNED" // CmkEnvInfra is the name of the env variable that the original CMK sets // to communicate all CPUs of the infra pool to the workload. We use the // same environment variable for compatibility. CmkEnvInfra = "CMK_CPUS_INFRA" // CmkEnvShared is the name of the env variable that the original CMK sets // to communicate all CPUs of the shared pool to the workload. We use the // same environment variable for compatibility. CmkEnvShared = "CMK_CPUS_SHARED" // CmkEnvNumCores is the name of the env used in the original CMK to select // the number of exclusive CPUs, deprecated here CmkEnvNumCores = "CMK_NUM_CORES" // PoolInfra is the hardcoded name of the 'infra' pool CmkPoolInfra = "infra" // PoolInfra is the hardcoded name of the 'infra' pool CmkPoolShared = "shared" ) type stp struct { logger.Logger conf *config // STP policy configuration nodeUpdater *nodeUpdater // node updater thread state cache.Cache // state cache } var _ policy.Backend = &stp{} // // Policy backend implementation // // CreateStpPolicy creates a new policy instance. func CreateStpPolicy(opts *policy.BackendOptions) policy.Backend { stp := &stp{ Logger: logger.NewLogger(PolicyName), state: opts.Cache, nodeUpdater: newNodeUpdater(opts.AgentCli), } stp.Info("creating policy...") pkgcfg.GetModule(PolicyPath).AddNotify(stp.configNotify) return stp } // Name returns the name of this policy. func (stp *stp) Name() string { return PolicyName } // Description returns the description for this policy. func (stp *stp) Description() string { return PolicyDescription } // Start prepares this policy for accepting allocation/release requests. func (stp *stp) Start(add []cache.Container, del []cache.Container) error { if err := stp.nodeUpdater.start(); err != nil { return err } if stp.conf == nil { if err := stp.setConfig(conf); err != nil { return err } } if err := stp.initializeState(); err != nil { return err } stp.Debug("retrieved stp container states from cache:\n%s", utils.DumpJSON(*stp.getContainerRegistry())) if err := stp.Sync(add, del); err != nil { return err } stp.Debug("preparing for making decisions...") return nil } // Sync synchronizes the state of this policy. func (stp *stp) Sync(add []cache.Container, del []cache.Container) error { stp.Debug("synchronizing state...") for _, c := range del { stp.ReleaseResources(c) } for _, c := range add { stp.AllocateResources(c) } return nil } // AllocateResources is a resource allocation request for this policy. func (stp *stp) AllocateResources(c cache.Container) error { containerID := c.GetCacheID() stp.Debug("allocating resources for container %s...", containerID) cs := stpContainerStatus{Socket: -1} // Default pool name poolName := CmkPoolShared // Get resource requests stp.Debug("RESOURCE REQUESTS: %s", c.GetResourceRequirements().Requests) requestedCPUs, ok := c.GetResourceRequirements().Requests[exclusiveCoreResourceName] if ok { nCPUs, _ := requestedCPUs.AsInt64() cs.NExclusiveCPUs = nCPUs } // Parse container command line. Backwards compatibility for old CMK // workloads cmkArgs := stp.parseContainerCmdline(c.GetCommand(), c.GetArgs()) if cmkArgs != nil { poolName = cmkArgs.Pool cs.Socket = cmkArgs.SocketID cs.NoAffinity = cmkArgs.NoAffinity // Overwrite container commandline c.SetCommand(cmkArgs.Command) c.SetArgs([]string{}) stp.Debug("parsed options from container command line: %v", cmkArgs) } // Get STP options from container env envVal, ok := c.GetEnv(StpEnvSocketID) if ok { socketID, err := strconv.ParseInt(envVal, 10, 32) if err != nil { stp.Warn("unable to parse socket id from %q: %v", StpEnvSocketID, err) } else { cs.Socket = socketID } } envVal, ok = c.GetEnv(StpEnvPool) if ok { poolName = envVal } _, ok = c.GetEnv(StpEnvNoAffinity) if ok { // We do not care about the value of the env variable here cs.NoAffinity = true } // Force socket to -1 if pool is not "socket aware" if poolName == CmkPoolInfra { cs.Socket = -1 } // Get pool configuration if _, ok := stp.conf.Pools[poolName]; !ok { return stpError("non-existent pool %q", poolName) } cs.Pool = poolName // Allocate (CPU) resources for the container err := stp.allocateStpResources(c, cs) if err != nil { return err } return nil } // ReleaseResources is a resource release request for this policy. func (stp *stp) ReleaseResources(c cache.Container) error { stp.Debug("releasing resources of container %s...", c.PrettyName()) stp.releaseStpResources(c.GetCacheID()) return nil } // UpdateResources is a resource allocation update request for this policy. func (stp *stp) UpdateResources(c cache.Container) error { stp.Debug("updating resource allocations of container %s...", c.PrettyName()) return nil } // Rebalance tries to find an optimal allocation of resources for the current containers. func (stp *stp) Rebalance() (bool, error) { stp.Debug("(not) rebalancing containers...") return false, nil } // HandleEvent handles policy-specific events. func (stp *stp) HandleEvent(*events.Policy) (bool, error) { stp.Debug("(not) handling event...") return false, nil } // ExportResourceData provides resource data to export for the container. func (stp *stp) ExportResourceData(c cache.Container) map[string]string { return nil } // Introspect provides data for external introspection. func (stp *stp) Introspect(*introspect.State) { return } // DescribeMetrics generates policy-specific prometheus metrics data descriptors. func (p *stp) DescribeMetrics() []*prometheus.Desc { return nil } // PollMetrics provides policy metrics for monitoring. func (p *stp) PollMetrics() policy.Metrics { return nil } // CollectMetrics generates prometheus metrics from cached/polled policy-specific metrics data. func (p *stp) CollectMetrics(policy.Metrics) ([]prometheus.Metric, error) { return nil, nil } func (stp *stp) configNotify(event pkgcfg.Event, source pkgcfg.Source) error { stp.Info("configuration %s", event) if err := stp.setConfig(conf); err != nil { return err } stp.Info("config updated successfully") return nil } func (stp *stp) setConfig(cfg *config) error { // Read legacy pools configuration if the given config has no pools configured if cfg.Pools == nil || len(cfg.Pools) == 0 { if len(cfg.ConfDirPath) > 0 { stp.Debug("Reading legacy configuration directory tree %q", cfg.ConfDirPath) p, err := readConfDir(cfg.ConfDirPath) if err != nil { stp.Warn("failed to read configuration directory: %v", err) } else { cfg.Pools = p } } if len(cfg.ConfFilePath) > 0 { stp.Debug("Reading legacy configuration file %q", cfg.ConfFilePath) p, err := readConfFile(cfg.ConfFilePath) if err != nil { stp.Warn("failed to read configuration file: %v", err) } else { if cfg.Pools != nil || len(cfg.Pools) > 0 { stp.Info("Overriding pool configuration from %q with configuration from %q", cfg.ConfDirPath, cfg.ConfFilePath) } cfg.Pools = p } } } if err := stp.verifyConfig(cfg); err != nil { return err } stp.conf = cfg stp.Debug("policy configuration:\n%s", utils.DumpJSON(stp.conf)) stp.nodeUpdater.update(*stp.conf) return nil } // // Helper functions for STP policy backend // func stpError(format string, args ...interface{}) error { return fmt.Errorf(PolicyName+": "+format, args...) } func (stp *stp) initializeState() error { ccr := stp.getContainerRegistry() for id := range *ccr { // Remove orphaned containers if _, ok := stp.state.LookupContainer(id); !ok { stp.Info("removing orphaned container %s from policy cache", id) stp.releaseStpResources(id) } } return stp.verifyConfig(stp.conf) } // Verify configuration against the existing set of containers func (stp *stp) verifyConfig(cfg *config) error { // Sanity check for config if cfg == nil || cfg.Pools == nil || len(cfg.Pools) == 0 { return stpError("invalid config, no pools configured") } // Loop through all existing containers ccr := stp.getContainerRegistry() for id, cs := range *ccr { // Check that pool for container exists pool, ok := cfg.Pools[cs.Pool] if !ok { return stpError("invalid stp configuration: pool %q for container %q not found", cs.Pool, id) } // Check that pool exclusivity is compatible with container configuration if pool.Exclusive && cs.NExclusiveCPUs < 1 { return stpError("invalid stp configuration: container %q with no exclusive CPUs set to run in exclusive pool %q", id, cs.Pool) } else if !pool.Exclusive && cs.NExclusiveCPUs > 0 { return stpError("invalid stp configuration: container %q with exclusive CPUs set to run in non-exclusive pool %q", id, cs.Pool) } // Check that cpu lists (cpuset) of container can be satisfied by the pool // NOTE: we do not try to do any migration to possibly free cpu lists // if the originally allocated cpu lists are not available // TODO: for non-exclusive pools it might be feasible just to alter the // cpuset (i.e. reconcile new cpu list using the existing pool/socket // spec for container) in case cpu lists do not match exactly for _, cCpuset := range cs.Cpusets { for i, pClist := range pool.CPULists { if cCpuset == pClist.Cpuset { pool.CPULists[i].addContainer(id) break } if i == len(pool.CPULists)-1 { return stpError("invalid stp configuration: cpu list %q configured for container %q not found in pool %q", cCpuset, id, cs.Pool) } } } } return nil } type cmkLegacyArgs struct { Pool string SocketID int64 Command []string NoAffinity bool } // parseContainerCmdline tries to parse the pool name and socket id parameters // from container command line func (stp *stp) parseContainerCmdline(cmd, args []string) *cmkLegacyArgs { // NOTE: This is naive implementation and not foolproof. E.g. args could be // defined throught env variables cmdLine := append(cmd, args...) stp.Debug("Parsing container command line %v\n", cmdLine) cmkArgs := parseCmkCmdline(cmdLine) // If we didn't find cmk arguments, try to parse each argument separately // in case cmk was invoked like 'bash -c "cmk isolate ..." // NOTE: We do somewhat naive strings.Fields() here, there is room for // improvement by usage go-shellquote or similar if cmkArgs == nil { for _, arg := range cmdLine { cmkArgs = parseCmkCmdline(strings.Fields(arg)) if cmkArgs != nil { break } } } return cmkArgs } func parseCmkCmdline(args []string) *cmkLegacyArgs { parsedArgs := cmkLegacyArgs{} // Create parser cmkCmd := flag.NewFlagSet("cmk-legacy", flag.ContinueOnError) cmkCmd.SetOutput(io.Discard) cmkCmd.StringVar(&parsedArgs.Pool, "pool", "", "pool to use") cmkCmd.Int64Var(&parsedArgs.SocketID, "socket-id", -1, "socket id to use") cmkCmd.BoolVar(&parsedArgs.NoAffinity, "no-affinity", false, "Do not set cpu affinity before forking the child command") // Args that we're not really interested in _ = cmkCmd.String("conf-dir", "", "CMK configuration directory") if len(args) > 1 && args[0] == "cmk" && args[1] == "isolate" { err := cmkCmd.Parse(args[2:]) // Parse out (i.e. ignore) all unknown args for err != nil { err = cmkCmd.Parse(cmkCmd.Args()) } // Pool needs to be defined if parsedArgs.Pool != "" { parsedArgs.Command = cmkCmd.Args() return &parsedArgs } } return nil } func (stp *stp) allocateStpResources(c cache.Container, cs stpContainerStatus) error { var CPULists [](*cpuList) // Get pool configuration for this container pool, ok := stp.conf.Pools[cs.Pool] if !ok { return stpError("BUG: pool %q not found", cs.Pool) } availableCPULists := getAvailableCPULists(cs.Socket, &pool) if pool.Exclusive { if cs.NExclusiveCPUs < 1 { return stpError("exclusive pool specified but the number of exclusive CPUs requested is 0") } // Check the possible deprecated CMK_NUM_CORES setting. Print a warning // if this does not match what was requested through extended resources envNumCores, ok := c.GetEnv(CmkEnvNumCores) if ok { iNumCores, err := strconv.ParseInt(envNumCores, 10, 64) if err != nil || iNumCores != cs.NExclusiveCPUs { stp.Warn("Ignoring deprecated env variable setting, %s=%q does "+ "not match the number of cores (%d) from resource request", CmkEnvNumCores, envNumCores, cs.NExclusiveCPUs) } } if int64(len(availableCPULists)) < cs.NExclusiveCPUs { if cs.Socket < 0 { return stpError("not enough free cpu lists in pool %q", cs.Pool) } return stpError("not enough free cpu lists in pool %q with socket id %d", cs.Pool, cs.Socket) } CPULists = availableCPULists[0:cs.NExclusiveCPUs] } else { /* NOTE (from CMK): This allocation algorithm is probably an oversimplification, however for known use cases the non-exclusive pools should never have more than one cpu list anyhow. If that ceases to hold in the future, we could explore population or load-based spreading. Keeping it simple for now. */ if len(availableCPULists) == 0 { return stpError("no available cpu lists in pool %q with socket id %d", cs.Pool, cs.Socket) } i := rand.Int31n(int32((len(availableCPULists)))) CPULists = availableCPULists[i : i+1] } containerID := c.GetCacheID() cpuset := "" sep := "" for _, cl := range CPULists { cl.addContainer(containerID) cpuset += sep + cl.Cpuset sep = "," cs.Cpusets = append(cs.Cpusets, cpuset) } // Commit our changes containers := stp.getContainerRegistry() (*containers)[containerID] = cs stp.setContainerRegistry(containers) if cs.NoAffinity { stp.Info("not setting cpuset for container %q as --no-affinity was specified", containerID) } else { stp.Info("setting cpuset of container %q to %q", containerID, cpuset) c.SetCpusetCpus(cpuset) } c.SetEnv(CmkEnvAssigned, cpuset) // Advertise CPUs belonging to the infa pool pool, ok = stp.conf.Pools[CmkPoolInfra] if ok { c.SetEnv(CmkEnvInfra, pool.cpuSet()) } // Advertise CPUs belonging to the shared pool pool, ok = stp.conf.Pools[CmkPoolShared] if ok { c.SetEnv(CmkEnvShared, pool.cpuSet()) } return nil } // getAvailableCPULists Constructa a list of available cpu lists that satisfy // the possible socket constraint func getAvailableCPULists(socket int64, pool *poolConfig) [](*cpuList) { availableCPULists := make([](*cpuList), 0, len(pool.CPULists)) for _, c := range pool.CPULists { if socket < 0 || socket == int64(c.Socket) { if pool.Exclusive && len(c.getContainers()) > 0 { continue } availableCPULists = append(availableCPULists, c) } } return availableCPULists } func (stp *stp) releaseStpResources(containerID string) error { ccr := *stp.getContainerRegistry() if cs, ok := ccr[containerID]; ok { pool, ok := stp.conf.Pools[cs.Pool] if !ok { return stpError("BUG: pool %q for container %q not found", cs.Pool, containerID) } for _, clist := range pool.CPULists { clist.removeContainer(containerID) } delete(ccr, containerID) // Commit our changes to stp cache stp.setContainerRegistry(&ccr) } return nil } // // Handling of cached data // const ( cacheKeyContainerRegistry = "ContainerRegistry" ) type stpContainerStatus struct { Pool string // pool configuration Socket int64 // physical socket id NExclusiveCPUs int64 // number of exclusive cpus Cpusets []string // cpusets (cpu lists) assigned to this container NoAffinity bool // disable cpuset enforcing } // stpContainerCache contains STP-specific data of containers type stpContainerCache map[string]stpContainerStatus // Set the value of cached cachableContainerRegistry object func (c *stpContainerCache) Set(value interface{}) { switch value.(type) { case stpContainerCache: *c = value.(stpContainerCache) case *stpContainerCache: cp := value.(*stpContainerCache) *c = *cp } } // Get the cached cachableContainerRegistry object func (c *stpContainerCache) Get() interface{} { return *c } // getContainerRegistry gets the current state of our container registry func (stp *stp) getContainerRegistry() *stpContainerCache { ccr := &stpContainerCache{} if !stp.state.GetPolicyEntry(cacheKeyContainerRegistry, ccr) { stp.Error("no cached container registry found") } return ccr } // setContainerRegistry caches the state of our container registry func (stp *stp) setContainerRegistry(ccr *stpContainerCache) { stp.state.SetPolicyEntry(cacheKeyContainerRegistry, cache.Cachable(ccr)) } // Register us as a policy implementation. func init() { policy.Register(PolicyName, PolicyDescription, CreateStpPolicy) } ================================================ FILE: pkg/cri/resource-manager/policy/builtin/static-pools/stp-policy_test.go ================================================ /* Copyright 2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ package stp import ( "encoding/json" "testing" "github.com/google/go-cmp/cmp" logger "github.com/intel/cri-resource-manager/pkg/log" ) func TestParseContainerCmdline(t *testing.T) { stp := &stp{Logger: logger.NewLogger(PolicyName + "-test")} // 1. empty command line should return a nil pointer args := stp.parseContainerCmdline([]string{}, []string{}) if args != nil { t.Errorf("Exptected but got %v", *args) } // 2. case where cmk isolate command is in container "Command" args = stp.parseContainerCmdline([]string{"cmk", "isolate", "--pool", "foo", "--socket-id=2", "--conf-dir=/etc", "cmd", "-arg"}, []string{}) expected := cmkLegacyArgs{Pool: "foo", SocketID: 2, Command: []string{"cmd", "-arg"}} if args == nil || !cmp.Equal(expected, *args) { t.Errorf("Exptected %v but got %v", expected, *args) } // 3. we should ignore unknown cmk options args = stp.parseContainerCmdline([]string{"cmk", "isolate", "--invalid-1=inv1", "--pool", "foo", "--invalid-2=inv2", "cmd", "--arg"}, []string{}) expected = cmkLegacyArgs{Pool: "foo", SocketID: -1, Command: []string{"cmd", "--arg"}} if args == nil || !cmp.Equal(expected, *args) { t.Errorf("Exptected %v but got %v", expected, *args) } // 4. --pool should be defined in cmk options args = stp.parseContainerCmdline([]string{"cmk", "isolate", "--socket-id=2", "cmd", "--arg"}, []string{}) if args != nil { t.Errorf("Exptected but got %v", *args) } // 5. parsing from container "Args" args = stp.parseContainerCmdline([]string{"bash"}, []string{"-c", "cmk isolate --pool=foo --socket-id=2 cmd --arg"}) expected = cmkLegacyArgs{Pool: "foo", SocketID: 2, Command: []string{"cmd", "--arg"}} if args == nil || !cmp.Equal(expected, *args) { t.Errorf("Exptected %v but got %v", expected, *args) } // 6. Only _cmk_ isolate should be accepted args = stp.parseContainerCmdline([]string{"bash"}, []string{"-c", "dmk isolate --pool=foo cmd --arg"}) if args != nil { t.Errorf("Exptected but got %v", *args) } } func TestCachableData(t *testing.T) { ccr := &stpContainerCache{"id1": stpContainerStatus{Pool: "p", Socket: 1}} // Test JSON marshalling of cached data data, err := json.Marshal(ccr) if err != nil { t.Errorf("JSON marshal failed: %v", err) } expected := []byte(`{"id1":{"Pool":"p","Socket":1,"NExclusiveCPUs":0,"Cpusets":null,"NoAffinity":false}}`) if !cmp.Equal(expected, data) { t.Errorf("Exptected %s but got %s", expected, data) } // Test JSON unmarshalling of cached data ccr2 := &stpContainerCache{} err = json.Unmarshal(data, ccr2) if err != nil { t.Errorf("JSON unmarshal failed: %v", err) } if !cmp.Equal(*ccr, *ccr2) { t.Errorf("Exptected %v but got %v", *ccr, *ccr2) } } ================================================ FILE: pkg/cri/resource-manager/policy/builtin/topology-aware/affinity.go ================================================ // Copyright Intel Corporation. All Rights Reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package topologyaware import ( "github.com/intel/cri-resource-manager/pkg/apis/resmgr" "github.com/intel/cri-resource-manager/pkg/cri/resource-manager/cache" ) // Calculate pool affinities for the given container. func (p *policy) calculatePoolAffinities(container cache.Container) (map[int]int32, error) { log.Debug("=> calculating pool affinities...") affinities, err := p.calculateContainerAffinity(container) if err != nil { return nil, err } result := make(map[int]int32, len(p.nodes)) for id, w := range affinities { grant, ok := p.allocations.grants[id] if !ok { continue } node := grant.GetCPUNode() result[node.NodeID()] += w // TODO: calculate affinity for memory here too? } return result, nil } // Calculate affinity of this container (against all other containers). func (p *policy) calculateContainerAffinity(container cache.Container) (map[string]int32, error) { log.Debug("* calculating affinity for container %s...", container.PrettyName()) ca, err := container.GetAffinity() if err != nil { return nil, err } result := make(map[string]int32) for _, a := range ca { for id, w := range p.cache.EvaluateAffinity(a) { result[id] += w } } // self-affinity does not make sense, so remove any delete(result, container.GetCacheID()) log.Debug(" => affinity: %v", result) return result, nil } // Register our policy-specific implicit affinities with the Cache. func (p *policy) registerImplicitAffinities() error { affinities := []struct { name string disabled bool affinity cache.ImplicitAffinity }{ { name: "AVX512-pull/push", affinity: func(c cache.Container, hasExplicit bool) *cache.Affinity { _, tagged := c.GetTag(cache.TagAVX512) if tagged { return cache.GlobalAffinity("tags/"+cache.TagAVX512, 5) } return cache.GlobalAntiAffinity("tags/"+cache.TagAVX512, 5) }, }, { name: "colocate-pods", disabled: !opt.ColocatePods, affinity: func(c cache.Container, hasExplicit bool) *cache.Affinity { if hasExplicit { return nil } pod, ok := c.GetPod() if !ok { log.Error("failed to inject pod-colocation affinity, can't find pod") return nil } return &cache.Affinity{ Scope: pod.ScopeExpression(), Match: &resmgr.Expression{ Op: resmgr.AlwaysTrue, }, Weight: 10, } }, }, { name: "colocate-namespaces", disabled: !opt.ColocateNamespaces, affinity: func(c cache.Container, hasExplicit bool) *cache.Affinity { if hasExplicit { return nil } return &cache.Affinity{ Scope: &resmgr.Expression{ Op: resmgr.AlwaysTrue, }, Match: &resmgr.Expression{ Key: resmgr.KeyNamespace, Op: resmgr.Equals, Values: []string{ c.GetNamespace(), }, }, Weight: 10, } }, }, } enabled := map[string]cache.ImplicitAffinity{} for _, a := range affinities { if a.disabled { log.Info("implicit affinity %s is disabled", a.name) continue } enabled[PolicyName+":"+a.name] = a.affinity } if err := p.cache.AddImplicitAffinities(enabled); err != nil { return policyError("failed to register implicit affinities: %v", err) } return nil } ================================================ FILE: pkg/cri/resource-manager/policy/builtin/topology-aware/cache.go ================================================ // Copyright 2019 Intel Corporation. All Rights Reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package topologyaware import ( "encoding/json" "time" "github.com/intel/cri-resource-manager/pkg/cri/resource-manager/cache" "github.com/intel/cri-resource-manager/pkg/utils/cpuset" idset "github.com/intel/goresctrl/pkg/utils" ) const ( keyAllocations = "allocations" keyConfig = "config" ) func (p *policy) saveAllocations() { p.cache.SetPolicyEntry(keyAllocations, cache.Cachable(&p.allocations)) p.cache.Save() } func (p *policy) restoreAllocations(allocations *allocations) error { savedAllocations := allocations.clone() p.allocations = p.newAllocations() // // Try to reinstate all grants with the exact same resource assignments // as saved. If that fails, release and try to reallocate all corresponding // containers with pool hints pointing to the currently assigned pools. If // this fails too, save the original allocations unchanged to the cache and // return an error. // if err := p.reinstateGrants(allocations.grants); err != nil { log.Error("failed to reinstate grants verbatim: %v", err) containers, poolHints := allocations.getContainerPoolHints() if err := p.reallocateResources(containers, poolHints); err != nil { p.allocations = savedAllocations p.saveAllocations() // undo any potential changes in saved cache return err } } return nil } // reinstateGrants tries to restore the given grants exactly as such. func (p *policy) reinstateGrants(grants map[string]Grant) error { for id, grant := range grants { c := grant.GetContainer() pool := grant.GetCPUNode() supply := pool.FreeSupply() if err := supply.Reserve(grant); err != nil { return policyError("failed to update pool %q with CPU grant of %q: %v", pool.Name(), c.PrettyName(), err) } log.Info("updated pool %q with reinstated CPU grant of %q", pool.Name(), c.PrettyName()) pool = grant.GetMemoryNode() if err := supply.ReserveMemory(grant); err != nil { grant.GetCPUNode().FreeSupply().ReleaseCPU(grant) return policyError("failed to update pool %q with extra memory of %q: %v", pool.Name(), c.PrettyName(), err) } log.Info("updated pool %q with reinstanted memory reservation of %q", pool.Name(), c.PrettyName()) p.allocations.grants[id] = grant p.applyGrant(grant) } p.updateSharedAllocations(nil) return nil } type cachedGrant struct { Exclusive string Part int CPUType cpuClass Container string Pool string MemoryPool string MemType memoryType Memset idset.IDSet MemoryLimit memoryMap ColdStart time.Duration } func newCachedGrant(cg Grant) *cachedGrant { ccg := &cachedGrant{} ccg.Exclusive = cg.ExclusiveCPUs().String() ccg.Part = cg.CPUPortion() ccg.CPUType = cg.CPUType() ccg.Container = cg.GetContainer().GetCacheID() ccg.Pool = cg.GetCPUNode().Name() ccg.MemoryPool = cg.GetMemoryNode().Name() ccg.MemType = cg.MemoryType() ccg.Memset = cg.Memset().Clone() ccg.MemoryLimit = make(memoryMap) for key, value := range cg.MemLimit() { ccg.MemoryLimit[key] = value } ccg.ColdStart = cg.ColdStart() return ccg } func (ccg *cachedGrant) ToGrant(policy *policy) (Grant, error) { node, ok := policy.nodes[ccg.Pool] if !ok { return nil, policyError("cache error: failed to restore %v, unknown pool/node", *ccg) } container, ok := policy.cache.LookupContainer(ccg.Container) if !ok { return nil, policyError("cache error: failed to restore %v, unknown container", *ccg) } g := newGrant( node, container, ccg.CPUType, cpuset.MustParse(ccg.Exclusive), ccg.Part, ccg.MemType, ccg.MemoryLimit, ccg.ColdStart, ) if g.Memset().String() != ccg.Memset.String() { log.Error("cache error: mismatch in stored/recalculated memset: %s != %s", ccg.Memset, g.Memset()) } return g, nil } func (cg *grant) MarshalJSON() ([]byte, error) { return json.Marshal(newCachedGrant(cg)) } func (cg *grant) UnmarshalJSON(data []byte) error { ccg := cachedGrant{} if err := json.Unmarshal(data, &ccg); err != nil { return policyError("failed to restore grant: %v", err) } cg.exclusive = cpuset.MustParse(ccg.Exclusive) return nil } func (a *allocations) MarshalJSON() ([]byte, error) { cgrants := make(map[string]*cachedGrant) for id, cg := range a.grants { cgrants[id] = newCachedGrant(cg) } return json.Marshal(cgrants) } func (a *allocations) UnmarshalJSON(data []byte) error { var err error cgrants := make(map[string]*cachedGrant) if err := json.Unmarshal(data, &cgrants); err != nil { return policyError("failed to restore allocations: %v", err) } a.grants = make(map[string]Grant, 32) for id, ccg := range cgrants { a.grants[id], err = ccg.ToGrant(a.policy) if err != nil { log.Error("removing unresolvable cached grant %v: %v", *ccg, err) delete(a.grants, id) } else { log.Debug("resolved cache grant: %v", a.grants[id].String()) } } return nil } func (a *allocations) Get() interface{} { return a } func (a *allocations) Set(value interface{}) { var from *allocations switch value.(type) { case allocations: v := value.(allocations) from = &v case *allocations: from = value.(*allocations) } a.grants = make(map[string]Grant, 32) for id, cg := range from.grants { a.grants[id] = cg } } func (a *allocations) Dump(logfn func(format string, args ...interface{}), prefix string) { for _, cg := range a.grants { logfn(prefix+"%s", cg) } } ================================================ FILE: pkg/cri/resource-manager/policy/builtin/topology-aware/cache_test.go ================================================ // Copyright 2019 Intel Corporation. All Rights Reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package topologyaware import ( "bytes" "testing" "github.com/intel/cri-resource-manager/pkg/utils/cpuset" ) func TestToGrant(t *testing.T) { tcases := []struct { name string policy *policy cgrant *cachedGrant expectedError bool }{ { name: "unknown node", cgrant: &cachedGrant{}, policy: &policy{ nodes: map[string]Node{ "node1": &node{}, }, }, expectedError: true, }, { name: "known node but failed lookup", cgrant: &cachedGrant{ Pool: "node1", }, policy: &policy{ nodes: map[string]Node{ "node1": &node{}, }, cache: &mockCache{}, }, expectedError: true, }, { name: "known node", cgrant: &cachedGrant{ Pool: "node1", }, policy: &policy{ nodes: map[string]Node{ "node1": &node{}, }, cache: &mockCache{ returnValue2ForLookupContainer: true, }, }, }, } for _, tc := range tcases { t.Run(tc.name, func(t *testing.T) { _, err := tc.cgrant.ToGrant(tc.policy) if tc.expectedError && err == nil { t.Errorf("Expected error, but got success") } if !tc.expectedError && err != nil { t.Errorf("Unxpected error: %+v", err) } }) } } func TestAllocationMarshalling(t *testing.T) { tcases := []struct { name string data []byte expectedUnmarshallingError bool expectedMarshallingError bool }{ { name: "non-zero Exclusive", data: []byte(`{"key1":{"Exclusive":"1","Part":1,"CPUType":0,"Container":"1","Pool":"testnode","MemoryPool":"testnode","MemType":"DRAM,PMEM,HBM","Memset":"","MemoryLimit":{},"ColdStart":0}}`), }, { name: "zero Exclusive", data: []byte(`{"key1":{"Exclusive":"","Part":1,"CPUType":0,"Container":"1","Pool":"testnode","MemoryPool":"testnode","MemType":"DRAM,PMEM,HBM","Memset":"","MemoryLimit":{},"ColdStart":0}}`), }, } for _, tc := range tcases { t.Run(tc.name, func(t *testing.T) { alloc := &allocations{ policy: &policy{ nodes: map[string]Node{ "testnode": &virtualnode{ node: node{ name: "testnode", kind: UnknownNode, noderes: newSupply(&node{}, cpuset.New(), cpuset.New(), cpuset.New(), 0, 0, createMemoryMap(0, 0, 0), createMemoryMap(0, 0, 0)), freeres: newSupply(&node{}, cpuset.New(), cpuset.New(), cpuset.New(), 0, 0, createMemoryMap(0, 0, 0), createMemoryMap(0, 0, 0)), }, }, }, cache: &mockCache{ returnValue1ForLookupContainer: &mockContainer{ returnValueForGetCacheID: "1", }, returnValue2ForLookupContainer: true, }, }, } unmarshallingErr := alloc.UnmarshalJSON(tc.data) if tc.expectedUnmarshallingError && unmarshallingErr == nil { t.Errorf("Expected unmarshalling error, but got success") } if !tc.expectedUnmarshallingError && unmarshallingErr != nil { t.Errorf("Unxpected unmarshalling error: %+v", unmarshallingErr) } out, marshallingErr := alloc.MarshalJSON() if !bytes.Equal(out, tc.data) { t.Errorf("Expected\n%q\nBut got\n%q", tc.data, out) } if tc.expectedMarshallingError && marshallingErr == nil { t.Errorf("Expected marshalling error, but got success") } if !tc.expectedMarshallingError && marshallingErr != nil { t.Errorf("Unxpected marshalling error: %+v", marshallingErr) } }) } } ================================================ FILE: pkg/cri/resource-manager/policy/builtin/topology-aware/coldstart.go ================================================ // Copyright 2019-2020 Intel Corporation. All Rights Reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package topologyaware import ( "time" "github.com/intel/cri-resource-manager/pkg/cri/resource-manager/cache" "github.com/intel/cri-resource-manager/pkg/cri/resource-manager/events" ) // trigger cold start for the container if necessary. func (p *policy) triggerColdStart(c cache.Container) error { log.Info("coldstart: triggering coldstart for %s...", c.PrettyName()) g, ok := p.allocations.grants[c.GetCacheID()] if !ok { log.Warn("coldstart: no grant found, nothing to do...") return nil } coldStart := g.ColdStart() if coldStart <= 0 { log.Info("coldstart: no coldstart, nothing to do...") return nil } // Start a timer to restore the grant memset to full. Store the // timer so that we can release it if the grant is destroyed before // the timer elapses. duration := coldStart timer := time.AfterFunc(duration, func() { e := &events.Policy{ Type: ColdStartDone, Source: PolicyName, Data: c.GetID(), } if err := p.options.SendEvent(e); err != nil { // we should retry this later, the channel is probably full... log.Error("Ouch... we'should retry this later.") } }) g.AddTimer(timer) return nil } // finish an ongoing coldstart for the container. func (p *policy) finishColdStart(c cache.Container) (bool, error) { g, ok := p.allocations.grants[c.GetCacheID()] if !ok { log.Warn("coldstart: no grant found, nothing to do...") return false, policyError("coldstart: no grant found for %s", c.PrettyName()) } log.Info("restoring memset to grant %v", g) g.RestoreMemset() g.ClearTimer() return true, nil } ================================================ FILE: pkg/cri/resource-manager/policy/builtin/topology-aware/coldstart_test.go ================================================ // Copyright 2020 Intel Corporation. All Rights Reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package topologyaware import ( "fmt" "sync" "testing" "time" "github.com/intel/cri-resource-manager/pkg/cri/resource-manager/cache" "github.com/intel/cri-resource-manager/pkg/cri/resource-manager/events" policyapi "github.com/intel/cri-resource-manager/pkg/cri/resource-manager/policy" system "github.com/intel/cri-resource-manager/pkg/sysfs" idset "github.com/intel/goresctrl/pkg/utils" ) var globalPolicy *policy var mutex sync.Mutex func sendEvent(param interface{}) error { // Simulate event synchronization in the upper levels. mutex.Lock() defer mutex.Unlock() fmt.Printf("Event received: %v", param) event := param.(*events.Policy) globalPolicy.HandleEvent(event) return nil } func TestColdStart(t *testing.T) { // Idea with cold start is that the workload is first allocated only PMEM node. Only when timer expires // (or some other event is triggered) is the DRAM node added to the memset. This causes the initial // memory allocations to be made from PMEM only. tcases := []struct { name string numaNodes []system.Node req Request affinities map[int]int32 container cache.Container expectedColdStartTimeout time.Duration expectedDRAMNodeID int expectedPMEMNodeID int expectedDRAMSystemNodeID idset.ID expectedPMEMSystemNodeID idset.ID }{ { name: "three node cold start", numaNodes: []system.Node{ &mockSystemNode{id: 1, memFree: 10000, memTotal: 10000, memType: system.MemoryTypeDRAM, distance: []int{5, 5, 1}}, &mockSystemNode{id: 2, memFree: 50000, memTotal: 50000, memType: system.MemoryTypePMEM, distance: []int{5, 1, 5}}, }, container: &mockContainer{ name: "demo-coldstart-container", returnValueForGetCacheID: "1234", pod: &mockPod{ coldStartTimeout: 1000 * time.Millisecond, returnValue1FotGetResmgrAnnotation: "demo-coldstart-container: pmem,dram", returnValue2FotGetResmgrAnnotation: true, coldStartContainerName: "demo-coldstart-container", }, }, expectedColdStartTimeout: 1000 * time.Millisecond, expectedDRAMNodeID: 101, expectedDRAMSystemNodeID: idset.ID(1), expectedPMEMSystemNodeID: idset.ID(2), expectedPMEMNodeID: 102, }, } for _, tc := range tcases { t.Run(tc.name, func(t *testing.T) { policy := &policy{ sys: &mockSystem{ nodes: tc.numaNodes, }, cache: &mockCache{ returnValue1ForLookupContainer: tc.container, returnValue2ForLookupContainer: true, }, allocations: allocations{ grants: make(map[string]Grant, 0), }, options: &policyapi.BackendOptions{}, } policy.allocations.policy = policy policy.options.SendEvent = sendEvent if err := policy.buildPoolsByTopology(); err != nil { t.Errorf("failed to build topology pool") } grant, err := policy.allocatePool(tc.container, "") if err != nil { panic(err) } if grant.ColdStart() != tc.expectedColdStartTimeout { t.Errorf("Expected coldstart value '%v', but got '%v'", tc.expectedColdStartTimeout, grant.ColdStart()) } policy.allocations.grants[tc.container.GetCacheID()] = grant mems := grant.Memset() if len(mems) != 1 || mems.Members()[0] != tc.expectedPMEMSystemNodeID { t.Errorf("Expected one memory controller %v, got: %v", tc.expectedPMEMSystemNodeID, mems) } if grant.MemoryType()&memoryDRAM != 0 { // FIXME: should we report only the limited memory types or the granted types // while the cold start is going on? // t.Errorf("No DRAM was expected before coldstart timer: %v", grant.MemoryType()) } globalPolicy = policy policy.options.SendEvent(&events.Policy{ Type: events.ContainerStarted, Data: tc.container, }) time.Sleep(tc.expectedColdStartTimeout * 2) newMems := grant.Memset() if len(newMems) != 2 { t.Errorf("Expected two memory controllers, got %d: %v", len(newMems), newMems) } if !newMems.Has(tc.expectedPMEMSystemNodeID) || !newMems.Has(tc.expectedDRAMSystemNodeID) { t.Errorf("Didn't get all expected system nodes in mems, got: %v", newMems) } }) } } ================================================ FILE: pkg/cri/resource-manager/policy/builtin/topology-aware/error.go ================================================ // Copyright 2019 Intel Corporation. All Rights Reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package topologyaware import ( "fmt" ) // policyError creates a formatted policy-specific error. func policyError(format string, args ...interface{}) error { return fmt.Errorf(PolicyName+": "+format, args...) } ================================================ FILE: pkg/cri/resource-manager/policy/builtin/topology-aware/flags.go ================================================ // Copyright 2019 Intel Corporation. All Rights Reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package topologyaware import ( config "github.com/intel/cri-resource-manager/pkg/config" ) // Options captures our configurable policy parameters. type options struct { // PinCPU controls CPU pinning in this policy. PinCPU bool // PinMemory controls memory pinning in this policy. PinMemory bool // PreferIsolated controls whether isolated CPUs are preferred for isolated allocations. PreferIsolated bool `json:"PreferIsolatedCPUs"` // PreferShared controls whether shared CPU allocation is always preferred by default. PreferShared bool `json:"PreferSharedCPUs"` // ReservedPoolNamespaces is a list of namespace globs that will be allocated to reserved CPUs ReservedPoolNamespaces []string `json:"ReservedPoolNamespaces,omitempty"` // ColocatePods causes all containers in a pod to have affinity for each other. ColocatePods bool `json:"ColocatePods"` // ColocateNamespaces causes all containers in a namespace to have affinity for each other. ColocateNamespaces bool `json:"ColocateNamespaces"` } // Our runtime configuration. var opt = defaultOptions().(*options) var aliasOpt = defaultOptions().(*options) // defaultOptions returns a new options instance, all initialized to defaults. func defaultOptions() interface{} { return &options{ PinCPU: true, PinMemory: true, PreferIsolated: true, PreferShared: false, ReservedPoolNamespaces: []string{"kube-system"}, } } // Register us for configuration handling. func init() { config.Register(PolicyPath, PolicyDescription, opt, defaultOptions) config.Register(AliasPath, PolicyDescription, aliasOpt, defaultOptions) } ================================================ FILE: pkg/cri/resource-manager/policy/builtin/topology-aware/hint.go ================================================ // Copyright 2019 Intel Corporation. All Rights Reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package topologyaware import ( "strconv" "strings" system "github.com/intel/cri-resource-manager/pkg/sysfs" "github.com/intel/cri-resource-manager/pkg/topology" "github.com/intel/cri-resource-manager/pkg/utils/cpuset" idset "github.com/intel/goresctrl/pkg/utils" ) // Calculate the hint score of the given hint and CPUSet. func cpuHintScore(hint topology.Hint, CPUs cpuset.CPUSet) float64 { hCPUs, err := cpuset.Parse(hint.CPUs) if err != nil { log.Warn("invalid hint CPUs '%s' from %s", hint.CPUs, hint.Provider) return 0.0 } common := hCPUs.Intersection(CPUs) return float64(common.Size()) / float64(hCPUs.Size()) } // Calculate the NUMA node score of the given hint and NUMA node. func numaHintScore(hint topology.Hint, sysIDs ...idset.ID) float64 { for _, idstr := range strings.Split(hint.NUMAs, ",") { hID, err := strconv.ParseInt(idstr, 0, 0) if err != nil { log.Warn("invalid hint NUMA node %s from %s", idstr, hint.Provider) return 0.0 } for _, id := range sysIDs { if hID == int64(id) { return 1.0 } } } return 0.0 } // Calculate the die node score of the given hint and die. func dieHintScore(hint topology.Hint, sysID idset.ID, socket system.CPUPackage) float64 { numaNodes := idset.NewIDSet(socket.DieNodeIDs(sysID)...) for _, idstr := range strings.Split(hint.NUMAs, ",") { hID, err := strconv.ParseInt(idstr, 0, 0) if err != nil { log.Warn("invalid hint NUMA node %s from %s", idstr, hint.Provider) return 0.0 } if numaNodes.Has(idset.ID(hID)) { return 1.0 } } return 0.0 } // Calculate the socket node score of the given hint and NUMA node. func socketHintScore(hint topology.Hint, sysID idset.ID) float64 { for _, idstr := range strings.Split(hint.Sockets, ",") { id, err := strconv.ParseInt(idstr, 0, 0) if err != nil { log.Warn("invalid hint socket '%s' from %s", idstr, hint.Provider) return 0.0 } if id == int64(sysID) { return 1.0 } } return 0.0 } // return the cpuset for the CPU, NUMA or socket hints, preferred in this particular order. func (cs *supply) hintCpus(h topology.Hint) cpuset.CPUSet { var cpus cpuset.CPUSet switch { case h.CPUs != "": cpus = cpuset.MustParse(h.CPUs) case h.NUMAs != "": for _, idstr := range strings.Split(h.NUMAs, ",") { if id, err := strconv.ParseInt(idstr, 0, 0); err == nil { if node := cs.node.System().Node(idset.ID(id)); node != nil { cpus = cpus.Union(node.CPUSet()) } } } case h.Sockets != "": for _, idstr := range strings.Split(h.Sockets, ",") { if id, err := strconv.ParseInt(idstr, 0, 0); err == nil { if pkg := cs.node.System().Package(idset.ID(id)); pkg != nil { cpus = cpus.Union(pkg.CPUSet()) } } } } return cpus } ================================================ FILE: pkg/cri/resource-manager/policy/builtin/topology-aware/hint_test.go ================================================ // Copyright 2019 Intel Corporation. All Rights Reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package topologyaware import ( "testing" "github.com/intel/cri-resource-manager/pkg/topology" "github.com/intel/cri-resource-manager/pkg/utils/cpuset" idset "github.com/intel/goresctrl/pkg/utils" ) func TestCpuHintScore(t *testing.T) { tcases := []struct { name string expected float64 hint topology.Hint cpus cpuset.CPUSet disabled bool // TODO(rojkov): remove this field when the code is fixed. }{ { name: "handle zero cpu size gracefully", disabled: true, }, { name: "handle unparsable cpu size gracefully", hint: topology.Hint{ CPUs: "unparsable", }, }, { name: "non-zero cpu size hint and empty CPUs", hint: topology.Hint{ CPUs: "1", }, }, { name: "hint corresponding to given CPU", hint: topology.Hint{ CPUs: "1,2", }, cpus: cpuset.New(1), expected: 0.5, }, } for _, tc := range tcases { t.Run(tc.name, func(t *testing.T) { if tc.disabled { t.Skipf("The case '%s' is skipped", tc.name) } actual := cpuHintScore(tc.hint, tc.cpus) if actual != tc.expected { t.Errorf("Expected %f, but got %f", tc.expected, actual) } }) } } func TestNumaHintScore(t *testing.T) { tcases := []struct { name string expected float64 hint topology.Hint ids []idset.ID }{ { name: "handle unparsable NUMAs gracefully", hint: topology.Hint{ NUMAs: "unparsable", }, }, { name: "non-zero NUMA hint and empty NUMAs", hint: topology.Hint{ NUMAs: "1", }, }, { name: "hint corresponding to a given ID", ids: []idset.ID{1}, hint: topology.Hint{ NUMAs: "1,2", }, expected: 1.0, }, } for _, tc := range tcases { t.Run(tc.name, func(t *testing.T) { actual := numaHintScore(tc.hint, tc.ids...) if actual != tc.expected { t.Errorf("Expected %f, but got %f", tc.expected, actual) } }) } } func TestSocketHintScore(t *testing.T) { tcases := []struct { name string expected float64 hint topology.Hint id idset.ID }{ { name: "handle unparsable Sockets gracefully", hint: topology.Hint{ Sockets: "unparsable", }, }, { name: "non-zero Sockets hint and empty Sockets", hint: topology.Hint{ Sockets: "1", }, }, { name: "hint corresponding to a given ID", id: 1, hint: topology.Hint{ Sockets: "1,2", }, expected: 1.0, }, } for _, tc := range tcases { t.Run(tc.name, func(t *testing.T) { actual := socketHintScore(tc.hint, tc.id) if actual != tc.expected { t.Errorf("Expected %f, but got %f", tc.expected, actual) } }) } } func TestHintCpus(t *testing.T) { tcases := []struct { name string supply *supply hint topology.Hint expected cpuset.CPUSet }{ { name: "handle unparsable Sockets gracefully", supply: &supply{}, hint: topology.Hint{ Sockets: "unparsable", }, }, { name: "non-zero Sockets hint and empty system.Package", supply: &supply{ node: &node{ policy: &policy{ sys: &mockSystem{}, }, }, }, hint: topology.Hint{ Sockets: "1", }, }, { name: "handle unparsable NUMAs gracefully", supply: &supply{}, hint: topology.Hint{ NUMAs: "unparsable", }, }, { name: "non-zero NUMAs hint and empty system.Node", supply: &supply{ node: &node{ policy: &policy{ sys: &mockSystem{}, }, }, }, hint: topology.Hint{ NUMAs: "1", }, }, // TODO(rojkov): add tests for non-empty system.Package's (can't be done while system.Package is closed struct) { name: "non-zero CPUs hint", supply: &supply{}, hint: topology.Hint{ CPUs: "1", }, expected: cpuset.New(1), }, } for _, tc := range tcases { t.Run(tc.name, func(t *testing.T) { actual := tc.supply.hintCpus(tc.hint) if tc.expected.IsEmpty() && actual.IsEmpty() { return } if !tc.expected.Equals(actual) { t.Errorf("Expected %+v, but got %+v", tc.expected, actual) } }) } } ================================================ FILE: pkg/cri/resource-manager/policy/builtin/topology-aware/logging.go ================================================ // Copyright 2019 Intel Corporation. All Rights Reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package topologyaware import ( "fmt" logger "github.com/intel/cri-resource-manager/pkg/log" ) // Create our logger instance. var log logger.Logger = logger.NewLogger("policy") // indent produces an indentation string for the given level. const ( IndentDepth = 4 ) func indent(prefix string, level ...int) string { if len(level) < 1 { return prefix } depth := level[0] * IndentDepth return prefix + fmt.Sprintf("%*.*s", depth, depth, "") } ================================================ FILE: pkg/cri/resource-manager/policy/builtin/topology-aware/mocks_test.go ================================================ // Copyright 2019 Intel Corporation. All Rights Reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package topologyaware import ( "os" "time" "github.com/intel/cri-resource-manager/pkg/apis/resmgr" "github.com/intel/cri-resource-manager/pkg/cri/resource-manager/cache" "github.com/intel/cri-resource-manager/pkg/cri/resource-manager/config" system "github.com/intel/cri-resource-manager/pkg/sysfs" "github.com/intel/cri-resource-manager/pkg/topology" "github.com/intel/cri-resource-manager/pkg/utils/cpuset" "github.com/intel/goresctrl/pkg/sst" idset "github.com/intel/goresctrl/pkg/utils" v1 "k8s.io/api/core/v1" criv1 "k8s.io/cri-api/pkg/apis/runtime/v1" ) type mockSystemNode struct { id idset.ID // node id memFree uint64 memTotal uint64 memType system.MemoryType distance []int } func (fake *mockSystemNode) MemoryInfo() (*system.MemInfo, error) { return &system.MemInfo{MemFree: fake.memFree, MemTotal: fake.memTotal}, nil } func (fake *mockSystemNode) PackageID() idset.ID { return 0 } func (fake *mockSystemNode) DieID() idset.ID { return 0 } func (fake *mockSystemNode) ID() idset.ID { return fake.id } func (fake *mockSystemNode) GetMemoryType() system.MemoryType { return fake.memType } func (fake *mockSystemNode) HasNormalMemory() bool { return true } func (fake *mockSystemNode) CPUSet() cpuset.CPUSet { return cpuset.New() } func (fake *mockSystemNode) Distance() []int { if len(fake.distance) == 0 { return []int{0} } return fake.distance } func (fake *mockSystemNode) DistanceFrom(id idset.ID) int { return 0 } type mockCPUPackage struct { } func (p *mockCPUPackage) ID() idset.ID { return idset.ID(0) } func (p *mockCPUPackage) CPUSet() cpuset.CPUSet { return cpuset.New() } func (p *mockCPUPackage) NodeIDs() []idset.ID { return []idset.ID{} } func (p *mockCPUPackage) DieIDs() []idset.ID { return []idset.ID{0} } func (p *mockCPUPackage) DieCPUSet(idset.ID) cpuset.CPUSet { return cpuset.New() } func (p *mockCPUPackage) DieNodeIDs(idset.ID) []idset.ID { return []idset.ID{} } func (p *mockCPUPackage) SstInfo() *sst.SstPackageInfo { return &sst.SstPackageInfo{} } type mockCPU struct { isolated cpuset.CPUSet online cpuset.CPUSet id idset.ID node mockSystemNode pkg mockCPUPackage } func (c *mockCPU) BaseFrequency() uint64 { return 0 } func (c *mockCPU) EPP() system.EPP { return system.EPPUnknown } func (c *mockCPU) ID() idset.ID { return idset.ID(0) } func (c *mockCPU) PackageID() idset.ID { return c.pkg.ID() } func (c *mockCPU) DieID() idset.ID { return idset.ID(0) } func (c *mockCPU) NodeID() idset.ID { return c.node.ID() } func (c *mockCPU) CoreID() idset.ID { return c.id } func (c *mockCPU) ThreadCPUSet() cpuset.CPUSet { return cpuset.New() } func (c *mockCPU) FrequencyRange() system.CPUFreq { return system.CPUFreq{} } func (c *mockCPU) Online() bool { return true } func (c *mockCPU) Isolated() bool { return false } func (c *mockCPU) SetFrequencyLimits(min, max uint64) error { return nil } func (c *mockCPU) SstClos() int { return -1 } type mockSystem struct { isolatedCPU int nodes []system.Node cpuCount int packageCount int socketCount int } func (fake *mockSystem) Node(id idset.ID) system.Node { for _, node := range fake.nodes { if node.ID() == id { return node } } return &mockSystemNode{} } func (fake *mockSystem) CPU(idset.ID) system.CPU { return &mockCPU{} } func (fake *mockSystem) CPUCount() int { if fake.cpuCount == 0 { return 1 } return fake.cpuCount } func (fake *mockSystem) Discover() error { return nil } func (fake *mockSystem) Package(idset.ID) system.CPUPackage { return &mockCPUPackage{} } func (fake *mockSystem) Offlined() cpuset.CPUSet { return cpuset.New() } func (fake *mockSystem) Isolated() cpuset.CPUSet { if fake.isolatedCPU > 0 { return cpuset.New(fake.isolatedCPU) } return cpuset.New() } func (fake *mockSystem) CPUSet() cpuset.CPUSet { return cpuset.New() } func (fake *mockSystem) CPUIDs() []idset.ID { return []idset.ID{} } func (fake *mockSystem) PackageCount() int { if fake.packageCount == 0 { return 1 } return fake.packageCount } func (fake *mockSystem) SocketCount() int { if fake.socketCount == 0 { return 1 } return fake.socketCount } func (fake *mockSystem) NUMANodeCount() int { return len(fake.nodes) } func (fake *mockSystem) ThreadCount() int { if fake.cpuCount == 0 { return 1 } return fake.cpuCount } func (fake *mockSystem) PackageIDs() []idset.ID { ids := make([]idset.ID, len(fake.nodes)) for i, node := range fake.nodes { ids[i] = node.PackageID() } return ids } func (fake *mockSystem) NodeIDs() []idset.ID { ids := make([]idset.ID, len(fake.nodes)) for i, node := range fake.nodes { ids[i] = node.ID() } return ids } func (fake *mockSystem) SetCPUFrequencyLimits(min, max uint64, cpus idset.IDSet) error { return nil } func (fake *mockSystem) SetCpusOnline(online bool, cpus idset.IDSet) (idset.IDSet, error) { return idset.NewIDSet(), nil } func (fake *mockSystem) NodeDistance(idset.ID, idset.ID) int { return 10 } type mockContainer struct { name string namespace string returnValueForGetResourceRequirements v1.ResourceRequirements returnValueForGetCacheID string returnValueForGetID string memoryLimit int64 cpuset cpuset.CPUSet returnValueForQOSClass v1.PodQOSClass pod cache.Pod } func (m *mockContainer) PrettyName() string { return m.name } func (m *mockContainer) GetPod() (cache.Pod, bool) { if m.pod == nil { return &mockPod{}, false } return m.pod, true } func (m *mockContainer) GetID() string { return m.returnValueForGetID } func (m *mockContainer) GetPodID() string { panic("unimplemented") } func (m *mockContainer) GetCacheID() string { if len(m.returnValueForGetCacheID) == 0 { return "0" } return m.returnValueForGetCacheID } func (m *mockContainer) GetName() string { return m.name } func (m *mockContainer) GetNamespace() string { return m.namespace } func (m *mockContainer) UpdateState(cache.ContainerState) { panic("unimplemented") } func (m *mockContainer) GetState() cache.ContainerState { panic("unimplemented") } func (m *mockContainer) GetQOSClass() v1.PodQOSClass { if len(m.returnValueForQOSClass) == 0 { return v1.PodQOSGuaranteed } return m.returnValueForQOSClass } func (m *mockContainer) GetImage() string { panic("unimplemented") } func (m *mockContainer) GetCommand() []string { panic("unimplemented") } func (m *mockContainer) GetArgs() []string { panic("unimplemented") } func (m *mockContainer) GetLabelKeys() []string { panic("unimplemented") } func (m *mockContainer) GetLabel(string) (string, bool) { panic("unimplemented") } func (m *mockContainer) GetLabels() map[string]string { panic("unimplemented") } func (m *mockContainer) GetResmgrLabelKeys() []string { panic("unimplemented") } func (m *mockContainer) GetResmgrLabel(string) (string, bool) { panic("unimplemented") } func (m *mockContainer) GetAnnotationKeys() []string { panic("unimplemented") } func (m *mockContainer) GetAnnotation(string, interface{}) (string, bool) { panic("unimplemented") } func (m *mockContainer) GetResmgrAnnotationKeys() []string { panic("unimplemented") } func (m *mockContainer) GetResmgrAnnotation(string, interface{}) (string, bool) { panic("unimplemented") } func (m *mockContainer) GetEffectiveAnnotation(key string) (string, bool) { pod, ok := m.GetPod() if !ok { return "", false } return pod.GetEffectiveAnnotation(key, m.name) } func (m *mockContainer) GetAnnotations() map[string]string { panic("unimplemented") } func (m *mockContainer) GetEnvKeys() []string { panic("unimplemented") } func (m *mockContainer) GetEnv(string) (string, bool) { panic("unimplemented") } func (m *mockContainer) GetMounts() []cache.Mount { panic("unimplemented") } func (m *mockContainer) GetMountByHost(string) *cache.Mount { panic("unimplemented") } func (m *mockContainer) GetMountByContainer(string) *cache.Mount { panic("unimplemented") } func (m *mockContainer) GetDevices() []cache.Device { panic("unimplemented") } func (m *mockContainer) GetDeviceByHost(string) *cache.Device { panic("unimplemented") } func (m *mockContainer) GetDeviceByContainer(string) *cache.Device { panic("unimplemented") } func (m *mockContainer) GetResourceRequirements() v1.ResourceRequirements { return m.returnValueForGetResourceRequirements } func (m *mockContainer) GetLinuxResources() *criv1.LinuxContainerResources { panic("unimplemented") } func (m *mockContainer) SetCommand([]string) { panic("unimplemented") } func (m *mockContainer) SetArgs([]string) { panic("unimplemented") } func (m *mockContainer) SetLabel(string, string) { panic("unimplemented") } func (m *mockContainer) DeleteLabel(string) { panic("unimplemented") } func (m *mockContainer) SetAnnotation(string, string) { panic("unimplemented") } func (m *mockContainer) DeleteAnnotation(string) { panic("unimplemented") } func (m *mockContainer) SetEnv(string, string) { panic("unimplemented") } func (m *mockContainer) UnsetEnv(string) { panic("unimplemented") } func (m *mockContainer) InsertMount(*cache.Mount) { panic("unimplemented") } func (m *mockContainer) DeleteMount(string) { panic("unimplemented") } func (m *mockContainer) InsertDevice(*cache.Device) { panic("unimplemented") } func (m *mockContainer) DeleteDevice(string) { panic("unimplemented") } func (m *mockContainer) GetTopologyHints() topology.Hints { return topology.Hints{} } func (m *mockContainer) GetCPUPeriod() int64 { panic("unimplemented") } func (m *mockContainer) GetCPUQuota() int64 { panic("unimplemented") } func (m *mockContainer) GetCPUShares() int64 { panic("unimplemented") } func (m *mockContainer) GetMemoryLimit() int64 { return m.memoryLimit } func (m *mockContainer) GetOomScoreAdj() int64 { panic("unimplemented") } func (m *mockContainer) GetCpusetCpus() string { return m.cpuset.String() } func (m *mockContainer) GetCpusetMems() string { panic("unimplemented") } func (m *mockContainer) SetLinuxResources(*criv1.LinuxContainerResources) { panic("unimplemented") } func (m *mockContainer) SetCPUPeriod(int64) { panic("unimplemented") } func (m *mockContainer) SetCPUQuota(int64) { panic("unimplemented") } func (m *mockContainer) SetCPUShares(int64) { } func (m *mockContainer) SetMemoryLimit(int64) { panic("unimplemented") } func (m *mockContainer) SetOomScoreAdj(int64) { panic("unimplemented") } func (m *mockContainer) SetCpusetCpus(string) { } func (m *mockContainer) SetCpusetMems(string) { } func (m *mockContainer) UpdateCriCreateRequest(*criv1.CreateContainerRequest) error { panic("unimplemented") } func (m *mockContainer) CriUpdateRequest() (*criv1.UpdateContainerResourcesRequest, error) { panic("unimplemented") } func (m *mockContainer) GetAffinity() ([]*cache.Affinity, error) { return nil, nil } func (m *mockContainer) SetRDTClass(string) { panic("unimplemented") } func (m *mockContainer) GetRDTClass() string { panic("unimplemented") } func (m *mockContainer) SetBlockIOClass(string) { panic("unimplemented") } func (m *mockContainer) GetBlockIOClass() string { panic("unimplemented") } func (m *mockContainer) SetToptierLimit(int64) { panic("unimplemented") } func (m *mockContainer) GetToptierLimit() int64 { panic("unimplemented") } func (m *mockContainer) SetPageMigration(*cache.PageMigrate) { return } func (m *mockContainer) GetPageMigration() *cache.PageMigrate { return nil } func (m *mockContainer) SetCRIRequest(req interface{}) error { panic("unimplemented") } func (m *mockContainer) GetCRIRequest() (interface{}, bool) { panic("unimplemented") } func (m *mockContainer) ClearCRIRequest() (interface{}, bool) { panic("unimplemented") } func (m *mockContainer) GetCRIEnvs() []*criv1.KeyValue { panic("unimplemented") } func (m *mockContainer) GetCRIMounts() []*criv1.Mount { panic("unimplemented") } func (m *mockContainer) GetCRIDevices() []*criv1.Device { panic("unimplemented") } func (m *mockContainer) GetPending() []string { panic("unimplemented") } func (m *mockContainer) HasPending(string) bool { panic("unimplemented") } func (m *mockContainer) ClearPending(string) { panic("unimplemented") } func (m *mockContainer) GetTag(string) (string, bool) { panic("unimplemented") } func (m *mockContainer) SetTag(string, string) (string, bool) { panic("unimplemented") } func (m *mockContainer) DeleteTag(string) (string, bool) { panic("unimplemented") } func (m *mockContainer) String() string { return "mockContainer" } func (m *mockContainer) Eval(string) interface{} { panic("unimplemented") } func (m *mockContainer) GetProcesses() ([]string, error) { panic("unimplemented") } func (m *mockContainer) GetTasks() ([]string, error) { panic("unimplemented") } func (m *mockContainer) GetCgroupDir() string { panic("unimplemented") } type mockPod struct { name string returnValueFotGetQOSClass v1.PodQOSClass returnValue1FotGetResmgrAnnotation string returnValue2FotGetResmgrAnnotation bool coldStartTimeout time.Duration coldStartContainerName string annotations map[string]string } func (m *mockPod) GetInitContainers() []cache.Container { panic("unimplemented") } func (m *mockPod) GetContainers() []cache.Container { panic("unimplemented") } func (m *mockPod) GetContainer(string) (cache.Container, bool) { panic("unimplemented") } func (m *mockPod) GetID() string { panic("unimplemented") } func (m *mockPod) GetUID() string { panic("unimplemented") } func (m *mockPod) GetName() string { return m.name } func (m *mockPod) GetNamespace() string { panic("unimplemented") } func (m *mockPod) GetState() cache.PodState { panic("unimplemented") } func (m *mockPod) GetQOSClass() v1.PodQOSClass { return m.returnValueFotGetQOSClass } func (m *mockPod) GetLabelKeys() []string { panic("unimplemented") } func (m *mockPod) GetLabel(string) (string, bool) { panic("unimplemented") } func (m *mockPod) GetResmgrLabelKeys() []string { panic("unimplemented") } func (m *mockPod) GetResmgrLabel(string) (string, bool) { panic("unimplemented") } func (m *mockPod) GetAnnotationKeys() []string { panic("unimplemented") } func (m *mockPod) GetAnnotation(string) (string, bool) { panic("unimplemented") } func (m *mockPod) GetAnnotationObject(string, interface{}, func([]byte, interface{}) error) (bool, error) { panic("unimplemented") } func (m *mockPod) GetResmgrAnnotationKeys() []string { panic("unimplemented") } func (m *mockPod) GetResmgrAnnotation(key string) (string, bool) { if key == keyColdStartPreference && len(m.coldStartContainerName) > 0 { return m.coldStartContainerName + ": { duration: " + m.coldStartTimeout.String() + " }", true } return m.returnValue1FotGetResmgrAnnotation, m.returnValue2FotGetResmgrAnnotation } func (m *mockPod) GetResmgrAnnotationObject(string, interface{}, func([]byte, interface{}) error) (bool, error) { panic("unimplemented") } func (m *mockPod) GetEffectiveAnnotation(key, container string) (string, bool) { if v, ok := m.annotations[key+"/container."+container]; ok { return v, true } if v, ok := m.annotations[key+"/pod"]; ok { return v, true } v, ok := m.annotations[key] return v, ok } func (m *mockPod) GetCgroupParentDir() string { panic("unimplemented") } func (m *mockPod) GetPodResourceRequirements() cache.PodResourceRequirements { panic("unimplemented") } func (m *mockPod) GetContainerAffinity(string) ([]*cache.Affinity, error) { panic("unimplemented") } func (m *mockPod) ScopeExpression() *resmgr.Expression { panic("unimplemented") } func (m *mockPod) String() string { return "mockPod" } func (m *mockPod) Eval(string) interface{} { panic("unimplemented") } func (m *mockPod) GetProcesses(bool) ([]string, error) { panic("unimplemented") } func (m *mockPod) GetTasks(bool) ([]string, error) { panic("unimplemented") } type mockCache struct { returnValueForGetPolicyEntry bool returnValue1ForLookupContainer cache.Container returnValue2ForLookupContainer bool } func (m *mockCache) InsertPod(string, interface{}, *cache.PodStatus) (cache.Pod, error) { panic("unimplemented") } func (m *mockCache) DeletePod(string) cache.Pod { panic("unimplemented") } func (m *mockCache) LookupPod(string) (cache.Pod, bool) { panic("unimplemented") } func (m *mockCache) InsertContainer(interface{}) (cache.Container, error) { panic("unimplemented") } func (m *mockCache) UpdateContainerID(string, interface{}) (cache.Container, error) { panic("unimplemented") } func (m *mockCache) DeleteContainer(string) cache.Container { panic("unimplemented") } func (m *mockCache) LookupContainer(string) (cache.Container, bool) { return m.returnValue1ForLookupContainer, m.returnValue2ForLookupContainer } func (m *mockCache) LookupContainerByCgroup(path string) (cache.Container, bool) { panic("unimplemented") } func (m *mockCache) GetPendingContainers() []cache.Container { panic("unimplemented") } func (m *mockCache) GetPods() []cache.Pod { panic("unimplemented") } func (m *mockCache) GetContainers() []cache.Container { panic("unimplemented") } func (m *mockCache) GetContainerCacheIds() []string { panic("unimplemented") } func (m *mockCache) GetContainerIds() []string { panic("unimplemented") } func (m *mockCache) FilterScope(*resmgr.Expression) []cache.Container { panic("unimplemented") } func (m *mockCache) EvaluateAffinity(*cache.Affinity) map[string]int32 { return map[string]int32{ "fake key": 1, } } func (m *mockCache) AddImplicitAffinities(map[string]cache.ImplicitAffinity) error { return nil } func (m *mockCache) GetActivePolicy() string { panic("unimplemented") } func (m *mockCache) SetActivePolicy(string) error { panic("unimplemented") } func (m *mockCache) ResetActivePolicy() error { panic("unimplemented") } func (m *mockCache) SetPolicyEntry(string, interface{}) { } func (m *mockCache) GetPolicyEntry(string, interface{}) bool { return m.returnValueForGetPolicyEntry } func (m *mockCache) SetConfig(*config.RawConfig) error { panic("unimplemented") } func (m *mockCache) GetConfig() *config.RawConfig { panic("unimplemented") } func (m *mockCache) ResetConfig() error { panic("unimplemented") } func (m *mockCache) SetAdjustment(*config.Adjustment) (bool, map[string]error) { panic("unimplemented") } func (m *mockCache) Save() error { return nil } func (m *mockCache) RefreshPods(*criv1.ListPodSandboxResponse, map[string]*cache.PodStatus) ([]cache.Pod, []cache.Pod, []cache.Container) { panic("unimplemented") } func (m *mockCache) RefreshContainers(*criv1.ListContainersResponse) ([]cache.Container, []cache.Container) { panic("unimplemented") } func (m *mockCache) ContainerDirectory(string) string { panic("unimplemented") } func (m *mockCache) OpenFile(string, string, os.FileMode) (*os.File, error) { panic("unimplemented") } func (m *mockCache) WriteFile(string, string, os.FileMode, []byte) error { panic("unimplemented") } ================================================ FILE: pkg/cri/resource-manager/policy/builtin/topology-aware/node.go ================================================ // Copyright 2019 Intel Corporation. All Rights Reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package topologyaware import ( "fmt" system "github.com/intel/cri-resource-manager/pkg/sysfs" "github.com/intel/cri-resource-manager/pkg/topology" "github.com/intel/cri-resource-manager/pkg/utils/cpuset" idset "github.com/intel/goresctrl/pkg/utils" ) // // Nodes (currently) correspond to some tangible entity in the hardware topology // hierarchy: full machine (virtual root in multi-socket systems), an individual // sockets a NUMA node. These nodes are linked into a tree resembling the topology // tree, with the full machine at the top, and CPU cores at the bottom. In a single // socket system, the virtual root is replaced with the single socket. In a single // NUMA node case, the single node is omitted. Also, CPU cores are not modelled as // nodes, instead they are properties of the nodes (as capacity and free CPU). // // NodeKind represents a unique node type. type NodeKind string const ( // NilNode is the type of a nil node. NilNode NodeKind = "" // UnknownNode is the type of unknown node type. UnknownNode NodeKind = "unknown" // SocketNode represents a physical CPU package/socket in the system. SocketNode NodeKind = "socket" // DieNode represents a die within a physical CPU package/socket in the system. DieNode NodeKind = "die" // NumaNode represents a NUMA node in the system. NumaNode NodeKind = "numa node" // VirtualNode represents a virtual node, currently the root multi-socket setups. VirtualNode NodeKind = "virtual node" ) const ( // OverfitPenalty is the per layer penalty for overfitting in the node tree. OverfitPenalty = 0.9 ) // Node is the abstract interface our partition tree nodes implement. type Node interface { // IsNil tests if this node is nil. IsNil() bool // Name returns the name of this node. Name() string // Kind returns the type of this node. Kind() NodeKind // NodeID returns the (enumerated) node id of this node. NodeID() int // Parent returns the parent node of this node. Parent() Node // Children returns the child nodes of this node. Children() []Node // LinkParent sets the given node as the parent node, and appends this node as a its child. LinkParent(Node) // AddChildren appends the nodes to the children, *WITHOUT* updating their parents. AddChildren([]Node) // IsSameNode returns true if the given node is the same as this one. IsSameNode(Node) bool // IsRootNode returns true if this node has no parent. IsRootNode() bool // IsLeafNode returns true if this node has no children. IsLeafNode() bool // Get the distance of this node from the root node. RootDistance() int // Get the height of this node (inverse of depth: tree depth - node depth). NodeHeight() int // System returns the policy sysfs instance. System() system.System // Policy returns the policy back pointer. Policy() *policy // DiscoverSupply DiscoverSupply(assignedNUMANodes []idset.ID) Supply // GetSupply returns the full CPU at this node. GetSupply() Supply // FreeSupply returns the available CPU supply of this node. FreeSupply() Supply // GrantedReservedCPU returns the amount of granted reserved CPU of this node and its children. GrantedReservedCPU() int // GrantedSharedCPU returns the amount of granted shared CPU of this node and its children. GrantedSharedCPU() int // GetMemset GetMemset(mtype memoryType) idset.IDSet // AssignNUMANodes assigns the given set of NUMA nodes to this one. AssignNUMANodes(ids []idset.ID) // DepthFirst traverse the tree@node calling the function at each node. DepthFirst(func(Node) error) error // BreadthFirst traverse the tree@node calling the function at each node. BreadthFirst(func(Node) error) error // Dump state of the node. Dump(string, ...int) // Dump type-specific state of the node. dump(string, ...int) GetMemoryType() memoryType HasMemoryType(memoryType) bool GetPhysicalNodeIDs() []idset.ID GetScore(Request) Score HintScore(topology.Hint) float64 } // node represents data common to all node types. type node struct { policy *policy // policy back pointer self nodeself // upcasted/type-specific interface name string // node name id int // node id kind NodeKind // node type depth int // node depth in the tree parent Node // parent node children []Node // child nodes noderes Supply // CPU and memory available at this node freeres Supply // CPU and memory allocatable at this node mem idset.IDSet // controllers with normal DRAM attached pMem idset.IDSet // controllers with PMEM attached hbm idset.IDSet // controllers with HBM attached } // nodeself is used to 'upcast' a generic Node interface to a type-specific one. type nodeself struct { node Node } // socketnode represents a physical CPU package/socket in the system. type socketnode struct { node // common node data id idset.ID // NUMA node socket id syspkg system.CPUPackage // corresponding system.Package } // dienode represents a die within a physical CPU package/socket in the system. type dienode struct { node // common node data id idset.ID // die id within socket syspkg system.CPUPackage // corresponding system.Package } // numanode represents a NUMA node in the system. type numanode struct { node // common node data id idset.ID // NUMA node system id sysnode system.Node // corresponding system.Node } // virtualnode represents a virtual node (ATM only the root in a multi-socket system). type virtualnode struct { node // common node data } // special node instance to represent a nonexistent node var nilnode Node = &node{ name: "", id: -1, kind: NilNode, depth: -1, children: nil, } // Init initializes the resource with common node data. func (n *node) init(p *policy, name string, kind NodeKind, parent Node) { n.policy = p n.name = name n.kind = kind n.parent = parent n.id = -1 n.LinkParent(parent) n.mem = idset.NewIDSet() n.pMem = idset.NewIDSet() n.hbm = idset.NewIDSet() } // IsNil tests if a node func (n *node) IsNil() bool { return n.kind == NilNode } // Name returns the name of this node. func (n *node) Name() string { if n.IsNil() { return "" } return n.name } // Kind returns the kind of this node. func (n *node) Kind() NodeKind { return n.kind } // NodeID returns the node id of this node. func (n *node) NodeID() int { if n.IsNil() { return -1 } return n.id } // IsSameNode checks if the given node is that same as this one. func (n *node) IsSameNode(other Node) bool { return n.NodeID() == other.NodeID() } // IsRootNode returns true if this node has no parent. func (n *node) IsRootNode() bool { return n.parent.IsNil() } // IsLeafNode returns true if this node has no children. func (n *node) IsLeafNode() bool { return len(n.children) == 0 } // RootDistance returns the distance of this node from the root node. func (n *node) RootDistance() int { if n.IsNil() { return -1 } return n.depth } // NodeHeight returns the hight of this node (tree depth - node depth). func (n *node) NodeHeight() int { if n.IsNil() { return -1 } return n.policy.depth - n.depth } // Parent returns the parent of this node. func (n *node) Parent() Node { if n.IsNil() { return nil } return n.parent } // Children returns the children of this node. func (n *node) Children() []Node { if n.IsNil() { return nil } return n.children } // LinkParent sets the given node as the node parent and appends this node to the parents children. func (n *node) LinkParent(parent Node) { n.parent = parent if !parent.IsNil() { parent.AddChildren([]Node{n}) } n.depth = parent.RootDistance() + 1 } // AddChildren appends the nodes to the childres, *WITHOUT* setting their parent. func (n *node) AddChildren(nodes []Node) { n.children = append(n.children, nodes...) } // Dump information/state of the node. func (n *node) Dump(prefix string, level ...int) { if !log.DebugEnabled() { return } lvl := 0 if len(level) > 0 { lvl = level[0] } idt := indent(prefix, lvl) n.self.node.dump(prefix, lvl) log.Debug("%s - %s", idt, n.noderes.DumpCapacity()) log.Debug("%s - %s", idt, n.freeres.DumpAllocatable()) n.freeres.DumpMemoryState(idt + " ") if n.mem.Size() > 0 { log.Debug("%s - normal memory: %v", idt, n.mem) } if n.hbm.Size() > 0 { log.Debug("%s - HBM memory: %v", idt, n.hbm) } if n.pMem.Size() > 0 { log.Debug("%s - PMEM memory: %v", idt, n.pMem) } for _, grant := range n.policy.allocations.grants { cpuNodeID := grant.GetCPUNode().NodeID() memNodeID := grant.GetMemoryNode().NodeID() switch { case cpuNodeID == n.id && memNodeID == n.id: log.Debug("%s + cpu+mem %s", idt, grant) case cpuNodeID == n.id: log.Debug("%s + cpuonly %s", idt, grant) case memNodeID == n.id: log.Debug("%s + memonly %s", idt, grant) } } if !n.Parent().IsNil() { log.Debug("%s - parent: <%s>", idt, n.Parent().Name()) } if len(n.children) > 0 { log.Debug("%s - children:", idt) for _, c := range n.children { c.Dump(prefix, lvl+1) } } } // Dump type-specific information about the node. func (n *node) dump(prefix string, level ...int) { n.self.node.dump(prefix, level...) } // Do a depth-first traversal starting at node calling the given function at each node. func (n *node) DepthFirst(fn func(Node) error) error { for _, c := range n.children { if err := c.DepthFirst(fn); err != nil { return err } } return fn(n) } // Do a breadth-first traversal starting at node calling the given function at each node. func (n *node) BreadthFirst(fn func(Node) error) error { if err := fn(n); err != nil { return err } for _, c := range n.children { if err := c.BreadthFirst(fn); err != nil { return err } } return nil } // System returns the policy System instance. func (n *node) System() system.System { return n.policy.sys } // Policy returns the policy back pointer. func (n *node) Policy() *policy { return n.policy } // GetSupply returns the full CPU supply of this node. func (n *node) GetSupply() Supply { return n.self.node.GetSupply() } // Discover CPU available at this node. func (n *node) DiscoverSupply(assignedNUMANodes []idset.ID) Supply { return n.self.node.DiscoverSupply(assignedNUMANodes) } // discoverSupply discovers the resource supply assigned to this pool node. func (n *node) discoverSupply(assignedNUMANodes []idset.ID) Supply { if n.noderes != nil { return n.noderes.Clone() } if !n.IsLeafNode() { log.Debug("%s: cumulating child resources...", n.Name()) if len(assignedNUMANodes) > 0 { log.Fatal("invalid pool setup: trying to attach NUMA nodes to non-leaf node %s", n.Name()) } n.noderes = newSupply(n, cpuset.New(), cpuset.New(), cpuset.New(), 0, 0, nil, nil) for _, c := range n.children { supply := c.GetSupply() n.noderes.Cumulate(supply) n.mem.Add(c.GetMemset(memoryDRAM).Members()...) n.hbm.Add(c.GetMemset(memoryHBM).Members()...) n.pMem.Add(c.GetMemset(memoryPMEM).Members()...) log.Debug(" + %s", supply.DumpCapacity()) } log.Debug(" = %s", n.noderes.DumpCapacity()) } else { log.Debug("%s: discovering attached/assigned resources...", n.Name()) mmap := createMemoryMap(0, 0, 0) cpus := cpuset.New() for _, nodeID := range assignedNUMANodes { node := n.System().Node(nodeID) nodeCPUs := node.CPUSet() meminfo, err := node.MemoryInfo() if err != nil { log.Fatal("%s: failed to get memory info for NUMA node #%d", n.Name(), nodeID) } switch node.GetMemoryType() { case system.MemoryTypeDRAM: n.mem.Add(nodeID) mmap.AddDRAM(meminfo.MemTotal) shortCPUs := cpuset.ShortCPUSet(nodeCPUs) log.Debug(" + assigned DRAM NUMA node #%d (cpuset: %s, DRAM %.2fM)", nodeID, shortCPUs, float64(meminfo.MemTotal)/float64(1024*1024)) case system.MemoryTypePMEM: n.pMem.Add(nodeID) mmap.AddPMEM(meminfo.MemTotal) log.Debug(" + assigned PMEM NUMA node #%d (DRAM %.2fM)", nodeID, float64(meminfo.MemTotal)/float64(1024*1024)) case system.MemoryTypeHBM: n.hbm.Add(nodeID) mmap.AddHBM(meminfo.MemTotal) log.Debug(" + assigned HBMEM NUMA node #%d (DRAM %.2fM)", nodeID, float64(meminfo.MemTotal)/float64(1024*1024)) default: log.Fatal("NUMA node #%d with unknown memory type %v", node.GetMemoryType()) } allowed := nodeCPUs.Intersection(n.policy.allowed) isolated := allowed.Intersection(n.policy.isolated) reserved := allowed.Intersection(n.policy.reserved).Difference(isolated) sharable := allowed.Difference(isolated).Difference(reserved) if !reserved.IsEmpty() { log.Debug(" allowed reserved CPUs: %s", cpuset.ShortCPUSet(reserved)) } if !sharable.IsEmpty() { log.Debug(" allowed sharable CPUs: %s", cpuset.ShortCPUSet(sharable)) } if !isolated.IsEmpty() { log.Debug(" allowed isolated CPUs: %s", cpuset.ShortCPUSet(isolated)) } cpus = cpus.Union(allowed) } isolated := cpus.Intersection(n.policy.isolated) reserved := cpus.Intersection(n.policy.reserved).Difference(isolated) sharable := cpus.Difference(isolated).Difference(reserved) n.noderes = newSupply(n, isolated, reserved, sharable, 0, 0, mmap, nil) log.Debug(" = %s", n.noderes.DumpCapacity()) } n.freeres = n.noderes.Clone() return n.noderes.Clone() } // FreeSupply returns the available CPU supply of this node. func (n *node) FreeSupply() Supply { return n.freeres } // Get the set of memory attached to this node. func (n *node) GetMemset(mtype memoryType) idset.IDSet { if n.self.node == nil { // protect against &node{}-abuse by test cases... return idset.NewIDSet() } return n.self.node.GetMemset(mtype) } // AssignNUMANodes assigns the given set of NUMA nodes to this one. func (n *node) AssignNUMANodes(ids []idset.ID) { n.self.node.AssignNUMANodes(ids) } // assignNUMANodes assigns the given set of NUMA nodes to this one. func (n *node) assignNUMANodes(ids []idset.ID) { mem := createMemoryMap(0, 0, 0) for _, numaNodeID := range ids { if n.mem.Has(numaNodeID) || n.pMem.Has(numaNodeID) || n.hbm.Has(numaNodeID) { log.Warn("*** NUMA node #%d already discovered by or assigned to %s", numaNodeID, n.Name()) continue } numaNode := n.policy.sys.Node(numaNodeID) memTotal := uint64(0) if meminfo, err := numaNode.MemoryInfo(); err != nil { log.Error("%s: failed to get memory info for NUMA node #%d", n.Name(), numaNodeID) } else { memTotal = meminfo.MemTotal } switch numaNode.GetMemoryType() { case system.MemoryTypeDRAM: mem.Add(memTotal, 0, 0) n.mem.Add(numaNodeID) log.Info("*** DRAM NUMA node #%d assigned to pool node %q", numaNodeID, n.Name()) case system.MemoryTypePMEM: n.pMem.Add(numaNodeID) mem.Add(0, memTotal, 0) log.Info("*** PMEM NUMA node #%d assigned to pool node %q", numaNodeID, n.Name()) case system.MemoryTypeHBM: n.hbm.Add(numaNodeID) mem.Add(0, 0, memTotal) log.Info("*** HBM NUMA node #%d assigned to pool node %q", numaNodeID, n.Name()) default: log.Fatal("can't assign NUMA node #%d of type %v to pool node %q", numaNodeID, numaNode.GetMemoryType()) } } n.noderes.AssignMemory(mem) n.freeres.AssignMemory(mem) } // Discover the set of memory attached to this node. func (n *node) GetPhysicalNodeIDs() []idset.ID { return n.self.node.GetPhysicalNodeIDs() } // GrantedReservedCPU returns the amount of granted reserved CPU of this node and its children. func (n *node) GrantedReservedCPU() int { grantedReserved := n.freeres.GrantedReserved() for _, c := range n.children { grantedReserved += c.GrantedReservedCPU() } return grantedReserved } // GrantedSharedCPU returns the amount of granted shared CPU of this node and its children. func (n *node) GrantedSharedCPU() int { grantedShared := n.freeres.GrantedShared() for _, c := range n.children { grantedShared += c.GrantedSharedCPU() } return grantedShared } // Get Score for a cpu request. func (n *node) GetScore(req Request) Score { f := n.FreeSupply() return f.GetScore(req) } // HintScore calculates the (CPU) score of the node for the given topology hint. func (n *node) HintScore(hint topology.Hint) float64 { return n.self.node.HintScore(hint) } func (n *node) GetMemoryType() memoryType { var memoryMask memoryType = 0x0 if n.pMem.Size() > 0 { memoryMask |= memoryPMEM } if n.mem.Size() > 0 { memoryMask |= memoryDRAM } if n.hbm.Size() > 0 { memoryMask |= memoryHBM } return memoryMask } func (n *node) HasMemoryType(reqType memoryType) bool { nodeType := n.GetMemoryType() return (nodeType & reqType) == reqType } // NewNumaNode create a node for a CPU socket. func (p *policy) NewNumaNode(id idset.ID, parent Node) Node { n := &numanode{} n.self.node = n n.node.init(p, fmt.Sprintf("NUMA node #%v", id), NumaNode, parent) n.id = id n.sysnode = p.sys.Node(id) return n } // Dump (the NUMA-specific parts of) this node. func (n *numanode) dump(prefix string, level ...int) { log.Debug("%s", indent(prefix, level...), n.id) } // Get CPU supply available at this node. func (n *numanode) GetSupply() Supply { return n.noderes.Clone() } func (n *numanode) GetPhysicalNodeIDs() []idset.ID { return []idset.ID{n.id} } // DiscoverSupply discovers the CPU supply available at this node. func (n *numanode) DiscoverSupply(assignedNUMANodes []idset.ID) Supply { return n.node.discoverSupply(assignedNUMANodes) } // GetMemset returns the set of memory attached to this node. func (n *numanode) GetMemset(mtype memoryType) idset.IDSet { mset := idset.NewIDSet() if mtype&memoryDRAM != 0 { mset.Add(n.mem.Members()...) } if mtype&memoryHBM != 0 { mset.Add(n.hbm.Members()...) } if mtype&memoryPMEM != 0 { mset.Add(n.pMem.Members()...) } return mset } // AssignNUMANodes assigns the given NUMA nodes to this one. func (n *numanode) AssignNUMANodes(ids []idset.ID) { n.node.assignNUMANodes(ids) } // HintScore calculates the (CPU) score of the node for the given topology hint. func (n *numanode) HintScore(hint topology.Hint) float64 { switch { case hint.CPUs != "": return cpuHintScore(hint, n.sysnode.CPUSet()) case hint.NUMAs != "": return numaHintScore(hint, n.id) case hint.Sockets != "": pkgID := n.sysnode.PackageID() score := socketHintScore(hint, n.sysnode.PackageID()) if score > 0.0 { // penalize underfit reciprocally (inverse-proportionally) to the socket size score /= float64(len(n.System().Package(pkgID).NodeIDs())) } return score } return 0.0 } // NewDieNode create a node for a CPU die. func (p *policy) NewDieNode(id idset.ID, parent Node) Node { pkg := parent.(*socketnode) n := &dienode{} n.self.node = n n.node.init(p, fmt.Sprintf("die #%v/%v", pkg.id, id), DieNode, parent) n.id = id n.syspkg = p.sys.Package(pkg.id) return n } // Dump (the die-specific parts of) this node. func (n *dienode) dump(prefix string, level ...int) { log.Debug("%s", indent(prefix, level...), n.syspkg.ID(), n.id) } // Get CPU supply available at this node. func (n *dienode) GetSupply() Supply { return n.noderes.Clone() } func (n *dienode) GetPhysicalNodeIDs() []idset.ID { ids := make([]idset.ID, 0) ids = append(ids, n.id) for _, c := range n.children { cIds := c.GetPhysicalNodeIDs() ids = append(ids, cIds...) } return ids } // DiscoverSupply discovers the CPU supply available at this die. func (n *dienode) DiscoverSupply(assignedNUMANodes []idset.ID) Supply { return n.node.discoverSupply(assignedNUMANodes) } // GetMemset returns the set of memory attached to this die. func (n *dienode) GetMemset(mtype memoryType) idset.IDSet { mset := idset.NewIDSet() if mtype&memoryDRAM != 0 { mset.Add(n.mem.Members()...) } if mtype&memoryHBM != 0 { mset.Add(n.hbm.Members()...) } if mtype&memoryPMEM != 0 { mset.Add(n.pMem.Members()...) } return mset } // AssignNUMANodes assigns the given NUMA nodes to this one. func (n *dienode) AssignNUMANodes(ids []idset.ID) { n.node.assignNUMANodes(ids) } // HintScore calculates the (CPU) score of the node for the given topology hint. func (n *dienode) HintScore(hint topology.Hint) float64 { switch { case hint.CPUs != "": return cpuHintScore(hint, n.syspkg.CPUSet()) case hint.NUMAs != "": return OverfitPenalty * dieHintScore(hint, n.id, n.syspkg) case hint.Sockets != "": score := socketHintScore(hint, n.syspkg.ID()) if score > 0.0 { // penalize underfit reciprocally (inverse-proportionally) to the socket size in dies score /= float64(len(n.syspkg.DieNodeIDs(n.id))) } return score } return 0.0 } // NewSocketNode create a node for a CPU socket. func (p *policy) NewSocketNode(id idset.ID, parent Node) Node { n := &socketnode{} n.self.node = n n.node.init(p, fmt.Sprintf("socket #%v", id), SocketNode, parent) n.id = id n.syspkg = p.sys.Package(id) return n } // Dump (the socket-specific parts of) this node. func (n *socketnode) dump(prefix string, level ...int) { log.Debug("%s", indent(prefix, level...), n.id) } // Get CPU supply available at this node. func (n *socketnode) GetSupply() Supply { return n.noderes.Clone() } func (n *socketnode) GetPhysicalNodeIDs() []idset.ID { ids := make([]idset.ID, 0) ids = append(ids, n.id) for _, c := range n.children { cIds := c.GetPhysicalNodeIDs() ids = append(ids, cIds...) } return ids } // DiscoverSupply discovers the CPU supply available at this socket. func (n *socketnode) DiscoverSupply(assignedNUMANodes []idset.ID) Supply { return n.node.discoverSupply(assignedNUMANodes) } // GetMemset returns the set of memory attached to this socket. func (n *socketnode) GetMemset(mtype memoryType) idset.IDSet { mset := idset.NewIDSet() if mtype&memoryDRAM != 0 { mset.Add(n.mem.Members()...) } if mtype&memoryHBM != 0 { mset.Add(n.hbm.Members()...) } if mtype&memoryPMEM != 0 { mset.Add(n.pMem.Members()...) } return mset } // AssignNUMANodes assigns the given NUMA nodes to this one. func (n *socketnode) AssignNUMANodes(ids []idset.ID) { n.node.assignNUMANodes(ids) } // HintScore calculates the (CPU) score of the node for the given topology hint. func (n *socketnode) HintScore(hint topology.Hint) float64 { switch { case hint.CPUs != "": return cpuHintScore(hint, n.syspkg.CPUSet()) case hint.NUMAs != "": return OverfitPenalty * numaHintScore(hint, n.syspkg.NodeIDs()...) case hint.Sockets != "": return socketHintScore(hint, n.id) } return 0.0 } // NewVirtualNode creates a new virtual node. func (p *policy) NewVirtualNode(name string, parent Node) Node { n := &virtualnode{} n.self.node = n n.node.init(p, fmt.Sprintf("%s", name), VirtualNode, parent) return n } // Dump (the virtual-node specific parts of) this node. func (n *virtualnode) dump(prefix string, level ...int) { log.Debug("%s", indent(prefix, level...), n.name) } // Get CPU supply available at this node. func (n *virtualnode) GetSupply() Supply { return n.noderes.Clone() } // DiscoverSupply discovers the CPU supply available at this node. func (n *virtualnode) DiscoverSupply(assignedNUMANodes []idset.ID) Supply { return n.node.discoverSupply(assignedNUMANodes) } // GetMemset returns the set of memory attached to this socket. func (n *virtualnode) GetMemset(mtype memoryType) idset.IDSet { mset := idset.NewIDSet() if mtype&memoryDRAM != 0 { mset.Add(n.mem.Members()...) } if mtype&memoryHBM != 0 { mset.Add(n.hbm.Members()...) } if mtype&memoryPMEM != 0 { mset.Add(n.pMem.Members()...) } return mset } // AssignNUMANodes assigns the given NUMA nodes to this one. func (n *virtualnode) AssignNUMANodes(ids []idset.ID) { log.Panic("cannot assign NUMA nodes #%s to %s", idset.NewIDSet(ids...).String(), n.Name()) } // HintScore calculates the (CPU) score of the node for the given topology hint. func (n *virtualnode) HintScore(hint topology.Hint) float64 { // don't bother calculating any scores, the root should always score 1.0 switch { case hint.CPUs != "": return cpuHintScore(hint, n.System().CPUSet()) case hint.NUMAs != "": return OverfitPenalty * OverfitPenalty case hint.Sockets != "": return OverfitPenalty } return 0.0 } func (n *virtualnode) GetPhysicalNodeIDs() []idset.ID { ids := make([]idset.ID, 0) for _, c := range n.children { cIds := c.GetPhysicalNodeIDs() ids = append(ids, cIds...) } return ids } // Finalize the setup of nilnode. func init() { nilnode.(*node).self.node = nilnode nilnode.(*node).parent = nilnode.(*node).self.node } ================================================ FILE: pkg/cri/resource-manager/policy/builtin/topology-aware/pod-preferences.go ================================================ // Copyright 2019 Intel Corporation. All Rights Reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package topologyaware import ( "encoding/json" "fmt" "path/filepath" "strconv" "strings" "time" "sigs.k8s.io/yaml" corev1 "k8s.io/api/core/v1" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "github.com/intel/cri-resource-manager/pkg/config" "github.com/intel/cri-resource-manager/pkg/cri/resource-manager/cache" "github.com/intel/cri-resource-manager/pkg/cri/resource-manager/kubernetes" ) const ( // annotation key for opting in to multiple isolated exclusive CPUs per container. keyIsolationPreference = "prefer-isolated-cpus" // annotation key for opting out of exclusive allocation and relaxed topology fitting. keySharedCPUPreference = "prefer-shared-cpus" // annotation key for type of memory to allocate keyMemoryTypePreference = "memory-type" // annotation key for type "cold start" of workloads keyColdStartPreference = "cold-start" // annotation key for reserved pools keyReservedCPUsPreference = "prefer-reserved-cpus" // effective annotation key for isolated CPU preference preferIsolatedCPUsKey = keyIsolationPreference + "." + kubernetes.ResmgrKeyNamespace // effective annotation key for shared CPU preference preferSharedCPUsKey = keySharedCPUPreference + "." + kubernetes.ResmgrKeyNamespace // effective annotation key for memory type preference preferMemoryTypeKey = keyMemoryTypePreference + "." + kubernetes.ResmgrKeyNamespace // effective annotation key for "cold start" preference preferColdStartKey = keyColdStartPreference + "." + kubernetes.ResmgrKeyNamespace // annotation key for reserved pools preferReservedCPUsKey = keyReservedCPUsPreference + "." + kubernetes.ResmgrKeyNamespace ) // cpuClass is a type of CPU to allocate type cpuClass int // names by cpu class var cpuClassNames = map[cpuClass]string{ cpuNormal: "normal", cpuReserved: "reserved", } const ( cpuNormal cpuClass = iota cpuReserved ) // types by memory type name var memoryNamedTypes = map[string]memoryType{ "dram": memoryDRAM, "pmem": memoryPMEM, "hbm": memoryHBM, "mixed": memoryAll, } // names by memory type var memoryTypeNames = map[memoryType]string{ memoryDRAM: "DRAM", memoryPMEM: "PMEM", memoryHBM: "HBM", } // memoryType is bitmask of types of memory to allocate type memoryType int // memoryType bits const ( memoryUnspec memoryType = (0x1 << iota) >> 1 memoryDRAM memoryPMEM memoryHBM memoryFirstUnusedBit memoryAll = memoryFirstUnusedBit - 1 // type of memory to use if none specified defaultMemoryType = memoryAll ) // isolatedCPUsPreference returns whether isolated CPUs should be preferred for // containers that allocate multiple CPUs, and if the container was explicitly // annotated with this setting. // // If the effective annotations are not found, this function falls back to // looking for the deprecated syntax by calling podIsolationPreference. func isolatedCPUsPreference(pod cache.Pod, container cache.Container) (bool, bool) { key := preferIsolatedCPUsKey value, ok := pod.GetEffectiveAnnotation(key, container.GetName()) if !ok { return podIsolationPreference(pod, container) } preference, err := strconv.ParseBool(value) if err != nil { log.Error("invalid CPU isolation preference annotation (%q, %q): %v", key, value, err) return opt.PreferIsolated, false } log.Debug("%s: effective CPU isolation preference %v", container.PrettyName(), preference) return preference, true } // sharedCPUsPreference returns whether shared CPUs should be preferred for // containers otherwise eligible for exclusive allocation, and whether the // container was explicitly annotated with this setting. // // If the effective annotations are not found, this function falls back to // looking for the deprecated syntax by calling podSharedCPUPreference. func sharedCPUsPreference(pod cache.Pod, container cache.Container) (bool, bool) { key := preferSharedCPUsKey value, ok := pod.GetEffectiveAnnotation(key, container.GetName()) if !ok { return podSharedCPUPreference(pod, container) } preference, err := strconv.ParseBool(value) if err != nil { log.Error("invalid shared CPU preference annotation (%q, %q): %v", key, value, err) return opt.PreferShared, false } log.Debug("%s: effective shared CPU preference %v", container.PrettyName(), preference) return preference, true } // memoryTypePreference returns what type of memory should be allocated for the container. // // If the effective annotations are not found, this function falls back to // looking for the deprecated syntax by calling podMemoryTypePreference. func memoryTypePreference(pod cache.Pod, container cache.Container) memoryType { key := preferMemoryTypeKey value, ok := pod.GetEffectiveAnnotation(key, container.GetName()) if !ok { return podMemoryTypePreference(pod, container) } mtype, err := parseMemoryType(value) if err != nil { log.Error("invalid memory type preference (%q, %q): %v", key, value, err) return memoryUnspec } log.Debug("%s: effective cold start preference %v", container.PrettyName(), mtype) return mtype } // coldStartPreference figures out 'cold start' preferences for the container, IOW // if the container memory should be allocated for an initial 'cold start' period // from PMEM, and how long this initial period should be. // // If the effective annotations are not found, this function falls back to // looking for the deprecated syntax by calling podColdStartPreference. func coldStartPreference(pod cache.Pod, container cache.Container) (ColdStartPreference, error) { key := preferColdStartKey value, ok := pod.GetEffectiveAnnotation(key, container.GetName()) if !ok { return podColdStartPreference(pod, container) } preference := ColdStartPreference{} if err := yaml.Unmarshal([]byte(value), &preference); err != nil { log.Error("failed to parse cold start preference (%q, %q): %v", keyColdStartPreference, value, err) return ColdStartPreference{}, policyError("invalid cold start preference %q: %v", value, err) } if preference.Duration < 0 || time.Duration(preference.Duration) > time.Hour { return ColdStartPreference{}, policyError("cold start duration %s out of range", preference.Duration.String()) } log.Debug("%s: effective cold start preference %v", container.PrettyName(), preference.Duration.String()) return preference, nil } // podIsolationPreference checks if containers explicitly prefers to run on multiple isolated CPUs. // The first return value indicates whether the container is isolated or not. // The second return value indicates whether that decision was explicit (true) or implicit (false). func podIsolationPreference(pod cache.Pod, container cache.Container) (bool, bool) { key := keyIsolationPreference value, ok := pod.GetResmgrAnnotation(key) if !ok { return opt.PreferIsolated, false } log.Warn("WARNING: using deprecated annotation %q", key) log.Warn("WARNING: consider using instead") log.Warn("WARNING: %q, or", preferIsolatedCPUsKey+"/container."+container.GetName()) log.Warn("WARNING: %q", preferIsolatedCPUsKey+"/pod") if value == "false" || value == "true" { return (value[0] == 't'), true } preferences := map[string]bool{} if err := yaml.Unmarshal([]byte(value), &preferences); err != nil { log.Error("failed to parse isolation preference %s = '%s': %v", keyIsolationPreference, value, err) return opt.PreferIsolated, false } name := container.GetName() if pref, ok := preferences[name]; ok { log.Debug("%s per-container isolation preference '%v'", name, pref) return pref, true } log.Debug("%s defaults to isolation preference '%v'", name, opt.PreferIsolated) return opt.PreferIsolated, false } // podSharedCPUPreference checks if a container wants to opt-out from exclusive allocation. // The first return value indicates if the container prefers to opt-out from // exclusive (sliced-off or isolated) CPU allocation even if it was otherwise // eligible for it. func podSharedCPUPreference(pod cache.Pod, container cache.Container) (bool, bool) { key := keySharedCPUPreference value, ok := pod.GetResmgrAnnotation(key) if !ok { return opt.PreferShared, false } log.Warn("WARNING: using deprecated annotation %q", key) log.Warn("WARNING: consider using instead") log.Warn("WARNING: %q, or", preferSharedCPUsKey+"/container."+container.GetName()) log.Warn("WARNING: %q", preferSharedCPUsKey+"/pod") if value == "false" || value == "true" { return value[0] == 't', true } preferences := map[string]string{} if err := yaml.Unmarshal([]byte(value), &preferences); err != nil { log.Error("failed to parse shared CPU preference %s = '%s': %v", keySharedCPUPreference, value, err) return opt.PreferShared, false } name := container.GetName() pref, ok := preferences[name] if !ok { return opt.PreferShared, false } if pref == "false" || pref == "true" { return pref[0] == 't', true } log.Error("invalid shared CPU boolean preference for container %s: %s", name, pref) return opt.PreferShared, false } // ColdStartPreference lists the various ways the container can be configured to trigger // cold start. Currently, only timer is supported. If the "duration" is set to a duration // greater than 0, cold start is enabled and the DRAM controller is added to the container // after the duration has passed. type ColdStartPreference struct { Duration config.Duration // `json:"duration,omitempty"` } // podColdStartPreference figures out if the container memory should be first allocated from PMEM. // It returns the time (in milliseconds) after which DRAM controller should be added to the mix. func podColdStartPreference(pod cache.Pod, container cache.Container) (ColdStartPreference, error) { key := keyColdStartPreference value, ok := pod.GetResmgrAnnotation(key) if !ok { return ColdStartPreference{}, nil } log.Warn("WARNING: using deprecated annotation %q", key) log.Warn("WARNING: consider using instead") log.Warn("WARNING: %q, or", preferColdStartKey+"/container."+container.GetName()) log.Warn("WARNING: %q", preferColdStartKey+"/pod") preferences := map[string]ColdStartPreference{} if err := yaml.Unmarshal([]byte(value), &preferences); err != nil { log.Error("failed to parse cold start preference %s = '%s': %v", key, value, err) return ColdStartPreference{}, err } name := container.GetName() preference, ok := preferences[name] if !ok { log.Debug("container %s has no entry among cold start preferences", container.PrettyName()) return ColdStartPreference{}, nil } if preference.Duration < 0 || time.Duration(preference.Duration) > time.Hour { // Duration can't be negative. We also reject durations which are longer than one hour. return ColdStartPreference{}, fmt.Errorf("failed to validate cold start timeout %s: value out of scope", preference.Duration.String()) } return preference, nil } func checkReservedPoolNamespaces(namespace string) bool { if namespace == metav1.NamespaceSystem { return true } for _, str := range opt.ReservedPoolNamespaces { ret, err := filepath.Match(str, namespace) if err != nil { return false } if ret { return true } } return false } func checkReservedCPUsAnnotations(c cache.Container) (bool, bool) { hintSetting, ok := c.GetEffectiveAnnotation(preferReservedCPUsKey) if !ok { return false, false } preference, err := strconv.ParseBool(hintSetting) if err != nil { log.Error("failed to parse reserved CPU preference %s = '%s': %v", keyReservedCPUsPreference, hintSetting, err) return false, false } return preference, true } // cpuAllocationPreferences figures out the amount and kind of CPU to allocate. // Returned values: // 1. full: number of full CPUs // 2. fraction: amount of fractional CPU in milli-CPU // 3. isolate: (bool) whether to prefer isolated full CPUs // 4. cpuType: (cpuClass) class of CPU to allocate (reserved vs. normal) func cpuAllocationPreferences(pod cache.Pod, container cache.Container) (int, int, bool, cpuClass) { // // CPU allocation preferences for a container consist of // // - the number of exclusive cores to allocate // - the amount of fractional cores to allocate (in milli-CPU) // - whether kernel-isolated cores are preferred for exclusive allocation // - cpu class IOW, whether reserved or normal cores should be allocated // // The rules for determining these preferences are: // // - reserved cores are only and always preferred for kube-system namespace containers // - kube-system namespace containers: // => fractional/shared (reserved) cores // - BestEffort QoS class containers: // => fractional/shared cores // - Burstable QoS class containers: // => fractional/shared cores // - Guaranteed QoS class containers: // - 1 full core > CPU request // => fractional/shared cores // - 1 full core <= CPU request < 2 full cores: // a. fractional allocation: // - shared preference explicitly annotated/configured false: // => mixed cores, prefer isolated, unless annotated/configured otherwise (*) // - shared preference explicitly annotated/configured true: // => shared cores // b. non-fractional allocation: // - shared preference explicitly annotated true: // => shared cores // - isolated default preference false or explicitly annotated false: // => exclusive cores // - isolated default preference true or explicitly annotated true: // => exclusive cores, prefer isolated (*) // - 2 full cores <= CPU request // a. fractional allocation: // - shared preference explicitly annotated false: // => mixed cores, prefer isolated only if explicitly annotated (**) // - otherwise (no shared annotation): // => shared cores // b. non-fractional allocation: // - shared preference explicitly annotated true: // => shared cores // - otherwise (no shared annotation): // => exclusive cores, prefer isolated only if explicitly annotated (**) // // - Rationale for isolation defaults: // *) // In the single core case, a workload does not need to do anything extra to // benefit from running on isolated vs. ordinary exclusive cores. Therefore, // allocating isolated cores is a safe default choice. // **) // In the multiple cores case, a workload needs to be 'isolation-aware' to // benefit (or actually to not even get hindered) by running on isolated vs. // ordinary exclusive cores. If it gets isolated cores allocated, it needs // to actively spread itself/its correct processes over the cores, because // the scheduler is not going to do load-balancing for it. Therefore, the // safe choice in this case is to not allocate isolated cores by default. // namespace := container.GetNamespace() request := container.GetResourceRequirements().Requests[corev1.ResourceCPU] qosClass := pod.GetQOSClass() fraction := int(request.MilliValue()) // easy cases: kube-system namespace, Burstable or BestEffort QoS class containers preferReserved, explicitReservation := checkReservedCPUsAnnotations(container) switch { case preferReserved == true: return 0, fraction, false, cpuReserved case checkReservedPoolNamespaces(namespace) && !explicitReservation: return 0, fraction, false, cpuReserved case qosClass == corev1.PodQOSBurstable: return 0, fraction, false, cpuNormal case qosClass == corev1.PodQOSBestEffort: return 0, 0, false, cpuNormal } // complex case: Guaranteed QoS class containers cores := fraction / 1000 fraction = fraction % 1000 preferIsolated, explicitIsolated := isolatedCPUsPreference(pod, container) preferShared, explicitShared := sharedCPUsPreference(pod, container) switch { // sub-core CPU request case cores == 0: return 0, fraction, false, cpuNormal // 1 <= CPU request < 2 case cores < 2: // fractional allocation, potentially mixed if fraction > 0 { if preferShared { return 0, 1000*cores + fraction, false, cpuNormal } return cores, fraction, preferIsolated, cpuNormal } // non-fractional allocation if preferShared && explicitShared { return 0, 1000*cores + fraction, false, cpuNormal } return cores, fraction, preferIsolated, cpuNormal // CPU request >= 2 default: // fractional allocation, only mixed if explicitly annotated as unshared if fraction > 0 { if !preferShared && explicitShared { return cores, fraction, preferIsolated && explicitIsolated, cpuNormal } return 0, 1000*cores + fraction, false, cpuNormal } // non-fractional allocation if preferShared && explicitShared { return 0, 1000 * cores, false, cpuNormal } return cores, fraction, preferIsolated && explicitIsolated, cpuNormal } } // podMemoryTypePreference returns what type of memory should be allocated for the container. func podMemoryTypePreference(pod cache.Pod, c cache.Container) memoryType { key := keyMemoryTypePreference value, ok := pod.GetResmgrAnnotation(key) if !ok { log.Debug("pod %s has no memory preference annotations", pod.GetName()) return memoryUnspec } log.Warn("WARNING: using deprecated annotation %q", key) log.Warn("WARNING: consider using instead") log.Warn("WARNING: %q, or", keyMemoryTypePreference+"/container."+c.GetName()) log.Warn("WARNING: %q", keyMemoryTypePreference+"/pod") // Try to parse as per-container preference. Assume common for all containers if fails. pref := "" preferences := map[string]string{} if err := yaml.Unmarshal([]byte(value), &preferences); err == nil { name := c.GetName() p, ok := preferences[name] if !ok { log.Debug("container %s has no entry among memory preferences", c.PrettyName()) return memoryUnspec } pref = p } else { pref = value } mtype, err := parseMemoryType(pref) if err != nil { log.Error("invalid memory type preference ('%s') in annotation %s: %v", pref, keyMemoryTypePreference, err) return memoryUnspec } log.Debug("container %s has effective memory preference: %s", c.PrettyName(), mtype) return mtype } // memoryAllocationPreference returns the amount and kind of memory to allocate. func memoryAllocationPreference(pod cache.Pod, c cache.Container) (uint64, uint64, memoryType) { resources := c.GetResourceRequirements() mtype := memoryTypePreference(pod, c) req, lim := uint64(0), uint64(0) if memReq, ok := resources.Requests[corev1.ResourceMemory]; ok { req = uint64(memReq.Value()) } if memLim, ok := resources.Limits[corev1.ResourceMemory]; ok { lim = uint64(memLim.Value()) } return req, lim, mtype } // String stringifies a cpuClass. func (t cpuClass) String() string { if cpuClassName, ok := cpuClassNames[t]; ok { return cpuClassName } return fmt.Sprintf("#UNNAMED-CPUCLASS(%d)", int(t)) } // String stringifies a memoryType. func (t memoryType) String() string { str := "" sep := "" for _, bit := range []memoryType{memoryDRAM, memoryPMEM, memoryHBM} { if int(t)&int(bit) != 0 { str += sep + memoryTypeNames[bit] sep = "," } } return str } // parseMemoryType parses a memory type string, ideally produced by String() func parseMemoryType(value string) (memoryType, error) { if value == "" { return memoryUnspec, nil } mtype := 0 for _, typestr := range strings.Split(value, ",") { t, ok := memoryNamedTypes[strings.ToLower(typestr)] if !ok { return memoryUnspec, policyError("unknown memory type value '%s'", typestr) } mtype |= int(t) } return memoryType(mtype), nil } // MarshalJSON is the JSON marshaller for memoryType. func (t memoryType) MarshalJSON() ([]byte, error) { value := t.String() return json.Marshal(value) } // UnmarshalJSON is the JSON unmarshaller for memoryType func (t *memoryType) UnmarshalJSON(data []byte) error { ival := 0 if err := json.Unmarshal(data, &ival); err == nil { *t = memoryType(ival) return nil } value := "" if err := json.Unmarshal(data, &value); err != nil { return policyError("failed to unmarshal memoryType '%s': %v", string(data), err) } mtype, err := parseMemoryType(value) if err != nil { return policyError("failed parse memoryType '%s': %v", value, err) } *t = mtype return nil } ================================================ FILE: pkg/cri/resource-manager/policy/builtin/topology-aware/pod-preferences_test.go ================================================ // Copyright 2019 Intel Corporation. All Rights Reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package topologyaware import ( "testing" corev1 "k8s.io/api/core/v1" v1 "k8s.io/api/core/v1" resapi "k8s.io/apimachinery/pkg/api/resource" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" ) func TestPodIsolationPreference(t *testing.T) { tcases := []struct { name string pod *mockPod container *mockContainer expectedIsolate bool expectedExplicit bool disabled bool }{ { name: "podIsolationPreference() should handle nil pod arg gracefully", disabled: true, }, { name: "return defaults", pod: &mockPod{}, container: &mockContainer{}, expectedIsolate: opt.PreferIsolated, }, { name: "prefer resmgr's annotation value", pod: &mockPod{ returnValue1FotGetResmgrAnnotation: "true", returnValue2FotGetResmgrAnnotation: true, }, container: &mockContainer{}, expectedIsolate: true, expectedExplicit: true, }, { name: "return defaults for unparsable", pod: &mockPod{ returnValue1FotGetResmgrAnnotation: "UNPARSABLE", returnValue2FotGetResmgrAnnotation: true, }, container: &mockContainer{}, expectedIsolate: opt.PreferIsolated, }, { name: "podIsolationPreference() should handle nil container arg gracefully", pod: &mockPod{ returnValue1FotGetResmgrAnnotation: "key: true", returnValue2FotGetResmgrAnnotation: true, }, container: &mockContainer{}, disabled: true, }, { name: "return defaults for missing preferences", pod: &mockPod{ returnValue1FotGetResmgrAnnotation: "key: true", returnValue2FotGetResmgrAnnotation: true, }, container: &mockContainer{}, expectedIsolate: opt.PreferIsolated, }, { name: "return defined preferences", pod: &mockPod{ returnValue1FotGetResmgrAnnotation: "testcontainer: false", returnValue2FotGetResmgrAnnotation: true, }, container: &mockContainer{ name: "testcontainer", }, expectedExplicit: true, }, // effective annotation tests { name: "prefer resmgr's annotation value", pod: &mockPod{ annotations: map[string]string{ preferIsolatedCPUsKey + "/container.c0": "true", }, }, container: &mockContainer{name: "c0"}, expectedIsolate: true, expectedExplicit: true, }, { name: "prefer resmgr's annotation value", pod: &mockPod{ annotations: map[string]string{ preferIsolatedCPUsKey + "/container.c0": "false", }, }, container: &mockContainer{name: "c0"}, expectedIsolate: false, expectedExplicit: true, }, { name: "return defaults for unparsable annotation value", pod: &mockPod{ annotations: map[string]string{ preferIsolatedCPUsKey + "/container.c0": "blah", }, }, container: &mockContainer{name: "c0"}, expectedIsolate: opt.PreferIsolated, }, { name: "return defaults for missing preferences", pod: &mockPod{ annotations: map[string]string{ preferIsolatedCPUsKey + "/container.c0": "true", }, }, container: &mockContainer{name: "c1"}, expectedIsolate: opt.PreferIsolated, }, } for _, tc := range tcases { t.Run(tc.name, func(t *testing.T) { if tc.disabled { t.Skipf("The case '%s' is skipped", tc.name) } isolate, explicit := isolatedCPUsPreference(tc.pod, tc.container) if isolate != tc.expectedIsolate || explicit != tc.expectedExplicit { t.Errorf("Expected (%v, %v), but got (%v, %v)", tc.expectedIsolate, tc.expectedExplicit, isolate, explicit) } }) } } func TestPodSharedCPUPreference(t *testing.T) { tcases := []struct { name string pod *mockPod container *mockContainer expectedShared bool disabled bool }{ { name: "podSharedCPUPreference() should handle nil pod arg gracefully", disabled: true, }, { name: "return defaults", pod: &mockPod{}, container: &mockContainer{}, expectedShared: opt.PreferShared, }, { name: "prefer resmgr's annotation value", pod: &mockPod{ returnValue1FotGetResmgrAnnotation: "true", returnValue2FotGetResmgrAnnotation: true, }, container: &mockContainer{}, expectedShared: true, }, { name: "return defaults for unparsable", pod: &mockPod{ returnValue1FotGetResmgrAnnotation: "UNPARSABLE", returnValue2FotGetResmgrAnnotation: true, }, container: &mockContainer{}, expectedShared: opt.PreferShared, }, { name: "podSharedCPUPreference() should handle nil container arg gracefully", pod: &mockPod{ returnValue1FotGetResmgrAnnotation: "key: true", returnValue2FotGetResmgrAnnotation: true, }, container: &mockContainer{}, disabled: true, }, { name: "return defaults for missing preferences", pod: &mockPod{ returnValue1FotGetResmgrAnnotation: "key: true", returnValue2FotGetResmgrAnnotation: true, }, container: &mockContainer{}, expectedShared: opt.PreferShared, }, { name: "return defined preferences", pod: &mockPod{ returnValue1FotGetResmgrAnnotation: "testcontainer: false", returnValue2FotGetResmgrAnnotation: true, }, container: &mockContainer{ name: "testcontainer", }, }, { name: "return defaults for unparsable annotation value", pod: &mockPod{ returnValue1FotGetResmgrAnnotation: "testcontainer: UNPARSABLE", returnValue2FotGetResmgrAnnotation: true, }, container: &mockContainer{ name: "testcontainer", }, expectedShared: opt.PreferShared, }, // effective annotation tests { name: "prefer resmgr's annotation value", pod: &mockPod{ annotations: map[string]string{ preferSharedCPUsKey + "/container.c0": "true", }, }, container: &mockContainer{name: "c0"}, expectedShared: true, }, { name: "prefer resmgr's annotation value", pod: &mockPod{ annotations: map[string]string{ preferSharedCPUsKey + "/container.c0": "false", }, }, container: &mockContainer{name: "c0"}, expectedShared: false, }, { name: "return defaults for unparsable annotation value", pod: &mockPod{ annotations: map[string]string{ preferSharedCPUsKey + "/container.c0": "blah", }, }, container: &mockContainer{name: "c0"}, expectedShared: opt.PreferShared, }, { name: "return defaults for missing preferences", pod: &mockPod{ annotations: map[string]string{ preferSharedCPUsKey + "/container.c0": "true", }, }, container: &mockContainer{name: "c1"}, expectedShared: opt.PreferShared, }, } for _, tc := range tcases { t.Run(tc.name, func(t *testing.T) { if tc.disabled { t.Skipf("The case '%s' is skipped", tc.name) } shared, _ := sharedCPUsPreference(tc.pod, tc.container) if shared != tc.expectedShared { t.Errorf("Expected %v, but got %v", tc.expectedShared, shared) } }) } } func TestCpuAllocationPreferences(t *testing.T) { tcases := []struct { name string pod *mockPod container *mockContainer preferIsolated bool preferShared bool expectedFull int expectedFraction int expectedIsolate bool expectedCpuType cpuClass disabled bool reservedPoolNamespaces []string }{ { name: "cpuAllocationPreferences() should handle nil container arg gracefully", disabled: true, }, { name: "no resource requirements", pod: &mockPod{}, container: &mockContainer{}, }, { name: "cpuAllocationPreferences() should handle nil pod arg gracefully", container: &mockContainer{ returnValueForGetResourceRequirements: v1.ResourceRequirements{ Requests: v1.ResourceList{ corev1.ResourceCPU: resapi.MustParse("1"), }, }, }, disabled: true, }, { name: "return defaults", container: &mockContainer{ returnValueForGetResourceRequirements: v1.ResourceRequirements{ Requests: v1.ResourceList{ corev1.ResourceCPU: resapi.MustParse("1"), }, }, }, pod: &mockPod{ returnValueFotGetQOSClass: corev1.PodQOSBurstable, }, expectedFraction: 1000, expectedIsolate: false, }, { name: "return request's value for system container", container: &mockContainer{ namespace: metav1.NamespaceSystem, returnValueForGetResourceRequirements: v1.ResourceRequirements{ Requests: v1.ResourceList{ corev1.ResourceCPU: resapi.MustParse("2"), }, }, }, pod: &mockPod{ returnValueFotGetQOSClass: corev1.PodQOSBurstable, }, expectedFraction: 2000, expectedCpuType: cpuReserved, }, { name: "return request's value for burstable QoS", container: &mockContainer{ returnValueForGetResourceRequirements: v1.ResourceRequirements{ Requests: v1.ResourceList{ corev1.ResourceCPU: resapi.MustParse("2"), }, }, }, pod: &mockPod{ returnValueFotGetQOSClass: corev1.PodQOSBurstable, }, expectedFraction: 2000, }, { name: "guaranteed QoS with sub-core request", container: &mockContainer{ returnValueForGetResourceRequirements: v1.ResourceRequirements{ Requests: v1.ResourceList{ corev1.ResourceCPU: resapi.MustParse("750m"), }, }, }, pod: &mockPod{ returnValueFotGetQOSClass: corev1.PodQOSGuaranteed, }, expectedFull: 0, expectedFraction: 750, expectedIsolate: false, }, { name: "guaranteed QoS with sub-core request, prefer isolated", container: &mockContainer{ returnValueForGetResourceRequirements: v1.ResourceRequirements{ Requests: v1.ResourceList{ corev1.ResourceCPU: resapi.MustParse("750m"), }, }, }, pod: &mockPod{ returnValueFotGetQOSClass: corev1.PodQOSGuaranteed, }, preferIsolated: true, expectedFull: 0, expectedFraction: 750, expectedIsolate: false, }, { name: "guaranteed QoS with sub-core request, prefer shared", container: &mockContainer{ returnValueForGetResourceRequirements: v1.ResourceRequirements{ Requests: v1.ResourceList{ corev1.ResourceCPU: resapi.MustParse("750m"), }, }, }, pod: &mockPod{ returnValueFotGetQOSClass: corev1.PodQOSGuaranteed, }, preferShared: true, expectedFull: 0, expectedFraction: 750, expectedIsolate: false, }, { name: "guaranteed QoS with sub-core request, prefer isolated & shared", container: &mockContainer{ returnValueForGetResourceRequirements: v1.ResourceRequirements{ Requests: v1.ResourceList{ corev1.ResourceCPU: resapi.MustParse("750m"), }, }, }, pod: &mockPod{ returnValueFotGetQOSClass: corev1.PodQOSGuaranteed, }, preferIsolated: true, preferShared: true, expectedFull: 0, expectedFraction: 750, expectedIsolate: false, }, { name: "guaranteed QoS with single full core request, prefer isolated", container: &mockContainer{ returnValueForGetResourceRequirements: v1.ResourceRequirements{ Requests: v1.ResourceList{ corev1.ResourceCPU: resapi.MustParse("1"), }, }, }, pod: &mockPod{ returnValueFotGetQOSClass: corev1.PodQOSGuaranteed, }, preferIsolated: true, expectedFull: 1, expectedIsolate: true, }, { name: "guaranteed QoS with single full core request, prefer no isolated", container: &mockContainer{ returnValueForGetResourceRequirements: v1.ResourceRequirements{ Requests: v1.ResourceList{ corev1.ResourceCPU: resapi.MustParse("1"), }, }, }, pod: &mockPod{ returnValueFotGetQOSClass: corev1.PodQOSGuaranteed, }, preferIsolated: false, expectedFull: 1, expectedIsolate: false, }, { name: "guaranteed QoS with single full core request, prefer shared", container: &mockContainer{ returnValueForGetResourceRequirements: v1.ResourceRequirements{ Requests: v1.ResourceList{ corev1.ResourceCPU: resapi.MustParse("1"), }, }, }, pod: &mockPod{ returnValueFotGetQOSClass: corev1.PodQOSGuaranteed, }, preferShared: true, expectedFull: 1, expectedFraction: 0, expectedIsolate: false, }, { name: "guaranteed QoS with single full core request, prefer isolated & shared", container: &mockContainer{ returnValueForGetResourceRequirements: v1.ResourceRequirements{ Requests: v1.ResourceList{ corev1.ResourceCPU: resapi.MustParse("1"), }, }, }, pod: &mockPod{ returnValueFotGetQOSClass: corev1.PodQOSGuaranteed, }, preferIsolated: true, preferShared: true, expectedFull: 1, expectedFraction: 0, expectedIsolate: true, }, { name: "guaranteed QoS with single full core request, annotated shared", container: &mockContainer{ name: "testcontainer", returnValueForGetResourceRequirements: v1.ResourceRequirements{ Requests: v1.ResourceList{ corev1.ResourceCPU: resapi.MustParse("1"), }, }, }, pod: &mockPod{ returnValueFotGetQOSClass: corev1.PodQOSGuaranteed, annotations: map[string]string{ preferSharedCPUsKey + "/container.testcontainer": "true", }, }, preferIsolated: true, preferShared: true, expectedFull: 0, expectedFraction: 1000, expectedIsolate: false, }, { name: "guaranteed QoS with single full core request, annotated no isolated", container: &mockContainer{ name: "testcontainer", returnValueForGetResourceRequirements: v1.ResourceRequirements{ Requests: v1.ResourceList{ corev1.ResourceCPU: resapi.MustParse("1"), }, }, }, pod: &mockPod{ returnValueFotGetQOSClass: corev1.PodQOSGuaranteed, annotations: map[string]string{ preferIsolatedCPUsKey + "/container.testcontainer": "false", }, }, preferIsolated: true, preferShared: true, expectedFull: 1, expectedFraction: 0, expectedIsolate: false, }, { name: "guaranteed QoS with potential mixed request", container: &mockContainer{ returnValueForGetResourceRequirements: v1.ResourceRequirements{ Requests: v1.ResourceList{ corev1.ResourceCPU: resapi.MustParse("1500m"), }, }, }, pod: &mockPod{ returnValueFotGetQOSClass: corev1.PodQOSGuaranteed, }, expectedFull: 1, expectedFraction: 500, expectedIsolate: false, }, { name: "guaranteed QoS with potential mixed request, prefer isolated", container: &mockContainer{ returnValueForGetResourceRequirements: v1.ResourceRequirements{ Requests: v1.ResourceList{ corev1.ResourceCPU: resapi.MustParse("1500m"), }, }, }, pod: &mockPod{ returnValueFotGetQOSClass: corev1.PodQOSGuaranteed, }, preferIsolated: true, expectedFull: 1, expectedFraction: 500, expectedIsolate: true, }, { name: "guaranteed QoS with potential mixed request, prefer shared", container: &mockContainer{ returnValueForGetResourceRequirements: v1.ResourceRequirements{ Requests: v1.ResourceList{ corev1.ResourceCPU: resapi.MustParse("1500m"), }, }, }, pod: &mockPod{ returnValueFotGetQOSClass: corev1.PodQOSGuaranteed, }, preferShared: true, expectedFull: 0, expectedFraction: 1500, expectedIsolate: false, }, { name: "guaranteed QoS with potential mixed request, prefer isolated & shared", container: &mockContainer{ returnValueForGetResourceRequirements: v1.ResourceRequirements{ Requests: v1.ResourceList{ corev1.ResourceCPU: resapi.MustParse("1500m"), }, }, }, pod: &mockPod{ returnValueFotGetQOSClass: corev1.PodQOSGuaranteed, }, preferIsolated: true, preferShared: true, expectedFull: 0, expectedFraction: 1500, expectedIsolate: false, }, { name: "guaranteed QoS with multi-core full request", container: &mockContainer{ returnValueForGetResourceRequirements: v1.ResourceRequirements{ Requests: v1.ResourceList{ corev1.ResourceCPU: resapi.MustParse("2"), }, }, }, pod: &mockPod{ returnValueFotGetQOSClass: corev1.PodQOSGuaranteed, }, expectedFull: 2, expectedIsolate: false, }, { name: "guaranteed QoS with multi-core full request, prefer isolated", container: &mockContainer{ returnValueForGetResourceRequirements: v1.ResourceRequirements{ Requests: v1.ResourceList{ corev1.ResourceCPU: resapi.MustParse("2"), }, }, }, pod: &mockPod{ returnValueFotGetQOSClass: corev1.PodQOSGuaranteed, }, preferIsolated: true, expectedFull: 2, expectedIsolate: false, }, { name: "guaranteed QoS with multi-core full request, prefer shared", container: &mockContainer{ returnValueForGetResourceRequirements: v1.ResourceRequirements{ Requests: v1.ResourceList{ corev1.ResourceCPU: resapi.MustParse("2"), }, }, }, pod: &mockPod{ returnValueFotGetQOSClass: corev1.PodQOSGuaranteed, }, preferShared: true, expectedFull: 2, expectedIsolate: false, }, { name: "guaranteed QoS with multi-core full request, prefer isolated & shared", container: &mockContainer{ returnValueForGetResourceRequirements: v1.ResourceRequirements{ Requests: v1.ResourceList{ corev1.ResourceCPU: resapi.MustParse("2"), }, }, }, pod: &mockPod{ returnValueFotGetQOSClass: corev1.PodQOSGuaranteed, }, preferIsolated: true, preferShared: true, expectedFull: 2, expectedIsolate: false, }, { name: "guaranteed QoS with multi-core full request, annotate isolated", container: &mockContainer{ name: "testcontainer", returnValueForGetResourceRequirements: v1.ResourceRequirements{ Requests: v1.ResourceList{ corev1.ResourceCPU: resapi.MustParse("2"), }, }, }, pod: &mockPod{ returnValueFotGetQOSClass: corev1.PodQOSGuaranteed, annotations: map[string]string{ preferIsolatedCPUsKey + "/container.testcontainer": "true", }, }, expectedFull: 2, expectedIsolate: true, }, { name: "guaranteed QoS with multi-core full request, annotate shared", container: &mockContainer{ name: "testcontainer", returnValueForGetResourceRequirements: v1.ResourceRequirements{ Requests: v1.ResourceList{ corev1.ResourceCPU: resapi.MustParse("2"), }, }, }, pod: &mockPod{ returnValueFotGetQOSClass: corev1.PodQOSGuaranteed, annotations: map[string]string{ preferSharedCPUsKey + "/container.testcontainer": "true", }, }, expectedFull: 0, expectedFraction: 2000, expectedIsolate: false, }, { name: "guaranteed QoS with multi-core full request, annotate isolated & shared", container: &mockContainer{ name: "testcontainer", returnValueForGetResourceRequirements: v1.ResourceRequirements{ Requests: v1.ResourceList{ corev1.ResourceCPU: resapi.MustParse("2"), }, }, }, pod: &mockPod{ returnValueFotGetQOSClass: corev1.PodQOSGuaranteed, annotations: map[string]string{ preferIsolatedCPUsKey + "/container.testcontainer": "true", preferSharedCPUsKey + "/container.testcontainer": "true", }, }, expectedFull: 0, expectedFraction: 2000, expectedIsolate: false, }, { name: "guaranteed QoS with multi-core mixed request", container: &mockContainer{ returnValueForGetResourceRequirements: v1.ResourceRequirements{ Requests: v1.ResourceList{ corev1.ResourceCPU: resapi.MustParse("2500m"), }, }, }, pod: &mockPod{ returnValueFotGetQOSClass: corev1.PodQOSGuaranteed, }, expectedFull: 0, expectedFraction: 2500, expectedIsolate: false, }, { name: "guaranteed QoS with multi-core mixed request, prefer isolated", container: &mockContainer{ returnValueForGetResourceRequirements: v1.ResourceRequirements{ Requests: v1.ResourceList{ corev1.ResourceCPU: resapi.MustParse("2500m"), }, }, }, pod: &mockPod{ returnValueFotGetQOSClass: corev1.PodQOSGuaranteed, }, expectedFull: 0, expectedFraction: 2500, expectedIsolate: false, }, { name: "guaranteed QoS with multi-core mixed request, prefer shared", container: &mockContainer{ returnValueForGetResourceRequirements: v1.ResourceRequirements{ Requests: v1.ResourceList{ corev1.ResourceCPU: resapi.MustParse("2500m"), }, }, }, pod: &mockPod{ returnValueFotGetQOSClass: corev1.PodQOSGuaranteed, }, expectedFull: 0, expectedFraction: 2500, expectedIsolate: false, }, { name: "guaranteed QoS with multi-core mixed request, prefer isolated & shared", container: &mockContainer{ returnValueForGetResourceRequirements: v1.ResourceRequirements{ Requests: v1.ResourceList{ corev1.ResourceCPU: resapi.MustParse("2500m"), }, }, }, pod: &mockPod{ returnValueFotGetQOSClass: corev1.PodQOSGuaranteed, }, expectedFull: 0, expectedFraction: 2500, expectedIsolate: false, }, { name: "guaranteed QoS with multi-core mixed request, annotate isolated", container: &mockContainer{ name: "testcontainer", returnValueForGetResourceRequirements: v1.ResourceRequirements{ Requests: v1.ResourceList{ corev1.ResourceCPU: resapi.MustParse("2500m"), }, }, }, pod: &mockPod{ returnValueFotGetQOSClass: corev1.PodQOSGuaranteed, annotations: map[string]string{ preferIsolatedCPUsKey + "/container.testcontainer": "true", }, }, expectedFull: 0, expectedFraction: 2500, expectedIsolate: false, }, { name: "guaranteed QoS with multi-core mixed request, annotate shared", container: &mockContainer{ name: "testcontainer", returnValueForGetResourceRequirements: v1.ResourceRequirements{ Requests: v1.ResourceList{ corev1.ResourceCPU: resapi.MustParse("2500m"), }, }, }, pod: &mockPod{ returnValueFotGetQOSClass: corev1.PodQOSGuaranteed, annotations: map[string]string{ preferSharedCPUsKey + "/container.testcontainer": "true", }, }, expectedFull: 0, expectedFraction: 2500, expectedIsolate: false, }, { name: "guaranteed QoS with multi-core mixed request, annotate isolated & shared", container: &mockContainer{ name: "testcontainer", returnValueForGetResourceRequirements: v1.ResourceRequirements{ Requests: v1.ResourceList{ corev1.ResourceCPU: resapi.MustParse("2500m"), }, }, }, pod: &mockPod{ returnValueFotGetQOSClass: corev1.PodQOSGuaranteed, annotations: map[string]string{ preferIsolatedCPUsKey + "/container.testcontainer": "true", preferSharedCPUsKey + "/container.testcontainer": "true", }, }, expectedFull: 0, expectedFraction: 2500, expectedIsolate: false, }, { name: "guaranteed QoS with multi-core mixed request, annotate no shared", container: &mockContainer{ name: "testcontainer", returnValueForGetResourceRequirements: v1.ResourceRequirements{ Requests: v1.ResourceList{ corev1.ResourceCPU: resapi.MustParse("2500m"), }, }, }, pod: &mockPod{ returnValueFotGetQOSClass: corev1.PodQOSGuaranteed, annotations: map[string]string{ preferSharedCPUsKey + "/container.testcontainer": "false", }, }, expectedFull: 2, expectedFraction: 500, expectedIsolate: false, }, { name: "guaranteed QoS with multi-core mixed request, annotate isolated, no shared", container: &mockContainer{ name: "testcontainer", returnValueForGetResourceRequirements: v1.ResourceRequirements{ Requests: v1.ResourceList{ corev1.ResourceCPU: resapi.MustParse("2500m"), }, }, }, pod: &mockPod{ returnValueFotGetQOSClass: corev1.PodQOSGuaranteed, annotations: map[string]string{ preferIsolatedCPUsKey + "/container.testcontainer": "true", preferSharedCPUsKey + "/container.testcontainer": "false", }, }, expectedFull: 2, expectedFraction: 500, expectedIsolate: true, }, { name: "return request's value for reserved pool namespace container", container: &mockContainer{ namespace: "foobar", returnValueForGetResourceRequirements: v1.ResourceRequirements{ Requests: v1.ResourceList{ corev1.ResourceCPU: resapi.MustParse("2"), }, }, }, pod: &mockPod{ returnValueFotGetQOSClass: corev1.PodQOSBurstable, }, expectedFraction: 2000, expectedCpuType: cpuReserved, reservedPoolNamespaces: []string{"foobar"}, }, { name: "return request's value for reserved pool namespace container using a glob 1", container: &mockContainer{ namespace: "foobar2", returnValueForGetResourceRequirements: v1.ResourceRequirements{ Requests: v1.ResourceList{ corev1.ResourceCPU: resapi.MustParse("2"), }, }, }, pod: &mockPod{ returnValueFotGetQOSClass: corev1.PodQOSBurstable, }, expectedFraction: 2000, expectedCpuType: cpuReserved, reservedPoolNamespaces: []string{"foobar*"}, }, { name: "return request's value for reserved pool namespace container using a glob 2", container: &mockContainer{ namespace: "foobar-testing", returnValueForGetResourceRequirements: v1.ResourceRequirements{ Requests: v1.ResourceList{ corev1.ResourceCPU: resapi.MustParse("2"), }, }, }, pod: &mockPod{ returnValueFotGetQOSClass: corev1.PodQOSBurstable, }, expectedFraction: 2000, expectedCpuType: cpuReserved, reservedPoolNamespaces: []string{"barfoo", "foobar*"}, }, { name: "return request's value for reserved pool namespace container using a glob 3", container: &mockContainer{ namespace: "testing", returnValueForGetResourceRequirements: v1.ResourceRequirements{ Requests: v1.ResourceList{ corev1.ResourceCPU: resapi.MustParse("2"), }, }, }, pod: &mockPod{ returnValueFotGetQOSClass: corev1.PodQOSBurstable, }, expectedFraction: 2000, expectedCpuType: cpuNormal, reservedPoolNamespaces: []string{"barfoo", "foobar?"}, }, { name: "return request's value for reserved pool namespace container using a glob 4", container: &mockContainer{ namespace: "1foobar2", returnValueForGetResourceRequirements: v1.ResourceRequirements{ Requests: v1.ResourceList{ corev1.ResourceCPU: resapi.MustParse("2"), }, }, }, pod: &mockPod{ returnValueFotGetQOSClass: corev1.PodQOSBurstable, }, expectedFraction: 2000, expectedCpuType: cpuNormal, reservedPoolNamespaces: []string{"barfoo", "foobar?"}, }, { name: "return request's value for reserved pool namespace container using a glob 5", container: &mockContainer{ namespace: "foobar12", returnValueForGetResourceRequirements: v1.ResourceRequirements{ Requests: v1.ResourceList{ corev1.ResourceCPU: resapi.MustParse("2"), }, }, }, pod: &mockPod{ returnValueFotGetQOSClass: corev1.PodQOSBurstable, }, expectedFraction: 2000, expectedCpuType: cpuNormal, reservedPoolNamespaces: []string{"barfoo", "foobar?", "testing"}, }, { name: "return request's value for reserved cpu annotation container", container: &mockContainer{ name: "testcontainer", pod: &mockPod{ returnValueFotGetQOSClass: corev1.PodQOSGuaranteed, annotations: map[string]string{ preferReservedCPUsKey + "/container.special": "false", }, }, }, pod: &mockPod{ returnValueFotGetQOSClass: corev1.PodQOSBurstable, }, expectedFraction: 0, expectedCpuType: cpuNormal, }, { name: "return request's value for reserved cpu annotation container", container: &mockContainer{ pod: &mockPod{ returnValueFotGetQOSClass: corev1.PodQOSGuaranteed, annotations: map[string]string{ preferReservedCPUsKey + "/pod": "true", }, }, }, pod: &mockPod{ returnValueFotGetQOSClass: corev1.PodQOSBurstable, }, expectedFraction: 0, expectedCpuType: cpuReserved, }, } for _, tc := range tcases { t.Run(tc.name, func(t *testing.T) { if tc.disabled { t.Skipf("The case '%s' is skipped", tc.name) } opt.PreferIsolated, opt.PreferShared = tc.preferIsolated, tc.preferShared opt.ReservedPoolNamespaces = tc.reservedPoolNamespaces full, fraction, isolate, cpuType := cpuAllocationPreferences(tc.pod, tc.container) if full != tc.expectedFull || fraction != tc.expectedFraction || isolate != tc.expectedIsolate || cpuType != tc.expectedCpuType { t.Errorf("Expected (%v, %v, %v, %s), but got (%v, %v, %v, %s)", tc.expectedFull, tc.expectedFraction, tc.expectedIsolate, tc.expectedCpuType, full, fraction, isolate, cpuType) } }) } } ================================================ FILE: pkg/cri/resource-manager/policy/builtin/topology-aware/pools.go ================================================ // Copyright 2019 Intel Corporation. All Rights Reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package topologyaware import ( "math" "sort" "github.com/intel/cri-resource-manager/pkg/cri/resource-manager/cache" "github.com/intel/cri-resource-manager/pkg/cri/resource-manager/kubernetes" system "github.com/intel/cri-resource-manager/pkg/sysfs" "github.com/intel/cri-resource-manager/pkg/utils/cpuset" idset "github.com/intel/goresctrl/pkg/utils" ) // buildPoolsByTopology builds a hierarchical tree of pools based on HW topology. func (p *policy) buildPoolsByTopology() error { if err := p.checkHWTopology(); err != nil { return err } // Notes: // we never create pool nodes for PMEM-only NUMA nodes (as these // are always without any close/local set of CPUs). We instead // assign the PMEM memory of such a node to one of the closest // normal (DRAM) pool NUMA nodes. // // Akin to omitting lone dies from their parent, we omit from the // pool tree each NUMA node that would end up being the only child // of its parent (a die or a socket pool node). Resources for each // such node will get discovered by and assigned to the would be // parent which is now a leaf (die or socket) node in the tree. // // The PMEM memory of (omitted) PMEM-only nodes is assigned // to one of the closest normal (DRAM) NUMA nodes. This right // assignment has already been calculated by assignPMEMNodes(). // However, making the corresponding assignment in the pool // tree is a bit more involved as the DRAM node where a PMEM // node has been assigned to might have gotten omitted from the // tree if it ended up being a lone child. We use the recorded // per NUMA node surrogates to find both if and where resources // of omitted DRAM NUMA nodes need to get assigned to, and also // where PMEM NUMA node resources need to get assigned to. log.Debug("building topology pool tree...") p.nodes = make(map[string]Node) // create a virtual root node, if we have a multi-socket system if p.sys.SocketCount() > 1 { p.root = p.NewVirtualNode("root", nilnode) p.nodes[p.root.Name()] = p.root log.Debug(" + created pool (root) %q", p.root.Name()) } else { log.Debug(" - omitted pool virtual root (single-socket system)") } // create socket nodes, for a single-socket system set the only socket as the root sockets := map[idset.ID]Node{} for _, socketID := range p.sys.PackageIDs() { var socket Node if p.root != nil { socket = p.NewSocketNode(socketID, p.root) log.Debug(" + created pool %q", socket.Parent().Name()+"/"+socket.Name()) } else { socket = p.NewSocketNode(socketID, nilnode) p.root = socket log.Debug(" + created pool %q (as root)", socket.Name()) } p.nodes[socket.Name()] = socket sockets[socketID] = socket } // create dies for every socket, but only if we have more than one die in the socket numaDies := map[idset.ID]Node{} // created die Nodes per NUMA node id for socketID, socket := range sockets { dieIDs := p.sys.Package(socketID).DieIDs() if len(dieIDs) < 2 { log.Debug(" - omitted pool %q (die count: %d)", socket.Name()+"/die #0", len(dieIDs)) continue } for _, dieID := range dieIDs { die := p.NewDieNode(dieID, socket) p.nodes[die.Name()] = die for _, numaNodeID := range p.sys.Package(socketID).DieNodeIDs(dieID) { numaDies[numaNodeID] = die } log.Debug(" + created pool %q", die.Parent().Name()+"/"+die.Name()) } } // create pool nodes for NUMA nodes pmemNodes := map[idset.ID]system.Node{} // collected PMEM-only nodes dramNodes := map[idset.ID]system.Node{} // collected DRAM-only nodes numaSurrogates := map[idset.ID]Node{} // surrogate leaf nodes for omitted NUMA nodes for _, numaNodeID := range p.sys.NodeIDs() { var numaNode Node numaSysNode := p.sys.Node(numaNodeID) switch numaSysNode.GetMemoryType() { case system.MemoryTypeDRAM: dramNodes[numaNodeID] = numaSysNode case system.MemoryTypePMEM: pmemNodes[numaNodeID] = numaSysNode log.Debug(" - omitted pool \"NUMA node #%d\": PMEM node", numaNodeID) continue // don't create pool, will assign to a closest DRAM node default: log.Warn(" - ignored pool \"NUMA node #%d\": unhandled memory type %v", numaNodeID, numaSysNode.GetMemoryType()) continue } // // Notes: // We omit inserting NUMA nodes (as leaf nodes) in the tree, if that NUMA node // would be the only child of its parent. In this case, we record the would-be // parent as the surrogate for the NUMA node. This surrogate will get assigned // any closest PMEM-only NUMA node that the original one would have received. // if die, ok := numaDies[numaNodeID]; ok { if p.parentNumaNodeCountWithCPUs(numaSysNode) < 2 { numaSurrogates[numaNodeID] = die log.Debug(" - omitted pool \"NUMA node #%d\": using surrogate %q", numaNodeID, numaSurrogates[numaNodeID].Name()) continue } numaNode = p.NewNumaNode(numaNodeID, die) } else { socket := sockets[p.sys.Node(numaNodeID).PackageID()] if p.parentNumaNodeCountWithCPUs(numaSysNode) < 2 { numaSurrogates[numaNodeID] = socket log.Debug(" - omitted pool \"NUMA node #%d\": using surrogate %q", numaNodeID, numaSurrogates[numaNodeID].Name()) continue } numaNode = p.NewNumaNode(numaNodeID, socket) } p.nodes[numaNode.Name()] = numaNode numaSurrogates[numaNodeID] = numaNode log.Debug(" + created pool %q", numaNode.Parent().Name()+"/"+numaNode.Name()) } // set up assignment of PMEM and DRAM node resources to pool nodes and surrogates assigned := p.assignNUMANodes(numaSurrogates, pmemNodes, dramNodes) log.Debug("NUMA node to pool assignment:") for n, numaNodeIDs := range assigned { log.Debug(" pool %q: NUMA nodes #%s", n.Name(), idset.NewIDSet(numaNodeIDs...)) } // enumerate pools, calculate depth, discover resource capacity, assign NUMA nodes p.pools = make([]Node, 0) p.root.DepthFirst(func(n Node) error { p.pools = append(p.pools, n) n.(*node).id = p.nodeCnt p.nodeCnt++ if p.depth < n.(*node).depth { p.depth = n.(*node).depth } n.DiscoverSupply(assigned[n.(*node).self.node]) delete(assigned, n.(*node).self.node) return nil }) // make sure all PMEM nodes got assigned if len(assigned) > 0 { for node, pmem := range assigned { log.Error("failed to assign PMEM NUMA nodes #%s (to NUMA node/surrogate %s %v)", idset.NewIDSet(pmem...), node.Name(), node) } log.Fatal("internal error: unassigned PMEM NUMA nodes remaining") } p.root.Dump("") return nil } // parentNumaNodeCountWithCPUs returns the number of CPU-ful NUMA nodes in the parent die/socket. func (p *policy) parentNumaNodeCountWithCPUs(numaNode system.Node) int { socketID := numaNode.PackageID() socket := p.sys.Package(socketID) count := 0 for _, nodeID := range socket.DieNodeIDs(numaNode.DieID()) { node := p.sys.Node(nodeID) if !node.CPUSet().IsEmpty() { count++ } } return count } // assignNUMANodes assigns each PMEM node to one of the closest DRAM nodes func (p *policy) assignNUMANodes(surrogates map[idset.ID]Node, pmem, dram map[idset.ID]system.Node) map[Node][]idset.ID { // collect the closest DRAM NUMA nodes (sorted by idset.ID) for each PMEM NUMA node. closest := map[idset.ID][]idset.ID{} for pmemID := range pmem { var min []idset.ID for dramID := range dram { if len(min) < 1 { min = []idset.ID{dramID} } else { minDist := p.sys.NodeDistance(pmemID, min[0]) newDist := p.sys.NodeDistance(pmemID, dramID) switch { case newDist == minDist: min = append(min, dramID) case newDist < minDist: min = []idset.ID{dramID} } } } sort.Slice(min, func(i, j int) bool { return min[i] < min[j] }) closest[pmemID] = min } assigned := map[Node][]idset.ID{} // assign each PMEM node to the closest DRAM surrogate with the least PMEM assigned for pmemID, min := range closest { var taker Node var takerID idset.ID for _, dramID := range min { if taker == nil { taker = surrogates[dramID] takerID = dramID } else { if len(assigned[taker]) > len(assigned[surrogates[dramID]]) { taker = surrogates[dramID] takerID = dramID } } } if taker == nil { log.Panic("failed to assign CPU-less PMEM node #%d to any surrogate", pmemID) } assigned[taker] = append(assigned[taker], pmemID) log.Debug(" + PMEM node #%d assigned to %s with distance %v", pmemID, taker.Name(), p.sys.NodeDistance(pmemID, takerID)) } // assign each DRAM node to its own surrogate (can be the DRAM node itself) for dramID := range dram { taker := surrogates[dramID] assigned[taker] = append([]idset.ID{dramID}, assigned[taker]...) log.Debug(" + DRAM node #%d assigned to %s", dramID, taker.Name()) } return assigned } // checkHWTopology verifies our otherwise implicit assumptions about the HW. func (p *policy) checkHWTopology() error { // NUMA nodes (memory controllers) should not be shared by multiple sockets. socketNodes := map[idset.ID]cpuset.CPUSet{} for _, socketID := range p.sys.PackageIDs() { pkg := p.sys.Package(socketID) socketNodes[socketID] = system.CPUSetFromIDSet(idset.NewIDSet(pkg.NodeIDs()...)) } for id1, nodes1 := range socketNodes { for id2, nodes2 := range socketNodes { if id1 == id2 { continue } if shared := nodes1.Intersection(nodes2); !shared.IsEmpty() { log.Error("can't handle HW topology: sockets #%v, #%v share NUMA node(s) #%s", id1, id2, shared.String()) return policyError("unhandled HW topology: sockets #%v, #%v share NUMA node(s) #%s", id1, id2, shared.String()) } } } // NUMA nodes (memory controllers) should not be shared by multiple dies. for _, socketID := range p.sys.PackageIDs() { pkg := p.sys.Package(socketID) for _, id1 := range pkg.DieIDs() { nodes1 := idset.NewIDSet(pkg.DieNodeIDs(id1)...) for _, id2 := range pkg.DieIDs() { if id1 == id2 { continue } nodes2 := idset.NewIDSet(pkg.DieNodeIDs(id2)...) if shared := system.CPUSetFromIDSet(nodes1).Intersection(system.CPUSetFromIDSet(nodes2)); !shared.IsEmpty() { log.Error("can't handle HW topology: "+ "socket #%v, dies #%v,%v share NUMA node(s) #%s", socketID, id1, id2, shared.String()) return policyError("unhandled HW topology: "+ "socket #%v, dies #%v,#%v share NUMA node(s) #%s", socketID, id1, id2, shared.String()) } } } } // NUMA distance matrix should be symmetric. for _, from := range p.sys.NodeIDs() { for _, to := range p.sys.NodeIDs() { d1 := p.sys.NodeDistance(from, to) d2 := p.sys.NodeDistance(to, from) if d1 != d2 { log.Error("asymmetric NUMA distance (#%d, #%d): %d != %d", from, to, d1, d2) return policyError("asymmetric NUMA distance (#%d, #%d): %d != %d", from, to, d1, d2) } } } return nil } // Pick a pool and allocate resource from it to the container. func (p *policy) allocatePool(container cache.Container, poolHint string) (Grant, error) { var pool Node request := newRequest(container) if p.root.FreeSupply().ReservedCPUs().IsEmpty() && request.CPUType() == cpuReserved { // Fallback to allocating reserved CPUs from the shared pool // if there are no reserved CPUs. request.SetCPUType(cpuNormal) } // Assumption: in the beginning the CPUs and memory will be allocated from // the same pool. This assumption can be relaxed later, requires separate // (but connected) scoring of memory and CPU. if request.CPUType() == cpuReserved || container.GetNamespace() == kubernetes.NamespaceSystem { pool = p.root } else { affinity, err := p.calculatePoolAffinities(request.GetContainer()) if err != nil { return nil, policyError("failed to calculate affinity for container %s: %v", container.PrettyName(), err) } scores, pools := p.sortPoolsByScore(request, affinity) if log.DebugEnabled() { log.Debug("* node fitting for %s", request) for idx, n := range pools { log.Debug(" - #%d: node %s, score %s, affinity: %d", idx, n.Name(), scores[n.NodeID()], affinity[n.NodeID()]) } } if len(pools) == 0 { return nil, policyError("no suitable pool found for container %s", container.PrettyName()) } if poolHint != "" { for idx, p := range pools { if p.Name() == poolHint { log.Debug("* using hinted pool %q (#%d best fit)", poolHint, idx+1) pool = p break } } if pool == nil { log.Debug("* cannot use hinted pool %q", poolHint) } } if pool == nil { pool = pools[0] } } supply := pool.FreeSupply() grant, err := supply.Allocate(request) if err != nil { return nil, policyError("failed to allocate %s from %s: %v", request, supply.DumpAllocatable(), err) } log.Debug("allocated req '%s' to memory node '%s' (memset %s,%s,%s)", container.PrettyName(), grant.GetMemoryNode().Name(), grant.GetMemoryNode().GetMemset(memoryDRAM), grant.GetMemoryNode().GetMemset(memoryPMEM), grant.GetMemoryNode().GetMemset(memoryHBM)) // In case the workload is assigned to a memory node with multiple // child nodes, there is no guarantee that the workload will // allocate memory "nicely". Instead we'll have to make the // conservative assumption that the memory will all be allocated // from one single node, and that node can be any of the child // nodes in the system. Thus, we'll need to reserve the memory // from all child nodes, and move the containers already // assigned to the child nodes upwards in the topology tree, if // they no longer fit to the child node that they are in. In // other words, they'll need to have a wider range of memory // node options in order to fit to memory. // // // Example: // // Workload 1 and Workload 2 are running on the leaf nodes: // // +----------------+ // |Total mem: 4G | // |Total CPUs: 4 | Workload 1: // |Reserved: | // | 1.5G | 1G mem // | | // | | Workload 2: // | | // +----------------+ 0.5G mem // / \ // / \ // / \ // / \ // / \ // / \ // / \ // / \ // +----------------+ +----------------+ // |Total mem: 2G | |Total mem: 2G | // |Total CPUs: 2 | |Total CPUs: 2 | // |Reserved: | |Reserved: | // | 1G | | 0.5G | // | | | | // | | | | // | * WL 1 | | * WL 2 | // +----------------+ +----------------+ // // // Then Workload 3 comes in and is assigned to the root node. Memory // reservations are done on the leaf nodes: // // +----------------+ // |Total mem: 4G | // |Total CPUs: 4 | Workload 1: // |Reserved: | // | 3G | 1G mem // | | // | | Workload 2: // | * WL 3 | // +----------------+ 0.5G mem // / \ // / \ Workload 3: // / \ // / \ 1.5G mem // / \ // / \ // / \ // / \ // +----------------+ +----------------+ // |Total mem: 2G | |Total mem: 2G | // |Total CPUs: 2 | |Total CPUs: 2 | // |Reserved: | |Reserved: | // | 2.5G | | 2G | // | | | | // | | | | // | * WL 1 | | * WL 2 | // +----------------+ +----------------+ // // // Workload 1 no longer fits to the leaf node, because the total // reservation from the leaf node is over the memory maximum. // Thus, it's moved upwards in the tree to the root node. Memory // resevations are again updated accordingly: // // +----------------+ // |Total mem: 4G | // |Total CPUs: 4 | Workload 1: // |Reserved: | // | 3G | 1G mem // | | // | * WL 1 | Workload 2: // | * WL 3 | // +----------------+ 0.5G mem // / \ // / \ Workload 3: // / \ // / \ 1.5G mem // / \ // / \ // / \ // / \ // +----------------+ +----------------+ // |Total mem: 2G | |Total mem: 2G | // |Total CPUs: 2 | |Total CPUs: 2 | // |Reserved: | |Reserved: | // | 2.5G | | 3G | // | | | | // | | | | // | | | * WL 2 | // +----------------+ +----------------+ // // // Now Workload 2 doesn't fit to the leaf node either. It's also moved // to the root node: // // +----------------+ // |Total mem: 4G | // |Total CPUs: 4 | Workload 1: // |Reserved: | // | 3G | 1G mem // | * WL 2 | // | * WL 1 | Workload 2: // | * WL 3 | // +----------------+ 0.5G mem // / \ // / \ Workload 3: // / \ // / \ 1.5G mem // / \ // / \ // / \ // / \ // +----------------+ +----------------+ // |Total mem: 2G | |Total mem: 2G | // |Total CPUs: 2 | |Total CPUs: 2 | // |Reserved: | |Reserved: | // | 3G | | 3G | // | | | | // | | | | // | | | | // +----------------+ +----------------+ // // We need to analyze all existing containers which are a subset of current grant. memset := grant.GetMemoryNode().GetMemset(grant.MemoryType()) // Add an extra memory reservation to all subnodes. // TODO: no need to do any of this if no memory request grant.UpdateExtraMemoryReservation() // See how much memory reservations the workloads on the // nodes up from this one cause to the node. We only need to // analyze the workloads up until this node, because it's // guaranteed that the subtree can hold the workloads. // If it turns out that the current workloads no longer fit // to the node with the reservations from nodes from above // in the tree, move all nodes upward. Note that this // creates a reservation of the same size to the node, so in // effect the node has to be empty of its "own" workloads. // In this case move all the workloads one level up in the tree. changed := true for changed { changed = false for _, oldGrant := range p.allocations.grants { oldMemset := oldGrant.GetMemoryNode().GetMemset(grant.MemoryType()) if oldMemset.Size() < memset.Size() && memset.Has(oldMemset.Members()...) { changed, err = oldGrant.ExpandMemset() if err != nil { return nil, err } if changed { log.Debug("* moved container %s upward to node %s to guarantee memory", oldGrant.GetContainer().PrettyName(), oldGrant.GetMemoryNode().Name()) break } } } } p.allocations.grants[container.GetCacheID()] = grant p.saveAllocations() return grant, nil } // Apply the result of allocation to the requesting container. func (p *policy) applyGrant(grant Grant) { log.Debug("* applying grant %s", grant) container := grant.GetContainer() cpuType := grant.CPUType() exclusive := grant.ExclusiveCPUs() reserved := grant.ReservedCPUs() shared := grant.SharedCPUs() cpuPortion := grant.SharedPortion() cpus := "" kind := "" if cpuType == cpuNormal { if exclusive.IsEmpty() { cpus = shared.String() kind = "shared" } else { kind = "exclusive" if cpuPortion > 0 { kind += "+shared" cpus = exclusive.Union(shared).String() } else { cpus = exclusive.String() } } } else if cpuType == cpuReserved { kind = "reserved" cpus = reserved.String() cpuPortion = grant.ReservedPortion() } else { log.Debug("unsupported granted cpuType %s", cpuType) return } mems := "" if opt.PinMemory { mems = grant.Memset().String() } if opt.PinCPU { if cpus != "" { log.Debug(" => pinning to (%s) cpuset %s", kind, cpus) } else { log.Debug(" => not pinning CPUs, allocated cpuset is empty...") } container.SetCpusetCpus(cpus) // Notes: // It is extremely important to ensure that the exclusive subset of mixed // CPU allocations are really exclusive at the level of the whole system // and not just the orchestration. This is something we can't really do // from here reliably ATM. // // We set the CPU scheduling weight for the whole container (all processes // within the container) according to container's partial allocation. // This is typically a sub-CPU allocation (< 1000 mCPU) which is meant to be // consumed by an 'infra/mgmt' process within the container from the shared subset // of CPUs assigned to the container. The container entry point or the processes // within the container are supposed to arrange so that the 'infra' process(es) // are pinned to the shared CPUs and the 'data/performance critical' critical' // process(es) to the exclusive CPU(s). // // With this setup the kernel will slice out the correct amount of CPU from // the shared pool for the 'infra' process as it competes with other workloads' // processes in the same pool. Also the 'data' process should run fine, since // it does not need to compete for CPU with any other processes in the system // as long as that allocation is genuinely system-wide exclusive. container.SetCPUShares(int64(cache.MilliCPUToShares(int64(cpuPortion)))) } if mems != "" { log.Debug(" => pinning to memory %s", mems) container.SetCpusetMems(mems) p.setDemotionPreferences(container, grant) } else { log.Debug(" => not pinning memory, memory set is empty...") } } // Release resources allocated by this grant. func (p *policy) releasePool(container cache.Container) (Grant, bool) { log.Debug("* releasing resources allocated to %s", container.PrettyName()) grant, ok := p.allocations.grants[container.GetCacheID()] if !ok { log.Debug(" => no grant found, nothing to do...") return nil, false } log.Debug(" => releasing grant %s...", grant) // Remove the grant from all supplys it uses. grant.Release() delete(p.allocations.grants, container.GetCacheID()) p.saveAllocations() return grant, true } // Update shared allocations effected by agrant. func (p *policy) updateSharedAllocations(grant *Grant) { if grant != nil { log.Debug("* updating shared allocations affected by %s", (*grant).String()) if (*grant).CPUType() == cpuReserved { log.Debug(" this grant uses reserved CPUs, does not affect shared allocations") return } } else { log.Debug("* updating shared allocations") } for _, other := range p.allocations.grants { if grant != nil { if other.GetContainer().GetCacheID() == (*grant).GetContainer().GetCacheID() { continue } } if other.CPUType() == cpuReserved { log.Debug(" => %s not affected (only reserved CPUs)...", other) continue } if other.SharedPortion() == 0 && !other.ExclusiveCPUs().IsEmpty() { log.Debug(" => %s not affected (only exclusive CPUs)...", other) continue } if opt.PinCPU { shared := other.GetCPUNode().FreeSupply().SharableCPUs() exclusive := other.ExclusiveCPUs() if exclusive.IsEmpty() { log.Debug(" => updating %s with shared CPUs of %s: %s...", other, other.GetCPUNode().Name(), shared.String()) other.GetContainer().SetCpusetCpus(shared.String()) } else { log.Debug(" => updating %s with exclusive+shared CPUs of %s: %s+%s...", other, other.GetCPUNode().Name(), exclusive.String(), shared.String()) other.GetContainer().SetCpusetCpus(exclusive.Union(shared).String()) } } } } // setDemotionPreferences sets the dynamic demotion preferences a container. func (p *policy) setDemotionPreferences(c cache.Container, g Grant) { log.Debug("%s: setting demotion preferences...", c.PrettyName()) // System containers should not be demoted. if c.GetNamespace() == kubernetes.NamespaceSystem { c.SetPageMigration(nil) return } memType := g.GetMemoryNode().GetMemoryType() if memType&memoryDRAM == 0 || memType&memoryPMEM == 0 { c.SetPageMigration(nil) return } dram := g.GetMemoryNode().GetMemset(memoryDRAM) pmem := g.GetMemoryNode().GetMemset(memoryPMEM) log.Debug("%s: eligible for demotion from %s to %s NUMA node(s)", c.PrettyName(), dram, pmem) c.SetPageMigration(&cache.PageMigrate{ SourceNodes: dram, TargetNodes: pmem, }) } func (p *policy) filterInsufficientResources(req Request, originals []Node) []Node { sufficient := make([]Node, 0) for _, node := range originals { // TODO: Need to filter based on the memory demotion scheme here. For example, if the request is // of memory type memoryAll, the memory used might be PMEM until it's full and after that DRAM. If // it's DRAM, amount of PMEM should not be considered and so on. How to find this out in a live // system? supply := node.FreeSupply() reqMemType := req.MemoryType() if reqMemType == memoryUnspec { // The algorithm for handling unspecified memory allocations is the same as for handling a request // with memory type all. reqMemType = memoryAll } required := req.MemAmountToAllocate() for _, memType := range []memoryType{memoryPMEM, memoryDRAM, memoryHBM} { if reqMemType&memType != 0 { extra := supply.ExtraMemoryReservation(memType) free := supply.MemoryLimit()[memType] if extra > free { continue } if required+extra <= free { sufficient = append(sufficient, node) required = 0 break } if req.ColdStart() > 0 { // For a "cold start" request, the memory request must fit completely in the PMEM. So reject the node. break } // Subtracting unsigned integers. // Here free >= extra, that is, (free - extra) is non-negative, // and required > free - extra, that is, required stays positive. required -= (free - extra) } } if required > 0 { log.Debug("%s: filtered out %s with insufficient memory", req.GetContainer().PrettyName(), node.Name()) } } return sufficient } // Score pools against the request and sort them by score. func (p *policy) sortPoolsByScore(req Request, aff map[int]int32) (map[int]Score, []Node) { scores := make(map[int]Score, p.nodeCnt) p.root.DepthFirst(func(n Node) error { scores[n.NodeID()] = n.GetScore(req) return nil }) // Filter out pools which don't have enough uncompressible resources // (memory) to satisfy the request. filteredPools := p.filterInsufficientResources(req, p.pools) sort.Slice(filteredPools, func(i, j int) bool { return p.compareScores(req, filteredPools, scores, aff, i, j) }) return scores, filteredPools } // Compare two pools by scores for allocation preference. func (p *policy) compareScores(request Request, pools []Node, scores map[int]Score, affinity map[int]int32, i int, j int) bool { node1, node2 := pools[i], pools[j] depth1, depth2 := node1.RootDistance(), node2.RootDistance() id1, id2 := node1.NodeID(), node2.NodeID() score1, score2 := scores[id1], scores[id2] cpuType := request.CPUType() isolated1, reserved1, shared1 := score1.IsolatedCapacity(), score1.ReservedCapacity(), score1.SharedCapacity() isolated2, reserved2, shared2 := score2.IsolatedCapacity(), score2.ReservedCapacity(), score2.SharedCapacity() a1 := affinityScore(affinity, node1) a2 := affinityScore(affinity, node2) log.Debug("comparing scores for %s and %s", node1.Name(), node2.Name()) log.Debug(" %s: %s, affinity score %f", node1.Name(), score1.String(), a1) log.Debug(" %s: %s, affinity score %f", node2.Name(), score2.String(), a2) // // Notes: // // Our scoring/score sorting algorithm is: // // 1) - insufficient isolated, reserved or shared capacity loses // 2) - if we have affinity, the higher affinity score wins // 3) - if only one node matches the memory type request, it wins // 4) - if we have topology hints // * better hint score wins // * for a tie, prefer the lower node then the smaller id // 5) - if a node is lower in the tree it wins // 6) - for reserved allocations // * more unallocated reserved capacity per colocated container wins // 7) - for (non-reserved) isolated allocations // * more isolated capacity wins // * for a tie, prefer the smaller id // 8) - for (non-reserved) exclusive allocations // * more slicable (shared) capacity wins // * for a tie, prefer the smaller id // 9) - for (non-reserved) shared-only allocations // * fewer colocated containers win // * for a tie prefer more shared capacity // 10) - lower id wins // // Before this comparison is reached, nodes with insufficient uncompressible resources // (memory) have been filtered out. // 1) a node with insufficient isolated or shared capacity loses switch { case cpuType == cpuNormal && ((isolated2 < 0 && isolated1 >= 0) || (shared2 <= 0 && shared1 > 0)): log.Debug(" => %s loses, insufficent isolated or shared", node2.Name()) return true case cpuType == cpuNormal && ((isolated1 < 0 && isolated2 >= 0) || (shared1 <= 0 && shared2 > 0)): log.Debug(" => %s loses, insufficent isolated or shared", node1.Name()) return false case cpuType == cpuReserved && reserved2 < 0 && reserved1 >= 0: log.Debug(" => %s loses, insufficent reserved", node2.Name()) return true case cpuType == cpuReserved && reserved1 < 0 && reserved2 >= 0: log.Debug(" => %s loses, insufficent reserved", node1.Name()) return false } log.Debug(" - isolated/reserved/shared insufficiency is a TIE") // 2) higher affinity score wins if a1 > a2 { log.Debug(" => %s loses on affinity", node2.Name()) return true } if a2 > a1 { log.Debug(" => %s loses on affinity", node1.Name()) return false } log.Debug(" - affinity is a TIE") // 3) matching memory type wins if reqType := request.MemoryType(); reqType != memoryUnspec { if node1.HasMemoryType(reqType) && !node2.HasMemoryType(reqType) { log.Debug(" => %s WINS on memory type", node1.Name()) return true } if !node1.HasMemoryType(reqType) && node2.HasMemoryType(reqType) { log.Debug(" => %s WINS on memory type", node2.Name()) return false } log.Debug(" - memory type is a TIE") } // 4) better topology hint score wins hScores1 := score1.HintScores() if len(hScores1) > 0 { hScores2 := score2.HintScores() hs1, nz1 := combineHintScores(hScores1) hs2, nz2 := combineHintScores(hScores2) if hs1 > hs2 { log.Debug(" => %s WINS on hints", node1.Name()) return true } if hs2 > hs1 { log.Debug(" => %s WINS on hints", node2.Name()) return false } log.Debug(" - hints are a TIE") if hs1 == 0 { if nz1 > nz2 { log.Debug(" => %s WINS on non-zero hints", node1.Name()) return true } if nz2 > nz1 { log.Debug(" => %s WINS on non-zero hints", node2.Name()) return false } log.Debug(" - non-zero hints are a TIE") } // for a tie, prefer lower nodes and smaller ids if hs1 == hs2 && nz1 == nz2 && (hs1 != 0 || nz1 != 0) { if depth1 > depth2 { log.Debug(" => %s WINS as it is lower", node1.Name()) return true } if depth1 < depth2 { log.Debug(" => %s WINS as it is lower", node2.Name()) return false } log.Debug(" => %s WINS based on equal hint socres, lower id", map[bool]string{true: node1.Name(), false: node2.Name()}[id1 < id2]) return id1 < id2 } } // 5) a lower node wins if depth1 > depth2 { log.Debug(" => %s WINS on depth", node1.Name()) return true } if depth1 < depth2 { log.Debug(" => %s WINS on depth", node2.Name()) return false } log.Debug(" - depth is a TIE") if request.CPUType() == cpuReserved { // 6) if requesting reserved CPUs, more reserved // capacity per colocated container wins. Reserved // CPUs cannot be precisely accounted as they run // also BestEffort containers that do not carry // information on their CPU needs. if reserved1/(score1.Colocated()+1) > reserved2/(score2.Colocated()+1) { return true } if reserved2/(score2.Colocated()+1) > reserved1/(score1.Colocated()+1) { return false } log.Debug(" - reserved capacity is a TIE") } else if request.CPUType() == cpuNormal { // 7) more isolated capacity wins if request.Isolate() && (isolated1 > 0 || isolated2 > 0) { if isolated1 > isolated2 { return true } if isolated2 > isolated1 { return false } log.Debug(" => %s WINS based on equal isolated capacity, lower id", map[bool]string{true: node1.Name(), false: node2.Name()}[id1 < id2]) return id1 < id2 } // 8) more slicable shared capacity wins if request.FullCPUs() > 0 && (shared1 > 0 || shared2 > 0) { if shared1 > shared2 { log.Debug(" => %s WINS on more slicable capacity", node1.Name()) return true } if shared2 > shared1 { log.Debug(" => %s WINS on more slicable capacity", node2.Name()) return false } log.Debug(" => %s WINS based on equal slicable capacity, lower id", map[bool]string{true: node1.Name(), false: node2.Name()}[id1 < id2]) return id1 < id2 } // 9) fewer colocated containers win if score1.Colocated() < score2.Colocated() { log.Debug(" => %s WINS on colocation score", node1.Name()) return true } if score2.Colocated() < score1.Colocated() { log.Debug(" => %s WINS on colocation score", node2.Name()) return false } log.Debug(" - colocation score is a TIE") // more shared capacity wins if shared1 > shared2 { log.Debug(" => %s WINS on more shared capacity", node1.Name()) return true } if shared2 > shared1 { log.Debug(" => %s WINS on more shared capacity", node2.Name()) return false } } // 10) lower id wins log.Debug(" => %s WINS based on lower id", map[bool]string{true: node1.Name(), false: node2.Name()}[id1 < id2]) return id1 < id2 } // affinityScore calculate the 'goodness' of the affinity for a node. func affinityScore(affinities map[int]int32, node Node) float64 { Q := 0.75 // Calculate affinity for every node as a combination of // affinities of the nodes on the path from the node to // the root and the nodes in the subtree under the node. // // The combined affinity for node n is Sum_x(A_x*D_x), // where for every node x, A_x is the affinity for x and // D_x is Q ** (number of links from node to x). IOW, the // effective affinity is the sum of the affinity of n and // the affinity of each node x of the above mentioned set // diluted proprotionally to the distance of x to n, with // Q being 0.75. var score float64 for n, q := node.Parent(), Q; !n.IsNil(); n, q = n.Parent(), q*Q { a := affinities[n.NodeID()] score += q * float64(a) } node.BreadthFirst(func(n Node) error { diff := float64(n.RootDistance() - node.RootDistance()) q := math.Pow(Q, diff) a := affinities[n.NodeID()] score += q * float64(a) return nil }) return score } // hintScores calculates combined full and zero-filtered hint scores. func combineHintScores(scores map[string]float64) (float64, float64) { if len(scores) == 0 { return 0.0, 0.0 } combined, filtered := 1.0, 0.0 for _, score := range scores { combined *= score if score != 0.0 { if filtered == 0.0 { filtered = score } else { filtered *= score } } } return combined, filtered } ================================================ FILE: pkg/cri/resource-manager/policy/builtin/topology-aware/pools_test.go ================================================ // Copyright 2020 Intel Corporation. All Rights Reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package topologyaware import ( "fmt" "os" "path" "testing" "github.com/intel/cri-resource-manager/pkg/cri/resource-manager/cache" policyapi "github.com/intel/cri-resource-manager/pkg/cri/resource-manager/policy" v1 "k8s.io/api/core/v1" resapi "k8s.io/apimachinery/pkg/api/resource" system "github.com/intel/cri-resource-manager/pkg/sysfs" "github.com/intel/cri-resource-manager/pkg/utils" "github.com/intel/cri-resource-manager/pkg/utils/cpuset" ) func findNodeWithID(id int, nodes []Node) Node { for _, node := range nodes { if node.NodeID() == id { return node } } panic("No node found with id " + fmt.Sprintf("%d", id)) } func findNodeWithName(name string, nodes []Node) Node { for _, node := range nodes { if node.Name() == name { return node } } panic("No node found with name " + name) } func setLinks(nodes []Node, tree map[int][]int) { hasParent := map[int]struct{}{} for parent, children := range tree { parentNode := findNodeWithID(parent, nodes) for _, child := range children { childNode := findNodeWithID(child, nodes) childNode.LinkParent(parentNode) hasParent[child] = struct{}{} } } orphans := []int{} for id := range tree { if _, ok := hasParent[id]; !ok { node := findNodeWithID(id, nodes) node.LinkParent(nilnode) orphans = append(orphans, id) } } if len(orphans) != 1 { panic(fmt.Sprintf("expected one root node, got %d with IDs %v", len(orphans), orphans)) } } func TestMemoryLimitFiltering(t *testing.T) { // Test the scoring algorithm with synthetic data. The assumptions are: // 1. The first node in "nodes" is the root of the tree. tcases := []struct { name string nodes []Node numaNodes []system.Node req Request affinities map[int]int32 tree map[int][]int expectedRemainingNodes []int }{ { name: "single node memory limit (fits)", nodes: []Node{ &numanode{ node: node{ id: 100, name: "testnode0", kind: UnknownNode, noderes: newSupply(&node{}, cpuset.New(), cpuset.New(), cpuset.New(), 0, 0, createMemoryMap(10001, 0, 0), createMemoryMap(0, 0, 0)), freeres: newSupply(&node{}, cpuset.New(), cpuset.New(), cpuset.New(), 0, 0, createMemoryMap(10001, 0, 0), createMemoryMap(0, 0, 0)), }, id: 0, // system node id }, }, numaNodes: []system.Node{ &mockSystemNode{id: 0, memFree: 10001, memTotal: 10001}, }, req: &request{ memReq: 10000, memLim: 10000, memType: defaultMemoryType, container: &mockContainer{}, }, expectedRemainingNodes: []int{100}, tree: map[int][]int{100: {}}, }, { name: "single node memory limit (doesn't fit)", nodes: []Node{ &numanode{ node: node{ id: 100, name: "testnode0", kind: UnknownNode, noderes: newSupply(&node{}, cpuset.New(), cpuset.New(), cpuset.New(), 0, 0, createMemoryMap(9999, 0, 0), createMemoryMap(0, 0, 0)), freeres: newSupply(&node{}, cpuset.New(), cpuset.New(), cpuset.New(), 0, 0, createMemoryMap(9999, 0, 0), createMemoryMap(0, 0, 0)), }, id: 0, // system node id }, }, numaNodes: []system.Node{ &mockSystemNode{id: 0, memFree: 9999, memTotal: 9999}, }, req: &request{ memReq: 10000, memLim: 10000, memType: defaultMemoryType, container: &mockContainer{}, }, expectedRemainingNodes: []int{}, tree: map[int][]int{100: {}}, }, { name: "two node memory limit (fits to leaf)", nodes: []Node{ &virtualnode{ node: node{ id: 100, name: "testnode0", kind: UnknownNode, noderes: newSupply(&node{}, cpuset.New(), cpuset.New(), cpuset.New(), 0, 0, createMemoryMap(10001, 0, 0), createMemoryMap(0, 0, 0)), freeres: newSupply(&node{}, cpuset.New(), cpuset.New(), cpuset.New(), 0, 0, createMemoryMap(10001, 0, 0), createMemoryMap(0, 0, 0)), }, }, &numanode{ node: node{ id: 101, name: "testnode1", kind: UnknownNode, noderes: newSupply(&node{}, cpuset.New(), cpuset.New(), cpuset.New(), 0, 0, createMemoryMap(10001, 0, 0), createMemoryMap(0, 0, 0)), freeres: newSupply(&node{}, cpuset.New(), cpuset.New(), cpuset.New(), 0, 0, createMemoryMap(10001, 0, 0), createMemoryMap(0, 0, 0)), }, id: 0, // system node id }, }, numaNodes: []system.Node{ &mockSystemNode{id: 0, memFree: 10001, memTotal: 10001}, }, req: &request{ memReq: 10000, memLim: 10000, memType: defaultMemoryType, container: &mockContainer{}, }, expectedRemainingNodes: []int{100, 101}, tree: map[int][]int{100: {101}, 101: {}}, }, { name: "three node memory limit (fits to root)", nodes: []Node{ &virtualnode{ node: node{ id: 100, name: "testnode0", kind: UnknownNode, noderes: newSupply(&node{}, cpuset.New(), cpuset.New(), cpuset.New(), 0, 0, createMemoryMap(12000, 0, 0), createMemoryMap(0, 0, 0)), freeres: newSupply(&node{}, cpuset.New(), cpuset.New(), cpuset.New(), 0, 0, createMemoryMap(12000, 0, 0), createMemoryMap(0, 0, 0)), }, }, &numanode{ node: node{ id: 101, name: "testnode1", kind: UnknownNode, noderes: newSupply(&node{}, cpuset.New(), cpuset.New(), cpuset.New(), 0, 0, createMemoryMap(6000, 0, 0), createMemoryMap(0, 0, 0)), freeres: newSupply(&node{}, cpuset.New(), cpuset.New(), cpuset.New(), 0, 0, createMemoryMap(6000, 0, 0), createMemoryMap(0, 0, 0)), }, id: 0, // system node id }, &numanode{ node: node{ id: 102, name: "testnode2", kind: UnknownNode, noderes: newSupply(&node{}, cpuset.New(), cpuset.New(), cpuset.New(), 0, 0, createMemoryMap(6000, 0, 0), createMemoryMap(0, 0, 0)), freeres: newSupply(&node{}, cpuset.New(), cpuset.New(), cpuset.New(), 0, 0, createMemoryMap(6000, 0, 0), createMemoryMap(0, 0, 0)), }, id: 1, // system node id }, }, numaNodes: []system.Node{ &mockSystemNode{id: 0, memFree: 6000, memTotal: 6000}, &mockSystemNode{id: 1, memFree: 6000, memTotal: 6000}, }, req: &request{ memReq: 10000, memLim: 10000, memType: defaultMemoryType, container: &mockContainer{}, }, expectedRemainingNodes: []int{100}, tree: map[int][]int{100: {101, 102}, 101: {}, 102: {}}, }, } for _, tc := range tcases { t.Run(tc.name, func(t *testing.T) { setLinks(tc.nodes, tc.tree) policy := &policy{ sys: &mockSystem{ nodes: tc.numaNodes, }, pools: tc.nodes, cache: &mockCache{}, root: tc.nodes[0], nodeCnt: len(tc.nodes), allocations: allocations{}, } // back pointers for _, node := range tc.nodes { switch node.(type) { case *numanode: numaNode := node.(*numanode) numaNode.self.node = numaNode noderes := numaNode.noderes.(*supply) noderes.node = node freeres := numaNode.freeres.(*supply) freeres.node = node numaNode.policy = policy case *virtualnode: virtualNode := node.(*virtualnode) virtualNode.self.node = virtualNode noderes := virtualNode.noderes.(*supply) noderes.node = node freeres := virtualNode.freeres.(*supply) freeres.node = node virtualNode.policy = policy } } policy.allocations.policy = policy scores, filteredPools := policy.sortPoolsByScore(tc.req, tc.affinities) fmt.Printf("scores: %v, remaining pools: %v\n", scores, filteredPools) if len(filteredPools) != len(tc.expectedRemainingNodes) { t.Errorf("Wrong nodes in the filtered pool: expected %v but got %v", tc.expectedRemainingNodes, filteredPools) } for _, id := range tc.expectedRemainingNodes { found := false for _, node := range filteredPools { if node.NodeID() == id { found = true break } } if !found { t.Errorf("Did not find id %d in filtered pools: %v", id, filteredPools) } } }) } } func TestPoolCreation(t *testing.T) { // Test pool creation with "real" sysfs data. // Create a temporary directory for the test data. dir, err := os.MkdirTemp("", "cri-resource-manager-test-sysfs-") if err != nil { panic(err) } defer os.RemoveAll(dir) // Uncompress the test data to the directory. err = utils.UncompressTbz2(path.Join("testdata", "sysfs.tar.bz2"), dir) if err != nil { panic(err) } tcases := []struct { path string name string req Request affinities map[int]int32 expectedRemainingNodes []int expectedFirstNodeMemory memoryType expectedLeafNodeCPUs int expectedRootNodeCPUs int // TODO: expectedRootNodeMemory int }{ { path: path.Join(dir, "sysfs", "desktop", "sys"), name: "sysfs pool creation from a desktop system", req: &request{ memReq: 10000, memLim: 10000, memType: memoryAll, container: &mockContainer{}, }, expectedRemainingNodes: []int{0}, expectedFirstNodeMemory: memoryDRAM, expectedLeafNodeCPUs: 20, expectedRootNodeCPUs: 20, }, { path: path.Join(dir, "sysfs", "server", "sys"), name: "sysfs pool creation from a server system", req: &request{ memReq: 10000, memLim: 10000, memType: memoryDRAM, container: &mockContainer{}, }, expectedRemainingNodes: []int{0, 1, 2, 3, 4, 5, 6}, expectedFirstNodeMemory: memoryDRAM | memoryPMEM, expectedLeafNodeCPUs: 28, expectedRootNodeCPUs: 112, }, { path: path.Join(dir, "sysfs", "server", "sys"), name: "pmem request on a server system", req: &request{ memReq: 10000, memLim: 10000, memType: memoryDRAM | memoryPMEM, container: &mockContainer{}, }, expectedRemainingNodes: []int{0, 1, 2, 3, 4, 5, 6}, expectedFirstNodeMemory: memoryDRAM | memoryPMEM, expectedLeafNodeCPUs: 28, expectedRootNodeCPUs: 112, }, { path: path.Join(dir, "sysfs", "4-socket-server-nosnc", "sys"), name: "sysfs pool creation from a 4 socket server with SNC disabled", req: &request{ memReq: 10000, memLim: 10000, memType: memoryAll, container: &mockContainer{}, }, expectedRemainingNodes: []int{0, 1, 2, 3, 4}, expectedFirstNodeMemory: memoryDRAM, expectedLeafNodeCPUs: 36, expectedRootNodeCPUs: 36 * 4, }, } for _, tc := range tcases { t.Run(tc.name, func(t *testing.T) { sys, err := system.DiscoverSystemAt(tc.path) if err != nil { panic(err) } reserved, _ := resapi.ParseQuantity("750m") policyOptions := &policyapi.BackendOptions{ Cache: &mockCache{}, System: sys, Reserved: policyapi.ConstraintSet{ policyapi.DomainCPU: reserved, }, } log.EnableDebug() policy := CreateTopologyAwarePolicy(policyOptions).(*policy) if policy.root.GetSupply().SharableCPUs().Size()+policy.root.GetSupply().IsolatedCPUs().Size()+policy.root.GetSupply().ReservedCPUs().Size() != tc.expectedRootNodeCPUs { t.Errorf("Expected %d CPUs, got %d", tc.expectedRootNodeCPUs, policy.root.GetSupply().SharableCPUs().Size()+policy.root.GetSupply().IsolatedCPUs().Size()+policy.root.GetSupply().ReservedCPUs().Size()) } for _, p := range policy.pools { if p.IsLeafNode() { if len(p.Children()) != 0 { t.Errorf("Leaf node %v had %d children", p, len(p.Children())) } if p.GetSupply().SharableCPUs().Size()+p.GetSupply().IsolatedCPUs().Size()+p.GetSupply().ReservedCPUs().Size() != tc.expectedLeafNodeCPUs { t.Errorf("Expected %d CPUs, got %d (%s)", tc.expectedLeafNodeCPUs, p.GetSupply().SharableCPUs().Size()+p.GetSupply().IsolatedCPUs().Size()+p.GetSupply().ReservedCPUs().Size(), p.GetSupply().DumpCapacity()) } } } scores, filteredPools := policy.sortPoolsByScore(tc.req, tc.affinities) fmt.Printf("scores: %v, remaining pools: %v\n", scores, filteredPools) if len(filteredPools) != len(tc.expectedRemainingNodes) { t.Errorf("Wrong number of nodes in the filtered pool: expected %d but got %d", len(tc.expectedRemainingNodes), len(filteredPools)) } for _, id := range tc.expectedRemainingNodes { found := false for _, node := range filteredPools { if node.NodeID() == id { found = true break } } if !found { t.Errorf("Did not find id %d in filtered pools: %s", id, filteredPools) } } if len(filteredPools) > 0 && filteredPools[0].GetMemoryType() != tc.expectedFirstNodeMemory { t.Errorf("Expected first node memory type %v, got %v", tc.expectedFirstNodeMemory, filteredPools[0].GetMemoryType()) } }) } } func TestWorkloadPlacement(t *testing.T) { // Do some workloads (containers) and see how they are placed in the // server system. // Create a temporary directory for the test data. dir, err := os.MkdirTemp("", "cri-resource-manager-test-sysfs-") if err != nil { panic(err) } defer os.RemoveAll(dir) // Uncompress the test data to the directory. err = utils.UncompressTbz2(path.Join("testdata", "sysfs.tar.bz2"), dir) if err != nil { panic(err) } tcases := []struct { path string name string req Request affinities map[int]int32 expectedRemainingNodes []int expectedLeafNode bool }{ { path: path.Join(dir, "sysfs", "server", "sys"), name: "workload placement on a server system leaf node", req: &request{ memReq: 10000, memLim: 10000, memType: memoryUnspec, isolate: false, full: 25, // 28 - 2 isolated = 26: but fully exhausting the shared CPU subpool is disallowed container: &mockContainer{}, }, expectedRemainingNodes: []int{0, 1, 2, 3, 4, 5, 6}, expectedLeafNode: true, }, { path: path.Join(dir, "sysfs", "server", "sys"), name: "workload placement on a server system root node: CPUs don't fit to leaf", req: &request{ memReq: 10000, memLim: 10000, memType: memoryUnspec, isolate: false, full: 29, container: &mockContainer{}, }, expectedRemainingNodes: []int{0, 1, 2, 3, 4, 5, 6}, expectedLeafNode: false, }, { path: path.Join(dir, "sysfs", "server", "sys"), name: "workload placement on a server system root node: memory doesn't fit to leaf", req: &request{ memReq: 190000000000, memLim: 190000000000, memType: memoryUnspec, isolate: false, full: 28, container: &mockContainer{}, }, expectedRemainingNodes: []int{2, 6}, expectedLeafNode: false, }, } for _, tc := range tcases { t.Run(tc.name, func(t *testing.T) { sys, err := system.DiscoverSystemAt(tc.path) if err != nil { panic(err) } reserved, _ := resapi.ParseQuantity("750m") policyOptions := &policyapi.BackendOptions{ Cache: &mockCache{}, System: sys, Reserved: policyapi.ConstraintSet{ policyapi.DomainCPU: reserved, }, } log.EnableDebug() policy := CreateTopologyAwarePolicy(policyOptions).(*policy) scores, filteredPools := policy.sortPoolsByScore(tc.req, tc.affinities) fmt.Printf("scores: %v, remaining pools: %v\n", scores, filteredPools) if len(filteredPools) != len(tc.expectedRemainingNodes) { t.Errorf("Wrong number of nodes in the filtered pool: expected %d but got %d", len(tc.expectedRemainingNodes), len(filteredPools)) } for _, id := range tc.expectedRemainingNodes { found := false for _, node := range filteredPools { if node.NodeID() == id { found = true break } } if !found { t.Errorf("Did not find id %d in filtered pools: %s", id, filteredPools) } } if filteredPools[0].IsLeafNode() != tc.expectedLeafNode { t.Errorf("Workload should have been placed in a leaf node: %t", tc.expectedLeafNode) } }) } } func TestContainerMove(t *testing.T) { // In case there's not enough memory to guarantee that the // containers running on child nodes won't get OOM killed, they need // to be moved upwards in the tree. // Create a temporary directory for the test data. dir, err := os.MkdirTemp("", "cri-resource-manager-test-sysfs-") if err != nil { panic(err) } defer os.RemoveAll(dir) // Uncompress the test data to the directory. err = utils.UncompressTbz2(path.Join("testdata", "sysfs.tar.bz2"), dir) if err != nil { panic(err) } tcases := []struct { path string name string container1 cache.Container container2 cache.Container container3 cache.Container affinities map[int]int32 expectedLeafNodeForContainer1 bool expectedLeafNodeForContainer2 bool expectedLeafNodeForContainer3 bool expectedChangeForContainer1 bool expectedChangeForContainer2 bool expectedChangeForContainer3 bool }{ { path: path.Join(dir, "sysfs", "server", "sys"), name: "workload placement on a server system leaf node", container1: &mockContainer{ returnValueForGetResourceRequirements: v1.ResourceRequirements{ Limits: v1.ResourceList{ v1.ResourceCPU: resapi.MustParse("2"), v1.ResourceMemory: resapi.MustParse("1000"), }, }, returnValueForGetCacheID: "first", }, container2: &mockContainer{ returnValueForGetResourceRequirements: v1.ResourceRequirements{ Limits: v1.ResourceList{ v1.ResourceCPU: resapi.MustParse("2"), v1.ResourceMemory: resapi.MustParse("1000"), }, }, returnValueForGetCacheID: "second", }, container3: &mockContainer{ returnValueForGetResourceRequirements: v1.ResourceRequirements{ Limits: v1.ResourceList{ v1.ResourceCPU: resapi.MustParse("2"), v1.ResourceMemory: resapi.MustParse("1000"), }, }, returnValueForGetCacheID: "third", }, expectedLeafNodeForContainer1: true, expectedLeafNodeForContainer2: true, expectedLeafNodeForContainer3: true, }, { path: path.Join(dir, "sysfs", "server", "sys"), name: "workload placement on a server system non-leaf node", container1: &mockContainer{ name: "c1", returnValueForGetResourceRequirements: v1.ResourceRequirements{ Limits: v1.ResourceList{ v1.ResourceCPU: resapi.MustParse("2"), v1.ResourceMemory: resapi.MustParse("1000"), }, }, returnValueForGetCacheID: "first", }, container2: &mockContainer{ name: "c2", returnValueForGetResourceRequirements: v1.ResourceRequirements{ Limits: v1.ResourceList{ v1.ResourceCPU: resapi.MustParse("2"), v1.ResourceMemory: resapi.MustParse("190000000000"), // 180 GB }, }, returnValueForGetCacheID: "second", }, container3: &mockContainer{ name: "c3", returnValueForGetResourceRequirements: v1.ResourceRequirements{ Limits: v1.ResourceList{ v1.ResourceCPU: resapi.MustParse("2"), v1.ResourceMemory: resapi.MustParse("140000000000"), // 130 GB }, }, returnValueForGetCacheID: "third", }, expectedLeafNodeForContainer1: false, expectedLeafNodeForContainer2: false, expectedLeafNodeForContainer3: false, expectedChangeForContainer1: true, }, } for _, tc := range tcases { t.Run(tc.name, func(t *testing.T) { sys, err := system.DiscoverSystemAt(tc.path) if err != nil { panic(err) } reserved, _ := resapi.ParseQuantity("750m") policyOptions := &policyapi.BackendOptions{ Cache: &mockCache{}, System: sys, Reserved: policyapi.ConstraintSet{ policyapi.DomainCPU: reserved, }, } log.EnableDebug() policy := CreateTopologyAwarePolicy(policyOptions).(*policy) grant1, err := policy.allocatePool(tc.container1, "") if err != nil { panic(err) } fmt.Printf("grant 1 memsets: dram %s, pmem %s\n", grant1.GetMemoryNode().GetMemset(memoryDRAM), grant1.GetMemoryNode().GetMemset(memoryPMEM)) grant2, err := policy.allocatePool(tc.container2, "") if err != nil { panic(err) } fmt.Printf("grant 2 memsets: dram %s, pmem %s\n", grant2.GetMemoryNode().GetMemset(memoryDRAM), grant2.GetMemoryNode().GetMemset(memoryPMEM)) grant3, err := policy.allocatePool(tc.container3, "") if err != nil { panic(err) } fmt.Printf("grant 3 memsets: dram %s, pmem %s\n", grant3.GetMemoryNode().GetMemset(memoryDRAM), grant3.GetMemoryNode().GetMemset(memoryPMEM)) if (grant1.GetCPUNode().IsSameNode(grant1.GetMemoryNode())) && tc.expectedChangeForContainer1 { t.Errorf("Workload 1 should have been relocated: %t, node: %s", tc.expectedChangeForContainer1, grant1.GetMemoryNode().Name()) } if (grant2.GetCPUNode().IsSameNode(grant2.GetMemoryNode())) && tc.expectedChangeForContainer2 { t.Errorf("Workload 2 should have been relocated: %t, node: %s", tc.expectedChangeForContainer2, grant2.GetMemoryNode().Name()) } if (grant3.GetCPUNode().IsSameNode(grant3.GetMemoryNode())) && tc.expectedChangeForContainer3 { t.Errorf("Workload 3 should have been relocated: %t, node: %s", tc.expectedChangeForContainer3, grant3.GetMemoryNode().Name()) } if grant1.GetMemoryNode().IsLeafNode() != tc.expectedLeafNodeForContainer1 { t.Errorf("Workload 1 should have been placed in a leaf node: %t, node: %s", tc.expectedLeafNodeForContainer1, grant1.GetMemoryNode().Name()) } if grant2.GetMemoryNode().IsLeafNode() != tc.expectedLeafNodeForContainer2 { t.Errorf("Workload 2 should have been placed in a leaf node: %t, node: %s", tc.expectedLeafNodeForContainer2, grant2.GetMemoryNode().Name()) } if grant3.GetMemoryNode().IsLeafNode() != tc.expectedLeafNodeForContainer3 { t.Errorf("Workload 3 should have been placed in a leaf node: %t, node: %s", tc.expectedLeafNodeForContainer3, grant3.GetMemoryNode().Name()) } }) } } func TestAffinities(t *testing.T) { // // Test how (already pre-calculated) affinities affect workload placement. // // Create a temporary directory for the test data. dir, err := os.MkdirTemp("", "cri-resource-manager-test-sysfs-") if err != nil { panic(err) } defer os.RemoveAll(dir) // Uncompress the test data to the directory. err = utils.UncompressTbz2(path.Join("testdata", "sysfs.tar.bz2"), dir) if err != nil { panic(err) } tcases := []struct { path string name string req Request affinities map[string]int32 expected string }{ { path: path.Join(dir, "sysfs", "server", "sys"), name: "no affinities", req: &request{ memReq: 10000, memLim: 10000, memType: memoryUnspec, isolate: false, full: 3, container: &mockContainer{}, }, affinities: map[string]int32{}, expected: "NUMA node #2", }, { path: path.Join(dir, "sysfs", "server", "sys"), name: "reserved - no affinities", req: &request{ cpuType: cpuReserved, memReq: 10000, memLim: 10000, memType: memoryUnspec, isolate: false, full: 0, container: &mockContainer{}, }, affinities: map[string]int32{}, expected: "NUMA node #0", }, { path: path.Join(dir, "sysfs", "server", "sys"), name: "affinity to NUMA node #1", req: &request{ memReq: 10000, memLim: 10000, memType: memoryUnspec, isolate: false, full: 3, container: &mockContainer{}, }, affinities: map[string]int32{ "NUMA node #1": 1, }, expected: "NUMA node #1", }, { path: path.Join(dir, "sysfs", "server", "sys"), name: "affinity to socket #1", req: &request{ memReq: 10000, memLim: 10000, memType: memoryUnspec, isolate: false, full: 3, container: &mockContainer{}, }, affinities: map[string]int32{ "socket #1": 1, }, expected: "socket #1", }, { path: path.Join(dir, "sysfs", "server", "sys"), name: "equal affinities to NUMA node #1, socket #1", req: &request{ memReq: 10000, memLim: 10000, memType: memoryUnspec, isolate: false, full: 3, container: &mockContainer{}, }, affinities: map[string]int32{ "socket #1": 1, "NUMA node #1": 1, }, expected: "NUMA node #1", }, { path: path.Join(dir, "sysfs", "server", "sys"), name: "equal affinities to NUMA node #1, NUMA node #3", req: &request{ memReq: 10000, memLim: 10000, memType: memoryUnspec, isolate: false, full: 3, container: &mockContainer{}, }, affinities: map[string]int32{ "NUMA node #1": 1, "NUMA node #3": 1, }, expected: "socket #1", }, { path: path.Join(dir, "sysfs", "server", "sys"), name: "double affinity to NUMA node #1 vs. #3", req: &request{ memReq: 10000, memLim: 10000, memType: memoryUnspec, isolate: false, full: 3, container: &mockContainer{}, }, affinities: map[string]int32{ "NUMA node #1": 2, "NUMA node #3": 1, }, expected: "socket #1", }, { path: path.Join(dir, "sysfs", "server", "sys"), name: "triple affinity to NUMA node #1 vs. #3", req: &request{ memReq: 10000, memLim: 10000, memType: memoryUnspec, isolate: false, full: 3, container: &mockContainer{}, }, affinities: map[string]int32{ "NUMA node #1": 3, "NUMA node #3": 1, }, expected: "NUMA node #1", }, { path: path.Join(dir, "sysfs", "server", "sys"), name: "double affinity to NUMA node #0,#3 vs. socket #1", req: &request{ memReq: 10000, memLim: 10000, memType: memoryUnspec, isolate: false, full: 3, container: &mockContainer{}, }, affinities: map[string]int32{ "NUMA node #0": 2, "NUMA node #3": 2, "socket #1": 1, }, expected: "root", }, { path: path.Join(dir, "sysfs", "server", "sys"), name: "equal affinity to NUMA node #0,#3 vs. socket #1", req: &request{ memReq: 10000, memLim: 10000, memType: memoryUnspec, isolate: false, full: 3, container: &mockContainer{}, }, affinities: map[string]int32{ "NUMA node #0": 1, "NUMA node #3": 1, "socket #1": 1, }, expected: "root", }, { path: path.Join(dir, "sysfs", "server", "sys"), name: "half the affinity to NUMA node #0,#3 vs. socket #1", req: &request{ memReq: 10000, memLim: 10000, memType: memoryUnspec, isolate: false, full: 3, container: &mockContainer{}, }, affinities: map[string]int32{ "NUMA node #0": 1, "NUMA node #3": 1, "socket #1": 2, }, expected: "socket #1", }, } for _, tc := range tcases { t.Run(tc.name, func(t *testing.T) { sys, err := system.DiscoverSystemAt(tc.path) if err != nil { panic(err) } reserved, _ := resapi.ParseQuantity("750m") policyOptions := &policyapi.BackendOptions{ Cache: &mockCache{}, System: sys, Reserved: policyapi.ConstraintSet{ policyapi.DomainCPU: reserved, }, } log.EnableDebug() policy := CreateTopologyAwarePolicy(policyOptions).(*policy) affinities := map[int]int32{} for name, weight := range tc.affinities { affinities[findNodeWithName(name, policy.pools).NodeID()] = weight } log.EnableDebug() scores, filteredPools := policy.sortPoolsByScore(tc.req, affinities) fmt.Printf("scores: %v, remaining pools: %v\n", scores, filteredPools) if len(filteredPools) < 1 { t.Errorf("pool scoring failed to find any pools") } node := filteredPools[0] if node.Name() != tc.expected { t.Errorf("expected best pool %s, got %s", tc.expected, node.Name()) } }) } } ================================================ FILE: pkg/cri/resource-manager/policy/builtin/topology-aware/resources.go ================================================ // Copyright 2019 Intel Corporation. All Rights Reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package topologyaware import ( "fmt" "strconv" "time" v1 "k8s.io/api/core/v1" "github.com/intel/cri-resource-manager/pkg/cpuallocator" "github.com/intel/cri-resource-manager/pkg/cri/resource-manager/cache" "github.com/intel/cri-resource-manager/pkg/topology" "github.com/intel/cri-resource-manager/pkg/utils/cpuset" idset "github.com/intel/goresctrl/pkg/utils" ) // Supply represents avaialbe CPU and memory capacity of a node. type Supply interface { // GetNode returns the node supplying this capacity. GetNode() Node // Clone creates a copy of this supply. Clone() Supply // IsolatedCPUs returns the isolated cpuset in this supply. IsolatedCPUs() cpuset.CPUSet // ReservedCPUs returns the reserved cpuset in this supply. ReservedCPUs() cpuset.CPUSet // SharableCPUs returns the sharable cpuset in this supply. SharableCPUs() cpuset.CPUSet // GrantedReserved returns the locally granted reserved CPU capacity in this supply. GrantedReserved() int // GrantedShared returns the locally granted shared CPU capacity in this supply. GrantedShared() int // GrantedMemory returns the locally granted memory capacity in this supply. GrantedMemory(memoryType) uint64 // Cumulate cumulates the given supply into this one. Cumulate(Supply) // AssignMemory adds extra memory to this supply (for extra NUMA nodes assigned to a pool). AssignMemory(mem memoryMap) // AccountAllocateCPU accounts for (removes) allocated exclusive capacity from the supply. AccountAllocateCPU(Grant) // AccountReleaseCPU accounts for (reinserts) released exclusive capacity into the supply. AccountReleaseCPU(Grant) // GetScore calculates how well this supply fits/fulfills the given request. GetScore(Request) Score // AllocatableSharedCPU calculates the allocatable amount of shared CPU of this supply. AllocatableSharedCPU(...bool) int // Allocate allocates CPU capacity from this supply and returns it as a grant. Allocate(Request) (Grant, error) // ReleaseCPU releases a previously allocated CPU grant from this supply. ReleaseCPU(Grant) // ReleaseMemory releases a previously allocated memory grant from this supply. ReleaseMemory(Grant) // ReallocateMemory updates the Grant to allocate memory from this supply. ReallocateMemory(Grant) error // ExtraMemoryReservation returns the memory reservation. ExtraMemoryReservation(memoryType) uint64 // SetExtraMemroyReservation sets the extra memory reservation based on the granted memory. SetExtraMemoryReservation(Grant) // ReleaseExtraMemoryReservation removes the extra memory reservations based on the granted memory. ReleaseExtraMemoryReservation(Grant) // MemoryLimit returns the amount of various memory types belonging to this grant. MemoryLimit() memoryMap // Reserve accounts for CPU grants after reloading cached allocations. Reserve(Grant) error // ReserveMemory accounts for memory grants after reloading cached allocations. ReserveMemory(Grant) error // DumpCapacity returns a printable representation of the supply's resource capacity. DumpCapacity() string // DumpAllocatable returns a printable representation of the supply's alloctable resources. DumpAllocatable() string // DumpMemoryState dumps the state of the available and allocated memory. DumpMemoryState(string) } // Request represents CPU and memory resources requested by a container. type Request interface { // GetContainer returns the container requesting CPU capacity. GetContainer() cache.Container // String returns a printable representation of this request. String() string // CPUType returns the type of requested CPU. CPUType() cpuClass // SetCPUType sets the type of requested CPU. SetCPUType(cpuType cpuClass) // FullCPUs return the number of full CPUs requested. FullCPUs() int // CPUFraction returns the amount of fractional milli-CPU requested. CPUFraction() int // Isolate returns whether isolated CPUs are preferred for this request. Isolate() bool // MemoryType returns the type(s) of requested memory. MemoryType() memoryType // MemAmountToAllocate retuns how much memory we need to reserve for a request. MemAmountToAllocate() uint64 // ColdStart returns the cold start timeout. ColdStart() time.Duration } // Grant represents CPU and memory capacity allocated to a container from a node. type Grant interface { // SetCPUPortion sets the fraction CPU portion for the grant. SetCPUPortion(fraction int) // SetMemoryAllocation sets the memory allocation for the grant. SetMemoryAllocation(memoryType, memoryMap, time.Duration) // Clone creates a copy of this grant. Clone() Grant // RefetchNodes updates the stored cpu and memory nodes of this grant by name. RefetchNodes() error // GetContainer returns the container CPU capacity is granted to. GetContainer() cache.Container // GetCPUNode returns the node that granted CPU capacity to the container. GetCPUNode() Node // GetMemoryNode returns the node which granted memory capacity to // the container. GetMemoryNode() Node // CPUType returns the type of granted CPUs CPUType() cpuClass // CPUPortion returns granted milli-CPUs of non-full CPUs of CPUType(). // CPUPortion() == ReservedPortion() + SharedPortion(). CPUPortion() int // ExclusiveCPUs returns the exclusively granted non-isolated cpuset. ExclusiveCPUs() cpuset.CPUSet // ReservedCPUs returns the reserved granted cpuset. ReservedCPUs() cpuset.CPUSet // ReservedPortion() returns the amount of CPUs in milli-CPU granted. ReservedPortion() int // SharedCPUs returns the shared granted cpuset. SharedCPUs() cpuset.CPUSet // SharedPortion returns the amount of CPUs in milli-CPU granted. SharedPortion() int // IsolatedCpus returns the exclusively granted isolated cpuset. IsolatedCPUs() cpuset.CPUSet // MemoryType returns the type(s) of granted memory. MemoryType() memoryType // SetMemoryNode updates the grant memory controllers. SetMemoryNode(Node) // Memset returns the granted memory controllers as a string. Memset() idset.IDSet // ExpandMemset() makes the memory controller set larger as the grant // is moved up in the node hierarchy. ExpandMemset() (bool, error) // MemLimit returns the amount of memory that the container is // allowed to use. MemLimit() memoryMap // String returns a printable representation of this grant. String() string // Release releases the grant from all the Supplys it uses. Release() // AccountAllocateCPU accounts for (removes) allocated exclusive capacity for this grant. AccountAllocateCPU() // AccountReleaseCPU accounts for (reinserts) released exclusive capacity for this grant. AccountReleaseCPU() // UpdateExtraMemoryReservation() updates the reservations in the subtree // of nodes under the node from which the memory was granted. UpdateExtraMemoryReservation() // RestoreMemset restores the granted memory set to node maximum // and reapplies the grant. RestoreMemset() // ColdStart returns the cold start timeout. ColdStart() time.Duration // AddTimer adds a cold start timer. AddTimer(*time.Timer) // StopTimer stops a cold start timer. StopTimer() // ClearTimer clears the cold start timer pointer. ClearTimer() } // Score represents how well a supply can satisfy a request. type Score interface { // Calculate the actual score from the collected parameters. Eval() float64 // Supply returns the supply associated with this score. Supply() Supply // Request returns the request associated with this score. Request() Request IsolatedCapacity() int ReservedCapacity() int SharedCapacity() int Colocated() int HintScores() map[string]float64 String() string } type memoryMap map[memoryType]uint64 // supply implements our Supply interface. type supply struct { node Node // node supplying CPUs and memory isolated cpuset.CPUSet // isolated CPUs at this node reserved cpuset.CPUSet // reserved CPUs at this node sharable cpuset.CPUSet // sharable CPUs at this node grantedReserved int // amount of reserved CPUs allocated grantedShared int // amount of shareable CPUs allocated mem memoryMap // available memory for this node grantedMem memoryMap // total memory granted extraMemReservations map[Grant]memoryMap // how much memory each workload above has requested } var _ Supply = &supply{} // request implements our Request interface. type request struct { container cache.Container // container for this request full int // number of full CPUs requested fraction int // amount of fractional CPU requested isolate bool // prefer isolated exclusive CPUs cpuType cpuClass // preferred CPU type (normal, reserved) memReq uint64 // memory request memLim uint64 // memory limit memType memoryType // requested types of memory // coldStart tells the timeout (in milliseconds) how long to wait until // a DRAM memory controller should be added to a container asking for a // mixed DRAM/PMEM memory allocation. This allows for a "cold start" where // initial memory requests are made to the PMEM memory. A value of 0 // indicates that cold start is not explicitly requested. coldStart time.Duration } var _ Request = &request{} // grant implements our Grant interface. type grant struct { container cache.Container // container CPU is granted to node Node // node CPU is supplied from memoryNode Node // node memory is supplied from exclusive cpuset.CPUSet // exclusive CPUs cpuType cpuClass // type of CPUs (normal, reserved, ...) cpuPortion int // milliCPUs granted from CPUs of cpuType memType memoryType // requested types of memory memset idset.IDSet // assigned memory nodes allocatedMem memoryMap // memory limit coldStart time.Duration // how long until cold start is done coldStartTimer *time.Timer // timer to trigger cold start timeout } var _ Grant = &grant{} // score implements our Score interface. type score struct { supply Supply // CPU supply (node) req Request // CPU request (container) isolated int // remaining isolated CPUs reserved int // remaining reserved CPUs shared int // remaining shared capacity colocated int // number of colocated containers hints map[string]float64 // hint scores } var _ Score = &score{} // newSupply creates CPU supply for the given node, cpusets and existing grant. func newSupply(n Node, isolated, reserved, sharable cpuset.CPUSet, grantedReserved int, grantedShared int, mem, grantedMem memoryMap) Supply { if mem == nil { mem = createMemoryMap(0, 0, 0) } if grantedMem == nil { grantedMem = createMemoryMap(0, 0, 0) } return &supply{ node: n, isolated: isolated.Clone(), reserved: reserved.Clone(), sharable: sharable.Clone(), grantedReserved: grantedReserved, grantedShared: grantedShared, mem: mem, grantedMem: grantedMem, extraMemReservations: make(map[Grant]memoryMap), } } func createMemoryMap(dram, pmem, hbm uint64) memoryMap { return memoryMap{ memoryDRAM: dram, memoryPMEM: pmem, memoryHBM: hbm, memoryAll: dram + pmem + hbm, memoryUnspec: 0, } } func (m memoryMap) Add(dram, pmem, hbm uint64) { m[memoryDRAM] += dram m[memoryPMEM] += pmem m[memoryPMEM] += hbm m[memoryAll] += dram + pmem + hbm } func (m memoryMap) AddDRAM(dram uint64) { m[memoryDRAM] += dram m[memoryAll] += dram } func (m memoryMap) AddPMEM(pmem uint64) { m[memoryPMEM] += pmem m[memoryAll] += pmem } func (m memoryMap) AddHBM(hbm uint64) { m[memoryHBM] += hbm m[memoryAll] += hbm } func (m memoryMap) String() string { mem, sep := "", "" dram, pmem, hbm, types := m[memoryDRAM], m[memoryPMEM], m[memoryHBM], 0 if dram > 0 || pmem > 0 || hbm > 0 { if dram > 0 { mem += "DRAM " + prettyMem(dram) sep = ", " types++ } if pmem > 0 { mem += sep + "PMEM " + prettyMem(pmem) sep = ", " types++ } if hbm > 0 { mem += sep + "HBM " + prettyMem(hbm) types++ } if types > 1 { mem += sep + "total " + prettyMem(pmem+dram+hbm) } } return mem } // GetNode returns the node supplying CPU and memory. func (cs *supply) GetNode() Node { return cs.node } // Clone clones the given CPU supply. func (cs *supply) Clone() Supply { // Copy the maps. mem := make(memoryMap) for key, value := range cs.mem { mem[key] = value } grantedMem := make(memoryMap) for key, value := range cs.grantedMem { grantedMem[key] = value } return newSupply(cs.node, cs.isolated, cs.reserved, cs.sharable, cs.grantedReserved, cs.grantedShared, mem, grantedMem) } // IsolatedCpus returns the isolated CPUSet of this supply. func (cs *supply) IsolatedCPUs() cpuset.CPUSet { return cs.isolated.Clone() } // ReservedCpus returns the reserved CPUSet of this supply. func (cs *supply) ReservedCPUs() cpuset.CPUSet { return cs.reserved.Clone() } // SharableCpus returns the sharable CPUSet of this supply. func (cs *supply) SharableCPUs() cpuset.CPUSet { return cs.sharable.Clone() } // GrantedReserved returns the locally granted reserved CPU capacity. func (cs *supply) GrantedReserved() int { return cs.grantedReserved } // GrantedShared returns the locally granted sharable CPU capacity. func (cs *supply) GrantedShared() int { return cs.grantedShared } func (cs *supply) GrantedMemory(memType memoryType) uint64 { // Return only granted memory of correct type return cs.grantedMem[memType] } func (cs *supply) MemoryLimit() memoryMap { return cs.mem } // Cumulate more CPU to supply. func (cs *supply) Cumulate(more Supply) { mcs := more.(*supply) cs.isolated = cs.isolated.Union(mcs.isolated) cs.reserved = cs.reserved.Union(mcs.reserved) cs.sharable = cs.sharable.Union(mcs.sharable) cs.grantedReserved += mcs.grantedReserved cs.grantedShared += mcs.grantedShared for key, value := range mcs.mem { cs.mem[key] += value } for key, value := range mcs.grantedMem { cs.grantedMem[key] += value } } // AssignMemory adds memory (for extra NUMA nodes assigned to a pool node). func (cs *supply) AssignMemory(mem memoryMap) { for key, value := range mem { cs.mem[key] += value } } // AccountAllocateCPU accounts for (removes) allocated exclusive capacity from the supply. func (cs *supply) AccountAllocateCPU(g Grant) { if cs.node.IsSameNode(g.GetCPUNode()) { return } exclusive := g.ExclusiveCPUs() cs.isolated = cs.isolated.Difference(exclusive) cs.sharable = cs.sharable.Difference(exclusive) } // AccountReleaseCPU accounts for (reinserts) released exclusive capacity into the supply. func (cs *supply) AccountReleaseCPU(g Grant) { if cs.node.IsSameNode(g.GetCPUNode()) { return } ncs := cs.node.GetSupply() nodecpus := ncs.IsolatedCPUs().Union(ncs.SharableCPUs()) grantcpus := g.ExclusiveCPUs().Intersection(nodecpus) isolated := grantcpus.Intersection(ncs.IsolatedCPUs()) sharable := grantcpus.Intersection(ncs.SharableCPUs()) cs.isolated = cs.isolated.Union(isolated) cs.sharable = cs.sharable.Union(sharable) } // allocateMemory tries to fulfill the memory allocation part of a request. func (cs *supply) allocateMemory(r Request) (memoryMap, error) { reqType := r.MemoryType() if reqType == memoryUnspec { reqType = memoryAll } allocated := createMemoryMap(0, 0, 0) requested := r.MemAmountToAllocate() remaining := requested // // Notes: // We try to allocate PMEM, then DRAM, and finally HBM, honoring // the types allowed by the request. We don't need to care about // extra memory reservations for this node as all the nodes with // insufficient memory have been filtered out before allocation. // // However, for cold started containers we do check if there is // enough PMEM free to accomodate the full request and bail out // if that check fails. // for _, memType := range []memoryType{memoryPMEM, memoryDRAM, memoryHBM} { if remaining > 0 && (reqType&memType) != 0 { available := cs.mem[memType] log.Debug("%s: trying %s %s of %s available", r.GetContainer().PrettyName(), prettyMem(remaining), memType.String(), prettyMem(available)) if remaining <= available { allocated[memType] = remaining } else { allocated[memType] = available } cs.grantedMem[memType] += allocated[memType] cs.mem[memType] -= allocated[memType] remaining -= allocated[memType] } if remaining > 0 { if r.ColdStart() > 0 && memType == memoryPMEM { return nil, policyError("internal error: "+ "not enough PMEM for cold start at %s", cs.GetNode().Name()) } } else { break } } if remaining > 0 { log.Debug("%s: %s allocation from %s fell short %s", r.GetContainer().PrettyName(), reqType.String(), cs.GetNode().Name(), prettyMem(remaining)) for memType, amount := range allocated { if amount > 0 { cs.grantedMem[memType] -= amount cs.mem[memType] += amount } } return nil, policyError("internal error: "+ "not enough memory at %s", cs.node.Name()) } cs.grantedMem[memoryAll] += requested cs.mem[memoryAll] -= requested return allocated, nil } // Allocate allocates a grant from the supply. func (cs *supply) Allocate(r Request) (Grant, error) { grant, err := cs.AllocateCPU(r) if err != nil { return nil, err } memory, err := cs.allocateMemory(r) if err != nil { cs.ReleaseCPU(grant) return nil, err } grant.SetMemoryAllocation(r.MemoryType(), memory, r.ColdStart()) return grant, nil } // AllocateCPU allocates CPU for a grant from the supply. func (cs *supply) AllocateCPU(r Request) (Grant, error) { var exclusive cpuset.CPUSet var err error cr := r.(*request) full := cr.full fraction := cr.fraction cpuType := cr.cpuType if cpuType == cpuReserved && full > 0 { log.Warn("exclusive reserved CPUs not supported, allocating %d full CPUs as fractions", full) fraction += full * 1000 full = 0 } if cpuType == cpuReserved && fraction > 0 && cs.AllocatableReservedCPU() < fraction { log.Warn("possible misconfiguration of reserved resources:") log.Warn(" %s: allocatable %s", cs.GetNode().Name(), cs.DumpAllocatable()) log.Warn(" %s: needs %d reserved, only %d available", cr.GetContainer().PrettyName(), fraction, cs.AllocatableReservedCPU()) log.Warn(" falling back to using normal unreserved CPUs instead...") cpuType = cpuNormal } // allocate isolated exclusive CPUs or slice them off the sharable set switch { case full > 0 && cs.isolated.Size() >= full && cr.isolate: exclusive, err = cs.takeCPUs(&cs.isolated, nil, full) if err != nil { return nil, policyError("internal error: "+ "%s: can't take %d exclusive isolated CPUs from %s: %v", cs.node.Name(), full, cs.isolated, err) } case full > 0 && cs.AllocatableSharedCPU() > 1000*full: exclusive, err = cs.takeCPUs(&cs.sharable, nil, full) if err != nil { return nil, policyError("internal error: "+ "%s: can't take %d exclusive CPUs from %s: %v", cs.node.Name(), full, cs.sharable, err) } case full > 0: return nil, policyError("internal error: "+ "%s: can't slice %d exclusive CPUs from %s, %dm available", cs.node.Name(), full, cs.sharable, cs.AllocatableSharedCPU()) } grant := newGrant(cs.node, cr.GetContainer(), cpuType, exclusive, 0, 0, nil, 0) grant.AccountAllocateCPU() if fraction > 0 { if cpuType == cpuNormal { // allocate requested portion of shared CPUs if cs.AllocatableSharedCPU() < fraction { cs.ReleaseCPU(grant) return nil, policyError("internal error: "+ "%s: not enough %dm sharable CPU for %dm, %dm available", cs.node.Name(), fraction, cs.sharable, cs.AllocatableSharedCPU()) } cs.grantedShared += fraction } else if cpuType == cpuReserved { // allocate requested portion of reserved CPUs if cs.AllocatableReservedCPU() < fraction { cs.ReleaseCPU(grant) return nil, policyError("internal error: "+ "%s: not enough reserved CPU: %dm requested, %dm available", cs.node.Name(), fraction, cs.AllocatableReservedCPU()) } cs.grantedReserved += fraction } grant.SetCPUPortion(fraction) } return grant, nil } func (cs *supply) ReallocateMemory(g Grant) error { log.Debug("%s: reallocating memory (%s) from %s to %s", g.GetContainer().PrettyName(), g.MemLimit().String(), g.GetMemoryNode().Name(), cs.GetNode().Name()) // The grant has been previously allocated from another supply. Reallocate it here. g.GetMemoryNode().FreeSupply().ReleaseMemory(g) mem := uint64(0) allocatedMemory := g.MemLimit() for key, value := range allocatedMemory { if cs.mem[key] < value { return policyError("internal error: not enough memory for reallocation at %s (released from %s)", cs.GetNode().Name(), g.GetMemoryNode().Name()) } cs.mem[key] -= value cs.grantedMem[key] += value mem += value } cs.grantedMem[memoryAll] += mem cs.mem[memoryAll] -= mem return nil } func (cs *supply) ReleaseCPU(g Grant) { isolated := g.ExclusiveCPUs().Intersection(cs.node.GetSupply().IsolatedCPUs()) sharable := g.ExclusiveCPUs().Difference(isolated) cs.isolated = cs.isolated.Union(isolated) cs.sharable = cs.sharable.Union(sharable) cs.grantedReserved -= g.ReservedPortion() cs.grantedShared -= g.SharedPortion() g.AccountReleaseCPU() } // ReleaseMemory returns memory from the given grant to the supply. func (cs *supply) ReleaseMemory(g Grant) { releasedMemory := uint64(0) log.Debug("%s: releasing granted memory (%s) from %s", g.GetContainer().PrettyName(), g.MemLimit().String(), cs.GetNode().Name()) for key, value := range g.MemLimit() { cs.grantedMem[key] -= value cs.mem[key] += value releasedMemory += value } cs.grantedMem[memoryAll] -= releasedMemory cs.mem[memoryAll] += releasedMemory cs.node.DepthFirst(func(n Node) error { n.FreeSupply().ReleaseExtraMemoryReservation(g) return nil }) } func (cs *supply) ExtraMemoryReservation(memType memoryType) uint64 { extra := uint64(0) for _, res := range cs.extraMemReservations { extra += res[memType] } return extra } func (cs *supply) ReleaseExtraMemoryReservation(g Grant) { if mems, ok := cs.extraMemReservations[g]; ok { log.Debug("%s: releasing extra memory reservation (%s) from %s", g.GetContainer().PrettyName(), mems.String(), cs.GetNode().Name()) delete(cs.extraMemReservations, g) } } func (cs *supply) SetExtraMemoryReservation(g Grant) { res := make(memoryMap) extraMemory := uint64(0) for key, value := range g.MemLimit() { res[key] = value extraMemory += value } res[memoryAll] = extraMemory cs.extraMemReservations[g] = res } func (cs *supply) Reserve(g Grant) error { if g.CPUType() == cpuNormal { isolated := g.IsolatedCPUs() exclusive := g.ExclusiveCPUs().Difference(isolated) sharedPortion := g.SharedPortion() if !cs.isolated.Intersection(isolated).Equals(isolated) { return policyError("can't reserve isolated CPUs (%s) of %s from %s", isolated.String(), g.String(), cs.DumpAllocatable()) } if !cs.sharable.Intersection(exclusive).Equals(exclusive) { return policyError("can't reserve exclusive CPUs (%s) of %s from %s", exclusive.String(), g.String(), cs.DumpAllocatable()) } if cs.AllocatableSharedCPU() < 1000*exclusive.Size()+sharedPortion { return policyError("can't reserve %d shared CPUs of %s from %s", sharedPortion, g.String(), cs.DumpAllocatable()) } cs.isolated = cs.isolated.Difference(isolated) cs.sharable = cs.sharable.Difference(exclusive) cs.grantedShared += sharedPortion } else if g.CPUType() == cpuReserved { sharedPortion := 1000*g.ExclusiveCPUs().Size() + g.SharedPortion() if sharedPortion > 0 && cs.AllocatableReservedCPU() < sharedPortion { return policyError("can't reserve %d reserved CPUs of %s from %s", sharedPortion, g.String(), cs.DumpAllocatable()) } cs.grantedReserved += sharedPortion } g.AccountAllocateCPU() return nil } func (cs *supply) ReserveMemory(g Grant) error { mem := uint64(0) allocatedMemory := g.MemLimit() for key, value := range allocatedMemory { if cs.mem[key] < value { return policyError("internal error: not enough memory for allocation at %s", g.GetMemoryNode().Name()) } cs.mem[key] -= value cs.grantedMem[key] += value mem += value } cs.grantedMem[memoryAll] += mem cs.mem[memoryAll] -= mem g.UpdateExtraMemoryReservation() return nil } // takeCPUs takes up to cnt CPUs from a given CPU set to another. func (cs *supply) takeCPUs(from, to *cpuset.CPUSet, cnt int) (cpuset.CPUSet, error) { cset, err := cs.node.Policy().cpuAllocator.AllocateCpus(from, cnt, cpuallocator.PriorityHigh) if err != nil { return cset, err } if to != nil { *to = to.Union(cset) } return cset, err } // DumpCapacity returns a printable representation of the supply's resource capacity. func (cs *supply) DumpCapacity() string { cpu, mem, sep := "", cs.mem.String(), "" if !cs.isolated.IsEmpty() { cpu = fmt.Sprintf("isolated:%s", cpuset.ShortCPUSet(cs.isolated)) sep = ", " } if !cs.reserved.IsEmpty() { cpu += sep + fmt.Sprintf("reserved:%s (%dm)", cpuset.ShortCPUSet(cs.reserved), 1000*cs.reserved.Size()) sep = ", " } if !cs.sharable.IsEmpty() { cpu += sep + fmt.Sprintf("sharable:%s (%dm)", cpuset.ShortCPUSet(cs.sharable), 1000*cs.sharable.Size()) } capacity := "<" + cs.node.Name() + " capacity: " if cpu == "" && mem == "" { capacity += "-" } else { sep = "" if cpu != "" { capacity += "CPU: " + cpu sep = ", " } if mem != "" { capacity += sep + "MemLimit: " + mem } } capacity += ">" return capacity } // DumpAllocatable returns a printable representation of the supply's resource capacity. func (cs *supply) DumpAllocatable() string { cpu, mem, sep := "", cs.mem.String(), "" if !cs.isolated.IsEmpty() { cpu = fmt.Sprintf("isolated:%s", cpuset.ShortCPUSet(cs.isolated)) sep = ", " } if !cs.reserved.IsEmpty() { cpu += sep + fmt.Sprintf("reserved:%s (allocatable: %dm)", cpuset.ShortCPUSet(cs.reserved), cs.AllocatableReservedCPU()) sep = ", " if cs.grantedReserved > 0 { cpu += sep + fmt.Sprintf("grantedReserved:%dm", cs.grantedReserved) } } local_grantedShared := cs.grantedShared total_grantedShared := cs.node.GrantedSharedCPU() if !cs.sharable.IsEmpty() { cpu += sep + fmt.Sprintf("sharable:%s (", cpuset.ShortCPUSet(cs.sharable)) sep = "" if local_grantedShared > 0 || total_grantedShared > 0 { cpu += fmt.Sprintf("grantedShared:") kind := "" if local_grantedShared > 0 { cpu += fmt.Sprintf("%dm", local_grantedShared) kind = "local" sep = "/" } if total_grantedShared > 0 { cpu += sep + fmt.Sprintf("%dm", total_grantedShared) kind += sep + "subtree" } cpu += " " + kind sep = ", " } cpu += sep + fmt.Sprintf("allocatable:%dm)", cs.AllocatableSharedCPU(true)) } allocatable := "<" + cs.node.Name() + " allocatable: " if cpu == "" && mem == "" { allocatable += "-" } else { sep = "" if cpu != "" { allocatable += "CPU: " + cpu sep = ", " } if mem != "" { allocatable += sep + "MemLimit: " + mem } } allocatable += ">" return allocatable } // prettyMem formats the given amount as k, M, G, or T units. func prettyMem(value uint64) string { units := []string{"k", "M", "G", "T"} coeffs := []uint64{1 << 10, 1 << 20, 1 << 30, 1 << 40} c, u := uint64(1), "" for i := 0; i < len(units); i++ { if coeffs[i] > value { break } c, u = coeffs[i], units[i] } v := float64(value) / float64(c) return strconv.FormatFloat(v, 'f', 2, 64) + u } // DumpMemoryState dumps the state of the available and allocated memory. func (cs *supply) DumpMemoryState(prefix string) { memTypes := []memoryType{memoryDRAM, memoryPMEM, memoryHBM} totalFree := uint64(0) totalGranted := uint64(0) for _, kind := range memTypes { free := cs.mem[kind] granted := cs.grantedMem[kind] if free != 0 || granted != 0 { log.Debug(prefix+"- %s: free: %s, granted %s", kind, prettyMem(free), prettyMem(granted)) } totalFree += free totalGranted += granted } log.Debug(prefix+"- total free: %s, total granted %s", prettyMem(totalFree), prettyMem(totalGranted)) printHdr := true if len(cs.extraMemReservations) > 0 { for g, memMap := range cs.extraMemReservations { split := "" sep := "" total := uint64(0) if mem := memMap[memoryDRAM]; mem > 0 { split = "DRAM " + prettyMem(mem) sep = ", " total += mem } if mem := memMap[memoryPMEM]; mem > 0 { split += sep + "PMEM " + prettyMem(mem) sep = ", " total += mem } if mem := memMap[memoryHBM]; mem > 0 { split += sep + "HBMEM " + prettyMem(mem) sep = ", " total += mem } if total > 0 { if printHdr { log.Debug(prefix + "- extra reservations:") printHdr = false } log.Debug(prefix+" - %s: %s (%s)", g.GetContainer().PrettyName(), prettyMem(total), split) } } } } // newRequest creates a new request for the given container. func newRequest(container cache.Container) Request { pod, _ := container.GetPod() full, fraction, isolate, cpuType := cpuAllocationPreferences(pod, container) req, lim, mtype := memoryAllocationPreference(pod, container) coldStart := time.Duration(0) log.Debug("%s: CPU preferences: cpuType=%s, full=%v, fraction=%v, isolate=%v", container.PrettyName(), cpuType, full, fraction, isolate) if mtype == memoryUnspec { mtype = defaultMemoryType } if mtype&memoryPMEM != 0 && mtype&memoryDRAM != 0 { parsedColdStart, err := coldStartPreference(pod, container) if err != nil { log.Error("Failed to parse cold start preference") } else { if parsedColdStart.Duration > 0 { if coldStartOff { log.Error("coldstart disabled (movable non-DRAM memory zones present)") } else { coldStart = time.Duration(parsedColdStart.Duration) } } } } else if mtype == memoryPMEM { if coldStartOff { mtype = mtype | memoryDRAM log.Error("%s: forced also DRAM usage (movable non-DRAM memory zones present)", container.PrettyName()) } } return &request{ container: container, full: full, fraction: fraction, isolate: isolate, cpuType: cpuType, memReq: req, memLim: lim, memType: mtype, coldStart: coldStart, } } // GetContainer returns the container requesting CPU. func (cr *request) GetContainer() cache.Container { return cr.container } // String returns aprintable representation of the CPU request. func (cr *request) String() string { mem := "" isolated := map[bool]string{false: "", true: "isolated "}[cr.isolate] switch { case cr.full == 0 && cr.fraction == 0: return "" + mem case cr.full > 0 && cr.fraction > 0: return fmt.Sprintf("", isolated, cr.full, cr.fraction) + mem case cr.full > 0: return fmt.Sprintf("", isolated, cr.full) + mem default: return fmt.Sprintf("", cr.fraction) + mem } } // CPUType returns the requested type of CPU for the grant. func (cr *request) CPUType() cpuClass { return cr.cpuType } // SetCPUType sets the requested type of CPU for the grant. func (cr *request) SetCPUType(cpuType cpuClass) { cr.cpuType = cpuType } // FullCPUs return the number of full CPUs requested. func (cr *request) FullCPUs() int { return cr.full } // CPUFraction returns the amount of fractional milli-CPU requested. func (cr *request) CPUFraction() int { return cr.fraction } // Isolate returns whether isolated CPUs are preferred for this request. func (cr *request) Isolate() bool { return cr.isolate } // MemAmountToAllocate retuns how much memory we need to reserve for a request. func (cr *request) MemAmountToAllocate() uint64 { var amount uint64 = 0 switch cr.GetContainer().GetQOSClass() { case v1.PodQOSBurstable: // May be a request and/or limit. We focus on the limit because we // need to prepare for the case when all containers are using all // the memory they are allowed to. If limit is not set then we'll // allocate the request (which the container will get). if cr.memLim > 0 { amount = cr.memLim } else { amount = cr.memReq } case v1.PodQOSGuaranteed: // Limit and request are the same. amount = cr.memLim case v1.PodQOSBestEffort: // No requests or limits. amount = 0 } return amount } // MemoryType returns the requested type of memory for the grant. func (cr *request) MemoryType() memoryType { return cr.memType } // ColdStart returns the cold start timeout (in milliseconds). func (cr *request) ColdStart() time.Duration { return cr.coldStart } // Score collects data for scoring this supply wrt. the given request. func (cs *supply) GetScore(req Request) Score { score := &score{ supply: cs, req: req, } cr := req.(*request) full, part := cr.full, cr.fraction if full == 0 && part == 0 { part = 1 } score.reserved = cs.AllocatableReservedCPU() score.shared = cs.AllocatableSharedCPU() if cr.CPUType() == cpuReserved { // calculate free reserved capacity score.reserved -= part } else { // calculate isolated node capacity CPU if cr.isolate { score.isolated = cs.isolated.Size() - full } // if we don't want isolated or there is not enough, calculate slicable capacity if !cr.isolate || score.isolated < 0 { score.shared -= 1000 * full } // calculate fractional capacity score.shared -= part } // calculate colocation score for _, grant := range cs.node.Policy().allocations.grants { if cr.CPUType() == grant.CPUType() && grant.GetCPUNode().NodeID() == cs.node.NodeID() { score.colocated++ } } // calculate real hint scores hints := cr.container.GetTopologyHints() score.hints = make(map[string]float64, len(hints)) for provider, hint := range cr.container.GetTopologyHints() { if provider == topology.ProviderKubelet { log.Warn(" - ignoring topology pseudo-hint from kubelet allocation %s", hint) continue } log.Debug(" - evaluating topology hint %s", hint) score.hints[provider] = cs.node.HintScore(hint) } return score } // AllocatableReservedCPU calculates the allocatable amount of reserved CPU of this supply. func (cs *supply) AllocatableReservedCPU() int { if cs.reserved.Size() == 0 { // This supply has no room for reserved (not even of zero-sized) return -1 } reserved := 1000*cs.reserved.Size() - cs.node.GrantedReservedCPU() for node := cs.node.Parent(); !node.IsNil(); node = node.Parent() { pSupply := node.FreeSupply() pReserved := 1000*pSupply.ReservedCPUs().Size() - pSupply.GetNode().GrantedReservedCPU() if pReserved < reserved { reserved = pReserved } } return reserved } // AllocatableSharedCPU calculates the allocatable amount of shared CPU of this supply. func (cs *supply) AllocatableSharedCPU(quiet ...bool) int { verbose := !(len(quiet) > 0 && quiet[0]) // Notes: // Take into account the supplies/grants in all ancestors, making sure // none of them gets overcommitted as the result of fulfilling this request. shared := 1000*cs.sharable.Size() - cs.node.GrantedSharedCPU() if verbose { log.Debug("%s: unadjusted free shared CPU: %dm", cs.node.Name(), shared) } for node := cs.node.Parent(); !node.IsNil(); node = node.Parent() { pSupply := node.FreeSupply() pShared := 1000*pSupply.SharableCPUs().Size() - pSupply.GetNode().GrantedSharedCPU() if pShared < shared { if verbose { log.Debug("%s: capping free shared CPU (%dm -> %dm) to avoid overcommit of %s", cs.node.Name(), shared, pShared, node.Name()) } shared = pShared } } if verbose { log.Debug("%s: ancestor-adjusted free shared CPU: %dm", cs.node.Name(), shared) } return shared } // Eval... func (score *score) Eval() float64 { return 1.0 } func (score *score) Supply() Supply { return score.supply } func (score *score) Request() Request { return score.req } func (score *score) IsolatedCapacity() int { return score.isolated } func (score *score) ReservedCapacity() int { return score.reserved } func (score *score) SharedCapacity() int { return score.shared } func (score *score) Colocated() int { return score.colocated } func (score *score) HintScores() map[string]float64 { return score.hints } func (score *score) String() string { return fmt.Sprintf("", score.supply.GetNode().Name(), score.isolated, score.reserved, score.shared, score.colocated, score.hints) } // newGrant creates a CPU grant from the given node for the container. func newGrant(n Node, c cache.Container, cpuType cpuClass, exclusive cpuset.CPUSet, cpuPortion int, mt memoryType, allocated memoryMap, coldstart time.Duration) Grant { grant := &grant{ node: n, memoryNode: n, container: c, cpuType: cpuType, exclusive: exclusive, cpuPortion: cpuPortion, } if allocated != nil { grant.SetMemoryAllocation(mt, allocated, coldstart) } return grant } // SetCPUPortion sets the fractional CPU portion for the grant. func (cg *grant) SetCPUPortion(fraction int) { cg.cpuPortion = fraction } // SetMemoryAllocation sets the memory allocation for the grant. func (cg *grant) SetMemoryAllocation(mt memoryType, allocated memoryMap, coldstart time.Duration) { initial := memoryPMEM if coldstart <= 0 { initial = mt } mems := cg.node.GetMemset(initial) if mems.Size() == 0 { mems = cg.node.GetMemset(memoryDRAM) if mems.Size() == 0 { mems = cg.node.GetMemset(memoryAll) } } mems = mems.Clone() cg.memType = mt cg.memset = mems cg.allocatedMem = allocated cg.coldStart = coldstart } // Clone creates a copy of this grant. func (cg *grant) Clone() Grant { return &grant{ node: cg.GetCPUNode(), memoryNode: cg.GetMemoryNode(), container: cg.GetContainer(), exclusive: cg.ExclusiveCPUs(), cpuType: cg.CPUType(), cpuPortion: cg.SharedPortion(), memType: cg.MemoryType(), memset: cg.Memset().Clone(), allocatedMem: cg.MemLimit(), coldStart: cg.ColdStart(), } } // RefetchNodes updates the stored cpu and memory nodes of this grant by name. func (cg *grant) RefetchNodes() error { node, ok := cg.node.Policy().nodes[cg.node.Name()] if !ok { return policyError("failed to refetch grant cpu node %s", cg.node.Name()) } memoryNode, ok := cg.memoryNode.Policy().nodes[cg.memoryNode.Name()] if !ok { return policyError("failed to refetch grant memory node %s", cg.memoryNode.Name()) } cg.node = node cg.memoryNode = memoryNode return nil } // GetContainer returns the container this grant is valid for. func (cg *grant) GetContainer() cache.Container { return cg.container } // GetNode returns the Node this grant gets its CPU allocation from. func (cg *grant) GetCPUNode() Node { return cg.node } // GetNode returns the Node this grant gets its memory allocation from. func (cg *grant) GetMemoryNode() Node { return cg.memoryNode } func (cg *grant) SetMemoryNode(n Node) { cg.memoryNode = n cg.memset = n.GetMemset(cg.MemoryType()) } // CPUType returns the requested type of CPU for the grant. func (cg *grant) CPUType() cpuClass { return cg.cpuType } // CPUPortion returns granted milli-CPUs of non-full CPUs of CPUType(). func (cg *grant) CPUPortion() int { return cg.cpuPortion } // ExclusiveCPUs returns the non-isolated exclusive CPUSet in this grant. func (cg *grant) ExclusiveCPUs() cpuset.CPUSet { return cg.exclusive } // ReservedCPUs returns the reserved CPUSet in the supply of this grant. func (cg *grant) ReservedCPUs() cpuset.CPUSet { return cg.node.GetSupply().ReservedCPUs() } // ReservedPortion returns the milli-CPU allocation for the reserved CPUSet in this grant. func (cg *grant) ReservedPortion() int { if cg.cpuType == cpuReserved { return cg.cpuPortion } return 0 } // SharedCPUs returns the shared CPUSet in the supply of this grant. func (cg *grant) SharedCPUs() cpuset.CPUSet { return cg.node.FreeSupply().SharableCPUs() } // SharedPortion returns the milli-CPU allocation for the shared CPUSet in this grant. func (cg *grant) SharedPortion() int { if cg.cpuType == cpuNormal { return cg.cpuPortion } return 0 } // ExclusiveCPUs returns the isolated exclusive CPUSet in this grant. func (cg *grant) IsolatedCPUs() cpuset.CPUSet { return cg.node.GetSupply().IsolatedCPUs().Intersection(cg.exclusive) } // MemoryType returns the requested type of memory for the grant. func (cg *grant) MemoryType() memoryType { return cg.memType } // Memset returns the granted memory controllers as an IDSet. func (cg *grant) Memset() idset.IDSet { return cg.memset } // MemLimit returns the granted memory. func (cg *grant) MemLimit() memoryMap { return cg.allocatedMem } // String returns a printable representation of the CPU grant. func (cg *grant) String() string { var cpuType, isolated, exclusive, reserved, shared string cpuType = fmt.Sprintf("cputype: %s", cg.cpuType) isol := cg.IsolatedCPUs() if !isol.IsEmpty() { isolated = fmt.Sprintf(", isolated: %s", isol) } if !cg.exclusive.IsEmpty() { exclusive = fmt.Sprintf(", exclusive: %s", cg.exclusive) } if cg.ReservedPortion() > 0 { reserved = fmt.Sprintf(", reserved: %s (%dm)", cg.node.FreeSupply().ReservedCPUs(), cg.ReservedPortion()) } if cg.SharedPortion() > 0 { shared = fmt.Sprintf(", shared: %s (%dm)", cg.node.FreeSupply().SharableCPUs(), cg.SharedPortion()) } mem := cg.allocatedMem.String() if mem != "" { mem = ", MemLimit: " + mem } return fmt.Sprintf("", cg.container.PrettyName(), cg.node.Name(), cpuType, isolated, exclusive, reserved, shared, mem) } func (cg *grant) AccountAllocateCPU() { cg.node.DepthFirst(func(n Node) error { n.FreeSupply().AccountAllocateCPU(cg) return nil }) for node := cg.node.Parent(); !node.IsNil(); node = node.Parent() { node.FreeSupply().AccountAllocateCPU(cg) } } func (cg *grant) Release() { cg.GetCPUNode().FreeSupply().ReleaseCPU(cg) cg.GetMemoryNode().FreeSupply().ReleaseMemory(cg) cg.StopTimer() } func (cg *grant) AccountReleaseCPU() { cg.node.DepthFirst(func(n Node) error { n.FreeSupply().AccountReleaseCPU(cg) return nil }) for node := cg.node.Parent(); !node.IsNil(); node = node.Parent() { node.FreeSupply().AccountReleaseCPU(cg) } } func (cg *grant) RestoreMemset() { mems := cg.GetMemoryNode().GetMemset(cg.memType) cg.memset = mems cg.GetMemoryNode().Policy().applyGrant(cg) } func (cg *grant) ExpandMemset() (bool, error) { supply := cg.GetMemoryNode().FreeSupply() node := cg.GetMemoryNode() parent := node.Parent() // We have to assume that the memory has been allocated how we granted it (if PMEM ran out // the allocations have been made from DRAM and so on). // Figure out if there is enough memory now to have grant as-is. extra := supply.ExtraMemoryReservation(memoryAll) free := supply.MemoryLimit()[memoryAll] if extra <= free { // The grant fits in the node even with extra reservations return false, nil } // Else it doesn't fit, so move the grant up in the memory tree. required := uint64(0) for _, memType := range []memoryType{memoryPMEM, memoryDRAM, memoryHBM} { required += cg.MemLimit()[memType] } log.Debug("out-of-memory risk in %s: extra reservations %s > free %s -> moving up %s total memory grant from %s", cg, prettyMem(extra), prettyMem(free), prettyMem(required), node.Name()) // Find an ancestor where the grant fits. As reservations in // child nodes do not show up in free + extra in parent nodes, // releasing the grant is not necessary before searching. for ; !parent.IsNil(); parent = parent.Parent() { pSupply := parent.FreeSupply() parentFree := pSupply.MemoryLimit()[memoryAll] parentExtra := pSupply.ExtraMemoryReservation(memoryAll) if parentExtra+required <= parentFree { required = 0 break } log.Debug("- %s has %s free but %s extra reservations, moving further up", parent.Name(), prettyMem(parentFree), prettyMem(parentExtra)) } if required > 0 { return false, fmt.Errorf("internal error: cannot find enough memory (%s) for %s from ancestors of %s", prettyMem(required), cg, node.Name()) } // Release granted memory from the node and allocate it from the parent node. err := parent.FreeSupply().ReallocateMemory(cg) if err != nil { return false, err } cg.SetMemoryNode(parent) cg.UpdateExtraMemoryReservation() // Make the container to use the new memory set. // FIXME: this could be done in a second pass to avoid doing this many times cg.GetMemoryNode().Policy().applyGrant(cg) return true, nil } func (cg *grant) UpdateExtraMemoryReservation() { // For every subnode, make sure that this grant is added to the extra memory allocation. cg.GetMemoryNode().DepthFirst(func(n Node) error { // No extra allocation should be done to the node itself. if !n.IsSameNode(cg.GetMemoryNode()) { supply := n.FreeSupply() supply.SetExtraMemoryReservation(cg) } return nil }) } func (cg *grant) ColdStart() time.Duration { return cg.coldStart } func (cg *grant) AddTimer(timer *time.Timer) { cg.coldStartTimer = timer } func (cg *grant) StopTimer() { if cg.coldStartTimer != nil { cg.coldStartTimer.Stop() cg.coldStartTimer = nil } } func (cg *grant) ClearTimer() { if cg.coldStartTimer != nil { cg.coldStartTimer = nil } } ================================================ FILE: pkg/cri/resource-manager/policy/builtin/topology-aware/topology-aware-policy.go ================================================ // Copyright 2019 Intel Corporation. All Rights Reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package topologyaware import ( "errors" v1 "k8s.io/api/core/v1" resapi "k8s.io/apimachinery/pkg/api/resource" "github.com/prometheus/client_golang/prometheus" "github.com/intel/cri-resource-manager/pkg/config" "github.com/intel/cri-resource-manager/pkg/cpuallocator" "github.com/intel/cri-resource-manager/pkg/cri/resource-manager/cache" "github.com/intel/cri-resource-manager/pkg/cri/resource-manager/events" "github.com/intel/cri-resource-manager/pkg/cri/resource-manager/introspect" "github.com/intel/cri-resource-manager/pkg/utils/cpuset" policyapi "github.com/intel/cri-resource-manager/pkg/cri/resource-manager/policy" system "github.com/intel/cri-resource-manager/pkg/sysfs" idset "github.com/intel/goresctrl/pkg/utils" ) const ( // PolicyName is the name used to activate this policy implementation. PolicyName = "topology-aware" // PolicyDescription is a short description of this policy. PolicyDescription = "A policy for prototyping memory tiering." // PolicyPath is the path of this policy in the configuration hierarchy. PolicyPath = "policy." + PolicyName // AliasName is the 'memtier' alias name for this policy. AliasName = "memtier" // AliasPath is the 'memtier' alias configuration path for this policy. AliasPath = "policy." + AliasName // ColdStartDone is the event generated for the end of a container cold start period. ColdStartDone = "cold-start-done" ) // allocations is our cache.Cachable for saving resource allocations in the cache. type allocations struct { policy *policy grants map[string]Grant } // policy is our runtime state for this policy. type policy struct { options *policyapi.BackendOptions // options we were created or reconfigured with cache cache.Cache // pod/container cache sys system.System // system/HW topology info allowed cpuset.CPUSet // bounding set of CPUs we're allowed to use reserved cpuset.CPUSet // system-/kube-reserved CPUs reserveCnt int // number of CPUs to reserve if given as resource.Quantity isolated cpuset.CPUSet // (our allowed set of) isolated CPUs nodes map[string]Node // pool nodes by name pools []Node // pre-populated node slice for scoring, etc... root Node // root of our pool/partition tree nodeCnt int // number of pools depth int // tree depth allocations allocations // container pool assignments cpuAllocator cpuallocator.CPUAllocator // CPU allocator used by the policy coldstartOff bool // coldstart forced off (have movable PMEM zones) isAlias bool // whether started by referencing AliasName } // Make sure policy implements the policy.Backend interface. var _ policyapi.Backend = &policy{} // Whether we have coldstart forced off due to PMEM in movable memory zones. var coldStartOff bool // CreateTopologyAwarePolicy creates a new policy instance. func CreateTopologyAwarePolicy(opts *policyapi.BackendOptions) policyapi.Backend { return createPolicy(opts, false) } // CreateMemtierPolicy creates a new policy instance, aliased as 'memtier'. func CreateMemtierPolicy(opts *policyapi.BackendOptions) policyapi.Backend { return createPolicy(opts, true) } // createPolicy creates a new policy instance. func createPolicy(opts *policyapi.BackendOptions, isAlias bool) policyapi.Backend { p := &policy{ cache: opts.Cache, sys: opts.System, options: opts, cpuAllocator: cpuallocator.NewCPUAllocator(opts.System), isAlias: isAlias, } if isAlias { *opt = *aliasOpt } if err := p.initialize(); err != nil { log.Fatal("failed to initialize %s policy: %v", PolicyName, err) } p.registerImplicitAffinities() config.GetModule(policyapi.ConfigPath).AddNotify(p.configNotify) return p } // Name returns the name of this policy. func (p *policy) Name() string { return PolicyName } // Description returns the description for this policy. func (p *policy) Description() string { return PolicyDescription } // Start prepares this policy for accepting allocation/release requests. func (p *policy) Start(add []cache.Container, del []cache.Container) error { if err := p.restoreCache(); err != nil { return policyError("failed to start: %v", err) } // Turn coldstart forcibly off if we have movable non-DRAM memory. // Note that although this can change dynamically we only check it // during startup and trust users to either not fiddle with memory // or restart us if they do. p.checkColdstartOff() p.root.Dump("") return p.Sync(add, del) } // Sync synchronizes the state of this policy. func (p *policy) Sync(add []cache.Container, del []cache.Container) error { log.Debug("synchronizing state...") for _, c := range del { p.ReleaseResources(c) } for _, c := range add { p.AllocateResources(c) } return nil } // AllocateResources is a resource allocation request for this policy. func (p *policy) AllocateResources(container cache.Container) error { log.Debug("allocating resources for %s...", container.PrettyName()) grant, err := p.allocatePool(container, "") if err != nil { return policyError("failed to allocate resources for %s: %v", container.PrettyName(), err) } p.applyGrant(grant) p.updateSharedAllocations(&grant) p.root.Dump("") return nil } // ReleaseResources is a resource release request for this policy. func (p *policy) ReleaseResources(container cache.Container) error { log.Debug("releasing resources of %s...", container.PrettyName()) if grant, found := p.releasePool(container); found { p.updateSharedAllocations(&grant) } p.root.Dump("") return nil } // UpdateResources is a resource allocation update request for this policy. func (p *policy) UpdateResources(c cache.Container) error { log.Debug("(not) updating container %s...", c.PrettyName()) return nil } // Rebalance tries to find an optimal allocation of resources for the current containers. func (p *policy) Rebalance() (bool, error) { var errors error containers := p.cache.GetContainers() movable := []cache.Container{} for _, c := range containers { if c.GetQOSClass() != v1.PodQOSGuaranteed { p.ReleaseResources(c) movable = append(movable, c) } } for _, c := range movable { if err := p.AllocateResources(c); err != nil { if errors == nil { errors = err } else { errors = policyError("%v, %v", errors, err) } } } return true, errors } // HandleEvent handles policy-specific events. func (p *policy) HandleEvent(e *events.Policy) (bool, error) { log.Debug("received policy event %s.%s with data %v...", e.Source, e.Type, e.Data) switch e.Type { case events.ContainerStarted: c, ok := e.Data.(cache.Container) if !ok { return false, policyError("%s event: expecting cache.Container Data, got %T", e.Type, e.Data) } log.Info("triggering coldstart period (if necessary) for %s", c.PrettyName()) return false, p.triggerColdStart(c) case ColdStartDone: id, ok := e.Data.(string) if !ok { return false, policyError("%s event: expecting container ID Data, got %T", e.Type, e.Data) } c, ok := p.cache.LookupContainer(id) if !ok { // TODO: This is probably a race condition. Should we return nil error here? return false, policyError("%s event: failed to lookup container %s", id) } log.Info("finishing coldstart period for %s", c.PrettyName()) return p.finishColdStart(c) } return false, nil } // Introspect provides data for external introspection. func (p *policy) Introspect(state *introspect.State) { pools := make(map[string]*introspect.Pool, len(p.pools)) for _, node := range p.nodes { cpus := node.GetSupply() pool := &introspect.Pool{ Name: node.Name(), CPUs: cpus.SharableCPUs().Union(cpus.IsolatedCPUs()).String(), Memory: node.GetMemset(memoryAll).String(), } if parent := node.Parent(); !parent.IsNil() { pool.Parent = parent.Name() } if children := node.Children(); len(children) > 0 { pool.Children = make([]string, 0, len(children)) for _, c := range children { pool.Children = append(pool.Children, c.Name()) } } pools[pool.Name] = pool } state.Pools = pools assignments := make(map[string]*introspect.Assignment, len(p.allocations.grants)) for _, g := range p.allocations.grants { a := &introspect.Assignment{ ContainerID: g.GetContainer().GetID(), CPUShare: g.SharedPortion(), ExclusiveCPUs: g.ExclusiveCPUs().Union(g.IsolatedCPUs()).String(), Pool: g.GetCPUNode().Name(), } if g.SharedPortion() > 0 || a.ExclusiveCPUs == "" { a.SharedCPUs = g.SharedCPUs().String() } assignments[a.ContainerID] = a } state.Assignments = assignments } // DescribeMetrics generates policy-specific prometheus metrics data descriptors. func (p *policy) DescribeMetrics() []*prometheus.Desc { return nil } // PollMetrics provides policy metrics for monitoring. func (p *policy) PollMetrics() policyapi.Metrics { return nil } // CollectMetrics generates prometheus metrics from cached/polled policy-specific metrics data. func (p *policy) CollectMetrics(policyapi.Metrics) ([]prometheus.Metric, error) { return nil, nil } // ExportResourceData provides resource data to export for the container. func (p *policy) ExportResourceData(c cache.Container) map[string]string { grant, ok := p.allocations.grants[c.GetCacheID()] if !ok { return nil } data := map[string]string{} shared := grant.SharedCPUs().String() isolated := grant.ExclusiveCPUs().Intersection(grant.GetCPUNode().GetSupply().IsolatedCPUs()) exclusive := grant.ExclusiveCPUs().Difference(isolated).String() if grant.SharedPortion() > 0 && shared != "" { data[policyapi.ExportSharedCPUs] = shared } if isolated.String() != "" { data[policyapi.ExportIsolatedCPUs] = isolated.String() } if exclusive != "" { data[policyapi.ExportExclusiveCPUs] = exclusive } mems := grant.Memset() dram := idset.NewIDSet() pmem := idset.NewIDSet() hbm := idset.NewIDSet() for _, id := range mems.SortedMembers() { node := p.sys.Node(id) switch node.GetMemoryType() { case system.MemoryTypeDRAM: dram.Add(id) case system.MemoryTypePMEM: pmem.Add(id) /* case system.MemoryTypeHBM: hbm.Add(id) */ } } data["ALL_MEMS"] = mems.String() if dram.Size() > 0 { data["DRAM_MEMS"] = dram.String() } if pmem.Size() > 0 { data["PMEM_MEMS"] = pmem.String() } if hbm.Size() > 0 { data["HBM_MEMS"] = hbm.String() } return data } // reallocateResources reallocates the given containers using the given pool hints func (p *policy) reallocateResources(containers []cache.Container, pools map[string]string) error { errs := []error{} log.Info("reallocating resources...") cache.SortContainers(containers) for _, c := range containers { p.releasePool(c) } for _, c := range containers { log.Debug("reallocating resources for %s...", c.PrettyName()) grant, err := p.allocatePool(c, pools[c.GetCacheID()]) if err != nil { errs = append(errs, err) } else { p.applyGrant(grant) } } if len(errs) > 0 { return errors.Join(errs...) } p.updateSharedAllocations(nil) p.root.Dump("") return nil } func (p *policy) configNotify(event config.Event, source config.Source) error { policyName := PolicyName if p.isAlias { policyName = AliasName *opt = *aliasOpt } log.Info("%s configuration %s:", policyName, event) log.Info(" - pin containers to CPUs: %v", opt.PinCPU) log.Info(" - pin containers to memory: %v", opt.PinMemory) log.Info(" - prefer isolated CPUs: %v", opt.PreferIsolated) log.Info(" - prefer shared CPUs: %v", opt.PreferShared) log.Info(" - reserved pool namespaces: %v", opt.ReservedPoolNamespaces) var allowed, reserved cpuset.CPUSet var reinit bool if cpus, ok := p.options.Available[policyapi.DomainCPU]; ok { if cset, ok := cpus.(cpuset.CPUSet); ok { allowed = cset } } if cpus, ok := p.options.Reserved[policyapi.DomainCPU]; ok { switch v := cpus.(type) { case cpuset.CPUSet: reserved = v case resapi.Quantity: reserveCnt := (int(v.MilliValue()) + 999) / 1000 if reserveCnt != p.reserveCnt { log.Warn("CPU reservation has changed (%v, was %v)", reserveCnt, p.reserveCnt) reinit = true } } } if !allowed.Equals(p.allowed) { if !(allowed.Size() == 0 && p.allowed.Size() == 0) { log.Warn("allowed cpuset changed (%s, was %s)", allowed.String(), p.allowed.String()) reinit = true } } if !reserved.Equals(p.reserved) { if !(reserved.Size() == 0 && p.reserved.Size() == 0) { log.Warn("reserved cpuset changed (%s, was %s)", reserved.String(), p.reserved.String()) reinit = true } } // // Notes: // If the allowed or reserved resources have changed, we need to // rebuild our pool hierarchy using the updated constraints and // also update the existing allocations accordingly. We do this // first reinitializing the policy then reloading the allocations // from the cache. If we fail, we restore the original state of // the policy and reject the new configuration. // if reinit { log.Warn("reinitializing %s policy...", PolicyName) savedPolicy := *p allocations := savedPolicy.allocations.clone() if err := p.initialize(); err != nil { *p = savedPolicy return policyError("failed to reconfigure: %v", err) } for _, grant := range allocations.grants { if err := grant.RefetchNodes(); err != nil { *p = savedPolicy return policyError("failed to reconfigure: %v", err) } } log.Warn("updating existing allocations...") if err := p.restoreAllocations(&allocations); err != nil { *p = savedPolicy return policyError("failed to reconfigure: %v", err) } p.root.Dump("") } return nil } // Initialize or reinitialize the policy. func (p *policy) initialize() error { p.nodes = nil p.pools = nil p.root = nil p.nodeCnt = 0 p.depth = 0 p.allocations = p.newAllocations() if err := p.checkConstraints(); err != nil { return err } if err := p.buildPoolsByTopology(); err != nil { return err } return nil } // Check the constraints passed to us. func (p *policy) checkConstraints() error { if c, ok := p.options.Available[policyapi.DomainCPU]; ok { p.allowed = c.(cpuset.CPUSet) } else { // default to all online cpus p.allowed = p.sys.CPUSet().Difference(p.sys.Offlined()) } p.isolated = p.sys.Isolated().Intersection(p.allowed) c, ok := p.options.Reserved[policyapi.DomainCPU] if !ok { return policyError("cannot start without CPU reservation") } switch c.(type) { case cpuset.CPUSet: p.reserved = c.(cpuset.CPUSet) // check that all reserved CPUs are in the allowed set if !p.reserved.Difference(p.allowed).IsEmpty() { return policyError("invalid reserved cpuset %s, some CPUs (%s) are not "+ "part of the online allowed cpuset (%s)", p.reserved, p.reserved.Difference(p.allowed), p.allowed) } // check that none of the reserved CPUs are isolated if !p.reserved.Intersection(p.isolated).IsEmpty() { return policyError("invalid reserved cpuset %s, some CPUs (%s) are also isolated", p.reserved.Intersection(p.isolated)) } case resapi.Quantity: qty := c.(resapi.Quantity) p.reserveCnt = (int(qty.MilliValue()) + 999) / 1000 // Use CpuAllocator to pick reserved CPUs among // allowed ones. Because using those CPUs is allowed, // they remain (they are put back) in the allowed set. cset, err := p.cpuAllocator.AllocateCpus(&p.allowed, p.reserveCnt, cpuallocator.PriorityNormal) p.allowed = p.allowed.Union(cset) if err != nil { log.Fatal("cannot reserve %dm CPUs for ReservedResources from AvailableResources: %s", qty.MilliValue(), err) } p.reserved = cset } if p.reserved.IsEmpty() { return policyError("cannot start without CPU reservation") } return nil } func (p *policy) restoreCache() error { allocations := p.newAllocations() if p.cache.GetPolicyEntry(keyAllocations, &allocations) { if err := p.restoreAllocations(&allocations); err != nil { return policyError("failed to restore allocations from cache: %v", err) } p.allocations.Dump(log.Info, "restored ") } p.saveAllocations() return nil } func (p *policy) checkColdstartOff() { for _, id := range p.sys.NodeIDs() { node := p.sys.Node(id) if node.GetMemoryType() == system.MemoryTypePMEM { if !node.HasNormalMemory() { coldStartOff = true log.Error("coldstart forced off: NUMA node #%d does not have normal memory", id) return } } } } // newAllocations returns a new initialized empty set of allocations. func (p *policy) newAllocations() allocations { return allocations{policy: p, grants: make(map[string]Grant)} } // clone creates a copy of the allocation. func (a *allocations) clone() allocations { o := allocations{policy: a.policy, grants: make(map[string]Grant)} for id, grant := range a.grants { o.grants[id] = grant.Clone() } return o } // getContainerPoolHints creates container pool hints for the current grants. func (a *allocations) getContainerPoolHints() ([]cache.Container, map[string]string) { containers := make([]cache.Container, 0, len(a.grants)) hints := make(map[string]string) for _, grant := range a.grants { c := grant.GetContainer() containers = append(containers, c) hints[c.GetCacheID()] = grant.GetCPUNode().Name() } return containers, hints } // Register us as a policy implementation. func init() { policyapi.Register(PolicyName, PolicyDescription, CreateTopologyAwarePolicy) policyapi.Register(AliasName, PolicyDescription, CreateMemtierPolicy) } ================================================ FILE: pkg/cri/resource-manager/policy/error.go ================================================ // Copyright 2019 Intel Corporation. All Rights Reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package policy import ( "fmt" ) func policyError(format string, args ...interface{}) error { return fmt.Errorf("policy: "+format, args...) } ================================================ FILE: pkg/cri/resource-manager/policy/flags.go ================================================ // Copyright 2019 Intel Corporation. All Rights Reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package policy import ( "encoding/json" "errors" "os" "path/filepath" "sort" "strconv" "strings" "k8s.io/apimachinery/pkg/api/resource" "github.com/intel/cri-resource-manager/pkg/cgroups" "github.com/intel/cri-resource-manager/pkg/config" "github.com/intel/cri-resource-manager/pkg/utils/cpuset" ) const ( // NonePolicy is the name of our no-op policy. NonePolicy = "none" // DefaultPolicy is the name of our default policy. DefaultPolicy = NonePolicy // ConfigPath is the configuration module path for the generic policy layer. ConfigPath = "policy" ) // Options captures our configurable parameters. type options struct { // Policy is the name of the policy backend to activate. Policy string `json:"Active"` // Available hardware resources to use. Available ConstraintSet `json:"AvailableResources,omitempty"` // Reserved hardware resources, for system and kube tasks. Reserved ConstraintSet `json:"ReservedResources,omitempty"` } // Our runtime configuration. var opt = defaultOptions().(*options) // MarshalJSON implements JSON marshalling for ConstraintSets. func (cs ConstraintSet) MarshalJSON() ([]byte, error) { obj := map[string]interface{}{} for domain, constraint := range cs { name := string(domain) switch constraint.(type) { case cpuset.CPUSet: obj[name] = "cpuset:" + constraint.(cpuset.CPUSet).String() case resource.Quantity: qty := constraint.(resource.Quantity) obj[name] = qty.String() case int: obj[name] = strconv.Itoa(constraint.(int)) default: return nil, policyError("invalid %v constraint of type %T", domain, constraint) } } return json.Marshal(obj) } // UnmarshalJSON implements JSON unmarshalling for ConstraintSets. func (cs *ConstraintSet) UnmarshalJSON(raw []byte) error { set := make(ConstraintSet) obj := map[string]interface{}{} if err := json.Unmarshal(raw, &obj); err != nil { return policyError("failed to unmarshal ConstraintSet: %v", err) } for name, value := range obj { switch strings.ToUpper(name) { case string(DomainCPU): switch v := value.(type) { case string: if err := set.parseCPU(v); err != nil { return err } case int: set.setCPUMilliQuantity(v) case float64: set.setCPUMilliQuantity(int(1000.0 * v)) default: return policyError("invalid CPU constraint of type %T", value) } default: return policyError("internal error: unhandled ConstraintSet domain %s", name) } } *cs = set return nil } func (cs *ConstraintSet) String() string { ret := "" sep := "" for domain, value := range *cs { ret += sep + string(domain) + "=" + ConstraintToString(value) sep = "," } return ret } func (cs *ConstraintSet) parseCPU(value string) error { kind, spec := "", "" if sep := strings.IndexByte(value, ':'); sep != -1 { kind = value[:sep] spec = value[sep+1:] } else { spec = value } if len(spec) == 0 { return policyError("missing CPU constraint value") } switch { case kind == "cgroup" || spec[0] == '/': if err := cs.parseCPUFromCgroup(spec); err != nil { return err } case kind == "cpuset" || strings.IndexAny(spec, "-,") != -1: if err := cs.parseCPUSet(spec); err != nil { return err } case kind == "": if err := cs.parseCPUQuantity(spec); err != nil { return err } default: return policyError("invalid CPU constraint qualifier %q", kind) } return nil } func (cs *ConstraintSet) parseCPUSet(value string) error { cset, err := cpuset.Parse(value) if err != nil { return policyError("failed to parse CPU cpuset constraint %q: %v", value, err) } (*cs)[DomainCPU] = cset return nil } func (cs *ConstraintSet) parseCPUQuantity(value string) error { qty, err := resource.ParseQuantity(value) if err != nil { return policyError("failed to parse CPU Quantity constraint %q: %v", value, err) } (*cs)[DomainCPU] = qty return nil } func (cs *ConstraintSet) parseCPUFromCgroup(dir string) error { pathToCpuset := func(outPath *string, fragments ...string) bool { *outPath = filepath.Join(filepath.Join(fragments...), "cpuset.cpus") _, err := os.Stat(*outPath) return !errors.Is(err, os.ErrNotExist) } path := "" switch { case len(dir) == 0: return policyError("empty CPU cgroup constraint") case dir[0] == '/' && pathToCpuset(&path, dir): // dir is a direct, absolute path to an existing cgroup case pathToCpuset(&path, cgroups.GetMountDir(), dir): // dir is a relative path starting from the cgroup mount point case pathToCpuset(&path, cgroups.Cpuset.Path(), dir): // dir is a relative path starting from the cpuset controller (cgroup v1) default: // dir is none of the previous return policyError("failed to find cpuset.cpus for CPU cgroup constraint %q", dir) } bytes, err := os.ReadFile(path) if err != nil { return policyError("failed read CPU cpuset cgroup constraint %q: %v", path, err) } cpus := strings.TrimSuffix(string(bytes), "\n") cset, err := cpuset.Parse(cpus) if err != nil { return policyError("failed to parse cpuset cgroup constraint %q: %v", cpus, err) } (*cs)[DomainCPU] = cset return nil } func (cs *ConstraintSet) setCPUMilliQuantity(value int) { qty := resource.NewMilliQuantity(int64(value), resource.DecimalSI) (*cs)[DomainCPU] = *qty } // AvailablePolicy describes an available policy. type AvailablePolicy struct { // Name is the name of the policy. Name string // Description is a short description of the policy. Description string } // AvailablePolicies returns the available policies and their descriptions. func AvailablePolicies() []*AvailablePolicy { policies := make([]*AvailablePolicy, 0, len(backends)+1) for name, be := range backends { policies = append(policies, &AvailablePolicy{ Name: name, Description: be.description, }) } sort.Slice(policies, func(i, j int) bool { return policies[i].Name < policies[j].Name }) return policies } // defaultOptions returns a new options instance, all initialized to defaults. func defaultOptions() interface{} { return &options{ Policy: DefaultPolicy, Available: ConstraintSet{}, Reserved: ConstraintSet{}, } } // Register us for configuration handling. func init() { config.Register(ConfigPath, "Generic policy layer.", opt, defaultOptions, config.WithNotify(configNotify)) } ================================================ FILE: pkg/cri/resource-manager/policy/policy.go ================================================ // Copyright 2019 Intel Corporation. All Rights Reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package policy import ( "bytes" "fmt" "sort" "strconv" corev1 "k8s.io/api/core/v1" "k8s.io/apimachinery/pkg/api/resource" "github.com/intel/cri-resource-manager/pkg/blockio" "github.com/intel/cri-resource-manager/pkg/config" "github.com/intel/cri-resource-manager/pkg/cri/resource-manager/agent" "github.com/intel/cri-resource-manager/pkg/cri/resource-manager/cache" "github.com/intel/cri-resource-manager/pkg/cri/resource-manager/control/rdt" "github.com/intel/cri-resource-manager/pkg/cri/resource-manager/events" "github.com/intel/cri-resource-manager/pkg/cri/resource-manager/introspect" "github.com/intel/cri-resource-manager/pkg/utils/cpuset" "github.com/prometheus/client_golang/prometheus" logger "github.com/intel/cri-resource-manager/pkg/log" system "github.com/intel/cri-resource-manager/pkg/sysfs" ) // Domain represents a hardware resource domain that can be policied by a backend. type Domain string const ( // DomainCPU is the CPU resource domain. DomainCPU Domain = "CPU" // DomainMemory is the memory resource domain. DomainMemory Domain = "Memory" // DomainHugePage is the hugepages resource domain. DomainHugePage Domain = "HugePages" // DomainCache is the CPU cache resource domain. DomainCache Domain = "Cache" // DomainMemoryBW is the memory resource bandwidth. DomainMemoryBW Domain = "MBW" ) // Constraint describes constraint of one hardware domain type Constraint interface{} // ConstraintSet describes, per hardware domain, the resources available for a policy. type ConstraintSet map[Domain]Constraint // Options describes policy options type Options struct { // Client interface to cri-resmgr agent AgentCli agent.Interface // SendEvent is the function for delivering events back to the resource manager. SendEvent SendEventFn } // BackendOptions describes the options for a policy backend instance type BackendOptions struct { // System provides system/HW/topology information System system.System // System state/cache Cache cache.Cache // Resource availibility constraint Available ConstraintSet // Resource reservation constraint Reserved ConstraintSet // Client interface to cri-resmgr agent AgentCli agent.Interface // SendEvent is the function for delivering events up to the resource manager. SendEvent SendEventFn } // CreateFn is the type for functions used to create a policy instance. type CreateFn func(*BackendOptions) Backend // SendEventFn is the type for a function to send events back to the resource manager. type SendEventFn func(interface{}) error const ( // ExportedResources is the basename of the file container resources are exported to. ExportedResources = "resources.sh" // ExportSharedCPUs is the shell variable used to export shared container CPUs. ExportSharedCPUs = "SHARED_CPUS" // ExportIsolatedCPUs is the shell variable used to export isolated container CPUs. ExportIsolatedCPUs = "ISOLATED_CPUS" // ExportExclusiveCPUs is the shell variable used to export exclusive container CPUs. ExportExclusiveCPUs = "EXCLUSIVE_CPUS" ) // Backend is the policy (decision making logic) interface exposed by implementations. // // A backends operates in a set of policy domains. Currently each policy domain // corresponds to some particular hardware resource (CPU, memory, cache, etc). type Backend interface { // Name gets the well-known name of this policy. Name() string // Description gives a verbose description about the policy implementation. Description() string // Start up and sycnhronizes the policy, using the given cache and resource constraints. Start([]cache.Container, []cache.Container) error // Sync synchronizes the policy, allocating/releasing the given containers. Sync([]cache.Container, []cache.Container) error // AllocateResources allocates resources to/for a container. AllocateResources(cache.Container) error // ReleaseResources release resources of a container. ReleaseResources(cache.Container) error // UpdateResources updates resource allocations of a container. UpdateResources(cache.Container) error // Rebalance tries an optimal allocation of resources for the current container. Rebalance() (bool, error) // HandleEvent processes the given event. The returned boolean indicates whether // changes have been made to any of the containers while handling the event. HandleEvent(*events.Policy) (bool, error) // ExportResourceData provides resource data to export for the container. ExportResourceData(cache.Container) map[string]string // Introspect provides data for external introspection. Introspect(*introspect.State) // DescribeMetrics generates policy-specific prometheus metrics data descriptors. DescribeMetrics() []*prometheus.Desc // PollMetrics provides policy metrics for monitoring. PollMetrics() Metrics // CollectMetrics generates prometheus metrics from cached/polled policy-specific metrics data. CollectMetrics(Metrics) ([]prometheus.Metric, error) } // Policy is the exposed interface for container resource allocations decision making. type Policy interface { // Start starts up policy, prepare for serving resource management requests. Start([]cache.Container, []cache.Container) error // Sync synchronizes the state of the active policy. Sync([]cache.Container, []cache.Container) error // AllocateResources allocates resources to a container. AllocateResources(cache.Container) error // ReleaseResources releases resources of a container. ReleaseResources(cache.Container) error // UpdateResources updates resource allocations of a container. UpdateResources(cache.Container) error // Rebalance tries to find an optimal allocation of resources for the current containers. Rebalance() (bool, error) // HandleEvent passes on the given event to the active policy. The returned boolean // indicates whether changes have been made to any of the containers while handling // the event. HandleEvent(*events.Policy) (bool, error) // ExportResourceData exports/updates resource data for the container. ExportResourceData(cache.Container) // Introspect provides data for external introspection. Introspect() *introspect.State // DescribeMetrics generates policy-specific prometheus metrics data descriptors. DescribeMetrics() []*prometheus.Desc // PollMetrics provides policy metrics for monitoring. PollMetrics() Metrics // CollectMetrics generates prometheus metrics from cached/polled policy-specific metrics data. CollectMetrics(Metrics) ([]prometheus.Metric, error) } type Metrics interface{} // Policy instance/state. type policy struct { options Options // policy options cache cache.Cache // system state cache active Backend // our active backend system system.System // system/HW/topology info inspsys *introspect.System // ditto for introspection sendEvent SendEventFn // function to send event up to the resource manager } // backend is a registered Backend. type backend struct { name string // unqiue backend name description string // verbose backend description create CreateFn // backend creation function } // Out logger instance. var log logger.Logger = logger.NewLogger("policy") // Registered backends. var backends = make(map[string]*backend) // Options passed to created/activated backend. var backendOpts = &BackendOptions{} // ActivePolicy returns the name of the policy to be activated. func ActivePolicy() string { return opt.Policy } // NewPolicy creates a policy instance using the selected backend. func NewPolicy(cache cache.Cache, o *Options) (Policy, error) { sys, err := system.DiscoverSystem() if err != nil { return nil, policyError("failed to discover system topology: %v", err) } p := &policy{ cache: cache, system: sys, options: *o, } active, ok := backends[opt.Policy] if !ok { return nil, policyError("unknown policy '%s' requested", opt.Policy) } log.Info("activating '%s' policy...", active.name) if len(opt.Available) != 0 { log.Info(" with available resources:") for n, r := range opt.Available { log.Info(" - %s=%s", n, ConstraintToString(r)) } } if len(opt.Reserved) != 0 { log.Info(" with reserved resources:") for n, r := range opt.Reserved { log.Info(" - %s=%s", n, ConstraintToString(r)) } } if log.DebugEnabled() { logger.Get(opt.Policy).EnableDebug() } backendOpts.Cache = p.cache backendOpts.System = p.system backendOpts.Available = opt.Available backendOpts.Reserved = opt.Reserved backendOpts.AgentCli = o.AgentCli backendOpts.SendEvent = o.SendEvent p.active = active.create(backendOpts) return p, nil } // Start starts up policy, preparing it for resving requests. func (p *policy) Start(add []cache.Container, del []cache.Container) error { log.Info("starting policy '%s'...", p.active.Name()) return p.active.Start(add, del) } // Sync synchronizes the active policy state. func (p *policy) Sync(add []cache.Container, del []cache.Container) error { return p.active.Sync(add, del) } // AllocateResources allocates resources for a container. func (p *policy) AllocateResources(c cache.Container) error { return p.active.AllocateResources(c) } // ReleaseResources release resources of a container. func (p *policy) ReleaseResources(c cache.Container) error { return p.active.ReleaseResources(c) } // UpdateResources updates resource allocations of a container. func (p *policy) UpdateResources(c cache.Container) error { return p.active.UpdateResources(c) } // Rebalance tries to find a more optimal allocation of resources for the current containers. func (p *policy) Rebalance() (bool, error) { return p.active.Rebalance() } // HandleEvent passes on the given event to the active policy. func (p *policy) HandleEvent(e *events.Policy) (bool, error) { return p.active.HandleEvent(e) } // ExportResourceData exports/updates resource data for the container. func (p *policy) ExportResourceData(c cache.Container) { var buf bytes.Buffer data := p.active.ExportResourceData(c) keys := []string{} for key := range data { keys = append(keys, key) } sort.Strings(keys) for _, key := range keys { value := data[key] if _, err := buf.WriteString(fmt.Sprintf("%s=%q\n", key, value)); err != nil { log.Error("container %s: failed to export resource data (%s=%q)", c.PrettyName(), key, value) buf.Reset() break } } p.cache.WriteFile(c.GetCacheID(), ExportedResources, 0644, buf.Bytes()) } // Introspect provides data for external introspection/visualization. func (p *policy) Introspect() *introspect.State { pods := p.cache.GetPods() state := &introspect.State{Pods: make(map[string]*introspect.Pod, len(pods))} for _, p := range pods { containers := p.GetContainers() if len(containers) == 0 { continue } pod := &introspect.Pod{ ID: p.GetID(), UID: p.GetUID(), Name: p.GetName(), Containers: make(map[string]*introspect.Container, len(containers)), } for _, c := range containers { container := &introspect.Container{ ID: c.GetID(), Name: c.GetName(), Command: c.GetCommand(), Args: c.GetArgs(), Hints: introspect.TopologyHints(c.GetTopologyHints()), } resources := c.GetResourceRequirements() if req, ok := resources.Requests[corev1.ResourceCPU]; ok { if value := req.MilliValue(); value > 0 { container.CPURequest = value } } if lim, ok := resources.Limits[corev1.ResourceCPU]; ok { if value := lim.MilliValue(); value > 0 { container.CPULimit = value } } if req, ok := resources.Requests[corev1.ResourceMemory]; ok { if value := req.Value(); value > 0 { container.MemoryRequest = value } } if lim, ok := resources.Limits[corev1.ResourceMemory]; ok { if value := lim.Value(); value > 0 { container.MemoryLimit = value } } pod.Containers[container.ID] = container } state.Pods[pod.ID] = pod } if p.inspsys == nil { sys := &introspect.System{ Sockets: make(map[int]*introspect.Socket, p.system.PackageCount()), Nodes: make(map[int]*introspect.Node, p.system.NUMANodeCount()), } for _, id := range p.system.PackageIDs() { pkg := p.system.Package(id) sys.Sockets[int(id)] = &introspect.Socket{ID: int(id), CPUs: pkg.CPUSet().String()} } for _, id := range p.system.NodeIDs() { node := p.system.Node(id) sys.Nodes[int(id)] = &introspect.Node{ID: int(id), CPUs: node.CPUSet().String()} } sys.Isolated = p.system.Isolated().String() sys.Offlined = p.system.Offlined().String() p.inspsys = sys } rdtClassNames := []string{} for _, rdtClass := range rdt.GetClasses() { rdtClassNames = append(rdtClassNames, rdtClass.Name()) } blkioClassNames := []string{} for _, blkioClass := range blockio.GetClasses() { blkioClassNames = append(blkioClassNames, blkioClass.Name) } p.inspsys.RDTClasses = rdtClassNames p.inspsys.Policy = opt.Policy state.System = p.inspsys p.active.Introspect(state) return state } // PollMetrics provides policy metrics for monitoring. func (p *policy) PollMetrics() Metrics { return p.active.PollMetrics() } // DescribeMetrics generates policy-specific prometheus metrics data descriptors. func (p *policy) DescribeMetrics() []*prometheus.Desc { return p.active.DescribeMetrics() } // CollectMetrics generates prometheus metrics from cached/polled policy-specific metrics data. func (p *policy) CollectMetrics(m Metrics) ([]prometheus.Metric, error) { return p.active.CollectMetrics(m) } // Register registers a policy backend. func Register(name, description string, create CreateFn) error { log.Info("registering policy '%s'...", name) if o, ok := backends[name]; ok { return policyError("policy %s already registered (%s)", name, o.description) } backends[name] = &backend{ name: name, description: description, create: create, } return nil } // ConstraintToString returns the given constraint as a string. func ConstraintToString(value Constraint) string { switch value.(type) { case cpuset.CPUSet: return "#" + value.(cpuset.CPUSet).String() case int: return strconv.Itoa(value.(int)) case string: return value.(string) case resource.Quantity: qty := value.(resource.Quantity) return qty.String() default: return fmt.Sprintf("", value) } } // configNotify is the configuration change notification callback for the genric policy layer. func configNotify(_ config.Event, _ config.Source) error { // let the active policy know of changes backendOpts.Available = opt.Available backendOpts.Reserved = opt.Reserved return nil } ================================================ FILE: pkg/cri/resource-manager/requests.go ================================================ // Copyright 2019 Intel Corporation. All Rights Reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package resmgr import ( "context" "fmt" "strings" criv1 "k8s.io/cri-api/pkg/apis/runtime/v1" pkgcfg "github.com/intel/cri-resource-manager/pkg/config" "github.com/intel/cri-resource-manager/pkg/cri/resource-manager/cache" config "github.com/intel/cri-resource-manager/pkg/cri/resource-manager/config" "github.com/intel/cri-resource-manager/pkg/cri/resource-manager/events" "github.com/intel/cri-resource-manager/pkg/cri/resource-manager/policy" "github.com/intel/cri-resource-manager/pkg/cri/server" ) const ( kubeAPIVersion = "0.1.0" ) var knownRuntimes = []string{ "containerd", "cri-o", } // setupRequestProcessing prepares the resource manager for CRI request processing. func (m *resmgr) setupRequestProcessing() error { interceptors := map[string]server.Interceptor{ "RunPodSandbox": m.RunPod, "StopPodSandbox": m.StopPod, "RemovePodSandbox": m.RemovePod, "CreateContainer": m.CreateContainer, "StartContainer": m.StartContainer, "StopContainer": m.StopContainer, "RemoveContainer": m.RemoveContainer, "ListContainers": m.ListContainers, "UpdateContainerResources": m.UpdateContainer, } if err := m.relay.Server().RegisterInterceptors(interceptors); err != nil { return resmgrError("failed to register resource-manager CRI interceptors: %v", err) } return nil } // disambiguate produces disambiguation context for a request/reply dump. func (m *resmgr) disambiguate(msg interface{}) string { var qualifier string m.RLock() defer m.RUnlock() switch req := msg.(type) { case *criv1.RunPodSandboxRequest: if req.Config != nil && req.Config.Metadata != nil { qualifier = req.Config.Metadata.Name } case *criv1.StopPodSandboxRequest: if pod, ok := m.cache.LookupPod(req.PodSandboxId); ok { qualifier = pod.GetName() } else { qualifier = "unknown pod " + req.PodSandboxId } case *criv1.RemovePodSandboxRequest: if pod, ok := m.cache.LookupPod(req.PodSandboxId); ok { qualifier = pod.GetName() } else { qualifier = "unknown pod " + req.PodSandboxId } case *criv1.CreateContainerRequest: switch { case req.SandboxConfig == nil || req.SandboxConfig.Metadata == nil: qualifier = "missing pod metadata in request" case req.Config == nil || req.Config.Metadata == nil: qualifier = "missing metadata in request" default: qualifier = req.SandboxConfig.Metadata.Name + ":" + req.Config.Metadata.Name } case *criv1.StartContainerRequest: if container, ok := m.cache.LookupContainer(req.ContainerId); ok { qualifier = container.PrettyName() } else { qualifier = "unknown container " + req.ContainerId } case *criv1.StopContainerRequest: if container, ok := m.cache.LookupContainer(req.ContainerId); ok { qualifier = container.PrettyName() } else { qualifier = "unknown container " + req.ContainerId } case *criv1.RemoveContainerRequest: if container, ok := m.cache.LookupContainer(req.ContainerId); ok { qualifier = container.PrettyName() } else { qualifier = "unknown container " + req.ContainerId } case *criv1.UpdateContainerResourcesRequest: if container, ok := m.cache.LookupContainer(req.ContainerId); ok { qualifier = container.PrettyName() } else { qualifier = "unknown container " + req.ContainerId } } if qualifier != "" { return "<" + qualifier + ">" } return "" } // startRequestProcessing starts request processing by starting the active policy. func (m *resmgr) startRequestProcessing() error { ctx := context.Background() add, del, err := m.syncWithCRI(ctx) if err != nil { return err } // // Notes: // While normally it is enough to release stale containers and allocate // newly discovered ones, if we are switching policies we need to force // reallocating everything. Otherwise containers already present in the // cache would not get properly updated by the new policy. // if m.policySwitch { containers := m.cache.GetContainers() cache.SortContainers(containers) add, del = containers, containers m.policySwitch = false } if err := m.policy.Start(add, del); err != nil { return resmgrError("failed to start policy %s: %v", policy.ActivePolicy(), err) } if err := m.runPostReleaseHooks(ctx, "startup", del...); err != nil { m.Error("startup: failed to run post-release hooks: %v", err) } return m.cache.Save() } // syncWithCRI synchronizes cache pods and containers with the CRI runtime. func (m *resmgr) syncWithCRI(ctx context.Context) ([]cache.Container, []cache.Container, error) { if !m.relay.Client().HasRuntimeService() { return nil, nil, nil } m.Info("synchronizing cache state with CRI runtime...") add, del := []cache.Container{}, []cache.Container{} pods, err := m.relay.Client().ListPodSandbox(ctx, &criv1.ListPodSandboxRequest{}) if err != nil { return nil, nil, resmgrError("cache synchronization pod query failed: %v", err) } status := map[string]*cache.PodStatus{} for _, pod := range pods.Items { if s, err := m.queryPodStatus(ctx, pod.Id); err != nil { m.Error("%s: failed to query pod status: %v", pod.Id, err) } else { status[pod.Id] = s } } _, _, deleted := m.cache.RefreshPods(pods, status) for _, c := range deleted { m.Info("discovered stale container %s...", c.GetID()) del = append(del, c) } containers, err := m.relay.Client().ListContainers(ctx, &criv1.ListContainersRequest{}) if err != nil { return nil, nil, resmgrError("cache synchronization container query failed: %v", err) } added, deleted := m.cache.RefreshContainers(containers) for _, c := range added { if c.GetState() != cache.ContainerStateRunning { m.Info("ignoring discovered container %s (in state %v)...", c.GetID(), c.GetState()) continue } m.Info("discovered out-of-sync running container %s...", c.GetID()) add = append(add, c) } for _, c := range deleted { m.Info("discovered stale container %s...", c.GetID()) del = append(del, c) } return add, del, nil } func (m *resmgr) queryPodStatus(ctx context.Context, podID string) (*cache.PodStatus, error) { response, err := m.relay.Client().PodSandboxStatus(ctx, &criv1.PodSandboxStatusRequest{ PodSandboxId: podID, Verbose: true, }) if err != nil { return nil, err } return cache.ParsePodStatus(response) } // RunPod intercepts CRI requests for Pod creation. func (m *resmgr) RunPod(ctx context.Context, method string, request interface{}, handler server.Handler) (interface{}, error) { reply, rqerr := handler(ctx, request) if rqerr != nil { m.Error("%s: failed to create pod: %v", method, rqerr) return reply, rqerr } podID := reply.(*criv1.RunPodSandboxResponse).PodSandboxId m.Lock() defer m.Unlock() pod, err := m.cache.InsertPod(podID, request, nil) if err != nil { m.Error("%s: failed to insert new pod to cache: %v", method, err) return nil, resmgrError("%s: failed to insert new pod to cache: %v", method, err) } m.updateIntrospection() // search for any lingering old version and clean up if found released := false del := []cache.Container{} for _, p := range m.cache.GetPods() { if p.GetUID() != pod.GetUID() || p == pod { continue } m.Warn("re-creation of pod %s, releasing old one", p.GetName()) for _, c := range pod.GetInitContainers() { m.Info("%s: removing stale init-container %s...", method, c.PrettyName()) m.policy.ReleaseResources(c) c.UpdateState(cache.ContainerStateStale) released = true del = append(del, c) } for _, c := range pod.GetContainers() { m.Info("%s: removing stale container %s...", method, c.PrettyName()) m.policy.ReleaseResources(c) c.UpdateState(cache.ContainerStateStale) released = true del = append(del, c) } m.cache.DeletePod(p.GetID()) } if released { if err := m.runPostReleaseHooks(ctx, method, del...); err != nil { m.Error("%s: failed to run post-release hooks for lingering pod %s: %v", method, pod.GetName(), err) } } m.Info("created pod %s (%s)", pod.GetName(), podID) return reply, nil } // StopPod intercepts CRI requests for stopping Pods. func (m *resmgr) StopPod(ctx context.Context, method string, request interface{}, handler server.Handler) (interface{}, error) { reply, rqerr := handler(ctx, request) m.Lock() defer m.Unlock() podID := request.(*criv1.StopPodSandboxRequest).PodSandboxId pod, ok := m.cache.LookupPod(podID) if !ok { m.Warn("%s: failed to look up pod %s, just passing request through", method, podID) return reply, rqerr } if rqerr != nil { m.Error("%s: failed to stop pod %s: %v", method, podID, rqerr) return reply, rqerr } m.Info("%s: stopped pod %s (%s)...", method, pod.GetName(), podID) released := []cache.Container{} for _, c := range pod.GetInitContainers() { m.Info("%s: releasing resources for %s...", method, c.PrettyName()) if err := m.policy.ReleaseResources(c); err != nil { m.Warn("%s: failed to release init-container %s: %v", method, c.PrettyName(), err) } c.UpdateState(cache.ContainerStateExited) released = append(released, c) } for _, c := range pod.GetContainers() { m.Info("%s: releasing resources for container %s...", method, c.PrettyName()) if err := m.policy.ReleaseResources(c); err != nil { m.Warn("%s: failed to release container %s: %v", method, c.PrettyName(), err) } c.UpdateState(cache.ContainerStateExited) released = append(released, c) } if err := m.runPostReleaseHooks(ctx, method, released...); err != nil { m.Error("%s: failed to run post-release hooks for pod %s: %v", method, pod.GetName(), err) } m.updateIntrospection() return reply, rqerr } // RemovePod intercepts CRI requests for Pod removal. func (m *resmgr) RemovePod(ctx context.Context, method string, request interface{}, handler server.Handler) (interface{}, error) { reply, rqerr := handler(ctx, request) m.Lock() defer m.Unlock() podID := request.(*criv1.RemovePodSandboxRequest).PodSandboxId pod, ok := m.cache.LookupPod(podID) if !ok { m.Warn("%s: failed to look up pod %s, just passing request through", method, podID) return reply, rqerr } if rqerr != nil { m.Error("%s: failed to remove pod %s: %v", method, podID, rqerr) } else { m.Info("%s: removed pod %s (%s)...", method, pod.GetName(), podID) } released := []cache.Container{} for _, c := range pod.GetInitContainers() { m.Info("%s: removing stale init-container %s...", method, c.PrettyName()) if err := m.policy.ReleaseResources(c); err != nil { m.Warn("%s: failed to release init-container %s: %v", method, c.PrettyName(), err) } c.UpdateState(cache.ContainerStateStale) released = append(released, c) } for _, c := range pod.GetContainers() { m.Info("%s: removing stale container %s...", method, c.PrettyName()) if err := m.policy.ReleaseResources(c); err != nil { m.Warn("%s: failed to release container %s: %v", method, c.PrettyName(), err) } c.UpdateState(cache.ContainerStateStale) released = append(released, c) } if err := m.runPostReleaseHooks(ctx, method, released...); err != nil { m.Error("%s: failed to run post-release hooks for pod %s: %v", method, pod.GetName(), err) } m.cache.DeletePod(podID) m.updateIntrospection() return reply, rqerr } // CreateContainer intercepts CRI requests for Container creation. func (m *resmgr) CreateContainer(ctx context.Context, method string, request interface{}, handler server.Handler) (interface{}, error) { m.Lock() defer m.Unlock() // kubelet doesn't always clean up crashed containers so we try doing it here if msg, ok := request.(*criv1.CreateContainerRequest); ok { if pod, ok := m.cache.LookupPod(msg.PodSandboxId); ok { if msg.Config != nil && msg.Config.Metadata != nil { if c, ok := pod.GetContainer(msg.Config.Metadata.Name); ok { m.Warn("re-creation of container %s, releasing old one", c.PrettyName()) m.policy.ReleaseResources(c) } } } } container, err := m.cache.InsertContainer(request) if err != nil { m.Error("%s: failed to insert new container to cache: %v", method, err) return nil, resmgrError("%s: failed to insert new container to cache: %v", method, err) } container.SetCRIRequest(request) m.Info("%s: creating container %s...", method, container.PrettyName()) if err := m.policy.AllocateResources(container); err != nil { m.Error("%s: failed to allocate resources for container %s: %v", method, container.PrettyName(), err) m.cache.DeleteContainer(container.GetCacheID()) return nil, resmgrError("failed to allocate container resources: %v", err) } container.InsertMount(&cache.Mount{ Container: "/.cri-resmgr", Host: m.cache.ContainerDirectory(container.GetCacheID()), Readonly: true, Propagation: cache.MountHostToContainer, }) if err := m.runPostAllocateHooks(ctx, method); err != nil { m.Error("%s: failed to run post-allocate hooks for %s: %v", method, container.PrettyName(), err) m.policy.ReleaseResources(container) m.runPostReleaseHooks(ctx, method, container) m.cache.DeleteContainer(container.GetCacheID()) return nil, resmgrError("failed to allocate container resources: %v", err) } container.ClearCRIRequest() reply, rqerr := handler(ctx, request) if rqerr != nil { m.Error("%s: failed to create container %s: %v", method, container.PrettyName(), rqerr) m.policy.ReleaseResources(container) m.runPostReleaseHooks(ctx, method, container) m.cache.DeleteContainer(container.GetCacheID()) return nil, resmgrError("failed to create container: %v", rqerr) } m.cache.UpdateContainerID(container.GetCacheID(), reply) container.UpdateState(cache.ContainerStateCreated) m.updateIntrospection() return reply, nil } // StartContainer intercepts CRI requests for starting Containers. func (m *resmgr) StartContainer(ctx context.Context, method string, request interface{}, handler server.Handler) (interface{}, error) { m.Lock() defer m.Unlock() containerID := request.(*criv1.StartContainerRequest).ContainerId container, ok := m.cache.LookupContainer(containerID) if !ok { m.Warn("%s: failed to look up container %s, just passing request through", method, containerID) return handler(ctx, request) } m.Info("%s: starting container %s...", method, container.PrettyName()) if container.GetState() != cache.ContainerStateCreated { m.Error("%s: refusing to start container %s in unexpected state %v", method, container.PrettyName(), container.GetState()) return nil, resmgrError("refusing to start container %s in unexpexted state %v", container.PrettyName(), container.GetState()) } reply, rqerr := handler(ctx, request) if rqerr != nil { m.Error("%s: failed to start container %s: %v", method, container.PrettyName(), rqerr) return nil, rqerr } container.UpdateState(cache.ContainerStateRunning) e := &events.Policy{ Type: events.ContainerStarted, Source: "resource-manager", Data: container, } if _, err := m.policy.HandleEvent(e); err != nil { m.Error("%s: policy failed to handle event %s: %v", method, e.Type, err) } if err := m.runPostStartHooks(method, container); err != nil { m.Error("%s: failed to run post-start hooks for %s: %v", method, container.PrettyName(), err) } m.updateIntrospection() return reply, rqerr } // StopContainer intercepts CRI requests for stopping Containers. func (m *resmgr) StopContainer(ctx context.Context, method string, request interface{}, handler server.Handler) (interface{}, error) { reply, rqerr := handler(ctx, request) m.Lock() defer m.Unlock() containerID := request.(*criv1.StopContainerRequest).ContainerId container, ok := m.cache.LookupContainer(containerID) if !ok { m.Warn("%s: failed to look up container %s, just passing request through", method, containerID) return reply, rqerr } if rqerr != nil { m.Error("%s: failed to stop container %s: %v", method, container.PrettyName(), rqerr) return reply, rqerr } m.Info("%s: stopped container %s...", method, container.PrettyName()) // Notes: // For now, we assume any error replies from CRI are about the container not // being found, in which case we still go ahead and finish locally stopping it... if err := m.policy.ReleaseResources(container); err != nil { m.Error("%s: failed to release resources for container %s: %v", method, container.PrettyName(), err) } container.UpdateState(cache.ContainerStateExited) if err := m.runPostReleaseHooks(ctx, method, container); err != nil { m.Error("%s: failed to run post-release hooks for %s: %v", method, container.PrettyName(), err) } m.updateIntrospection() return reply, rqerr } // RemoveContainer intercepts CRI requests for Container removal. func (m *resmgr) RemoveContainer(ctx context.Context, method string, request interface{}, handler server.Handler) (interface{}, error) { reply, rqerr := handler(ctx, request) m.Lock() defer m.Unlock() containerID := request.(*criv1.RemoveContainerRequest).ContainerId container, ok := m.cache.LookupContainer(containerID) if !ok { m.Warn("%s: failed to look up container %s, just passing request through", method, containerID) return reply, rqerr } if rqerr != nil { m.Error("%s: failed to remove container %s: %v", method, container.PrettyName(), rqerr) } else { m.Info("%s: removed container %s...", method, container.PrettyName()) } if err := m.policy.ReleaseResources(container); err != nil { m.Error("%s: failed to release resources for container %s: %v", method, container.PrettyName(), err) } container.UpdateState(cache.ContainerStateStale) if err := m.runPostReleaseHooks(ctx, method, container); err != nil { m.Error("%s: failed to run post-release hooks for %s: %v", method, container.PrettyName(), err) } m.updateIntrospection() return reply, rqerr } // ListContainers intercepts CRI requests for listing Containers. func (m *resmgr) ListContainers(ctx context.Context, method string, request interface{}, handler server.Handler) (interface{}, error) { reply, rqerr := handler(ctx, request) if rqerr != nil { return reply, rqerr } if f := request.(*criv1.ListContainersRequest).Filter; f != nil { if f.Id != "" || f.State != nil || f.PodSandboxId != "" || len(f.LabelSelector) > 0 { return reply, nil } } m.Lock() defer m.Unlock() clistmap := map[string]*criv1.Container{} released := []cache.Container{} for _, listed := range reply.(*criv1.ListContainersResponse).Containers { clistmap[listed.Id] = listed if listed.State != criv1.ContainerState_CONTAINER_EXITED { continue } if c, ok := m.cache.LookupContainer(listed.Id); ok { state := c.GetState() if state == cache.ContainerStateRunning || state == cache.ContainerStateCreated { m.Info("%s: exited, releasing its resources...", c.PrettyName()) if err := m.policy.ReleaseResources(c); err != nil { m.Error("%s: failed to release resources for container %s: %v", method, c.PrettyName(), err) } c.UpdateState(cache.ContainerStateExited) released = append(released, c) } } } for _, c := range m.cache.GetContainers() { if c.GetState() == cache.ContainerStateRunning { if _, ok := clistmap[c.GetID()]; !ok { m.Info("%s: absent from runtime, releasing its resources...", c.PrettyName()) if err := m.policy.ReleaseResources(c); err != nil { m.Error("%s: failed to release resources for container %s: %v", method, c.PrettyName(), err) } c.UpdateState(cache.ContainerStateStale) released = append(released, c) } } } if len(released) > 0 { if err := m.runPostReleaseHooks(ctx, method, released...); err != nil { m.Error("%s: failed to run post-release hooks: %v", method, err) } } m.updateIntrospection() return reply, nil } // UpdateContainer intercepts CRI requests for updating Containers. func (m *resmgr) UpdateContainer(_ context.Context, _ string, _ interface{}, _ server.Handler) (interface{}, error) { m.Lock() defer m.Unlock() // // Notes: // Once VPA is fully implemented, we need to start passing these // requests on to the active policy. // // containerID := request.(*criv1.UpdateContainerResourcesRequest).ContainerId // container, ok := m.cache.LookupContainer(containerID) // if !ok { // m.Warn("%s: failed to look up container %s, just passing request through", // method, containerID) // return handler(ctx, request) // } // // err := m.policy.UpdateResources(container) // if err != nil { // m.Error("%s: failed to update resources of container %s: %v", method, containerID, err) // return nil, err // } // // err := m.runPostUpdateHooks(ctx, method) // if err != nil { // m.Warn("%s: failed to run post-update hooks for update of container %s: %v", // method, containerID, err) // } // // return &criv1.UpdateContainerResourcesResponse{}, nil // if !m.warnedCRIUpdate { m.Warn("CRI UpdateContainerResources request received. Unless Vertical") m.Warn("Pod Autoscaling is fully implemented, this usually indicates that") m.Warn("kubelet is running with CPU Manager enabled and 'static' or some") m.Warn("other than 'none' policy active. This does not make much sense when") m.Warn("CRI Resource Manager is also active and on the kubelet-runtime") m.Warn("signalling path. Please consider disabling CPU Manager or setting") m.Warn("its active policy to 'none'.") m.warnedCRIUpdate = true } return &criv1.UpdateContainerResourcesResponse{}, nil } // RebalanceContainers tries to find a more optimal container resource allocation if necessary. func (m *resmgr) RebalanceContainers() error { m.Lock() defer m.Unlock() m.Info("rebalancing (reallocating) containers...") return m.rebalance("Rebalance") } // rebalance triggers a policy-specific rebalancing cycle of containers. func (m *resmgr) rebalance(method string) error { if m.policy == nil { return nil } changes, err := m.policy.Rebalance() if err != nil { m.Error("%s: rebalancing of containers failed: %v", method, err) } if changes { if err := m.runPostUpdateHooks(context.Background(), method); err != nil { m.Error("%s: failed to run post-update hooks: %v", method, err) return resmgrError("%s: failed to run post-update hooks: %v", method, err) } } return m.cache.Save() } // DeliverPolicyEvent delivers a policy-specific event to the active policy. func (m *resmgr) DeliverPolicyEvent(e *events.Policy) error { m.Lock() defer m.Unlock() if m.policy == nil { return nil } if e.Source == "" { e.Source = "unspecified" } m.Info("delivering policy event %s.%s...", e.Source, e.Type) method := "DeliverPolicyEvent" changes, err := m.policy.HandleEvent(e) if err != nil { m.Error("%s: handling event %s.%s failed: %v", method, e.Source, e.Type, err) return err } if changes { if err = m.runPostUpdateHooks(context.Background(), method); err != nil { m.Error("%s: failed to run post-update hooks: %v", method, err) return resmgrError("%s: failed to run post-update hooks: %v", method, err) } } m.cache.Save() return nil } // setConfig activates a new configuration, either from the agent or from a file. func (m *resmgr) setConfig(v interface{}) error { var err error m.Lock() defer m.Unlock() switch cfg := v.(type) { case *config.RawConfig: err = pkgcfg.SetConfig(cfg.Data) case string: err = pkgcfg.SetConfigFromFile(cfg) default: err = fmt.Errorf("invalid configuration source/type %T", v) } if err != nil { m.Error("configuration rejected: %v", err) return resmgrError("configuration rejected: %v", err) } if m.policy != nil { // synchronize state of controllers with new configuration if err = m.control.StartStopControllers(m.cache, m.relay.Client()); err != nil { m.Error("failed to synchronize controllers with new configuration: %v", err) return resmgrError("failed to synchronize controllers with new configuration: %v", err) } if err = m.runPostUpdateHooks(context.Background(), "setConfig"); err != nil { m.Error("failed to run post-update hooks after reconfiguration: %v", err) return resmgrError("failed to run post-update hooks after reconfiguration: %v", err) } } // if we managed to activate a configuration from the agent, store it in the cache if cfg, ok := v.(*config.RawConfig); ok { m.cache.SetConfig(cfg) } m.Info("successfully switched to new configuration") return nil } // runPostAllocateHooks runs the necessary hooks after allocating resources for some containers. func (m *resmgr) runPostAllocateHooks(ctx context.Context, method string) error { for _, c := range m.cache.GetPendingContainers() { switch c.GetState() { case cache.ContainerStateRunning, cache.ContainerStateCreated: if err := m.control.RunPostUpdateHooks(c); err != nil { m.Warn("%s post-update hook failed for %s: %v", method, c.PrettyName(), err) } if req, ok := c.ClearCRIRequest(); ok { if _, err := m.sendCRIRequest(ctx, req); err != nil { m.Warn("%s update of container %s failed: %v", method, c.PrettyName(), err) } } m.policy.ExportResourceData(c) case cache.ContainerStateCreating: if err := m.control.RunPreCreateHooks(c); err != nil { m.Warn("%s pre-create hook failed for %s: %v", method, c.PrettyName(), err) } m.policy.ExportResourceData(c) default: m.Warn("%s: skipping container %s (in state %v)", method, c.PrettyName(), c.GetState()) } } return nil } // runPostStartHooks runs the necessary hooks after having started a container. func (m *resmgr) runPostStartHooks(method string, c cache.Container) error { if err := m.control.RunPostStartHooks(c); err != nil { m.Error("%s: post-start hook failed for %s: %v", method, c.PrettyName(), err) } return nil } // runPostReleaseHooks runs the necessary hooks after releaseing resources of some containers func (m *resmgr) runPostReleaseHooks(ctx context.Context, method string, released ...cache.Container) error { for _, c := range released { if err := m.control.RunPostStopHooks(c); err != nil { m.Warn("post-stop hook failed for %s: %v", c.PrettyName(), err) } if c.GetState() == cache.ContainerStateStale { m.cache.DeleteContainer(c.GetCacheID()) } } for _, c := range m.cache.GetPendingContainers() { switch state := c.GetState(); state { case cache.ContainerStateStale, cache.ContainerStateExited: if err := m.control.RunPostStopHooks(c); err != nil { m.Warn("post-stop hook failed for %s: %v", c.PrettyName(), err) } if state == cache.ContainerStateStale { m.cache.DeleteContainer(c.GetCacheID()) } case cache.ContainerStateRunning, cache.ContainerStateCreated: if err := m.control.RunPostUpdateHooks(c); err != nil { m.Warn("post-update hook failed for %s: %v", c.PrettyName(), err) } if req, ok := c.ClearCRIRequest(); ok { if _, err := m.sendCRIRequest(ctx, req); err != nil { m.Warn("update of container %s failed: %v", c.PrettyName(), err) } } m.policy.ExportResourceData(c) default: m.Warn("%s: skipping pending container %s (in state %v)", method, c.PrettyName(), c.GetState()) } } return nil } // runPostUpdateHooks runs the necessary hooks after reconcilation. func (m *resmgr) runPostUpdateHooks(ctx context.Context, method string) error { for _, c := range m.cache.GetPendingContainers() { switch c.GetState() { case cache.ContainerStateRunning, cache.ContainerStateCreated: if err := m.control.RunPostUpdateHooks(c); err != nil { return err } if req, ok := c.GetCRIRequest(); ok { if _, err := m.sendCRIRequest(ctx, req); err != nil { m.Warn("%s update of container %s failed: %v", method, c.PrettyName(), err) } else { c.ClearCRIRequest() } } m.policy.ExportResourceData(c) default: m.Warn("%s: skipping container %s (in state %v)", method, c.PrettyName(), c.GetState()) } } return nil } // sendCRIRequest sends the given CRI request, returning the received reply and error. func (m *resmgr) sendCRIRequest(ctx context.Context, request interface{}) (interface{}, error) { client := m.relay.Client() switch request.(type) { case *criv1.UpdateContainerResourcesRequest: req := request.(*criv1.UpdateContainerResourcesRequest) m.Debug("sending update request for container %s...", req.ContainerId) return client.UpdateContainerResources(ctx, req) default: return nil, resmgrError("sendCRIRequest: unhandled request type %T", request) } } func (m *resmgr) checkRuntime(ctx context.Context) error { version, err := m.relay.Client().Version(ctx, &criv1.VersionRequest{ Version: kubeAPIVersion, }) if err != nil { return resmgrError("failed to query runtime version: %v", err) } for _, name := range knownRuntimes { if strings.HasPrefix(version.RuntimeName, name) { return nil } } if opt.AllowUntestedRuntimes { m.Warnf("running with untested/unknown runtime %q", version.RuntimeName) return nil } return rejectRuntimeError(version.RuntimeName) } func rejectRuntimeError(name string) error { return resmgrError("rejecting untested runtime %s, use --%s to allow it", name, allowUntestedRuntimesFlag) } ================================================ FILE: pkg/cri/resource-manager/resource-manager.go ================================================ // Copyright 2019 Intel Corporation. All Rights Reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package resmgr import ( "context" "os" "os/signal" "path/filepath" "strings" "sync" "golang.org/x/sys/unix" pkgcfg "github.com/intel/cri-resource-manager/pkg/config" "github.com/intel/cri-resource-manager/pkg/cri/relay" "github.com/intel/cri-resource-manager/pkg/cri/resource-manager/agent" "github.com/intel/cri-resource-manager/pkg/cri/resource-manager/cache" config "github.com/intel/cri-resource-manager/pkg/cri/resource-manager/config" "github.com/intel/cri-resource-manager/pkg/cri/resource-manager/control" "github.com/intel/cri-resource-manager/pkg/cri/resource-manager/introspect" "github.com/intel/cri-resource-manager/pkg/cri/resource-manager/metrics" "github.com/intel/cri-resource-manager/pkg/cri/resource-manager/policy" "github.com/intel/cri-resource-manager/pkg/cri/resource-manager/visualizer" "github.com/intel/cri-resource-manager/pkg/instrumentation" logger "github.com/intel/cri-resource-manager/pkg/log" "github.com/intel/cri-resource-manager/pkg/pidfile" "github.com/intel/cri-resource-manager/pkg/sysfs" "github.com/intel/cri-resource-manager/pkg/topology" policyCollector "github.com/intel/cri-resource-manager/pkg/policycollector" "github.com/intel/cri-resource-manager/pkg/utils" ) // ResourceManager is the interface we expose for controlling the CRI resource manager. type ResourceManager interface { // Start starts the resource manager. Start() error // Stop stops the resource manager. Stop() // SetConfig dynamically updates the resource manager configuration. SetConfig(*config.RawConfig) error // SetAdjustment dynamically updates external adjustments. SetAdjustment(*config.Adjustment) map[string]error // SendEvent sends an event to be processed by the resource manager. SendEvent(event interface{}) error // Add-ons for testing. ResourceManagerTestAPI } // resmgr is the implementation of ResourceManager. type resmgr struct { logger.Logger sync.RWMutex relay relay.Relay // our CRI relay cache cache.Cache // cached state policy policy.Policy // resource manager policy policySwitch bool // active policy is being switched configServer config.Server // configuration management server control control.Control // policy controllers/enforcement agent agent.Interface // connection to cri-resmgr agent conf *config.RawConfig // pending for saving in cache metrics *metrics.Metrics // metrics collector/pre-processor events chan interface{} // channel for delivering events stop chan interface{} // channel for signalling shutdown to goroutines signals chan os.Signal // signal channel introspect *introspect.Server // server for external introspection warnedCRIUpdate bool // warned about CRI UpdateContainer calls } // NewResourceManager creates a new ResourceManager instance. func NewResourceManager() (ResourceManager, error) { m := &resmgr{Logger: logger.NewLogger("resource-manager")} if err := m.setupCache(); err != nil { return nil, err } sysfs.SetSysRoot(opt.HostRoot) topology.SetSysRoot(opt.HostRoot) switch { case opt.ResetPolicy && opt.ResetConfig: os.Exit(m.resetCachedPolicy() + m.resetCachedConfig()) case opt.ResetPolicy: os.Exit(m.resetCachedPolicy()) case opt.ResetConfig: os.Exit(m.resetCachedConfig()) } if err := m.checkOpts(); err != nil { return nil, err } if err := m.setupAgentInterface(); err != nil { return nil, err } if err := m.loadConfig(); err != nil { return nil, err } if err := m.setupConfigServer(); err != nil { return nil, err } if err := m.setupPolicy(); err != nil { return nil, err } if err := m.registerPolicyMetricsCollector(); err != nil { return nil, err } if err := m.setupRelay(); err != nil { pid, _ := pidfile.OwnerPid() if pid > 0 { m.Error("looks like we're already running as pid %d...", pid) } return nil, err } if err := m.setupRequestProcessing(); err != nil { return nil, err } if err := m.setupEventProcessing(); err != nil { return nil, err } if err := m.setupControllers(); err != nil { return nil, err } if err := m.setupIntrospection(); err != nil { return nil, err } return m, nil } // Start starts the resource manager. func (m *resmgr) Start() error { m.Info("starting...") m.Lock() defer m.Unlock() if err := m.checkRuntime(context.Background()); err != nil { return err } if err := m.startControllers(); err != nil { return err } if err := m.startRequestProcessing(); err != nil { return err } if err := m.startEventProcessing(); err != nil { return err } m.startIntrospection() if err := m.relay.Start(); err != nil { return resmgrError("failed to start CRI relay: %v", err) } if err := pidfile.Remove(); err != nil { return resmgrError("failed to remove stale/old PID file: %v", err) } if err := pidfile.Write(); err != nil { return resmgrError("failed to write PID file: %v", err) } if opt.ForceConfig == "" { if err := m.configServer.Start(opt.ConfigSocket); err != nil { return resmgrError("failed to start configuration server: %v", err) } // We never store a forced configuration in the cache. However, if we're not // running with a forced configuration, and the configuration is pending to // get stored in the cache (IOW, it is a new one acquired from an agent), then // then store it in the cache now. if m.conf != nil { m.cache.SetConfig(m.conf) m.conf = nil } } m.Info("up and running") return nil } // Stop stops the resource manager. func (m *resmgr) Stop() { m.Info("shutting down...") m.Lock() defer m.Unlock() if m.signals != nil { close(m.signals) m.signals = nil } m.configServer.Stop() m.relay.Stop() m.stopIntrospection() m.stopEventProcessing() } // SetConfig pushes new configuration to the resource manager. func (m *resmgr) SetConfig(conf *config.RawConfig) error { m.Info("applying new configuration from agent...") return m.setConfig(conf) } // SetAdjustment pushes new external adjustments to the resource manager. func (m *resmgr) SetAdjustment(adjustment *config.Adjustment) map[string]error { m.Info("applying new adjustments from agent...") m.Lock() defer m.Unlock() return m.setAdjustment(adjustment) } // setConfigFromFile pushes new configuration to the resource manager from a file. func (m *resmgr) setConfigFromFile(path string) error { m.Info("applying new configuration from file %s...", path) return m.setConfig(path) } // setAdjustments pushes new external policies to the resource manager. func (m *resmgr) setAdjustment(adjustments *config.Adjustment) map[string]error { m.Info("applying new external adjustments from agent...") rebalance, errors := m.cache.SetAdjustment(adjustments) if rebalance { m.rebalance("setAdjustment") } return errors } // resetCachedPolicy resets the cached active policy and all of its data. func (m *resmgr) resetCachedPolicy() int { m.Info("resetting active policy stored in cache...") defer logger.Flush() if ls, err := utils.IsListeningSocket(opt.RelaySocket); ls || err != nil { m.Error("refusing to reset, looks like an instance of %q is active at socket %q...", filepath.Base(os.Args[0]), opt.RelaySocket) return 1 } if err := m.cache.ResetActivePolicy(); err != nil { m.Error("failed to reset active policy: %v", err) return 1 } return 0 } // resetCachedConfig resets any cached configuration. func (m *resmgr) resetCachedConfig() int { m.Info("resetting cached configuration...") defer logger.Flush() if ls, err := utils.IsListeningSocket(opt.RelaySocket); ls || err != nil { m.Error("refusing to reset, looks like an instance of %q is active at socket %q...", filepath.Base(os.Args[0]), opt.RelaySocket) return 1 } if err := m.cache.ResetConfig(); err != nil { m.Error("failed to reset cached configuration: %v", err) return 1 } return 0 } // setupCache creates a cache and reloads its last saved state if found. func (m *resmgr) setupCache() error { var err error options := cache.Options{CacheDir: opt.RelayDir} if m.cache, err = cache.NewCache(options); err != nil { return resmgrError("failed to create cache: %v", err) } return nil } // setupAgentInterface sets up the connection to the node agent. func (m *resmgr) setupAgentInterface() error { var err error if m.agent, err = agent.NewAgentInterface(opt.AgentSocket); err != nil { return err } return nil } // setupConfigServer sets up our configuration server for agent notifications. func (m *resmgr) setupConfigServer() error { var err error if m.configServer, err = config.NewConfigServer(m.SetConfig, m.SetAdjustment); err != nil { return resmgrError("failed to create configuration notification server: %v", err) } return nil } // checkOpts checks the command line options for obvious errors. func (m *resmgr) checkOpts() error { if opt.ForceConfig != "" && opt.FallbackConfig != "" { return resmgrError("both fallback (%s) and forced (%s) configurations given", opt.FallbackConfig, opt.ForceConfig) } return nil } // loadConfig tries to pick and load (initial) configuration from a number of sources. func (m *resmgr) loadConfig() error { // // We try to load initial configuration from a number of sources: // // 1. use forced configuration file if we were given one // 2. use last configuration stored in cache, if we have one and it applies // 3. use fallback configuration file if we were given one // 4. use empty/builtin default configuration, whatever that is... // if opt.ForceConfig != "" { m.Info("using forced configuration %s...", opt.ForceConfig) if err := pkgcfg.SetConfigFromFile(opt.ForceConfig); err != nil { return resmgrError("failed to load forced configuration %s: %v", opt.ForceConfig, err) } return m.setupConfigSignal(opt.ForceConfigSignal) } m.Info("trying last cached configuration...") if conf := m.cache.GetConfig(); conf != nil { err := pkgcfg.SetConfig(conf.Data) if err == nil { return nil } m.Error("failed to activate cached configuration: %v", err) } if opt.FallbackConfig != "" { m.Info("using fallback configuration %s...", opt.FallbackConfig) if err := pkgcfg.SetConfigFromFile(opt.FallbackConfig); err != nil { return resmgrError("failed to load fallback configuration %s: %v", opt.FallbackConfig, err) } return nil } m.Warn("no initial configuration found") return nil } // setupConfigSignal sets up a signal handler for reloading forced configuration. func (m *resmgr) setupConfigSignal(signame string) error { if signame == "" || strings.HasPrefix(strings.ToLower(signame), "disable") { return nil } m.Info("setting up signal %s to reload forced configuration", signame) sig := unix.SignalNum(signame) if int(sig) == 0 { return resmgrError("invalid forced configuration reload signal '%s'", signame) } m.signals = make(chan os.Signal, 1) signal.Notify(m.signals, sig) go func(signals <-chan os.Signal) { for { select { case _, ok := <-signals: if !ok { return } } m.Info("reloading forced configuration %s...", opt.ForceConfig) if err := m.setConfigFromFile(opt.ForceConfig); err != nil { m.Error("failed to reload forced configuration %s: %v", opt.ForceConfig, err) } } }(m.signals) return nil } // setupPolicy sets up policy with the configured/active backend func (m *resmgr) setupPolicy() error { var err error active := policy.ActivePolicy() cached := m.cache.GetActivePolicy() if active != cached { if cached != "" { if opt.DisablePolicySwitch { m.Error("can't switch policy from %q to %q: policy switching disabled", cached, active) return resmgrError("cannot load cache with policy %s for active policy %s", cached, active) } if err := m.cache.ResetActivePolicy(); err != nil { return resmgrError("failed to reset cached policy %q: %v", cached, err) } } m.cache.SetActivePolicy(active) m.policySwitch = true } options := &policy.Options{AgentCli: m.agent, SendEvent: m.SendEvent} if m.policy, err = policy.NewPolicy(m.cache, options); err != nil { return resmgrError("failed to create policy %s: %v", active, err) } return nil } // setupRelay sets up the CRI request relay. func (m *resmgr) setupRelay() error { var err error options := relay.Options{ RelaySocket: opt.RelaySocket, ImageSocket: opt.ImageSocket, RuntimeSocket: opt.RuntimeSocket, QualifyReqFn: m.disambiguate, } options.ImageSocket = strings.TrimPrefix(options.ImageSocket, "unix://") options.RuntimeSocket = strings.TrimPrefix(options.RuntimeSocket, "unix://") options.RelaySocket = strings.TrimPrefix(options.RelaySocket, "unix://") if m.relay, err = relay.NewRelay(options); err != nil { return resmgrError("failed to create CRI relay: %v", err) } if err = m.relay.Setup(); err != nil { return resmgrError("failed to create CRI relay: %v", err) } return nil } // setupControllers sets up the resource controllers. func (m *resmgr) setupControllers() error { var err error if m.control, err = control.NewControl(); err != nil { return resmgrError("failed to create resource controller: %v", err) } return nil } // startControllers start the resource controllers. func (m *resmgr) startControllers() error { if err := m.control.StartStopControllers(m.cache, m.relay.Client()); err != nil { return resmgrError("failed to start resource controllers: %v", err) } return nil } // setupIntrospection prepares the resource manager for serving external introspection requests. func (m *resmgr) setupIntrospection() error { mux := instrumentation.GetHTTPMux() i, err := introspect.Setup(mux, m.policy.Introspect()) if err != nil { return resmgrError("failed to set up introspection service: %v", err) } m.introspect = i if !opt.DisableUI { if err := visualizer.Setup(mux); err != nil { m.Error("failed to set up UI for visualization: %v", err) } } else { m.Warn("built-in visualization UIs are disabled") } return nil } // startIntrospection starts serving the external introspection requests. func (m *resmgr) startIntrospection() { m.introspect.Start() m.updateIntrospection() } // stopInstrospection stops serving external introspection requests. func (m *resmgr) stopIntrospection() { m.introspect.Stop() } // updateIntrospection pushes updated data for external introspection· func (m *resmgr) updateIntrospection() { m.introspect.Set(m.policy.Introspect()) } // registerPolicyMetricsCollector registers policy metrics collector· func (m *resmgr) registerPolicyMetricsCollector() error { pc := &policyCollector.PolicyCollector{} pc.SetPolicy(m.policy) if pc.HasPolicySpecificMetrics() { return pc.RegisterPolicyMetricsCollector() } m.Info("%s policy has no policy-specific metrics.", policy.ActivePolicy()) return nil } ================================================ FILE: pkg/cri/resource-manager/sockets/sockets.go ================================================ // Copyright 2019 Intel Corporation. All Rights Reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package sockets const ( // Containerd is the CRI socket containerd listens on. Containerd = "/var/run/containerd/containerd.sock" // ResourceManagerRelay is the CRI socket the resource manager listens on. ResourceManagerRelay = "/var/run/cri-resmgr/cri-resmgr.sock" // ResourceManagerAgent is the socket the resource manager node agent listens on. ResourceManagerAgent = "/var/run/cri-resmgr/cri-resmgr-agent.sock" // ResourceManagerConfig for resource manager configuration notifications. ResourceManagerConfig = "/var/run/cri-resmgr/cri-resmgr-config.sock" // DirPermissions is the permissions to create the directory for sockets with. DirPermissions = 0711 ) ================================================ FILE: pkg/cri/resource-manager/test-api.go ================================================ // Copyright 2029 Intel Corporation. All Rights Reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. //go:build test // +build test package resmgr import ( "github.com/intel/cri-resource-manager/pkg/cri/resource-manager/cache" ) // ResourceManagerTestAPI is a post-test verification helper interface. type ResourceManagerTestAPI interface { // GetCache returns the Cache resource manager is running with. GetCache() cache.Cache } func (m *resmgr) GetCache() cache.Cache { return m.cache } ================================================ FILE: pkg/cri/resource-manager/visualizer/bubbles/assets/css/style.css ================================================ body { font-family: Arial; color: white; } a { color: white; text-decoration: underline; } .node { cursor: pointer; } .node:hover { stroke: #000; stroke-width: 1.5px; } .node--leaf { fill: white; } .label { font: 11px "Helvetica Neue", Helvetica, Arial, sans-serif; text-anchor: middle; text-shadow: 0 1px 0 #fff, 1px 0 0 #fff, -1px 0 0 #fff, 0 -1px 0 #fff; } .label, .inner--leaf, .node--root { pointer-events: none; } .node--leaf:hover { fill: gainsboro; } .node--leaf:active { pointer-events: none } ================================================ FILE: pkg/cri/resource-manager/visualizer/bubbles/assets/index.html ================================================
================================================ FILE: pkg/cri/resource-manager/visualizer/bubbles/assets/js/ui-json-adapter.js ================================================ // CRI-RM introspection data to UI JSON data format adaptation. "use strict"; function AdaptJSON(data) { "use strict"; var root, nodes, containers console.log("should translate introspection to d3obj: %o", data) root = null nodes = new Object() containers = new Object() // create tree of pools for (var name in data.Pools) { var p = data.Pools[name] var node = new Object() console.log("got pool %o: %o", name, p) node.name = p.Name node.CPUs = p.CPUs node.Memory = p.Memory node.children = new Array() if (p.Parent == "") { root = node console.log("root set to %o: %o", p.parent, node) } nodes[name] = node } for (var name in data.Pools) { var p = data.Pools[name] var n = nodes[name] if (n == null) { console.log("failed to look up node %o", name) } if (p.Children != null) { for (var i = 0; i < p.Children.length; i++) { var cname = p.Children[i] n.children.push(nodes[cname]) } } } // create lookup table of containers for (var pid in data.Pods) { var p = data.Pods[pid] console.log("got pod %o", pid) for (var cid in p.Containers) { var c = p.Containers[cid] console.log("got container %o", cid) node = new Object() node.name = p.Name + ":" + c.Name node.CPURequest = c.CPURequest node.CPULimit = c.CPULimit node.MemoryRequest = c.MemoryRequest node.MemoryLimit = c.MemoryLimit node.Hints = c.Hints node.container = c containers[cid] = node } } // attach containers to pools for (var cid in data.Assignments) { var a = data.Assignments[cid] var n = containers[cid] var shared = "" var exclusive = "" var cpu = "" var sep = "" console.log("got assignment for container %o", cid) if (a.SharedCPUs != "") { shared = "shared:"+a.SharedCPUs+"(share:"+a.CPUShare+")" } if (a.ExclusiveCPUs != "") { exclusive = "exclusive:"+a.ExclusiveCPUs } if (exclusive != "") { cpus = exclusive sep = " + " } if (shared != "") { cpu += sep + shared } n.CPUs = cpu n.Memory = a.Memory n.RDTClass = a.RDTClass n.BlockIOClass = a.BlockIOClass p = nodes[a.Pool] p.children.push(n) } console.log("translated object: %o", root) return root } ================================================ FILE: pkg/cri/resource-manager/visualizer/bubbles/assets/js/ui.js ================================================ var svg = d3.select("svg") .attr("preserveAspectRatio", "xMinYMin meet") .attr("viewBox", "0 0 800 800"), margin = 20, diameter = +svg.attr("width"), g = svg.append("g").attr("transform", "translate(" + diameter / 2 + "," + diameter / 2 + ")"); var green = d3.color("green"); var color = d3.scaleLinear() .domain([-1, 5]) .range(["hsl(152,80%,80%)", "hsl(228,30%,40%)"]) .interpolate(d3.interpolateHcl); var pack = d3.pack() .size([diameter - margin, diameter - margin]) .padding(100); drawBubbleGraph("/introspect") function drawBubbleGraph(filename) { console.log("redraw") g.selectAll("*").remove() d3.json(filename, function(error, introspectJSON) { if (error) throw error; var root = AdaptJSON(introspectJSON) root = d3.hierarchy(root) .sum(function(d) { return d.CPURequest; }) .sort(function(a, b) { console.log (b.value + " - " + a.value);return b.value - a.value; }); var focus = root, nodes = pack(root).descendants(), view; console.log(nodes); var circle = g.selectAll("circle") .data(nodes) .enter().append("circle") .attr("class", function(d) { console.log("dx: " + d.x + " dy: " + d.y + " dr: " + d.r); console.log(d.data.name); d.parent ? d.children ? console.log("node") : console.log("node leaf") : console.log ("node root"); return d.parent ? d.children ? "node" : "node node--leaf" : "node node--root"; }) .on("click", function(d) { if (focus !== d) zoom(d), d3.event.stopPropagation(); }) .on("mouseover", function(d) {return d.children ? null : showData(d);}) .on("mouseout", function(d) {return d.children ? null : clearData(d);}) .style("fill", function(d) { return d.children ? color(d.depth) : null; }) let innercircle = g.selectAll("innercircle") .data(nodes) .enter().append("circle") .attr("class", function(d) { return d.parent ? d.children ? "inner--node" : "inner--leaf" : "inner--root"; }) let innerleaf = g.selectAll(".inner--leaf") .attr("r", function(d) {if (d.data.CPULimit || d.data.CPURequest) return (d.r * d.data.CPULimit / d.data.CPURequest);}) .style("fill-opacity", 0.2) .on("click", function(d) { if (focus !== d) zoom(d), d3.event.stopPropagation(); }) .style("fill", "red"); var text = g.selectAll("text") .data(nodes) .enter().append("text") .attr("class", "label") .style("fill-opacity", function(d) { return d.parent === root ? 1 : 0; }) .style("display", function(d) { return d.parent === root ? "inline" : "none"; }) .text(function(d) { return d.data.name;}); var node = g.selectAll("circle,innerleaf,text"); svg .style("background", color(-1)) .on("click", function() { zoom(root); }); zoomTo([root.x, root.y, root.r * 2 + margin]); function zoom(d) { var focus0 = focus; focus = d; var transition = d3.transition() .duration(d3.event.altKey ? 7500 : 750) .tween("zoom", function(d) { var i = d3.interpolateZoom(view, [focus.x, focus.y, focus.r * 2 + margin]); return function(t) { zoomTo(i(t)); }; }); svg.transition().selectAll("text") .filter(function(d) { return d.parent === focus || this.style.display === "inline"; }) .style("fill-opacity", function(d) { return d.parent === focus ? 1 : 0; }) .on("start", function(d) { if (d.parent === focus) this.style.display = "inline"; }) .on("end", function(d) { if (d.parent !== focus) this.style.display = "none"; }); } function zoomTo(v) { var k = diameter / v[2]; view = v; node.attr("transform", function(d) { return "translate(" + (d.x - v[0]) * k + "," + (d.y - v[1]) * k + ")"; }); circle.attr("r", function(d) { if (d.children) return d.r *k; if (d.data.CPULimit && d.data.CPURequest) return d.r * k; else return 20 * k ; }) circle.style("fill", function(d) { if (d.children) return color(d.depth); if (!d.data.CPULimit || !d.data.CPURequest)return "grey"; else return color(d.depth);}); innerleaf.attr("r", function(d) { if (d.data.CPULimit && d.data.CPURequest) { if (d.data.CPULimit == d.data.CPURequest) return d.r * k; else return d.r * 2 *k; }}); } let current_circle = undefined; function clearData(d) { console.log("CCLEAR DATA"); svg.selectAll("#details-popup").remove(); } function showData(d) { // cleanup previous selected circle if(current_circle !== undefined){ svg.selectAll("#details-popup").remove(); } console.log("here I am" + d.data.name); // select the circle current_circle = d3.select(this); console.log("here"); console.log(current_circle); let textblock = svg.selectAll("#details-popup") .data([d]) .enter() .append("g") .attr("id", "details-popup") .attr("font-size", 14) .attr("font-family", "sans-serif") .attr("text-anchor", "start") .attr("transform", d => `translate(0, 20)`); textblock.append("text") .text("Details:") .attr("font-weight", "bold"); textblock.append("text") .text(d => "Name: " + d.data.name) .attr("y", "16"); textblock.append("text") .text(d => "CPUs: " + d.data.CPUs) .attr("y", "32"); textblock.append("text") .text(d => "CPU Request: " + d.data.CPURequest) .attr("y", "48"); textblock.append("text") .text(d => "CPU Limit: " + d.data.CPULimit) .attr("y", "64"); textblock.append("text") .text(d => "Memory Request: " + d.data.MemoryRequest) .attr("y", "80"); textblock.append("text") .text(d => "Memory Limit: " + d.data.MemoryLimit) .attr("y", "96"); } }); } ================================================ FILE: pkg/cri/resource-manager/visualizer/bubbles/assets.go ================================================ // Copyright 2019-2020 Intel Corporation. All Rights Reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. //go:build test // +build test package bubbles import ( "net/http" ) // Assets is our UI assets for 'bubbles' visualizer, to serve over HTTP. var Assets = http.Dir("assets") ================================================ FILE: pkg/cri/resource-manager/visualizer/bubbles/assets_generate.go ================================================ // Copyright 2019-2020 Intel Corporation. All Rights Reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. //go:build ignore // +build ignore package main import ( "fmt" visualizer "github.com/intel/cri-resource-manager/pkg/cri/resource-manager/visualizer/bubbles" "github.com/shurcooL/vfsgen" "log" ) const ( name = "bubbles" ) func main() { opts := vfsgen.Options{ PackageName: name, BuildTags: "!test", VariableName: "Assets", Filename: "assets_gendata.go", } if err := vfsgen.Generate(visualizer.Assets, opts); err != nil { log.Fatalln(fmt.Sprintf("failed to generate assets for %s UI:", name, err)) } } ================================================ FILE: pkg/cri/resource-manager/visualizer/bubbles/doc.go ================================================ // Copyright 2019-2020 Intel Corporation. All Rights Reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package bubbles import ( // The blank import is to make govendor happy. _ "github.com/shurcooL/vfsgen" ) //go:generate go run -tags=test assets_generate.go ================================================ FILE: pkg/cri/resource-manager/visualizer/builtins.go ================================================ // Copyright 2019-2020 Intel Corporation. All Rights Reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. //go:build !dev // +build !dev package visualizer import ( // Pull in builtin visualizer implementations. "github.com/intel/cri-resource-manager/pkg/cri/resource-manager/visualizer/bubbles" ) func init() { visualizers.register("bubbles", bubbles.Assets) } ================================================ FILE: pkg/cri/resource-manager/visualizer/flags.go ================================================ // Copyright 2019-2020 Intel Corporation. All Rights Reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package visualizer import ( "flag" ) // externalDirs is a comma-separated list of directories to search for visualizers. var externalDirs string // Register our command line options. func init() { flag.StringVar(&externalDirs, "external-visualizers", "", "comma-separated list of directories to search for external visualizers.") } ================================================ FILE: pkg/cri/resource-manager/visualizer/visualizer.go ================================================ // Copyright 2019-2020 Intel Corporation. All Rights Reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package visualizer import ( "fmt" "net/http" "os" "path/filepath" "sort" "strings" xhttp "github.com/intel/cri-resource-manager/pkg/instrumentation/http" logger "github.com/intel/cri-resource-manager/pkg/log" ) const ( // HTTP URI prefix to register all visualizer implementations under. visualizerPrefix = "/ui" ) // Our logger instance. var log = logger.NewLogger("visualizer") // visualizer captures our runtime state. type visualizer struct { builtin map[string]http.FileSystem } // Visualizer singleton instance. var visualizers = &visualizer{ builtin: map[string]http.FileSystem{}, } // Register registers an builtin visualizer implementation. func Register(name string, dir http.FileSystem) { visualizers.register(name, dir) } // Setup sets up the given multiplexer to serve visualization implementations. func Setup(mux *xhttp.ServeMux) error { log.Info("activating visualization interface...") mux.Handle("/", http.RedirectHandler("/ui/index.html", http.StatusFound)) mux.Handle("/ui", http.RedirectHandler("/ui/index.html", http.StatusFound)) mux.Handle("/ui/builtin/", http.FileServer(visualizers)) mux.Handle("/ui/external/", http.FileServer(visualizers)) mux.HandleFunc("/ui/index.html", visualizers.generateIndexHTML) return nil } // Open is the http.Dir implementation for our visualizers. func (v *visualizer) Open(path string) (http.File, error) { log.Debug("HTTP request %q", path) relative, err := filepath.Rel(visualizerPrefix+"/", path) if err != nil { return nil, visualizerError("failed to resolve path %q: %v", err) } log.Debug("%s => %s", path, relative) split := strings.Split(relative, "/") if len(split) < 2 { return nil, visualizerError("failed to resolve relative path %q", relative) } kind, name := split[0], split[1] fs, err := v.getVisualizerFileSystem(kind, name) if err != nil { return nil, err } return fs.Open(filepath.Join(split[2:]...)) } // getVisualizerFileSystem returns the http.FileSystem for the given visualizer. func (v *visualizer) getVisualizerFileSystem(kind, name string) (http.FileSystem, error) { switch kind { case "builtin": if dir, ok := v.builtin[name]; ok { return dir, nil } return nil, visualizerError("unknown builtin visualization UI %q", name) case "external": external := v.discoverExternalUIs() if path, ok := external[name]; ok { return http.FileSystem(http.Dir(path)), nil } return nil, visualizerError("unkown external visualization UI %q", name) } return nil, visualizerError("unknown visualization UI type %q", kind) } // Index page HTML header and footer. const ( uiPageHTMLHeader = ` CRI Resource Manager - Workload Placement Visualization
    ` uiPageHTMLFooter = `
` ) // generateIndexHTML generates a HTML page to access all known visualization UIs. func (v *visualizer) generateIndexHTML(w http.ResponseWriter, _ *http.Request) { builtinUIs := []string{} for name := range v.builtin { builtinUIs = append(builtinUIs, name) } sort.Strings(builtinUIs) externalUIs := []string{} for name := range v.discoverExternalUIs() { externalUIs = append(externalUIs, name) } sort.Strings(externalUIs) fmt.Fprintf(w, "%s", uiPageHTMLHeader) if len(builtinUIs)+len(externalUIs) == 0 { fmt.Fprintf(w, "No builtin or external visualization UIs found.") } else { for _, name := range builtinUIs { fmt.Fprintf(w, "
  • %s\n", name, name) } for _, name := range externalUIs { fmt.Fprintf(w, "
  • external %s\n", name, name) } } fmt.Fprintf(w, "%s\r\n", uiPageHTMLFooter) } // register registers a builtin visualizer implementation. func (v *visualizer) register(name string, dir http.FileSystem) { if _, ok := v.builtin[name]; ok { log.Error("builtin visualizer '%s' already registered", name) return } v.builtin[name] = dir log.Info("registered %s builtin visualizer...", name) } // discoverExternalUIs returns a map of external visualizer implementations. func (v *visualizer) discoverExternalUIs() map[string]string { external := make(map[string]string) for _, root := range strings.Split(externalDirs, ",") { filepath.Walk(root, func(path string, info os.FileInfo, err error) error { if err != nil || info.IsDir() || info.Name() != "index.html" { return nil } dir, err := filepath.Abs(filepath.Dir(path)) if err != nil { log.Error("failed to determine absolute directory for '%s': %v", path, err) return nil } name := v.uniqueExternalUIName(dir, external) external[name] = dir log.Debug("found external visualizer '%s' (%s)", name, dir) return nil }) } return external } // uniqueExternalUIName generates a unique name for the external visualizer. func (v *visualizer) uniqueExternalUIName(dir string, others map[string]string) string { base := filepath.Base(dir) if base == "assets" { base = filepath.Base(filepath.Dir(dir)) } cnt := 0 name := base for { if cnt > 0 { name = base + fmt.Sprintf("-%d", cnt) } if _, ok := others[name]; !ok { return name } cnt++ } } // visualizerError returns a formatted package-specific error. func visualizerError(format string, args ...interface{}) error { return fmt.Errorf("visualizer: "+format, args...) } ================================================ FILE: pkg/cri/server/server.go ================================================ // Copyright 2019 Intel Corporation. All Rights Reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package server import ( "context" "fmt" "net" "os" "os/user" "path/filepath" "strconv" "strings" "time" "google.golang.org/grpc" criv1 "k8s.io/cri-api/pkg/apis/runtime/v1" "github.com/intel/cri-resource-manager/pkg/cri/resource-manager/sockets" "github.com/intel/cri-resource-manager/pkg/dump" logger "github.com/intel/cri-resource-manager/pkg/log" "github.com/intel/cri-resource-manager/pkg/utils" "github.com/intel/cri-resource-manager/pkg/instrumentation" "go.opencensus.io/trace" ) // Options contains the configurable options of our CRI server. type Options struct { // Socket is the path of our gRPC servers unix-domain socket. Socket string // User is the user ID for our gRPC socket. User int // Group is the group ID for our gRPC socket. Group int // Mode is the permission mode bits for our gRPC socket. Mode os.FileMode // QualifyReqFn produces return context for disambiguating a CRI request/reply. QualifyReqFn func(interface{}) string } // Handler is a CRI server generic request handler. type Handler grpc.UnaryHandler // Interceptor is a hook that intercepts processing a request by a handler. type Interceptor func(context.Context, string, interface{}, Handler) (interface{}, error) // Server is the interface we expose for controlling our CRI server. type Server interface { // RegisterImageService registers the provided image service with the server. RegisterImageService(criv1.ImageServiceServer) error // RegistersRuntimeService registers the provided runtime service with the server. RegisterRuntimeService(criv1.RuntimeServiceServer) error // RegisterInterceptors registers the given interceptors with the server. RegisterInterceptors(map[string]Interceptor) error // Start starts the request processing loop (goroutine) of the server. Start() error // Stop stops the request processing loop (goroutine) of the server. Stop() // Chmod changes the permissions of the server's socket. Chmod(mode os.FileMode) error // Chown changes ownership of the server's socket. Chown(uid, gid int) error } // server is the implementation of Server. type server struct { logger.Logger listener net.Listener // socket our gRPC server listens on server *grpc.Server // our gRPC server options Options // server options interceptors map[string]Interceptor // request intercepting hooks runtime *criv1.RuntimeServiceServer // CRI runtime service image *criv1.ImageServiceServer // CRI image service } // NewServer creates a new server instance. func NewServer(options Options) (Server, error) { if !filepath.IsAbs(options.Socket) { return nil, serverError("invalid socket '%s', expecting absolute path", options.Socket) } s := &server{ Logger: logger.NewLogger("cri/server"), options: options, } return s, nil } // RegisterImageService registers an image service with the server. func (s *server) RegisterImageService(service criv1.ImageServiceServer) error { if s.image != nil { return serverError("can't register image service, already registered") } if err := s.createGrpcServer(); err != nil { return err } is := service s.image = &is criv1.RegisterImageServiceServer(s.server, s) return nil } // RegisterRuntimeService registers a runtime service with the server. func (s *server) RegisterRuntimeService(service criv1.RuntimeServiceServer) error { if s.runtime != nil { return serverError("can't register runtime server, already registered") } if err := s.createGrpcServer(); err != nil { return err } rs := service s.runtime = &rs criv1.RegisterRuntimeServiceServer(s.server, s) return nil } // RegisterInterceptors registers the given interveptors with the server. func (s *server) RegisterInterceptors(intercept map[string]Interceptor) error { if s.interceptors == nil { s.interceptors = make(map[string]Interceptor) } for method, i := range intercept { if _, ok := s.interceptors[method]; ok { return serverError("server already has a registered interceptor for '%s'", method) } s.interceptors[method] = i } return nil } // Start starts the servers request processing goroutine. func (s *server) Start() error { s.trainMessageDumper() s.Debug("starting server on socket %s...", s.options.Socket) go func() { s.server.Serve(s.listener) }() s.Debug("waiting for server to become ready...") if err := utils.WaitForServer(s.options.Socket, time.Second); err != nil { return serverError("starting CRI server failed: %v", err) } return nil } // Stop serving CRI requests. func (s *server) Stop() { s.Debug("stopping server on socket %s...", s.options.Socket) s.server.Stop() } // createGrpcServer creates a gRPC server instance on our socket. func (s *server) createGrpcServer() error { if s.server != nil { return nil } if err := os.MkdirAll(filepath.Dir(s.options.Socket), sockets.DirPermissions); err != nil { return serverError("failed to create directory for socket %s: %v", s.options.Socket, err) } l, err := net.Listen("unix", s.options.Socket) if err != nil { if ls, lsErr := utils.IsListeningSocket(s.options.Socket); ls || lsErr != nil { return serverError("failed to create server: socket %q already exists", s.options.Socket) } s.Warn("removing abandoned socket %q...", s.options.Socket) os.Remove(s.options.Socket) l, err = net.Listen("unix", s.options.Socket) if err != nil { return serverError("failed to create server on socket %s: %v", s.options.Socket, err) } } s.listener = l if s.options.User >= 0 { if err := s.Chown(s.options.User, s.options.Group); err != nil { l.Close() s.listener = nil return err } } if s.options.Mode != 0 { if err := s.Chmod(s.options.Mode); err != nil { l.Close() s.listener = nil return err } } s.server = grpc.NewServer(instrumentation.InjectGrpcServerTrace()...) return nil } // Chmod changes the permissions of the server's socket. func (s *server) Chmod(mode os.FileMode) error { if s.listener != nil { if err := os.Chmod(s.options.Socket, mode); err != nil { return serverError("failed to change permissions of socket %q to %v: %v", s.options.Socket, mode, err) } s.Info("changed permissions of socket %q to %v", s.options.Socket, mode) } s.options.Mode = mode return nil } // Chown changes ownership of the server's socket. func (s *server) Chown(uid, gid int) error { if s.listener != nil { userName := strconv.FormatInt(int64(uid), 10) if u, err := user.LookupId(userName); u != nil && err == nil { userName = u.Name } groupName := strconv.FormatInt(int64(gid), 10) if g, err := user.LookupGroupId(groupName); g != nil && err == nil { groupName = g.Name } if err := os.Chown(s.options.Socket, uid, gid); err != nil { return serverError("failed to change ownership of socket %q to %s/%s: %v", s.options.Socket, userName, groupName, err) } s.Info("changed ownership of socket %q to %s/%s", s.options.Socket, userName, groupName) } s.options.User = uid s.options.Group = gid return nil } // getInterceptor finds an interceptor for the given method. func (s *server) getInterceptor(method string) (Interceptor, string) { name := method[strings.LastIndex(method, "/")+1:] if fn, ok := s.interceptors[name]; ok { return fn, name } if fn, ok := s.interceptors["*"]; ok { return fn, name } return nil, name } // intercept processes requests with a registered interceptor or the default handler. func (s *server) intercept(ctx context.Context, req interface{}, info *grpc.UnaryServerInfo, handler grpc.UnaryHandler) (interface{}, error) { // Notes: // We record timestamps at various phases of processing a request to later // calculate local, CRI-server and total request processing latencies. We // wrap the original handler to get the pre- and post-communication stamps // with reasonable accuracy without having to get the stamps at the client. // // One thing that we currently fail to measure separately is the latency of // internally generated CRI requests (UpdateContainerResources). These are // now accounted to the local processing latency of the triggering request. var kind string var start, send, recv, end time.Time var sync bool wrapHandler := func(ctx context.Context, req interface{}) (interface{}, error) { send = time.Now() rpl, err := handler(ctx, req) recv = time.Now() return rpl, err } fn, name := s.getInterceptor(info.FullMethod) if fn != nil { kind = "intercepted" sync = true } else { kind = "passthrough" fn = func(c context.Context, n string, r interface{}, h Handler) (interface{}, error) { rpl, err := h(c, r) return rpl, err } } qualif := s.qualifier(req) dump.RequestMessage(kind, info.FullMethod, qualif, req, sync) if span := trace.FromContext(ctx); span != nil { span.AddAttributes(trace.StringAttribute("kind", kind)) } start = time.Now() rpl, err := fn(ctx, name, req, wrapHandler) end = time.Now() elapsed := end.Sub(start) if err != nil { dump.ReplyMessage(kind, info.FullMethod, qualif, err, elapsed, false) } else { dump.ReplyMessage(kind, info.FullMethod, qualif, rpl, elapsed, false) } s.collectStatistics(kind, name, start, send, recv, end) logger.Flush() return rpl, err } // collectStatistics collects (should collect) request processing statistics. func (s *server) collectStatistics(kind, name string, start, send, recv, end time.Time) { if kind == "passthrough" { return } pre := send.Sub(start) server := recv.Sub(send) post := end.Sub(recv) s.Debug(" * latency for %s: preprocess: %v, CRI server: %v, postprocess: %v, total: %v", name, pre, server, post, pre+server+post) } // trainMessageDumper pre-trains the message dumper with our full set of service methods. func (s server) trainMessageDumper() { methods := []string{} svcinfo := s.server.GetServiceInfo() for _, info := range svcinfo { for _, m := range info.Methods { methods = append(methods, m.Name) } } dump.Train(methods) } // qualifier pulls a qualifier for disambiguation from a CRI request message. func (s server) qualifier(msg interface{}) string { if fn := s.options.QualifyReqFn; fn != nil { return fn(msg) } return "" } // Return a formatter server error. func serverError(format string, args ...interface{}) error { return fmt.Errorf("cri/server: "+format, args...) } ================================================ FILE: pkg/cri/server/services.go ================================================ // Copyright 2019 Intel Corporation. All Rights Reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package server import ( "context" "go.opencensus.io/trace" "google.golang.org/grpc" criv1 "k8s.io/cri-api/pkg/apis/runtime/v1" ) const ( apiVersion = "v1" imageService = "ImageService" listImages = "ListImages" imageStatus = "ImageStatus" pullImage = "PullImage" removeImage = "RemoveImage" imageFsInfo = "ImageFsInfo" runtimeService = "RuntimeService" version = "Version" runPodSandbox = "RunPodSandbox" stopPodSandbox = "StopPodSandbox" removePodSandbox = "RemovePodSandbox" podSandboxStatus = "PodSandboxStatus" listPodSandbox = "ListPodSandbox" createContainer = "CreateContainer" startContainer = "StartContainer" stopContainer = "StopContainer" removeContainer = "RemoveContainer" listContainers = "ListContainers" containerStatus = "ContainerStatus" updateContainerResources = "UpdateContainerResources" reopenContainerLog = "ReopenContainerLog" execSync = "ExecSync" exec = "Exec" attach = "Attach" portForward = "PortForward" containerStats = "ContainerStats" listContainerStats = "ListContainerStats" podSandboxStats = "PodSandboxStats" listPodSandboxStats = "ListPodSandboxStats" updateRuntimeConfig = "UpdateRuntimeConfig" status = "Status" checkpointContainer = "CheckpointContainer" getContainerEvents = "GetContainerEvents" listMetricDescriptors = "ListMetricDescriptors" listPodSandboxMetrics = "ListPodSandboxMetrics" runtimeConfig = "RuntimeConfig" ) func fqmn(service, method string) string { return "/runtime." + apiVersion + "." + service + "/" + method } func (s *server) interceptRequest(ctx context.Context, service, method string, req interface{}, handler grpc.UnaryHandler) (interface{}, error) { if span := trace.FromContext(ctx); span != nil { span.AddAttributes( trace.StringAttribute("service", service), trace.StringAttribute("method", method)) } return s.intercept(ctx, req, &grpc.UnaryServerInfo{Server: s, FullMethod: fqmn(service, method)}, handler) } func (s *server) ListImages(ctx context.Context, req *criv1.ListImagesRequest) (*criv1.ListImagesResponse, error) { rsp, err := s.interceptRequest(ctx, imageService, listImages, req, func(ctx context.Context, req interface{}) (interface{}, error) { return (*s.image).ListImages(ctx, req.(*criv1.ListImagesRequest)) }) if err != nil { return nil, err } return rsp.(*criv1.ListImagesResponse), err } func (s *server) ImageStatus(ctx context.Context, req *criv1.ImageStatusRequest) (*criv1.ImageStatusResponse, error) { rsp, err := s.interceptRequest(ctx, imageService, imageStatus, req, func(ctx context.Context, req interface{}) (interface{}, error) { return (*s.image).ImageStatus(ctx, req.(*criv1.ImageStatusRequest)) }) if err != nil { return nil, err } return rsp.(*criv1.ImageStatusResponse), err } func (s *server) PullImage(ctx context.Context, req *criv1.PullImageRequest) (*criv1.PullImageResponse, error) { rsp, err := s.interceptRequest(ctx, imageService, pullImage, req, func(ctx context.Context, req interface{}) (interface{}, error) { return (*s.image).PullImage(ctx, req.(*criv1.PullImageRequest)) }) if err != nil { return nil, err } return rsp.(*criv1.PullImageResponse), err } func (s *server) RemoveImage(ctx context.Context, req *criv1.RemoveImageRequest) (*criv1.RemoveImageResponse, error) { rsp, err := s.interceptRequest(ctx, imageService, removeImage, req, func(ctx context.Context, req interface{}) (interface{}, error) { return (*s.image).RemoveImage(ctx, req.(*criv1.RemoveImageRequest)) }) if err != nil { return nil, err } return rsp.(*criv1.RemoveImageResponse), err } func (s *server) ImageFsInfo(ctx context.Context, req *criv1.ImageFsInfoRequest) (*criv1.ImageFsInfoResponse, error) { rsp, err := s.interceptRequest(ctx, imageService, imageFsInfo, req, func(ctx context.Context, req interface{}) (interface{}, error) { return (*s.image).ImageFsInfo(ctx, req.(*criv1.ImageFsInfoRequest)) }) if err != nil { return nil, err } return rsp.(*criv1.ImageFsInfoResponse), err } func (s *server) Version(ctx context.Context, req *criv1.VersionRequest) (*criv1.VersionResponse, error) { rsp, err := s.interceptRequest(ctx, runtimeService, version, req, func(ctx context.Context, req interface{}) (interface{}, error) { return (*s.runtime).Version(ctx, req.(*criv1.VersionRequest)) }) if err != nil { return nil, err } return rsp.(*criv1.VersionResponse), err } func (s *server) RunPodSandbox(ctx context.Context, req *criv1.RunPodSandboxRequest) (*criv1.RunPodSandboxResponse, error) { rsp, err := s.interceptRequest(ctx, runtimeService, runPodSandbox, req, func(ctx context.Context, req interface{}) (interface{}, error) { return (*s.runtime).RunPodSandbox(ctx, req.(*criv1.RunPodSandboxRequest)) }) if err != nil { return nil, err } return rsp.(*criv1.RunPodSandboxResponse), err } func (s *server) StopPodSandbox(ctx context.Context, req *criv1.StopPodSandboxRequest) (*criv1.StopPodSandboxResponse, error) { rsp, err := s.interceptRequest(ctx, runtimeService, stopPodSandbox, req, func(ctx context.Context, req interface{}) (interface{}, error) { return (*s.runtime).StopPodSandbox(ctx, req.(*criv1.StopPodSandboxRequest)) }) if err != nil { return nil, err } return rsp.(*criv1.StopPodSandboxResponse), err } func (s *server) RemovePodSandbox(ctx context.Context, req *criv1.RemovePodSandboxRequest) (*criv1.RemovePodSandboxResponse, error) { rsp, err := s.interceptRequest(ctx, runtimeService, removePodSandbox, req, func(ctx context.Context, req interface{}) (interface{}, error) { return (*s.runtime).RemovePodSandbox(ctx, req.(*criv1.RemovePodSandboxRequest)) }) if err != nil { return nil, err } return rsp.(*criv1.RemovePodSandboxResponse), err } func (s *server) PodSandboxStatus(ctx context.Context, req *criv1.PodSandboxStatusRequest) (*criv1.PodSandboxStatusResponse, error) { rsp, err := s.interceptRequest(ctx, runtimeService, podSandboxStatus, req, func(ctx context.Context, req interface{}) (interface{}, error) { return (*s.runtime).PodSandboxStatus(ctx, req.(*criv1.PodSandboxStatusRequest)) }) if err != nil { return nil, err } return rsp.(*criv1.PodSandboxStatusResponse), err } func (s *server) ListPodSandbox(ctx context.Context, req *criv1.ListPodSandboxRequest) (*criv1.ListPodSandboxResponse, error) { rsp, err := s.interceptRequest(ctx, runtimeService, listPodSandbox, req, func(ctx context.Context, req interface{}) (interface{}, error) { return (*s.runtime).ListPodSandbox(ctx, req.(*criv1.ListPodSandboxRequest)) }) if err != nil { return nil, err } return rsp.(*criv1.ListPodSandboxResponse), err } func (s *server) CreateContainer(ctx context.Context, req *criv1.CreateContainerRequest) (*criv1.CreateContainerResponse, error) { rsp, err := s.interceptRequest(ctx, runtimeService, createContainer, req, func(ctx context.Context, req interface{}) (interface{}, error) { return (*s.runtime).CreateContainer(ctx, req.(*criv1.CreateContainerRequest)) }) if err != nil { return nil, err } return rsp.(*criv1.CreateContainerResponse), err } func (s *server) StartContainer(ctx context.Context, req *criv1.StartContainerRequest) (*criv1.StartContainerResponse, error) { rsp, err := s.interceptRequest(ctx, runtimeService, startContainer, req, func(ctx context.Context, req interface{}) (interface{}, error) { return (*s.runtime).StartContainer(ctx, req.(*criv1.StartContainerRequest)) }) if err != nil { return nil, err } return rsp.(*criv1.StartContainerResponse), err } func (s *server) StopContainer(ctx context.Context, req *criv1.StopContainerRequest) (*criv1.StopContainerResponse, error) { rsp, err := s.interceptRequest(ctx, runtimeService, stopContainer, req, func(ctx context.Context, req interface{}) (interface{}, error) { return (*s.runtime).StopContainer(ctx, req.(*criv1.StopContainerRequest)) }) if err != nil { return nil, err } return rsp.(*criv1.StopContainerResponse), err } func (s *server) RemoveContainer(ctx context.Context, req *criv1.RemoveContainerRequest) (*criv1.RemoveContainerResponse, error) { rsp, err := s.interceptRequest(ctx, runtimeService, removeContainer, req, func(ctx context.Context, req interface{}) (interface{}, error) { return (*s.runtime).RemoveContainer(ctx, req.(*criv1.RemoveContainerRequest)) }) if err != nil { return nil, err } return rsp.(*criv1.RemoveContainerResponse), err } func (s *server) ListContainers(ctx context.Context, req *criv1.ListContainersRequest) (*criv1.ListContainersResponse, error) { rsp, err := s.interceptRequest(ctx, runtimeService, listContainers, req, func(ctx context.Context, req interface{}) (interface{}, error) { return (*s.runtime).ListContainers(ctx, req.(*criv1.ListContainersRequest)) }) if err != nil { return nil, err } return rsp.(*criv1.ListContainersResponse), err } func (s *server) ContainerStatus(ctx context.Context, req *criv1.ContainerStatusRequest) (*criv1.ContainerStatusResponse, error) { rsp, err := s.interceptRequest(ctx, runtimeService, containerStatus, req, func(ctx context.Context, req interface{}) (interface{}, error) { return (*s.runtime).ContainerStatus(ctx, req.(*criv1.ContainerStatusRequest)) }) if err != nil { return nil, err } return rsp.(*criv1.ContainerStatusResponse), err } func (s *server) UpdateContainerResources(ctx context.Context, req *criv1.UpdateContainerResourcesRequest) (*criv1.UpdateContainerResourcesResponse, error) { rsp, err := s.interceptRequest(ctx, runtimeService, updateContainerResources, req, func(ctx context.Context, req interface{}) (interface{}, error) { return (*s.runtime).UpdateContainerResources(ctx, req.(*criv1.UpdateContainerResourcesRequest)) }) if err != nil { return nil, err } return rsp.(*criv1.UpdateContainerResourcesResponse), err } func (s *server) ReopenContainerLog(ctx context.Context, req *criv1.ReopenContainerLogRequest) (*criv1.ReopenContainerLogResponse, error) { rsp, err := s.interceptRequest(ctx, runtimeService, reopenContainerLog, req, func(ctx context.Context, req interface{}) (interface{}, error) { return (*s.runtime).ReopenContainerLog(ctx, req.(*criv1.ReopenContainerLogRequest)) }) if err != nil { return nil, err } return rsp.(*criv1.ReopenContainerLogResponse), err } func (s *server) ExecSync(ctx context.Context, req *criv1.ExecSyncRequest) (*criv1.ExecSyncResponse, error) { rsp, err := s.interceptRequest(ctx, runtimeService, execSync, req, func(ctx context.Context, req interface{}) (interface{}, error) { return (*s.runtime).ExecSync(ctx, req.(*criv1.ExecSyncRequest)) }) if err != nil { return nil, err } return rsp.(*criv1.ExecSyncResponse), err } func (s *server) Exec(ctx context.Context, req *criv1.ExecRequest) (*criv1.ExecResponse, error) { rsp, err := s.interceptRequest(ctx, runtimeService, exec, req, func(ctx context.Context, req interface{}) (interface{}, error) { return (*s.runtime).Exec(ctx, req.(*criv1.ExecRequest)) }) if err != nil { return nil, err } return rsp.(*criv1.ExecResponse), err } func (s *server) Attach(ctx context.Context, req *criv1.AttachRequest) (*criv1.AttachResponse, error) { rsp, err := s.interceptRequest(ctx, runtimeService, attach, req, func(ctx context.Context, req interface{}) (interface{}, error) { return (*s.runtime).Attach(ctx, req.(*criv1.AttachRequest)) }) if err != nil { return nil, err } return rsp.(*criv1.AttachResponse), err } func (s *server) PortForward(ctx context.Context, req *criv1.PortForwardRequest) (*criv1.PortForwardResponse, error) { rsp, err := s.interceptRequest(ctx, runtimeService, portForward, req, func(ctx context.Context, req interface{}) (interface{}, error) { return (*s.runtime).PortForward(ctx, req.(*criv1.PortForwardRequest)) }) if err != nil { return nil, err } return rsp.(*criv1.PortForwardResponse), err } func (s *server) ContainerStats(ctx context.Context, req *criv1.ContainerStatsRequest) (*criv1.ContainerStatsResponse, error) { rsp, err := s.interceptRequest(ctx, runtimeService, containerStats, req, func(ctx context.Context, req interface{}) (interface{}, error) { return (*s.runtime).ContainerStats(ctx, req.(*criv1.ContainerStatsRequest)) }) if err != nil { return nil, err } return rsp.(*criv1.ContainerStatsResponse), err } func (s *server) ListContainerStats(ctx context.Context, req *criv1.ListContainerStatsRequest) (*criv1.ListContainerStatsResponse, error) { rsp, err := s.interceptRequest(ctx, runtimeService, listContainerStats, req, func(ctx context.Context, req interface{}) (interface{}, error) { return (*s.runtime).ListContainerStats(ctx, req.(*criv1.ListContainerStatsRequest)) }) if err != nil { return nil, err } return rsp.(*criv1.ListContainerStatsResponse), err } func (s *server) PodSandboxStats(ctx context.Context, req *criv1.PodSandboxStatsRequest) (*criv1.PodSandboxStatsResponse, error) { rsp, err := s.interceptRequest(ctx, runtimeService, podSandboxStats, req, func(ctx context.Context, req interface{}) (interface{}, error) { return (*s.runtime).PodSandboxStats(ctx, req.(*criv1.PodSandboxStatsRequest)) }) if err != nil { return nil, err } return rsp.(*criv1.PodSandboxStatsResponse), err } func (s *server) ListPodSandboxStats(ctx context.Context, req *criv1.ListPodSandboxStatsRequest) (*criv1.ListPodSandboxStatsResponse, error) { rsp, err := s.interceptRequest(ctx, runtimeService, listPodSandboxStats, req, func(ctx context.Context, req interface{}) (interface{}, error) { return (*s.runtime).ListPodSandboxStats(ctx, req.(*criv1.ListPodSandboxStatsRequest)) }) if err != nil { return nil, err } return rsp.(*criv1.ListPodSandboxStatsResponse), err } func (s *server) UpdateRuntimeConfig(ctx context.Context, req *criv1.UpdateRuntimeConfigRequest) (*criv1.UpdateRuntimeConfigResponse, error) { rsp, err := s.interceptRequest(ctx, runtimeService, updateRuntimeConfig, req, func(ctx context.Context, req interface{}) (interface{}, error) { return (*s.runtime).UpdateRuntimeConfig(ctx, req.(*criv1.UpdateRuntimeConfigRequest)) }) if err != nil { return nil, err } return rsp.(*criv1.UpdateRuntimeConfigResponse), err } func (s *server) Status(ctx context.Context, req *criv1.StatusRequest) (*criv1.StatusResponse, error) { rsp, err := s.interceptRequest(ctx, runtimeService, status, req, func(ctx context.Context, req interface{}) (interface{}, error) { return (*s.runtime).Status(ctx, req.(*criv1.StatusRequest)) }) if err != nil { return nil, err } return rsp.(*criv1.StatusResponse), err } func (s *server) CheckpointContainer(ctx context.Context, req *criv1.CheckpointContainerRequest) (*criv1.CheckpointContainerResponse, error) { rsp, err := s.interceptRequest(ctx, runtimeService, checkpointContainer, req, func(ctx context.Context, req interface{}) (interface{}, error) { return (*s.runtime).CheckpointContainer(ctx, req.(*criv1.CheckpointContainerRequest)) }) if err != nil { return nil, err } return rsp.(*criv1.CheckpointContainerResponse), err } func (s *server) GetContainerEvents(req *criv1.GetEventsRequest, srv criv1.RuntimeService_GetContainerEventsServer) error { // TODO(klihub): interceptRequest is a unary interceptor. It can't handle streaming // requests so for now we short-circuit the call to the server here. return (*s.runtime).GetContainerEvents(req, srv) } func (s *server) ListMetricDescriptors(ctx context.Context, req *criv1.ListMetricDescriptorsRequest) (*criv1.ListMetricDescriptorsResponse, error) { rsp, err := s.interceptRequest(ctx, runtimeService, listMetricDescriptors, req, func(ctx context.Context, req interface{}) (interface{}, error) { return (*s.runtime).ListMetricDescriptors(ctx, req.(*criv1.ListMetricDescriptorsRequest)) }) if err != nil { return nil, err } return rsp.(*criv1.ListMetricDescriptorsResponse), err } func (s *server) ListPodSandboxMetrics(ctx context.Context, req *criv1.ListPodSandboxMetricsRequest) (*criv1.ListPodSandboxMetricsResponse, error) { rsp, err := s.interceptRequest(ctx, runtimeService, listPodSandboxMetrics, req, func(ctx context.Context, req interface{}) (interface{}, error) { return (*s.runtime).ListPodSandboxMetrics(ctx, req.(*criv1.ListPodSandboxMetricsRequest)) }) if err != nil { return nil, err } return rsp.(*criv1.ListPodSandboxMetricsResponse), err } func (s *server) RuntimeConfig(ctx context.Context, req *criv1.RuntimeConfigRequest) (*criv1.RuntimeConfigResponse, error) { rsp, err := s.interceptRequest(ctx, runtimeService, runtimeConfig, req, func(ctx context.Context, req interface{}) (interface{}, error) { return (*s.runtime).RuntimeConfig(ctx, req.(*criv1.RuntimeConfigRequest)) }) if err != nil { return nil, err } return rsp.(*criv1.RuntimeConfigResponse), err } ================================================ FILE: pkg/dump/doc.go ================================================ // Copyright 2019 Intel Corporation. All Rights Reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package dump // // This package implements the dumping of (gRPC) methods calls where // each method is called with a single request struct and returns a // single reply struct or an error. Configuring what to dump happens // by specifying a comma-separated dump request on the command line. // // A dump request is a comma-separated list of dump specs: // [,,...,], where each spec is of the form // <[target:]request> // A request is either a requests name (gRPC method name without // the leading path), or a regexp for matching requests. // The dump targets are: 'off', 'name', 'full', 'count' by default. // var configHelp = ` Dump CRI gRPC method calls as YAML. This package implements configurable message dumping of CRI gRPC method calls. Both requests and the resulting replies or errors can be dumped. Messages can be both logged and dumped to a given file. Configuring what to dumps happens using a dump configuration string of the following format: level1:pattern1[,level2:pattern2,...][,debug] Each level specifies a level of detail for method calls with names matching the corresponding pattern. A pattern can be a method call name to match just a single method, or it can be regexp to match several methods. For regexps all the patterns are evaluated in order of appearance with the last one staying in effect. Exact method name patterns terminate the evaluation without any regexp processing. The possible levels of duping detail are: off: suppress dumping of matching requests and replies short: short dump of requests and potential error replies full: full dump of both request and reply content as YAML Additionally including 'debug' in the configuration string will cause messages to be logged as debug messages with the 'message' log source. Note that debugging for this source needs to be explicitly enabled, otherwise messages are suppressed. If a dump file is specified messages will be dumped additionally to the dump file as well. Here is a sample configuration fragment to suppress all .*List.* calls, produce short dumps of all .*Stop.* calls, and full dumps of everything else, dumps also going to the file '/tmp/cri-dump.log' dump: config: full:.*,short:.*Stop.*,off:.*List.* file: /tmp/cri-dump.log ` ================================================ FILE: pkg/dump/dump.go ================================================ // Copyright 2019-2020 Intel Corporation. All Rights Reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package dump // // This package implements the dumping of (gRPC) methods calls where // each method is called with a single request struct and returns a // single reply struct or an error. Configuring what to dump happens // by specifying a comma-separated dump request on the command line. // // A dump request is a comma-separated list of dump specs: // [,,...,], where each spec is of the form // <[target:]request> // A request is either a requests name (gRPC method name without // the leading path), or a regexp for matching requests. // The dump targets are: 'off', 'name', 'full', 'count' by default. // import ( "fmt" "os" "sigs.k8s.io/yaml" "strings" "sync" "time" logger "github.com/intel/cri-resource-manager/pkg/log" ) const ( // stampLayout is the timestamp format used in dump files. stampLayout = "2006-Jan-02 15:04:05.000" // stampLen is the length we adjust our printed latencies to. stampLen = len(stampLayout) ) // dumper encapsulates the runtime state of our message dumper. type dumper struct { sync.RWMutex // protect concurrent dumping/reconfiguration rules ruleset // dumping rules details map[string]level // corresponding dump details per method disabled bool // dumping globally disabled debug bool // dump as debug messages path string // extra dump file path file *os.File // extra dump file methods []string // training set for config q chan *dumpreq } // dumpreq is a request to dump a (CRI) request or a reply type dumpreq struct { dir direction kind string method string qualifier string msg interface{} latency time.Duration sync chan struct{} } // direction is a message direction, a request or a reply type direction int const ( request = iota reply nop ) // Our global dumper instance. var dump = newDumper() // Our logger instances, one for generic logging and another for message dumps. var log = logger.NewLogger("dump") var message = logger.NewLogger("message") // Train trains the message dumper for the given set of methods. func Train(methods []string) { dump.Lock() defer dump.Unlock() dump.train(methods) } // RequestMessage dumps a CRI request. func RequestMessage(kind, name, qualifier string, req interface{}, sync bool) { if !dump.disabled { var ch chan struct{} if sync { ch = make(chan struct{}) } dump.q <- &dumpreq{ dir: request, kind: kind, method: name, qualifier: qualifier, msg: req, sync: ch, } if ch != nil { _ = <-ch } } } // ReplyMessage dumps a CRI reply. func ReplyMessage(kind, name, qualifier string, rpl interface{}, latency time.Duration, sync bool) { if !dump.disabled { var ch chan struct{} if sync { ch = make(chan struct{}) } dump.q <- &dumpreq{ dir: reply, kind: kind, method: name, qualifier: qualifier, msg: rpl, latency: latency, sync: ch, } if ch != nil { _ = <-ch } } } // Sync returns once the last message currently being dumped is finished. func Sync() { if !dump.disabled { dump.sync() } } // newDumper creates a dumper instance. func newDumper() *dumper { d := &dumper{q: make(chan *dumpreq, 16)} d.run() return d } // run runs the dumping goroutine of the dumper. func (d *dumper) run() { go func() { for req := range d.q { if req.dir != nop { method := methodName(req.method) d.RLock() detail, ok := d.details[method] if !ok { detail = d.rules.detailOf(method) } d.RUnlock() switch detail { case Name: d.name(req.dir, req.kind, method, req.qualifier, req.msg, req.latency) case Full: d.full(req.dir, req.kind, method, req.qualifier, req.msg, req.latency) } } if req.sync != nil { close(req.sync) } } }() } // sync waits until all the persent messages in the queue are dumped. func (d *dumper) sync() { ch := make(chan struct{}) dump.q <- &dumpreq{dir: nop, sync: ch} _ = <-ch } // configure (re)configures dumper func (d *dumper) configure(o *options) { d.Lock() defer d.Unlock() d.debug = o.Debug d.rules = o.rules.duplicate() if d.path != o.File || d.disabled != o.Disabled { if d.file != nil { log.Info("closing old message dump file %q...", d.path) d.file.Close() d.file = nil } d.disabled = o.Disabled if d.disabled { return } d.path = o.File if d.path != "" { var err error log.Info("opening new message dump file %q...", d.path) d.file, err = os.Create(d.path) if err != nil { log.Error("failed to open file %q: %v", d.path, err) } } } d.train(nil) } // train trains the dumper with the given set of messages. func (d *dumper) train(names []string) { if names != nil { d.methods = make([]string, len(names), len(names)) } else { names = d.methods } d.details = make(map[string]level) for idx, name := range names { method := methodName(name) detail := d.rules.detailOf(method) log.Info("%s: %v", method, detail) d.methods[idx] = method d.details[method] = detail } } // name does a name-only dump of the given message. func (d *dumper) name(dir direction, kind, method, qualifier string, msg interface{}, latency time.Duration) { var hdr string switch dir { case request: return case reply: if qualifier != "" { hdr = qualifier + " " + method + " " + dir.arrow() + " " } else { hdr = method + " " + dir.arrow() + " " } if err, ok := msg.(error); ok { d.warn(dir, latency, hdr+"(%s) FAILED: %v", kind, err) } else { d.line(dir, latency, hdr+"(%s) REQUEST", kind) } } } // full does a full dump of the given message. func (d *dumper) full(dir direction, kind, method, qualifier string, msg interface{}, latency time.Duration) { var hdr string if qualifier != "" { hdr = qualifier + " " + method + " " + dir.arrow() + " " } else { hdr = method + " " + dir.arrow() + " " } switch dir { case request: raw, _ := yaml.Marshal(msg) str := strings.TrimRight(string(raw), "\n") if strings.LastIndexByte(str, '\n') > 0 { d.line(dir, latency, hdr+"(%s) REQUEST", kind) d.block(dir, latency, hdr+" ", str) } else { d.line(dir, latency, hdr+"(%s) REQUEST %s", kind, str) } case reply: if err, ok := msg.(error); ok { d.warn(dir, latency, hdr+"(%s) FAILED", kind) d.warn(dir, latency, hdr+" %v", err) } else { raw, _ := yaml.Marshal(msg) str := strings.TrimRight(string(raw), "\n") if strings.LastIndexByte(str, '\n') > 0 { d.line(dir, latency, hdr+"(%s) REPLY", kind) d.block(dir, latency, hdr+" ", str) } else { d.line(dir, latency, hdr+"(%s) REPLY %s", kind, str) } } } } // line dumps a single line. func (d *dumper) line(dir direction, latency time.Duration, format string, args ...interface{}) { msg := fmt.Sprintf(format, args...) if !d.debug { message.Info("%s", msg) } else { message.Debug("%s", msg) } if d.file != nil { d.tofile(dir, latency, "%s", msg) } } // block dumps a block of lines. func (d *dumper) block(dir direction, latency time.Duration, prefix, msg string) { if !d.debug { message.InfoBlock(prefix, msg) } else { message.DebugBlock(prefix, msg) } if d.file != nil { for _, line := range strings.Split(msg, "\n") { d.tofile(dir, latency, "%s%s", prefix, line) } } } // warn dumps a single line as a warning. func (d *dumper) warn(dir direction, latency time.Duration, format string, args ...interface{}) { msg := fmt.Sprintf(format, args...) message.Warn("%s", msg) if d.file != nil { d.tofile(dir, latency, "%s", msg) } } // tofile dumps a single line to a file. func (d *dumper) tofile(dir direction, latency time.Duration, format string, args ...interface{}) { fmt.Fprintf(d.file, "["+stamp(dir, latency)+"] "+format+"\n", args...) } // stamp produces a stamp from a direction and a latency. func stamp(dir direction, latency time.Duration) string { switch dir { case request: return time.Now().Format(stampLayout) case reply: return fmt.Sprintf("%*s", stampLen, fmt.Sprintf("+%f", latency.Seconds())) } return "" } // String returns a string representing the direction. func (d direction) String() string { switch d { case request: return "request" case reply: return "reply" } return "unknown" } // arrow returns an 'ASCII arrow' for the direction. func (d direction) arrow() string { switch d { case request: return "=>" case reply: return "<=" } return "<=???=>" } // methodName returns the basename of a method. func methodName(method string) string { return method[strings.LastIndex(method, "/")+1:] } // dumpError produces a formatted package-specific error. func dumpError(format string, args ...interface{}) error { return fmt.Errorf("dump: "+format, args...) } ================================================ FILE: pkg/dump/dump_test.go ================================================ // Copyright 2019-2020 Intel Corporation. All Rights Reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package dump import ( "fmt" "os" "strings" "sync" "testing" "time" "github.com/intel/cri-resource-manager/pkg/config" ) // TestConfigParsing test parsing of dump configuration strings. func TestConfigParsing(t *testing.T) { tcases := []string{ DefaultConfig, "off:.*", "full:.*", "name:.*", "off:.*,full:CreateContainer,StartContainer,StopContainer,RemoveContainer", "off:.*,full:.*((PodSandbox)|(Container)),off:.*((Status)|(List)).*", } for _, cfg := range tcases { t.Run("parse config "+cfg, func(t *testing.T) { r := ruleset{} if err := r.parse(cfg); err != nil { t.Errorf("failed to parse dump config string '%s': %v", cfg, err) } if chk := r.String(); chk != cfg { switch { case strings.Replace(cfg, "short:", "name:", 1) == chk: case strings.Replace(cfg, "suppress:", "off:", 1) == chk: case strings.Replace(cfg, "verbose:", "full:", 1) == chk: default: t.Errorf("expected %s, got %s", cfg, chk) } } }) } } // TestFiltering test message filtering, and a bit of formatting. func fooTestFiltering(t *testing.T) { messages := []interface{}{ mkmsg(&Type1Message1{}), mkmsg(&Type1Message2{}), mkmsg(&Type1Message3{}), mkmsg(&Type1Whatever{}), mkmsg(&Type2Message1{}), mkmsg(&Type2Message2{}), mkmsg(&Type2Message3{}), mkmsg(&Type2Whatever{}), mkmsg(&Type3Message1{}), mkmsg(&Type3Message2{}), mkmsg(&Type3Message3{}), mkmsg(&Type3Whatever{}), } tcases := []filterTest{ { messages: messages, config: "off:.*", }, { messages: messages, config: "name:Type1.*", details: map[string]level{ msgmethod(&Type1Message1{}): Name, msgmethod(&Type1Message2{}): Name, msgmethod(&Type1Message3{}): Name, msgmethod(&Type1Whatever{}): Name, }, }, { messages: messages, config: "full:.*Whatever.*", details: map[string]level{ msgmethod(&Type1Whatever{}): Full, msgmethod(&Type2Whatever{}): Full, msgmethod(&Type3Whatever{}): Full, }, }, { messages: messages, config: "full:.*Whatever.*,off:Type1.*", details: map[string]level{ msgmethod(&Type2Whatever{}): Full, msgmethod(&Type3Whatever{}): Full, }, }, { messages: messages, config: "full:.*Message.*,off:Type2.*,name:Type2Whatever", details: map[string]level{ msgmethod(&Type1Message1{}): Full, msgmethod(&Type1Message2{}): Full, msgmethod(&Type1Message3{}): Full, msgmethod(&Type2Whatever{}): Name, msgmethod(&Type3Message1{}): Full, msgmethod(&Type3Message2{}): Full, msgmethod(&Type3Message3{}): Full, }, }, } for _, tc := range tcases { t.Run("filter with config "+tc.config, func(t *testing.T) { tc.run(t) }) } } type filterTest struct { messages []interface{} config string details map[string]level } const ( // test log marker to identify logged messages marker = "" ) func (ft *filterTest) setup(train bool) *testlog { // override message logger logger := &testlog{} message = logger // create training set/reset messages methods := []string{} if train { for _, msg := range ft.messages { methods = append(methods, msgname(msg)) } } Train(methods) // trigger reconfiguration opt.Config = ft.config opt.configNotify(config.UpdateEvent, config.ConfigFile) return logger } func (ft *filterTest) dumpMessages(logger *testlog) []string { // dump all test messages and a fake reply for each for _, msg := range ft.messages { RequestMessage(marker, msgname(msg), "", msg, false) ReplyMessage(marker, msgname(msg), "", Reply, time.Duration(0), false) } dump.sync() return logger.info } func (ft *filterTest) parseLogs(t *testing.T, logged []string) (map[string]int, map[string]int) { // count logged entries and lines per message lines := map[string]int{} entries := map[string]int{} for _, entry := range logged { entry = strings.Trim(entry, " ") split := strings.Split(entry, " ") method := "" switch { // log line: (marker) {REQUEST|REPLY} method case len(split) > 1 && split[0] == "("+marker+")": method = split[2] entries[method] = entries[method] + 1 case len(split) > 1: // log line continuation: method {=>|<=} content... method = split[0] } if method == "" { t.Errorf("failed to parse log entry '%s' for config '%s'", entry, ft.config) } detail, ok := ft.details[method] if !ok || detail == Off { t.Errorf("message '%s' should have been filtered for config '%s'", method, ft.config) } } return lines, entries } func (ft *filterTest) checkResult(t *testing.T, entries map[string]int, lines map[string]int) { // check correctness of logged entries and lines per method for method, lineCnt := range lines { logcnt := entries[method] expected := 0 switch ft.details[method] { case Full: expected = logcnt/2*(1+LinesPerRequest) + logcnt/2*(1+LinesPerReply) case Name: expected = logcnt } if lineCnt != expected { t.Errorf("message '%s' expected %d logged lines, got %d for config '%s'", method, expected, lineCnt, ft.config) } } } func (ft *filterTest) run(t *testing.T) { for _, train := range []bool{false, true} { logger := ft.setup(train) logged := ft.dumpMessages(logger) lines, entries := ft.parseLogs(t, logged) ft.checkResult(t, entries, lines) } } // // a few message types for testing // type Message struct { Body []string } type Type1Message1 Message type Type1Message2 Message type Type1Message3 Message type Type1Whatever Message type Type2Message1 Message type Type2Message2 Message type Type2Message3 Message type Type2Whatever Message type Type3Message1 Message type Type3Message2 Message type Type3Message3 Message type Type3Whatever Message const ( LinesPerRequest = 6 LinesPerReply = 2 ) var ( Reply = []string{"reply", "OK"} msgCnt int ) func mkmsg(o interface{}) interface{} { msgCnt++ body := []string{ "this", "is", "message", fmt.Sprintf("#%d", msgCnt), fmt.Sprintf("of type (%T)", o), } switch o.(type) { case *Type1Message1: m := o.(*Type1Message1) m.Body = body case *Type1Message2: m := o.(*Type1Message2) m.Body = body case *Type1Message3: m := o.(*Type1Message3) m.Body = body case *Type1Whatever: m := o.(*Type1Whatever) m.Body = body case *Type2Message1: m := o.(*Type2Message1) m.Body = body case *Type2Message2: m := o.(*Type2Message2) m.Body = body case *Type2Message3: m := o.(*Type2Message3) m.Body = body case *Type2Whatever: m := o.(*Type2Whatever) m.Body = body case *Type3Message1: m := o.(*Type3Message1) m.Body = body case *Type3Message2: m := o.(*Type3Message2) m.Body = body case *Type3Message3: m := o.(*Type3Message3) m.Body = body case *Type3Whatever: m := o.(*Type3Whatever) m.Body = body } return o } func msgname(o interface{}) string { return strings.ReplaceAll(fmt.Sprintf("%T", o), ".", "/") } func msgmethod(o interface{}) string { return methodName(msgname(o)) } // // test logger to override and check dumping/logging for test. // type testlog struct { sync.Mutex info []string warn []string err []string debug []string } func (t *testlog) reset() { t.Lock() defer t.Unlock() t.info = nil t.warn = nil t.err = nil t.debug = nil } func (t *testlog) log(save *[]string, prefix, format string, args ...interface{}) { msg := fmt.Sprintf(format, args...) *save = append(*save, msg) fmt.Println(" " + prefix + " " + msg) } func (t *testlog) Info(format string, args ...interface{}) { t.Lock() defer t.Unlock() t.log(&t.info, "I:", format, args...) } func (t *testlog) Warn(format string, args ...interface{}) { t.Lock() defer t.Unlock() t.log(&t.warn, "W:", format, args...) } func (t *testlog) Error(format string, args ...interface{}) { t.Lock() defer t.Unlock() t.log(&t.err, "E:", format, args...) } func (t *testlog) Debug(format string, args ...interface{}) { t.Lock() defer t.Unlock() t.log(&t.debug, "D:", format, args...) } func (t *testlog) Fatal(format string, args ...interface{}) { msg := fmt.Sprintf(format, args...) fmt.Printf(" Fatal error: %s\n", msg) os.Exit(1) } func (*testlog) Panic(format string, args ...interface{}) { msg := fmt.Sprintf(format, args...) fmt.Printf(" Panic: %s\n", msg) panic(msg) } func (t *testlog) Infof(format string, args ...interface{}) { t.Info(format, args...) } func (t *testlog) Warnf(format string, args ...interface{}) { t.Warn(format, args...) } func (t *testlog) Errorf(format string, args ...interface{}) { t.Error(format, args...) } func (t *testlog) Debugf(format string, args ...interface{}) { t.Debug(format, args...) } func (t *testlog) Fatalf(format string, args ...interface{}) { t.Fatal(format, args...) } func (t *testlog) Panicf(format string, args ...interface{}) { t.Panic(format, args...) } func (*testlog) Block(fn func(string, ...interface{}), prfx string, frmt string, a ...interface{}) { for _, line := range strings.Split(fmt.Sprintf(frmt, a...), "\n") { fn("%s%s", prfx, line) } } func (t *testlog) InfoBlock(prefix string, format string, args ...interface{}) { t.Lock() defer t.Unlock() for _, line := range strings.Split(fmt.Sprintf(format, args...), "\n") { t.log(&t.info, "I:", "%s%s", prefix, line) } } func (t *testlog) WarnBlock(prefix string, format string, args ...interface{}) { t.Lock() defer t.Unlock() for _, line := range strings.Split(fmt.Sprintf(format, args...), "\n") { t.log(&t.info, "W:", "%s%s", prefix, line) } } func (t *testlog) ErrorBlock(prefix string, format string, args ...interface{}) { t.Lock() defer t.Unlock() for _, line := range strings.Split(fmt.Sprintf(format, args...), "\n") { t.log(&t.err, "E:", "%s%s", prefix, line) } } func (t *testlog) DebugBlock(prefix string, format string, args ...interface{}) { t.Lock() defer t.Unlock() for _, line := range strings.Split(fmt.Sprintf(format, args...), "\n") { t.log(&t.debug, "I:", "%s%s", prefix, line) } } func (*testlog) EnableDebug() bool { return true } func (*testlog) DebugEnabled() bool { return true } func (*testlog) Stop() {} func (*testlog) Source() string { return "" } ================================================ FILE: pkg/dump/flags.go ================================================ // Copyright 2019-2020 Intel Corporation. All Rights Reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package dump // // This package implements the dumping of (gRPC) methods calls where // each method is called with a single request struct and returns a // single reply struct or an error. Configuring what to dump happens // by specifying a comma-separated dump request on the command line. // // A dump request is a comma-separated list of dump specs: // [,,...,], where each spec is of the form // <[target:]request> // A request is either a requests name (gRPC method name without // the leading path), or a regexp for matching requests. // The dump targets are: 'off', 'name', 'full', 'count' by default. // import ( "fmt" re "regexp" "strings" "github.com/intel/cri-resource-manager/pkg/config" ) const ( // DefaultConfig is the default dump configuration. DefaultConfig = "off:.*,short:((Create)|(Start)|(Run)|(Update)|(Stop)|(Remove)).*,off:.*Image.*" ) // Dumping options configurable via the command line or pkg/config. type options struct { Debug bool // log messages as debug messages Disabled bool // whether dumping is globally disabled File string // file to also dump to, if set Config string // dumping configuration rules ruleset // corresponding dumping rules } // ruleset is an ordered set of dumping rules. type ruleset []*rule // rule is a single dumping rule, declaring verbosity of a single or a set of methods. type rule struct { method string // method, '*' wildcard, or regexp matching a set of methods regexp *re.Regexp // compiled regexp, if applicable detail level // dumping verbosity } // level describes the level of detail to dump. type level int const ( // Off suppresses dumping of matching methods Off level = iota // Name dumps only success/failure status of matching methods. Name // Full dumps matching methods with full level of detail. Full ) // Our runtime configuration. var opt = defaultOptions().(*options) // parse parses the given string into a ruleset. func (set *ruleset) parse(value string) error { prev := Full for _, spec := range strings.Split(value, ",") { r := &rule{} split := strings.SplitN(spec, ":", 2) switch len(split) { case 1: r.detail = prev r.method = split[0] case 2: switch strings.ToLower(split[0]) { case "off", "suppress": r.detail = Off case "name", "short": r.detail = Name case "full", "verbose": r.detail = Full default: return dumpError("invalid dump level '%s'", split[0]) } r.method = split[1] prev = r.detail } if strings.ContainsAny(r.method, ".*?+()[]|") && r.method != "*" { regexp, err := re.Compile(r.method) if err != nil { return dumpError("invalid dump method regexp '%s': %v", r.method, err) } r.regexp = regexp } *set = append(*set, r) } return nil } // String returns the ruleset as a string. func (set *ruleset) String() string { if set == nil || *set == nil { return "" } prev := Off value, sep := "", "" for idx, r := range *set { detail := "" if idx == 0 || r.detail != prev { detail = r.detail.String() + ":" } value += sep + detail + r.method sep = "," prev = r.detail } return value } // detailOf returns the level of detail for dumping the given method. func (set *ruleset) detailOf(method string) level { log.Debug("%s: checking level of detail...", method) if set == nil { return Off } detail := Off for _, r := range *set { log.Debug(" - checking rule '%s'...", r.method) switch { case r.method == method: log.Debug(" => exact match: %v", r.detail) return r.detail case r.method == "*": log.Debug(" => wildcard match: %v", r.detail) detail = r.detail case r.regexp != nil && r.regexp.MatchString(method): log.Debug(" => regexp match (%s): %v", r.method, r.detail) detail = r.detail } } return detail } // copy creates a (shallow) copy of the ruleset. func (set *ruleset) duplicate() ruleset { if set == nil || *set == nil { return nil } cp := make([]*rule, len(*set)) copy(cp, *set) return cp } // String returns the level of detail as a string. func (detail level) String() string { switch detail { case Off: return "off" case Name: return "name" case Full: return "full" } return fmt.Sprintf("", detail) } // defaultOptions returns a new options instance, initialized to defaults. func defaultOptions() interface{} { o := &options{Config: DefaultConfig} o.rules.parse(DefaultConfig) return o } // configNotify updates our runtime configuration. func (o *options) configNotify(event config.Event, _ config.Source) error { log.Info("message dumper configuration %v", event) log.Info(" * config: %s", o.Config) rules := ruleset{} if err := rules.parse(o.Config); err != nil { return err } o.rules = rules log.Info(" * parsed: %s", o.rules.String()) log.Info(" * dump file: %v", opt.File) log.Info(" * log with debug: %v", opt.Debug) dump.configure(o) return nil } // Register us for command line parsing and configuration handling. func init() { opt.rules.parse(opt.Config) config.Register("dump", configHelp, opt, defaultOptions, config.WithNotify(opt.configNotify)) } ================================================ FILE: pkg/instrumentation/flags.go ================================================ // Copyright 2019-2020 Intel Corporation. All Rights Reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package instrumentation import ( "encoding/json" "os" "strconv" "strings" "time" "go.opencensus.io/trace" "github.com/intel/cri-resource-manager/pkg/config" "github.com/intel/cri-resource-manager/pkg/utils" ) // Sampling defines how often trace samples are taken. type Sampling float64 const ( // Disabled is the trace configuration for disabling tracing. Disabled Sampling = 0.0 // Production is a trace configuration for production use. Production Sampling = 0.1 // Testing is a trace configuration for testing. Testing Sampling = 1.0 // defaultSampling is the default sampling frequency. defaultSampling = "0" // defaultReportPeriod is the default report period defaultReportPeriod = "15s" // defaultJaegerCollector is the default Jaeger collector endpoint. defaultJaegerCollector = "" // defaultJaegerAgent is the default Jaeger agent endpoint. defaultJaegerAgent = "" // defaultHTTPEndpoint is the default HTTP endpoint serving Prometheus /metrics. defaultHTTPEndpoint = "" // defaultPrometheusExport is the default state for Prometheus exporting. defaultPrometheusExport = "false" ) // options encapsulates our configurable instrumentation parameters. type options optstruct type optstruct struct { // Sampling is the sampling frequency for traces. Sampling Sampling // ReportPeriod is the OpenCensus view reporting period. ReportPeriod time.Duration // jaegerCollector is the URL to the Jaeger HTTP Thrift collector. JaegerCollector string // jaegerAgent, if set, defines the address of a Jaeger agent to send spans to. JaegerAgent string // HTTPEndpoint is our HTTP endpoint, used among others to export Prometheus /metrics. HTTPEndpoint string // PrometheusExport defines whether we export /metrics to/for Prometheus. PrometheusExport bool `json:"PrometheusExport"` } // UnmarshalJSON is a resetting JSON unmarshaller for options. func (o *options) UnmarshalJSON(raw []byte) error { ostruct := optstruct{} if err := json.Unmarshal(raw, &ostruct); err != nil { return instrumentationError("failed to unmashal options: %v", err) } *o = options(ostruct) return nil } // Our instrumentation options. var opt = defaultOptions().(*options) // MarshalJSON is the JSON marshaller for Sampling values. func (s Sampling) MarshalJSON() ([]byte, error) { return json.Marshal(s.String()) } // UnmarshalJSON is the JSON unmarshaller for Sampling values. func (s *Sampling) UnmarshalJSON(raw []byte) error { var obj interface{} if err := json.Unmarshal(raw, &obj); err != nil { return instrumentationError("failed to unmarshal Sampling value: %v", err) } switch v := obj.(type) { case string: if err := s.Parse(v); err != nil { return err } case float64: *s = Sampling(v) default: return instrumentationError("invalid Sampling value of type %T: %v", obj, obj) } return nil } // Parse parses the given string to a Sampling value. func (s *Sampling) Parse(value string) error { switch strings.ToLower(value) { case "disabled": *s = Disabled case "testing": *s = Testing case "production": *s = Production default: f, err := strconv.ParseFloat(value, 64) if err != nil { return instrumentationError("invalid Sampling value '%s': %v", value, err) } *s = Sampling(f) } return nil } // String returns the Sampling value as a string. func (s Sampling) String() string { switch s { case Disabled: return "disabled" case Production: return "production" case Testing: return "testing" } return strconv.FormatFloat(float64(s), 'f', -1, 64) } // Sampler returns a trace.Sampler corresponding to the Sampling value. func (s Sampling) Sampler() trace.Sampler { if s == Disabled { return trace.NeverSample() } return trace.ProbabilitySampler(float64(s)) } // parseEnv parses the environment for default values. func parseEnv(name, defval string, parsefn func(string) error) { if envval := os.Getenv(name); envval != "" { err := parsefn(envval) if err == nil { return } log.Error("invalid environment %s=%q: %v, using default %q", name, envval, err, defval) } if err := parsefn(defval); err != nil { log.Error("invalid default %s=%q: %v", name, defval, err) } } // defaultOptions returns a new options instance, all initialized to defaults. func defaultOptions() interface{} { o := &options{} type param struct { defval string parsefn func(string) error } params := map[string]param{ "JAEGER_COLLECTOR": { defaultJaegerCollector, func(v string) error { o.JaegerCollector = v; return nil }, }, "JAEGER_AGENT": { defaultJaegerAgent, func(v string) error { o.JaegerAgent = v; return nil }, }, "HTTP_ENDPOINT": { defaultHTTPEndpoint, func(v string) error { o.HTTPEndpoint = v; return nil }, }, "PROMETHEUS_EXPORT": { defaultPrometheusExport, func(v string) error { enabled, err := utils.ParseEnabled(v) if err != nil { return err } o.PrometheusExport = enabled return nil }, }, "SAMPLING_FREQUENCY": { defaultSampling, func(v string) error { return o.Sampling.Parse(v) }, }, "REPORT_PERIOD": { defaultReportPeriod, func(v string) error { d, err := time.ParseDuration(v) if err != nil { return err } o.ReportPeriod = d return nil }, }, } for envvar, p := range params { parseEnv(envvar, p.defval, p.parsefn) } return o } // configNotify is our configuration udpate notification handler. func configNotify(_ config.Event, _ config.Source) error { log.Info("instrumentation configuration is now %v", opt) log.Info("reconfiguring...") if err := svc.reconfigure(); err != nil { log.Error("failed to restart instrumentation: %v", err) } return nil } // Register us for for configuration handling. func init() { config.Register("instrumentation", "Instrumentation for traces and metrics.", opt, defaultOptions, config.WithNotify(configNotify)) } ================================================ FILE: pkg/instrumentation/grpc.go ================================================ // Copyright 2019-2020 Intel Corporation. All Rights Reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package instrumentation import ( "google.golang.org/grpc" "go.opencensus.io/plugin/ocgrpc" "go.opencensus.io/stats/view" ) // InjectGrpcClientTrace injects gRPC dial options for instrumentation if necessary. func InjectGrpcClientTrace(opts ...grpc.DialOption) []grpc.DialOption { extra := grpc.WithStatsHandler(&ocgrpc.ClientHandler{}) if len(opts) > 0 { opts = append(opts, extra) } else { opts = []grpc.DialOption{extra} } return opts } // InjectGrpcServerTrace injects gRPC server options for instrumentation if necessary. func InjectGrpcServerTrace(opts ...grpc.ServerOption) []grpc.ServerOption { extra := grpc.StatsHandler(&ocgrpc.ServerHandler{}) if len(opts) > 0 { opts = append(opts, extra) } else { opts = []grpc.ServerOption{extra} } return opts } // registerGrpcViews registers default client and server trace views for gRPC. func registerGrpcViews() error { log.Debug("registering gRPC trace views...") if err := view.Register(ocgrpc.DefaultClientViews...); err != nil { return instrumentationError("failed to register default gRPC client views: %v", err) } if err := view.Register(ocgrpc.DefaultServerViews...); err != nil { return instrumentationError("failed to register default gRPC server views: %v", err) } return nil } // unregisterGrpcViews unregisters default client and server trace views for gRPC. func unregisterGrpcViews() { view.Unregister(ocgrpc.DefaultClientViews...) view.Unregister(ocgrpc.DefaultServerViews...) } ================================================ FILE: pkg/instrumentation/http/http.go ================================================ // Copyright 2019-2020 Intel Corporation. All Rights Reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package http import ( "context" "fmt" "net" "net/http" "sync" logger "github.com/intel/cri-resource-manager/pkg/log" ) const ( // httpServer is used in log messages. httpServer = "HTTP server" ) // Our logger instance. var log = logger.NewLogger("http") // ServeMux is our HTTP request multiplexer with removable handlers. type ServeMux struct { sync.RWMutex handlers map[string]http.Handler mux *http.ServeMux } // NewServeMux create a new HTTP request multiplexer. func NewServeMux() *ServeMux { return &ServeMux{ handlers: make(map[string]http.Handler), mux: http.NewServeMux(), } } // Handle registers a handler for the given pattern. func (mux *ServeMux) Handle(pattern string, handler http.Handler) { mux.Lock() defer mux.Unlock() log.Debug("registering handler for %q...", pattern) if _, ok := mux.handlers[pattern]; ok { log.Error("can't register duplicate HTTP handler for %q", pattern) return } mux.handlers[pattern] = handler mux.mux.Handle(pattern, handler) } // HandleFunc registers a handler function for the given pattern. func (mux *ServeMux) HandleFunc(pattern string, fn func(http.ResponseWriter, *http.Request)) { mux.Lock() defer mux.Unlock() log.Debug("registering handler function for %q...", pattern) if _, ok := mux.handlers[pattern]; ok { log.Error("can't register duplicate HTTP handler function for '%s'", pattern) return } handler := http.HandlerFunc(fn) mux.handlers[pattern] = handler mux.mux.Handle(pattern, handler) } // Unregister unregister any handlers for the given pattern. func (mux *ServeMux) Unregister(pattern string) (http.Handler, bool) { mux.Lock() defer mux.Unlock() h, ok := mux.handlers[pattern] if !ok { return nil, false } log.Debug("unregistering handler for %q...", pattern) delete(mux.handlers, pattern) mux.mux = http.NewServeMux() for pattern, handler := range mux.handlers { mux.mux.Handle(pattern, handler) } return h, true } // ServeHTTP serves a HTTP request. func (mux *ServeMux) ServeHTTP(w http.ResponseWriter, r *http.Request) { mux.RLock() defer mux.RUnlock() log.Debug("serving %s...", r.URL) mux.mux.ServeHTTP(w, r) } // Server is our HTTP server, with support for unregistering handlers. type Server struct { sync.RWMutex server *http.Server mux *ServeMux } // NewServer creates a new server instance. func NewServer() *Server { return &Server{ mux: NewServeMux(), } } // GetMux returns the mux for this server. func (s *Server) GetMux() *ServeMux { return s.mux } // GetAddress returns the current server HTTP endpoint/address. func (s *Server) GetAddress() string { if s.server == nil { return "" } return s.server.Addr } // Start sets up the server to listen and serve on the given address. func (s *Server) Start(addr string) error { if addr == "" { log.Info("%s is disabled", httpServer) return nil } log.Info("starting %s...", httpServer) s.Lock() defer s.Unlock() s.server = &http.Server{Addr: addr, Handler: s} ln, err := net.Listen("tcp", s.server.Addr) if err != nil { return httpError("can't listen on HTTP TCP address '%s': %v", s.server.Addr, err) } // update address if port was autobound if ln.Addr().String() != s.server.Addr { s.server.Addr = ln.Addr().String() } go s.server.Serve(ln) return nil } // Stop Close()'s the server immediately. func (s *Server) Stop() { log.Info("stopping %s...", httpServer) s.Lock() defer s.Unlock() if s.server == nil { return } s.server.Close() s.server = nil } // Shutdown shuts down the server gracefully. func (s *Server) Shutdown(wait bool) { var sync chan struct{} log.Info("shutting down %s...", httpServer) s.Lock() defer s.Unlock() if s.server == nil { return } if wait { sync = make(chan struct{}) s.server.RegisterOnShutdown(func() { close(sync) }) } s.server.Shutdown(context.Background()) _ = <-sync s.server = nil } // Reconfigure reconfigures the server. func (s *Server) Reconfigure(addr string) error { log.Info("reconfiguring %s...", httpServer) if s.GetAddress() != addr { return s.Restart(addr) } return nil } // Restart restarts it on the given address. func (s *Server) Restart(addr string) error { log.Info("restarting %s...", httpServer) s.Stop() return s.Start(addr) } // ServeHTTP servers the given HTTP request. func (s *Server) ServeHTTP(w http.ResponseWriter, r *http.Request) { s.RLock() defer s.RUnlock() s.mux.ServeHTTP(w, r) } // httpError returns a formatted instrumentation/http-specific error. func httpError(format string, args ...interface{}) error { return fmt.Errorf("instrumentation/http: "+format, args...) } ================================================ FILE: pkg/instrumentation/http/http_test.go ================================================ // Copyright 2019-2020 Intel Corporation. All Rights Reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package http import ( "io" "net/http" "testing" ) func TestStartStop(t *testing.T) { srv := NewServer() if err := srv.Start(":0"); err != nil { t.Errorf("failed to start HTTP server: %v", err) } srv.Stop() if err := srv.Start(":0"); err != nil { t.Errorf("failed to start HTTP server: %v", err) } if err := srv.Restart(":0"); err != nil { t.Errorf("failed to restart HTTP server on different port: %v", err) } if err := srv.Reconfigure(srv.GetAddress()); err != nil { t.Errorf("failed to reconfigure HTTP server on same port: %v", err) } if err := srv.Reconfigure(":0"); err != nil { t.Errorf("failed to reconfigure HTTP server on different port: %v", err) } srv.Stop() } type urlTest struct { pattern string response string fallback string } func checkURL(t *testing.T, srv *Server, path, response string, status int) { url := "http://" + srv.GetAddress() + path res, err := http.Get(url) if err != nil { t.Errorf("http.Get(%s) failed: %v", url, err) } if res.StatusCode != status { t.Errorf("http.Get(%s) status %d, expected %d", url, res.StatusCode, status) } txt, err := io.ReadAll(res.Body) if err != nil { t.Errorf("http.Get(%s) failed to read response: %v", url, err) } if string(txt) != response { t.Errorf("http.Get(%s) unexpected response: %v, expected: %v", url, txt, response) } } type testHandler struct { response string } func (h *testHandler) ServeHTTP(w http.ResponseWriter, _ *http.Request) { _, _ = w.Write([]byte(h.response)) } func TestPatternsp(t *testing.T) { srv := NewServer() mux := srv.GetMux() if err := srv.Start(":0"); err != nil { t.Errorf("failed to start HTTP server: %v", err) } rh := &testHandler{"/"} ah := &testHandler{"a"} bh := &testHandler{"b"} ch := &testHandler{"c"} mux.Handle("/a", ah) checkURL(t, srv, "/a", "a", 200) mux.Handle("/b", bh) checkURL(t, srv, "/b", "b", 200) mux.Handle("/", rh) checkURL(t, srv, "/b", "b", 200) mux.Unregister("/b") checkURL(t, srv, "/b", "/", 200) mux.Handle("/b", ch) checkURL(t, srv, "/b", "c", 200) mux.Unregister("/a") checkURL(t, srv, "/a", "/", 200) srv.Stop() } ================================================ FILE: pkg/instrumentation/instrumentation.go ================================================ // Copyright 2019-2020 Intel Corporation. All Rights Reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package instrumentation import ( "fmt" "github.com/intel/cri-resource-manager/pkg/instrumentation/http" logger "github.com/intel/cri-resource-manager/pkg/log" ) const ( // ServiceName is our service name in external tracing and metrics services. ServiceName = "CRI-RM" ) // Our logger instance. var log = logger.NewLogger("instrumentation") // Our instrumentation service instance. var svc = newService() // GetHTTPMux returns our HTTP request mux for external services. func GetHTTPMux() *http.ServeMux { if svc == nil { return nil } return svc.http.GetMux() } // TracingEnabled returns true if the Jaeger tracing sampler is not disabled. func TracingEnabled() bool { if svc == nil { return false } return svc.TracingEnabled() } // Start our internal instrumentation services. func Start() error { if svc == nil { return instrumentationError("cannot start, no instrumentation service instance") } return svc.Start() } // Stop stops our internal instrumentation services. func Stop() { if svc != nil { svc.Stop() } } // Restart restarts our internal instrumentation services. func Restart() error { if svc == nil { return instrumentationError("cannot restart, no instrumentation service instance") } return svc.Restart() } // instrumentationError produces a formatted instrumentation-specific error. func instrumentationError(format string, args ...interface{}) error { return fmt.Errorf("instrumentation: "+format, args...) } ================================================ FILE: pkg/instrumentation/instrumentation_test.go ================================================ // Copyright 2019-2020 Intel Corporation. All Rights Reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package instrumentation import ( "io" "net/http" "strings" "testing" ) func TestSamplingIdempotency(t *testing.T) { tcases := []Sampling{ Disabled, Testing, Production, 0.2, 0.25, 0.5, 0.75, 0.8, } for _, tc := range tcases { var chk Sampling if err := chk.Parse(tc.String()); err != nil { t.Errorf("failed to parse Sampling.String() %q: %v", tc, err) } if chk != tc { t.Errorf("expected sampling value for %q: %v, got: %v", tc, tc, chk) } } } func TestPrometheusConfiguration(t *testing.T) { log.EnableDebug() if opt.HTTPEndpoint == "" { opt.HTTPEndpoint = ":0" } s := newService() s.Start() address := s.http.GetAddress() if strings.HasSuffix(opt.HTTPEndpoint, ":0") { opt.HTTPEndpoint = address } checkPrometheus(t, address, !opt.PrometheusExport) opt.PrometheusExport = !opt.PrometheusExport s.reconfigure() checkPrometheus(t, address, !opt.PrometheusExport) opt.PrometheusExport = !opt.PrometheusExport s.reconfigure() checkPrometheus(t, address, !opt.PrometheusExport) opt.PrometheusExport = !opt.PrometheusExport s.reconfigure() checkPrometheus(t, address, !opt.PrometheusExport) s.http.Shutdown(true) s.Stop() } func checkPrometheus(t *testing.T, server string, shouldFail bool) { rpl, err := http.Get("http://" + server + "/metrics") switch shouldFail { case false: if err != nil { t.Errorf("Prometheus HTTP GET failed: %v", err) return } if rpl.StatusCode != 200 { t.Errorf("Prometheus HTTP GET failed: %s", rpl.Status) return } _, err = io.ReadAll(rpl.Body) rpl.Body.Close() if err != nil { t.Errorf("failed to read Prometheus response: %v", err) } return case true: if err == nil && rpl.StatusCode == 200 { t.Errorf("Prometheus HTTP GET should have failed, but it didn't.") return } } } ================================================ FILE: pkg/instrumentation/jaeger.go ================================================ // Copyright 2019-2020 Intel Corporation. All Rights Reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package instrumentation import ( "contrib.go.opencensus.io/exporter/jaeger" "go.opencensus.io/trace" ) const ( // jaegerExporter is used in log messages. jaegerExporter = "Jaeger trace exporter" ) // tracing encapsulates the state of our Jaeger exporter. type tracing struct { exporter *jaeger.Exporter agent string collector string sampling Sampling } // start starts our Jaeger exporter. func (t *tracing) start(agent, collector string, sampling Sampling) error { if agent == "" && collector == "" { log.Info("%s is disabled", jaegerExporter) return nil } log.Info("creating %s...", jaegerExporter) cfg := jaeger.Options{ ServiceName: ServiceName, CollectorEndpoint: collector, AgentEndpoint: agent, Process: jaeger.Process{ServiceName: ServiceName}, OnError: func(err error) { log.Error("jaeger error: %v", err) }, } exp, err := jaeger.NewExporter(cfg) if err != nil { return instrumentationError("failed to create %s: %v", jaegerExporter, err) } t.exporter = exp t.agent = agent t.collector = collector t.sampling = sampling trace.RegisterExporter(t.exporter) trace.ApplyConfig(trace.Config{DefaultSampler: t.sampling.Sampler()}) return nil } // stop stops our Jaeger exporter. func (t *tracing) stop() { if t.exporter == nil { return } log.Info("stopping Jaeger trace exporter...") trace.UnregisterExporter(t.exporter) *t = tracing{} } // reconfigure reconfigures our Jaeger exporter. func (t *tracing) reconfigure(agent, collector string, sampling Sampling) error { log.Info("reconfiguring %s...", jaegerExporter) if agent == "" && collector == "" { t.stop() return nil } if t.agent != agent || t.collector != collector { t.stop() } if t.exporter != nil { t.sampling = sampling trace.ApplyConfig(trace.Config{DefaultSampler: t.sampling.Sampler()}) return nil } return t.start(agent, collector, sampling) } ================================================ FILE: pkg/instrumentation/prometheus.go ================================================ // Copyright 2019-2020 Intel Corporation. All Rights Reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package instrumentation import ( "strings" "sync" "time" "contrib.go.opencensus.io/exporter/prometheus" pclient "github.com/prometheus/client_golang/prometheus" model "github.com/prometheus/client_model/go" "go.opencensus.io/stats/view" "github.com/intel/cri-resource-manager/pkg/instrumentation/http" ) const ( // PrometheusMetricsPath is the URL path for exposing metrics to Prometheus. PrometheusMetricsPath = "/metrics" // prometheusExporter is used in log messages. prometheusExporter = "Prometheus metrics exporter" ) // metrics encapsulates the state of our Prometheus exporter. type metrics struct { exporter *prometheus.Exporter mux *http.ServeMux period time.Duration } // start starts our Prometheus exporter. func (m *metrics) start(mux *http.ServeMux, period time.Duration, enable bool) error { if !enable { log.Info("%s is disabled", prometheusExporter) return nil } log.Info("starting %s...", prometheusExporter) cfg := prometheus.Options{ Namespace: prometheusNamespace(ServiceName), Gatherer: pclient.Gatherers{dynamicGatherers}, OnError: func(err error) { log.Error("prometheus error: %v", err) }, } exp, err := prometheus.NewExporter(cfg) if err != nil { return instrumentationError("failed to create %s: %v", prometheusExporter, err) } m.exporter = exp m.mux = mux m.period = period m.mux.Handle(PrometheusMetricsPath, m.exporter) view.RegisterExporter(m.exporter) view.SetReportingPeriod(m.period) return nil } // stop stops our Prometheus exporter. func (m *metrics) stop() { if m.exporter == nil { return } log.Info("stopping %s...", prometheusExporter) view.UnregisterExporter(m.exporter) m.mux.Unregister(PrometheusMetricsPath) *m = metrics{} } // reconfigure reconfigures our Prometheus exporter. func (m *metrics) reconfigure(mux *http.ServeMux, period time.Duration, enable bool) error { log.Info("reconfiguring %s...", prometheusExporter) if !enable { m.stop() return nil } if m.exporter != nil { m.period = period view.SetReportingPeriod(m.period) return nil } return m.start(mux, period, enable) } // mutate service name into a valid Prometheus namespace name. func prometheusNamespace(service string) string { return strings.ReplaceAll(strings.ToLower(service), "-", "_") } // gatherers is a trivial wrapper around prometheus Gatherers. type gatherers struct { sync.RWMutex gatherers pclient.Gatherers } // Our dynamically registered Prometheus gatherers. var dynamicGatherers = &gatherers{gatherers: pclient.Gatherers{}} // Register registers a new gatherer. func (g *gatherers) Register(gatherer pclient.Gatherer) { g.Lock() defer g.Unlock() g.gatherers = append(g.gatherers, gatherer) } // Gather implements the pclient.Gatherer interface. func (g *gatherers) Gather() ([]*model.MetricFamily, error) { g.RLock() defer g.RUnlock() return g.gatherers.Gather() } // RegisterGatherer registers a new prometheus Gatherer. func RegisterGatherer(g pclient.Gatherer) { dynamicGatherers.Register(g) } ================================================ FILE: pkg/instrumentation/service.go ================================================ // Copyright 2019-2020 Intel Corporation. All Rights Reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package instrumentation import ( "sync" "github.com/intel/cri-resource-manager/pkg/instrumentation/http" ) // service is the state of our instrumentation services: HTTP endpoint, trace/metrics exporters. type service struct { sync.RWMutex // we're RW-lockable http *http.Server // HTTP server tracing *tracing // tracing data exporter metrics *metrics // metrics data exporter } // newService creates an instance of our instrumentation services. func newService() *service { return &service{ http: http.NewServer(), tracing: &tracing{}, metrics: &metrics{}, } } // Start starts instrumentation services. func (s *service) Start() error { log.Info("starting instrumentation services...") s.Lock() defer s.Unlock() err := s.http.Start(opt.HTTPEndpoint) if err != nil { return instrumentationError("failed to start HTTP server: %v", err) } err = s.tracing.start(opt.JaegerAgent, opt.JaegerCollector, opt.Sampling) if err != nil { return instrumentationError("failed to start tracing: %v", err) } err = s.metrics.start(s.http.GetMux(), opt.ReportPeriod, opt.PrometheusExport) if err != nil { return instrumentationError("failed to start metrics: %v", err) } if err := registerGrpcViews(); err != nil { s.metrics.stop() s.tracing.stop() s.http.Stop() return err } return nil } // Stop stops instrumentation services. func (s *service) Stop() { s.Lock() defer s.Unlock() unregisterGrpcViews() s.metrics.stop() s.tracing.stop() s.http.Stop() } // reconfigure reconfigures instrumentation services. func (s *service) reconfigure() error { s.Lock() defer s.Unlock() err := s.http.Reconfigure(opt.HTTPEndpoint) if err != nil { return instrumentationError("failed to reconfigure HTTP server: %v", err) } err = s.tracing.reconfigure(opt.JaegerAgent, opt.JaegerCollector, opt.Sampling) if err != nil { return instrumentationError("failed to reconfigure tracing: %v", err) } err = s.metrics.reconfigure(s.http.GetMux(), opt.ReportPeriod, opt.PrometheusExport) if err != nil { return instrumentationError("failed to reconfigure metrics: %v", err) } return nil } // Restart restarts instrumentation services. func (s *service) Restart() error { s.Stop() return s.Start() } // TracingEnabled returns true if the Jaeger tracing sampler is not disabled. func (s *service) TracingEnabled() bool { s.RLock() defer s.RUnlock() return float64(opt.Sampling) > 0.0 } ================================================ FILE: pkg/log/default.go ================================================ // Copyright 2019-2020 Intel Corporation. All Rights Reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package log import ( "os" "path/filepath" ) // our default logger var deflog = log.get(filepath.Base(filepath.Clean(os.Args[0]))) // Default returns the default Logger. func Default() Logger { return deflog } // Info formats and emits an informational message. func Info(format string, args ...interface{}) { deflog.Info(format, args...) } // Warn formats and emits a warning message. func Warn(format string, args ...interface{}) { deflog.Warn(format, args...) } // Error formats and emits an error message. func Error(format string, args ...interface{}) { deflog.Error(format, args...) } // Fatal formats and emits an error message and os.Exit()'s with status 1. func Fatal(format string, args ...interface{}) { deflog.Fatal(format, args...) } // Panic formats and emits an error messages, and panics with the same. func Panic(format string, args ...interface{}) { deflog.Panic(format, args...) } // Debug formats and emits a debug message. func Debug(format string, args ...interface{}) { deflog.Debug(format, args...) } // InfoBlock formats and emits a multiline information message. func InfoBlock(prefix string, format string, args ...interface{}) { deflog.InfoBlock(prefix, format, args...) } // WarnBlock formats and emits a multiline warning message. func WarnBlock(prefix string, format string, args ...interface{}) { deflog.WarnBlock(prefix, format, args...) } // ErrorBlock formats and emits a multiline error message. func ErrorBlock(prefix string, format string, args ...interface{}) { deflog.ErrorBlock(prefix, format, args...) } // DebugBlock formats and emits a multiline debug message. func DebugBlock(prefix string, format string, args ...interface{}) { deflog.DebugBlock(prefix, format, args...) } func init() { binary := filepath.Clean(os.Args[0]) source := filepath.Base(binary) deflog = log.get(source) } ================================================ FILE: pkg/log/flags.go ================================================ // Copyright 2019-2020 Intel Corporation. All Rights Reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package log import ( "encoding/json" "os" "strings" pkgcfg "github.com/intel/cri-resource-manager/pkg/config" "github.com/intel/cri-resource-manager/pkg/log/klogcontrol" "github.com/intel/cri-resource-manager/pkg/utils" ) const ( // DefaultLevel is the default logging severity level. DefaultLevel = LevelInfo // debugEnvVar is the environment variable used to seed debugging flags. debugEnvVar = "LOGGER_DEBUG" // configModule is our module name in the runtime configuration. configModule = "logger" ) // options capture our runtime configuration. type options struct { // Klog contains klog-specific options. Klog klogcontrol.Options // Debug defines which sources produce debug messages. Debug srcmap // LogSource determines if messages are prefixed with the logger source LogSource bool } // srcmap tracks debugging settings for sources. type srcmap map[string]bool var ( // Runtime logging configuration. opt *options // Default debugging configuration. defaultDebugFlags srcmap // Default klog configuration. defaultKlogFlags klogcontrol.Options // klog control klogctl *klogcontrol.Control ) // parse parses the given string and updates the srcmap accordingly. func (m *srcmap) parse(value string) error { if *m == nil { *m = make(srcmap) } if value = strings.TrimSpace(value); value == "" { return nil } prev, state, src := "", "", "" for _, entry := range strings.Split(value, ",") { if entry = strings.TrimSpace(entry); entry == "" { continue } statesrc := strings.Split(entry, ":") switch len(statesrc) { case 2: state, src = statesrc[0], strings.TrimSpace(statesrc[1]) case 1: state, src = "", strings.TrimSpace(statesrc[0]) default: return loggerError("invalid state spec '%s' in source map", entry) } if state != "" { prev = state } else { state = prev if state == "" { state = "on" } } if src == "all" { src = "*" } enabled, err := utils.ParseEnabled(state) if err != nil { return loggerError("invalid state '%s' in source map", state) } (*m)[src] = enabled } return nil } // String returns a string representation of the srcmap. func (m *srcmap) String() string { off := "" on := "" for src, state := range *m { if state { if on == "" { on = src } else { on += "," + src } } else { if off == "" { off = src } else { off += "," + src } } } switch { case on == "" && off == "": return "" case off == "": return "on:" + on case on == "": return "off:" + off } return "on:" + on + "," + "off:" + off } // MarshalJSON is the JSON marshaller for srcmap. func (m srcmap) MarshalJSON() ([]byte, error) { return json.Marshal(m.String()) } // UnmarshalJSON is the JSON unmarshaller for srcmap. func (m *srcmap) UnmarshalJSON(raw []byte) error { cfgstr := "" if err := json.Unmarshal(raw, &cfgstr); err != nil { return loggerError("failed to unmarshal source map '%s': %v", string(raw), err) } if err := m.parse(cfgstr); err != nil { return loggerError("failed to unmarshal source map '%s': %v", string(raw), err) } return nil } // cloneFrom state from another srcmap. func (m *srcmap) cloneFrom(o srcmap) { *m = make(srcmap) for src, state := range o { (*m)[src] = state } } // clone returns a copy of the srcmap. func (m srcmap) clone() srcmap { if m == nil { return nil } o := make(srcmap) for src, state := range m { o[src] = state } return o } // configNotify is the configuration change notification callback for options. func (o *options) configNotify(event pkgcfg.Event, _ pkgcfg.Source) error { deflog.Info("logger configuration %v", event) deflog.Info(" * debugging: %s", o.Debug.String()) deflog.Info(" * log source: %v", o.LogSource) deflog.InfoBlock(" * klog: ", "%s", o.Klog.String()) // On the first configuration update event, we record the current values // of klog flags as the runtime defaults. Effectively this allows one to // override the built-in defaults using klog command line options (or // environment variables as interpreted by klogcontrol). The recorded // defaults will also reflect any potential programmatic changes done by // (mis-)using flag.Set() but there's not much we can do about that. if defaultKlogFlags == nil { defaultKlogFlags = klogctl.CurrentOptions() } if o.Klog == nil { o.Klog = make(klogcontrol.Options) } // The behavior of the options.Klog map across updates is difficult // to understand. To make it more user friendly we fill in runtime // defaults for each unset entry (klog flags) here. for flag, value := range defaultKlogFlags { if _, ok := o.Klog[flag]; !ok { o.Klog[flag] = value } } return o.apply() } // apply applies the options to logging. func (o *options) apply() error { log.Lock() defer log.Unlock() prefix := o.LogSource if logToStderr, ok := o.Klog["logtostderr"]; ok && logToStderr.(bool) { if skipHeaders, ok := o.Klog["skip_headers"]; ok && skipHeaders.(bool) { prefix = true } } log.setDbgMap(o.Debug.clone()) log.setPrefix(prefix) return klogctl.Configure(o.Klog) } // defaultOptions returns our current default runtime options. func defaultOptions() interface{} { o := &options{} o.Debug.cloneFrom(defaultDebugFlags) if defaultKlogFlags != nil { o.Klog.CloneFrom(defaultKlogFlags) } else { o.Klog = klogctl.CurrentOptions() } return o } // Set up klog control, set pkg/config logger, register us for configuration handling. func init() { klogctl = klogcontrol.Get() opt = defaultOptions().(*options) opt.apply() cfglog := log.get("config") pkgcfg.SetLogger(pkgcfg.Logger{ DebugEnabled: cfglog.DebugEnabled, Debug: cfglog.Debug, Info: cfglog.Info, Warning: cfglog.Warn, Error: cfglog.Error, Fatal: cfglog.Fatal, Panic: cfglog.Panic, }) defaultDebugFlags = make(srcmap) if value, ok := os.LookupEnv(debugEnvVar); ok { if err := defaultDebugFlags.parse(value); err != nil { Default().Error("failed to parse %s %q: %v", debugEnvVar, value, err) } else { log.setDbgMap(defaultDebugFlags) Default().Info("seeded debug flags ($%s): %s", debugEnvVar, defaultDebugFlags.String()) } } pkgcfg.Register(configModule, "logging control", opt, defaultOptions, pkgcfg.WithNotify(opt.configNotify)) } ================================================ FILE: pkg/log/grpc-logger.go ================================================ // Copyright 2019-2020 Intel Corporation. All Rights Reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package log import ( "fmt" "google.golang.org/grpc/grpclog" ) // SetGrpcLogger sets up a logger for (google.golang.org/)grpc. func SetGrpcLogger(source string, rate *Rate) { var l Logger if source == "" { l = Default() } else { l = log.get(source) } if rate != nil { l = RateLimit(l, *rate) } grpclog.SetLoggerV2(&grpclogger{Logger: l}) } // grpclogger implements grpclog.LoggerV2 interface for our logger. type grpclogger struct { Logger } func (g grpclogger) Info(args ...interface{}) { g.Logger.Debug("%s", fmt.Sprint(args...)) } func (g grpclogger) Infoln(args ...interface{}) { g.Logger.Debug("%s", fmt.Sprint(args...)) } func (g grpclogger) Infof(format string, args ...interface{}) { g.Logger.Debug(format, args...) } func (g grpclogger) Warning(args ...interface{}) { g.Logger.Warn("%s", fmt.Sprint(args...)) } func (g grpclogger) Warningln(args ...interface{}) { g.Logger.Warn("%s", fmt.Sprint(args...)) } func (g grpclogger) Warningf(format string, args ...interface{}) { g.Logger.Warn(format, args...) } func (g grpclogger) Error(args ...interface{}) { g.Logger.Error("%s", fmt.Sprint(args...)) } func (g grpclogger) Errorln(args ...interface{}) { g.Logger.Error("%s", fmt.Sprint(args...)) } func (g grpclogger) Errorf(format string, args ...interface{}) { g.Logger.Error(format, args...) } func (g grpclogger) Fatal(args ...interface{}) { g.Logger.Fatal("%s", fmt.Sprint(args...)) } func (g grpclogger) Fatalln(args ...interface{}) { g.Logger.Fatal("%s", fmt.Sprint(args...)) } func (g grpclogger) Fatalf(format string, args ...interface{}) { g.Logger.Fatal(format, args...) } func (g grpclogger) V(_ int) bool { return true } ================================================ FILE: pkg/log/klogcontrol/klogcontrol.go ================================================ // Copyright 2019-2020 Intel Corporation. All Rights Reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package klogcontrol import ( "flag" "fmt" "io" "os" "strings" "k8s.io/klog/v2" ) // Options captures runtime configuration for klog. type Options map[string]interface{} // Control implements runtime control for klog. type Control struct { flags *flag.FlagSet } // Our singleton klog Control instance. var ctl *Control // Get returns our singleton klog Control instance. func Get() *Control { return ctl } // CurrentOptions returns the current klog configuration as Options. func (c *Control) CurrentOptions() Options { o := make(Options) c.flags.VisitAll(func(f *flag.Flag) { o[f.Name] = flag.Lookup(f.Name).Value.(flag.Getter).Get() }) return o } // Configure reconfigures klog with the given Options. func (c *Control) Configure(options Options) error { for name, value := range options { if err := flag.Set(name, fmt.Sprintf("%v", value)); err != nil { return klogError("failed to set klog flag %q to %v: %v", name, value, err) } } return nil } // Set sets the value of the given klog flag. func (c *Control) Set(name, value string) error { return flag.Set(name, value) } // Get returns the current value of the given klog flag. func (c *Control) Get(name string) (interface{}, error) { if c.flags.Lookup(name) == nil { return nil, klogError("unknown klog flag %q", name) } return flag.Lookup(name).Value.(flag.Getter).Get(), nil } // CloneFrom clones src to o. func (o *Options) CloneFrom(src Options) { *o = make(Options) for name, value := range src { (*o)[name] = value } } // String returns a string representation of the Options. func (o *Options) String() string { if o == nil { return "" } str := "" sep := "" for name, value := range *o { str += sep + name + "=" + fmt.Sprintf("%v", value) sep = "\n" } return str } // klogflag wraps a klog flag for configuration. type klogflag struct { flag *flag.Flag } // Set implements flag.Value.Set() for wrapped klog flags. func (klogf *klogflag) Set(value string) error { if klogf.flag.Name == "stderrthreshold" { // klog expects thresholds in ALL CAPS value = strings.ToUpper(value) } if err := klogf.flag.Value.Set(value); err != nil { return err } return nil } // String implements flag.Value.String() for wrapped klog flags. func (klogf *klogflag) String() string { if klogf.flag == nil { // flag.isZeroValue() probing us... return "" } value := klogf.flag.Value.String() if klogf.flag.Name == "log_backtrace_at" && value == ":0" { value = "" } return value } // Get implements flag.Getter.Get() for wrapped klog flags. func (klogf *klogflag) Get() interface{} { if getter, ok := klogf.flag.Value.(flag.Getter); ok { if value := getter.Get(); value != nil { return value } } return klogf.String() } // boolFlag is identical to the unexported flag.boolFlag interface. type boolFlag interface { IsBoolFlag() bool } // IsBoolFlag implements flag.boolFlag.IsBoolFlag() for wrapped klog flags. func (klogf *klogflag) IsBoolFlag() bool { if klogf.flag == nil { return false } if boolf, ok := klogf.flag.Value.(boolFlag); ok { return boolf.IsBoolFlag() } return false } // getEnv returns a default value for the flag from the environment. func (klogf *klogflag) getEnv() (string, string, bool) { name := "LOGGER_" + strings.ToUpper(strings.ReplaceAll(klogf.flag.Name, "-", "_")) if value, ok := os.LookupEnv(name); ok { return name, value, true } return "", "", false } // klogError returns a package-specific formatted error. func klogError(format string, args ...interface{}) error { return fmt.Errorf("klogcontrol: "+format, args...) } // wrapKlogFlag wraps and registers the given klog flag. func wrapKlogFlag(f *flag.Flag) { klogf := &klogflag{flag: f} flag.Var(klogf, f.Name, f.Usage) if name, value, ok := klogf.getEnv(); ok { if err := klogf.Set(value); err != nil { klog.Errorf("klog flag %q: invalid environment default %s=%q: %v", f.Name, name, value, err) } } else { // Unless explicitly configured in the environment, by default // turn off headers (date, timestamp, etc.) when we're logging // to a journald stream. if f.Name == "skip_headers" { if value, _ := os.LookupEnv("JOURNAL_STREAM"); value != "" { klog.Infof("Logging to journald, forcing headers off...") klogf.Set("true") } } } } // init discovers klog flags and sets up dynamic control for them. func init() { ctl = &Control{flags: flag.NewFlagSet("klog flags", flag.ContinueOnError)} ctl.flags.SetOutput(io.Discard) klog.InitFlags(ctl.flags) ctl.flags.VisitAll(func(f *flag.Flag) { wrapKlogFlag(f) }) } ================================================ FILE: pkg/log/log.go ================================================ // Copyright 2019-2020 Intel Corporation. All Rights Reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package log import ( "fmt" "strings" "sync" "k8s.io/klog/v2" ) // Level describes the severity of a log message. type Level int const ( // levelUnset denotes an unset level. levelUnset Level = iota // LevelDebug is the severity for debug messages. LevelDebug // LevelInfo is the severity for informational messages. LevelInfo // LevelWarn is the severity for warnings. LevelWarn // LevelError is the severity for errors. LevelError // LevelPanic is the severity for panic messages. LevelPanic // LevelFatal is the severity for fatal errors. LevelFatal ) // Per-level prefix tags. var levelTag = map[Level]string{ levelUnset: "?: ", LevelDebug: "D: ", LevelInfo: "I: ", LevelWarn: "W: ", LevelError: "E: ", LevelFatal: "F: ", LevelPanic: "P: ", } // Logger is the interface for producing log messages for/from a particular source. type Logger interface { // Standardized Logger interface functions so that this interface can be // used from goresctrl library. Debugf(format string, v ...interface{}) Infof(format string, v ...interface{}) Warnf(format string, v ...interface{}) Errorf(format string, v ...interface{}) Panicf(format string, v ...interface{}) Fatalf(format string, v ...interface{}) // Debug formats and emits a debug message. Debug(format string, args ...interface{}) // Info formats and emits an informational message. Info(format string, args ...interface{}) // Warn formats and emits a warning message. Warn(format string, args ...interface{}) // Error formats and emits an error message. Error(format string, args ...interface{}) // Panic formats and emits an error message then panics with the same. Panic(format string, args ...interface{}) // Fatal formats and emits an error message and os.Exit()'s with status 1. Fatal(format string, args ...interface{}) // DebugBlock formats and emits a multiline debug message. DebugBlock(prefix string, format string, args ...interface{}) // InfoBlock formats and emits a multiline information message. InfoBlock(prefix string, format string, args ...interface{}) // WarnBlock formats and emits a multiline warning message. WarnBlock(prefix string, format string, args ...interface{}) // ErrorBlock formats and emits a multiline error message. ErrorBlock(prefix string, format string, args ...interface{}) // EnableDebug enables debug messages for this Logger. EnableDebug() bool // DebugEnabled checks if debug messages are enabled for this Logger. DebugEnabled() bool // Source returns the source name of this Logger. Source() string } // logger implements Logger. type logger uint // logging encapsulates the full runtime state of logging. type logging struct { sync.RWMutex level Level // logging threshold for stderr dbgmap srcmap // debug configuration loggers map[string]logger // source to logger mapping sources map[logger]string // logger to source mapping debug map[logger]struct{} // loggers with debugging enabled maxlen int // max source length. forced bool // forced global debugging prefix bool // prefix messages with logger source aligned map[logger]string // logger sources aligned to maxlen } // log tracks our runtime state. var log = &logging{ level: DefaultLevel, loggers: make(map[string]logger), sources: make(map[logger]string), aligned: make(map[logger]string), debug: make(map[logger]struct{}), } // Get returns the named Logger. func Get(source string) Logger { log.Lock() defer log.Unlock() return log.get(source) } // NewLogger creates the named logger. func NewLogger(source string) Logger { return Get(source) } // EnableDebug enables debug logging for the source. func EnableDebug(source string) bool { log.Lock() defer log.Unlock() return log.setDebug(source, true) } // DisableDebug disables debug logging for the source. func DisableDebug(source string) bool { log.Lock() defer log.Unlock() return log.setDebug(source, false) } // DebugEnabled checks if debug logging is enabled for the source. func DebugEnabled(source string) bool { log.Lock() defer log.Unlock() return log.getDebug(source) } // SetLevel sets the logging severity level. func SetLevel(level Level) { log.Lock() defer log.Unlock() log.setLevel(level) } // Flush flushes any pending log messages. func Flush() { log.RLock() defer log.RUnlock() klog.Flush() } // // logging // func (l Level) String() string { switch l { case LevelDebug: return "debug" case LevelInfo: return "info" case LevelWarn: return "warning" case LevelError: return "error" case LevelPanic: return "panic" case LevelFatal: return "fatal" } return "unknown" } // setLevel sets the logging severity level. func (log *logging) setLevel(level Level) error { log.level = level threshold := "" switch level { case LevelDebug, LevelInfo: threshold = "INFO" case LevelWarn: threshold = "WARNING" case LevelError, LevelPanic, LevelFatal: threshold = "ERROR" } if err := klogctl.Set("stderrthreshold", threshold); err != nil { return loggerError("failed to set log level/threshold to %s: %v", threshold, err) } return nil } // setDebug sets the debug state for the given source and returns the previous one. func (log *logging) setDebug(source string, enabled bool) bool { l := log.get(source) _, old := log.debug[l] if enabled { log.debug[l] = struct{}{} } else { delete(log.debug, l) } return old } // getDebug sets the debug state for the given source and returns the previous one. func (log *logging) getDebug(source string) bool { if log.forced { return true } l := log.get(source) _, enabled := log.debug[l] return enabled } // setDbgMap updates the debug configuration of logging. func (log *logging) setDbgMap(dbgmap srcmap) { log.dbgmap = dbgmap log.debug = make(map[logger]struct{}) for source := range log.loggers { state, ok := log.dbgmap[source] if !ok { state = log.dbgmap["*"] } log.setDebug(source, state) } } // setPrefix sets the prefix (source) logging preference. func (log *logging) setPrefix(prefix bool) { log.prefix = prefix } // align calculates and stores an aligned prefix for the given logger. func (log *logging) align(l logger) { source := log.sources[l] srclen := len(source) if srclen > log.maxlen { log.realign(srclen) return } pad := log.maxlen - srclen pre := (pad + 1) / 2 suf := pad - pre log.aligned[l] = "[" + fmt.Sprintf("%*s", pre, "") + source + fmt.Sprintf("%*s", suf, "") + "] " } // realign recalculates aligned prefixes for all loggers. func (log *logging) realign(maxlen int) { if maxlen <= 0 { for _, source := range log.sources { if srclen := len(source); srclen > maxlen { maxlen = srclen } } } log.maxlen = maxlen log.aligned = make(map[logger]string) for l := range log.sources { log.align(l) } } // // Logger // // get returns the logger for source, creating one if necessary. func (log *logging) get(source string) logger { if l, ok := log.loggers[source]; ok { return l } l := logger(len(log.loggers)) log.loggers[source] = l log.sources[l] = source log.align(l) state, ok := log.dbgmap[source] if !ok { state = log.dbgmap["*"] } log.setDebug(source, state) return l } func (l logger) EnableDebug() bool { log.Lock() defer log.Unlock() if _, ok := log.sources[l]; !ok { return false } _, old := log.debug[l] log.debug[l] = struct{}{} return old } func (l logger) DebugEnabled() bool { log.RLock() defer log.RUnlock() _, enabled := log.debug[l] return enabled || log.forced } func (l logger) Source() string { log.RLock() defer log.RUnlock() return log.sources[l] } func (l logger) Debug(format string, args ...interface{}) { log.RLock() defer log.RUnlock() if !log.forced { if _, ok := log.debug[l]; !ok { return } } msg := fmt.Sprintf(format, args...) if log.prefix { klog.InfoDepth(1, levelTag[LevelDebug], log.aligned[l], msg) } else { klog.InfoDepth(1, msg) } } func (l logger) Info(format string, args ...interface{}) { log.RLock() defer log.RUnlock() msg := fmt.Sprintf(format, args...) if log.prefix { klog.InfoDepth(1, levelTag[LevelInfo], log.aligned[l], msg) } else { klog.InfoDepth(1, msg) } } func (l logger) Warn(format string, args ...interface{}) { log.RLock() defer log.RUnlock() msg := fmt.Sprintf(format, args...) if log.prefix { klog.WarningDepth(1, levelTag[LevelWarn], log.aligned[l], msg) } else { klog.WarningDepth(1, msg) } } func (l logger) Error(format string, args ...interface{}) { log.RLock() defer log.RUnlock() msg := fmt.Sprintf(format, args...) if log.prefix { klog.ErrorDepth(1, levelTag[LevelError], log.aligned[l], msg) } else { klog.ErrorDepth(1, msg) } } func (l logger) Fatal(format string, args ...interface{}) { log.RLock() defer log.RUnlock() msg := fmt.Sprintf(format, args...) if log.prefix { klog.ExitDepth(1, levelTag[LevelFatal], log.aligned[l], msg) } else { klog.ExitDepth(1, msg) } } func (l logger) Panic(format string, args ...interface{}) { log.RLock() defer log.RUnlock() msg := fmt.Sprintf(format, args...) if log.prefix { klog.ErrorDepth(1, levelTag[LevelPanic], log.aligned[l], msg) } else { klog.ErrorDepth(1, msg) } panic(msg) } func (l logger) DebugBlock(prefix string, format string, args ...interface{}) { if l.DebugEnabled() { l.block(LevelDebug, prefix, format, args...) } } func (l logger) InfoBlock(prefix string, format string, args ...interface{}) { l.block(LevelInfo, prefix, format, args...) } func (l logger) WarnBlock(prefix string, format string, args ...interface{}) { l.block(LevelWarn, prefix, format, args...) } func (l logger) ErrorBlock(prefix string, format string, args ...interface{}) { l.block(LevelError, prefix, format, args...) } func (l logger) block(level Level, prefix, format string, args ...interface{}) { log.Lock() defer log.Unlock() var logFn func(int, ...interface{}) switch level { case LevelDebug, LevelInfo: logFn = klog.InfoDepth case LevelWarn: logFn = klog.WarningDepth case LevelError: logFn = klog.ErrorDepth default: return } if log.prefix { src := log.aligned[l] for _, msg := range strings.Split(fmt.Sprintf(format, args...), "\n") { logFn(2, levelTag[level], src, prefix, msg) } } else { for _, msg := range strings.Split(fmt.Sprintf(format, args...), "\n") { logFn(2, prefix, msg) } } } // loggerError produces a formatted logger-specific error. func loggerError(format string, args ...interface{}) error { return fmt.Errorf("logger: "+format, args...) } func (l logger) Debugf(format string, args ...interface{}) { l.Debug(format, args...) } func (l logger) Infof(format string, args ...interface{}) { l.Info(format, args...) } func (l logger) Warnf(format string, args ...interface{}) { l.Warn(format, args...) } func (l logger) Errorf(format string, args ...interface{}) { l.Error(format, args...) } func (l logger) Panicf(format string, args ...interface{}) { l.Panic(format, args...) } func (l logger) Fatalf(format string, args ...interface{}) { l.Fatal(format, args...) } ================================================ FILE: pkg/log/ratelimit.go ================================================ // Copyright 2019-2020 Intel Corporation. All Rights Reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package log import ( "fmt" "sync" "time" goxrate "golang.org/x/time/rate" ) // Rate specifies maximum per-message logging rate. type Rate struct { // rate limit Limit goxrate.Limit // allowed bursts Burst int // optional message window size Window int } // ratelimited implements rate-limited logging with a sliding window of unique messages. type ratelimited struct { Logger sync.Mutex rate Rate window []string limits map[string]*goxrate.Limiter } const ( // DefaultWindow is the default message window size for rate limiting. DefaultWindow = 256 // MinimumWindow is the smallest message window size for rate limiting. MinimumWindow = 32 ) // Every defines a rate limit for the given interval. func Every(interval time.Duration) goxrate.Limit { return goxrate.Every(interval) } // Interval returns a Rate for the given interval. func Interval(interval time.Duration) Rate { return Rate{Limit: Every(interval), Burst: 1} } // RateLimit returns a ratelimited version of the given logger. func RateLimit(log Logger, rate Rate) Logger { switch { case rate.Window == 0: rate.Window = DefaultWindow case rate.Window < MinimumWindow: rate.Window = MinimumWindow } if rate.Burst < 1 { rate.Burst = 1 } return &ratelimited{ Logger: log, rate: rate, window: make([]string, 0, rate.Window), limits: make(map[string]*goxrate.Limiter), } } func (rl *ratelimited) Debug(format string, args ...interface{}) { msg := fmt.Sprintf(format, args...) if limit := rl.getMessageLimit(msg); limit.Allow() { rl.Logger.Debug(" %s", msg) } } func (rl *ratelimited) Info(format string, args ...interface{}) { msg := fmt.Sprintf(format, args...) if limit := rl.getMessageLimit(msg); limit.Allow() { rl.Logger.Info(" %s", msg) } } func (rl *ratelimited) Warn(format string, args ...interface{}) { msg := fmt.Sprintf(format, args...) if limit := rl.getMessageLimit(msg); limit.Allow() { rl.Logger.Warn(" %s", msg) } } func (rl *ratelimited) Error(format string, args ...interface{}) { msg := fmt.Sprintf(format, args...) if limit := rl.getMessageLimit(msg); limit.Allow() { rl.Logger.Error(" %s", msg) } } // Get existing message limit or create a new one, shifting out the oldest if window is full. func (rl *ratelimited) getMessageLimit(msg string) *goxrate.Limiter { rl.Lock() defer rl.Unlock() limit, ok := rl.limits[msg] if ok { return limit } limit = goxrate.NewLimiter(rl.rate.Limit, rl.rate.Burst) if len(rl.limits) == rl.rate.Window { delete(rl.limits, rl.window[0]) rl.window = rl.window[1:] } rl.window = append(rl.window, msg) rl.limits[msg] = limit return limit } ================================================ FILE: pkg/log/ratelimit_test.go ================================================ // Copyright 2019-2020 Intel Corporation. All Rights Reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package log import ( "fmt" "testing" "time" goxrate "golang.org/x/time/rate" ) func TestRateLimit(t *testing.T) { ratelimit := RateLimit(Default(), Rate{Window: MinimumWindow, Limit: Every(time.Second)}) rl := ratelimit.(*ratelimited) limiters := make(map[string]*goxrate.Limiter) // fill message window, store limiters for checking messages := make([]string, 0, MinimumWindow) for idx := 0; idx < cap(messages); idx++ { msg := fmt.Sprintf("message #%d", idx) messages = append(messages, msg) limiters[msg] = rl.getMessageLimit(msg) } // check looked up vs. stored limters for msg, limiter := range limiters { if rl.getMessageLimit(msg) != limiter { t.Errorf("unexpected new limiter for message %s", msg) } } // create more messages, store limiters for checking recent := make([]string, 0, MinimumWindow/5) for i := 0; i < cap(recent); i++ { msg := fmt.Sprintf("message #%d", len(messages)+i) recent = append(recent, msg) limiters[msg] = rl.getMessageLimit(msg) } // check looked up vs. stored limiters for _, msg := range recent { if rl.getMessageLimit(msg) != limiters[msg] { t.Errorf("unexpected new limiter for recent message %s", msg) } } // check in-window part of old messages for idx := len(recent); idx < len(messages); idx++ { msg := messages[idx] l := rl.getMessageLimit(msg) if l != limiters[msg] { t.Errorf("unexpected new limiter for old message %s", msg) } } // check shifted out part of old messages for idx := 0; idx < len(recent); idx++ { msg := messages[idx] l := rl.getMessageLimit(msg) if l == limiters[msg] { t.Errorf("unexpected old limiter for old message %s", msg) } } } ================================================ FILE: pkg/log/signal.go ================================================ // Copyright 2019-2020 Intel Corporation. All Rights Reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package log import ( "os" "os/signal" ) // signal notification channel var signals chan os.Signal // SetupDebugToggleSignal sets up a signal handler to toggle full debugging on/off. func SetupDebugToggleSignal(sig os.Signal) { log.Lock() defer log.Unlock() clearDebugToggleSignal() signals = make(chan os.Signal, 1) signal.Notify(signals, sig) go func(sig <-chan os.Signal) { state := map[bool]string{false: "off", true: "on"} for { select { case _, ok := <-sig: if !ok { return } } log.forced = !log.forced deflog.Warn("forced full debugging is now %s...", state[log.forced]) } }(signals) } // ClearDebugToggleSignal removes any signal handlers for toggling debug on/off. func ClearDebugToggleSignal() { log.Lock() defer log.Unlock() clearDebugToggleSignal() } func clearDebugToggleSignal() { if signals != nil { signal.Stop(signals) close(signals) signals = nil } } ================================================ FILE: pkg/log/stdlog-logger.go ================================================ // Copyright 2019-2020 Intel Corporation. All Rights Reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package log import ( stdlog "log" ) // stdlogger implements an io.Writer to redirect logging by the stock log package. type stdlogger struct { l Logger } // SetStdLogger sets up a logger for the standard log package. func SetStdLogger(source string) { var l Logger if source == "" { l = Default() } else { l = log.get(source) } stdlog.SetPrefix("") stdlog.SetFlags(0) stdlog.SetOutput(&stdlogger{l: l}) } // Write implements io.Writer for stdlogger. func (s *stdlogger) Write(p []byte) (int, error) { s.l.Debug("%s", string(p)) return len(p), nil } ================================================ FILE: pkg/metrics/metrics.go ================================================ package metrics import ( "fmt" logger "github.com/intel/cri-resource-manager/pkg/log" "github.com/prometheus/client_golang/prometheus" ) var ( builtInCollectors = make(map[string]InitCollector) registeredCollectors = []prometheus.Collector{} initializedCollectors = make(map[string]struct{}) log = logger.NewLogger("collectors") ) // InitCollector is the type for functions that initialize collectors. type InitCollector func() (prometheus.Collector, error) // RegisterCollector registers the named prometheus.Collector for metrics collection. func RegisterCollector(name string, init InitCollector) error { log.Info("registering collector %s...", name) if _, found := builtInCollectors[name]; found { return metricsError("Collector %s already registered", name) } builtInCollectors[name] = init return nil } // NewMetricGatherer creates a new prometheus.Gatherer with all registered collectors. func NewMetricGatherer() (prometheus.Gatherer, error) { reg := prometheus.NewPedanticRegistry() for name, cb := range builtInCollectors { if _, ok := initializedCollectors[name]; ok { continue } c, err := cb() if err != nil { log.Error("Failed to initialize collector '%s': %v. Skipping it.", name, err) continue } registeredCollectors = append(registeredCollectors, c) initializedCollectors[name] = struct{}{} } reg.MustRegister(registeredCollectors[:]...) return reg, nil } func metricsError(format string, args ...interface{}) error { return fmt.Errorf("metrics: "+format, args...) } ================================================ FILE: pkg/metrics/register/register_metrics.go ================================================ package register import ( // Pull in cgroup-based metric collector. _ "github.com/intel/cri-resource-manager/pkg/cgroupstats" ) ================================================ FILE: pkg/metrics/register/register_metrics_avx.go ================================================ //go:build !noavx // +build !noavx package register import ( // Pull in avx collector. _ "github.com/intel/cri-resource-manager/pkg/avx" ) ================================================ FILE: pkg/pidfile/pidfile.go ================================================ // Copyright 2022 Intel Corporation. All Rights Reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package pidfile import ( "fmt" "os" "path/filepath" "strconv" "strings" "syscall" "github.com/pkg/errors" ) var ( pidFilePath = defaultPath() pidFile *os.File ) // GetPath returns the current pidfile path. func GetPath() string { return pidFilePath } // SetPath sets the pidfile path to the given one. func SetPath(path string) { closePIDFile() pidFilePath = path } // Write opens the PID file and writes os.Getpid() to it. If the PID file already // exists Write() fails with an error. On successful completion, Write keeps the // PID file open. func Write() error { if pidFile != nil { return nil } err := os.MkdirAll(filepath.Dir(pidFilePath), 0755) if err != nil { return errors.Wrap(err, "failed to create PID file") } pidFile, err = os.OpenFile(pidFilePath, os.O_CREATE|os.O_EXCL|os.O_WRONLY, 0644) if err != nil { return errors.Wrap(err, "failed to create PID file") } _, err = pidFile.Write([]byte(fmt.Sprintf("%d\n", os.Getpid()))) if err != nil { closePIDFile() return errors.Wrap(err, "failed to write PID file") } return nil } // Read reads the content of the PID file. It returns the process ID found // in the file. If opening the file or reading an integer process ID fails // Read() returns -1 and an error. func Read() (int, error) { var ( pid int buf []byte err error ) if buf, err = os.ReadFile(pidFilePath); err != nil { if os.IsNotExist(err) { return 0, nil } return -1, errors.Wrap(err, "failed to read PID file") } if pid, err = strconv.Atoi(strings.TrimRight(string(buf), "\n")); err != nil { return -1, errors.Wrapf(err, "invalid PID (%q) in PID file", string(buf)) } return pid, nil } // closePIDFile closes the PID file and truncates it to zero length. func closePIDFile() { if pidFile != nil { pidFile.Truncate(0) pidFile.Close() pidFile = nil } } // Remove removes the PID file for the process unconditionally, regardless if // the current process had created the PID file or not. func Remove() error { closePIDFile() err := os.Remove(pidFilePath) if err != nil { if os.IsNotExist(err) { return nil } } return err } // OwnerPid returns the ID of the process owning the PID file. 0 is returned // if it is known that no process owns the file. -1 and an error is returned // if the owner or its existence could not be determined. func OwnerPid() (int, error) { var ( pid int p *os.Process err error ) pid, err = Read() if err != nil { return -1, err } if pid == 0 { return 0, nil } p, err = os.FindProcess(pid) if err != nil { return -1, errors.Wrapf(err, "FindProcess() failed for PID %d", pid) } err = p.Signal(syscall.Signal(0)) if err == os.ErrProcessDone { return 0, nil } if err == nil { return pid, nil } return -1, errors.Wrapf(err, "failed to check process %d", pid) } // defaultPath returns the default pidfile path. func defaultPath() string { var path string if len(os.Args) > 0 { name := filepath.Base(os.Args[0]) if euid := os.Geteuid(); euid > 0 { path = filepath.Join("/tmp", name+".pid") } else { path = filepath.Join("/", "var", "run", name+".pid") } } return path } ================================================ FILE: pkg/pidfile/pidfile_test.go ================================================ // Copyright 2022 Intel Corporation. All Rights Reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package pidfile import ( "os" "path/filepath" "testing" "github.com/pkg/errors" "github.com/stretchr/testify/require" ) const ( testPidFile = "pidfile-test.pid" ) func prepare(t *testing.T) string { dir, err := mkTestDir(t) if err != nil { t.Errorf("failed to create test directory: %v", err) os.Exit(1) } SetPath(filepath.Join(dir, testPidFile)) return dir } func TestDefaults(t *testing.T) { t.Run("TestDefaults", func(t *testing.T) { var ( pid int err error ) Remove() err = Write() require.Nil(t, err) pid, err = Read() require.Nil(t, err) require.Equal(t, pid, os.Getpid()) err = Write() require.Nil(t, err) pid, err = Read() require.Nil(t, err) require.Equal(t, pid, os.Getpid()) closePIDFile() err = Write() require.NotNil(t, err) Remove() err = Write() require.Nil(t, err) pid, err = Read() require.Nil(t, err) require.Equal(t, pid, os.Getpid()) }) } func TestGetSetPath(t *testing.T) { t.Run("TestTestGetSetPath", func(t *testing.T) { var ( dir string path string ) dir = prepare(t) path = GetPath() require.Equal(t, path, filepath.Join(dir, testPidFile)) }) } func TestReadNonExisting(t *testing.T) { t.Run("TestReadNonExisting", func(t *testing.T) { var ( pid int err error ) prepare(t) pid, err = Read() require.Nil(t, err) require.Equal(t, pid, 0) }) } func TestRemoveNonExisting(t *testing.T) { t.Run("TestRemoveNonExisting", func(t *testing.T) { prepare(t) err := Remove() require.Nil(t, err) }) } func TestRemoveExisting(t *testing.T) { t.Run("TestRemoveExisting", func(t *testing.T) { var ( err error ) prepare(t) err = Write() require.Nil(t, err) err = Remove() require.Nil(t, err) }) } func TestWrite(t *testing.T) { t.Run("TestWrite", func(t *testing.T) { var ( pid int err error ) prepare(t) err = Write() require.Nil(t, err) pid, err = Read() require.Nil(t, err) require.Equal(t, pid, os.Getpid()) }) } func TestReadClosed(t *testing.T) { t.Run("TestReadClosed", func(t *testing.T) { var ( pid int err error ) prepare(t) err = Write() require.Nil(t, err) pid, err = Read() require.Nil(t, err) require.Equal(t, pid, os.Getpid()) closePIDFile() pid, err = Read() require.NotNil(t, err) require.Equal(t, pid, -1) }) } func TestFailToOverwrite(t *testing.T) { t.Run("TestFailToOverwrite", func(t *testing.T) { var ( pid int err error ) prepare(t) err = Write() require.Nil(t, err) pid, err = Read() require.Nil(t, err) require.Equal(t, pid, os.Getpid()) closePIDFile() err = Write() require.NotNil(t, err) }) } func TestRemoveToOverwrite(t *testing.T) { t.Run("TestRemoveToOverwrite", func(t *testing.T) { var ( pid int err error ) prepare(t) err = Write() require.Nil(t, err) pid, err = Read() require.Nil(t, err) require.Equal(t, pid, os.Getpid()) err = Remove() require.Nil(t, err) err = Write() require.Nil(t, err) pid, err = Read() require.Nil(t, err) require.Equal(t, pid, os.Getpid()) }) } func TestOwnerPid(t *testing.T) { t.Run("TestOwnerPid", func(t *testing.T) { var ( pid int chk int err error ) prepare(t) err = Write() require.Nil(t, err) pid, err = Read() require.Nil(t, err) require.Equal(t, pid, os.Getpid()) chk, err = OwnerPid() require.Nil(t, err) require.Equal(t, pid, chk) }) } func mkTestDir(t *testing.T) (string, error) { tmp, err := os.MkdirTemp("", ".pidfile-test*") if err != nil { return "", errors.Wrapf(err, "failed to create test directory") } t.Cleanup(func() { os.RemoveAll(tmp) }) return tmp, nil } ================================================ FILE: pkg/policycollector/collector.go ================================================ // Copyright 2019 Intel Corporation. All Rights Reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package policycollector import ( "github.com/intel/cri-resource-manager/pkg/cri/resource-manager/policy" "github.com/intel/cri-resource-manager/pkg/metrics" "github.com/prometheus/client_golang/prometheus" ) type PolicyCollector struct { policy policy.Policy } func (c *PolicyCollector) SetPolicy(policy policy.Policy) { c.policy = policy } // HasPolicySpecificMetrics judges whether the policy defines the policy-specific metrics func (c *PolicyCollector) HasPolicySpecificMetrics() bool { if c.policy.DescribeMetrics() == nil { return false } return true } // Describe implements prometheus.Collector interface func (c *PolicyCollector) Describe(ch chan<- *prometheus.Desc) { for _, d := range c.policy.DescribeMetrics() { ch <- d } } // Collect implements prometheus.Collector interface func (c *PolicyCollector) Collect(ch chan<- prometheus.Metric) { prometheusMetrics, err := c.policy.CollectMetrics(c.policy.PollMetrics()) if err != nil { return } for _, m := range prometheusMetrics { ch <- m } } // RegisterPolicyMetricsCollector registers policy-specific collector func (c *PolicyCollector) RegisterPolicyMetricsCollector() error { return metrics.RegisterCollector("policyMetrics", func() (prometheus.Collector, error) { return c, nil }) } ================================================ FILE: pkg/procstats/procstats.go ================================================ // Copyright 2019 Intel Corporation. All Rights Reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package procstats import ( "os" "strconv" "strings" "sync" "github.com/intel/cri-resource-manager/pkg/log" "github.com/intel/cri-resource-manager/pkg/sysfs" ) // CPUTimeStat is used to calculate the CPU usage. type CPUTimeStat struct { sync.RWMutex PrevIdleTime []uint64 PrevTotalTime []uint64 CurIdleTime []uint64 CurTotalTime []uint64 DeltaIdleTime []uint64 DeltaTotalTime []uint64 CPUUsage []float64 IsGetCPUUsageBegin bool } var ( // procRoot is the mount point for the proc filesystem procRoot = "/proc" procStat = procRoot + "/stat" ) // GetCPUTimeStat calculates CPU usage by using the CPU time statistics from /proc/stat func (t *CPUTimeStat) GetCPUTimeStat() error { // /proc/stat looks like this: // cpuid: user, nice, system, idle, iowait, irq, softirq // cpu 130216 19944 162525 1491240 3784 24749 17773 0 0 0 // cpu0 40321 11452 49784 403099 2615 6076 6748 0 0 0 // cpu1 26585 2425 36639 151166 404 2533 3541 0 0 0 // ... stats, err := os.ReadFile(procStat) if err != nil { return err } t.Lock() defer t.Unlock() sys, err := sysfs.DiscoverSystem() if err != nil { return err } cpuCount := len(sys.CPUIDs()) for index, line := range strings.Split(string(stats), "\n") { if index > cpuCount { break } split := strings.Split(line, " ") if strings.HasPrefix(split[0], "cpu") && split[0] != "cpu" { i, err := strconv.Atoi(split[0][3:]) if err != nil { log.Error("Fail to get CPU index.") return err } t.CurIdleTime[i], err = strconv.ParseUint(split[4], 10, 64) if err != nil { log.Error("Fail to get idle time.") return err } totalTime := uint64(0) for _, s := range split[1:] { u, err := strconv.ParseUint(s, 10, 64) if err == nil { totalTime += u } } t.CurTotalTime[i] = totalTime t.CPUUsage[i] = 0.0 if t.IsGetCPUUsageBegin { t.DeltaIdleTime[i] = t.CurIdleTime[i] - t.PrevIdleTime[i] t.DeltaTotalTime[i] = t.CurTotalTime[i] - t.PrevTotalTime[i] if t.DeltaTotalTime[i] != 0 { t.CPUUsage[i] = (1.0 - float64(t.DeltaIdleTime[i])/float64(t.DeltaTotalTime[i])) * 100.0 } } t.PrevIdleTime[i] = t.CurIdleTime[i] t.PrevTotalTime[i] = t.CurTotalTime[i] } } for _, i := range sys.Offlined().List() { t.DeltaIdleTime[i] = 0.0 t.DeltaTotalTime[i] = 0.0 t.PrevIdleTime[i] = t.CurIdleTime[i] t.PrevTotalTime[i] = t.CurTotalTime[i] t.CPUUsage[i] = 0.0 } t.IsGetCPUUsageBegin = true return nil } ================================================ FILE: pkg/sysfs/error.go ================================================ // Copyright 2019 Intel Corporation. All Rights Reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package sysfs import ( "fmt" ) func sysfsError(path, format string, args ...interface{}) error { return fmt.Errorf(path+": "+format, args...) } ================================================ FILE: pkg/sysfs/parsers.go ================================================ // Copyright 2019 Intel Corporation. All Rights Reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package sysfs import ( "os" "strconv" "strings" ) // unit multipliers const ( k = (int64(1) << 10) M = (int64(1) << 20) G = (int64(1) << 30) T = (int64(1) << 40) ) // unit name to multiplier mapping var units = map[string]int64{ "k": k, "kB": k, "M": M, "MB": M, "G": G, "GB": G, "T": T, "TB": T, } // PickEntryFn picks a given input line apart into an entry of key and value. type PickEntryFn func(string) (string, string, error) // splitNumericAndUnit splits a string into a numeric and a unit part. func splitNumericAndUnit(path string, value string) (string, int64, error) { fields := strings.Fields(value) switch len(fields) { case 1: return fields[0], 1, nil case 2: num := fields[0] unit, ok := units[fields[1]] if !ok { return "", -1, sysfsError(path, "failed to parse '%s', invalid unit '%s'", value, num, unit) } return num, unit, nil } return "", -1, sysfsError(path, "invalid numeric value %s", value) } // PparseNumberic parses a numeric string into integer of the right size. func parseNumeric(path, value string, ptr interface{}) error { var numstr string var num, unit int64 var f float64 var err error if numstr, unit, err = splitNumericAndUnit(path, value); err != nil { return err } switch ptr.(type) { case *int: num, err = strconv.ParseInt(numstr, 0, strconv.IntSize) *ptr.(*int) = int(num * unit) case *int8: num, err = strconv.ParseInt(numstr, 0, 8) *ptr.(*int8) = int8(num * unit) case *int16: num, err = strconv.ParseInt(numstr, 0, 16) *ptr.(*int16) = int16(num * unit) case *int32: num, err = strconv.ParseInt(numstr, 0, 32) *ptr.(*int32) = int32(num * unit) case *int64: num, err = strconv.ParseInt(numstr, 0, 64) *ptr.(*int64) = int64(num * unit) case *uint: num, err = strconv.ParseInt(numstr, 0, strconv.IntSize) *ptr.(*uint) = uint(num * unit) case *uint8: num, err = strconv.ParseInt(numstr, 0, 8) *ptr.(*uint8) = uint8(num * unit) case *uint16: num, err = strconv.ParseInt(numstr, 0, 16) *ptr.(*uint16) = uint16(num * unit) case *uint32: num, err = strconv.ParseInt(numstr, 0, 32) *ptr.(*uint32) = uint32(num * unit) case *uint64: num, err = strconv.ParseInt(numstr, 0, 64) *ptr.(*uint64) = uint64(num * unit) case *float32: f, err = strconv.ParseFloat(numstr, 32) *ptr.(*float32) = float32(f) * float32(unit) case *float64: f, err = strconv.ParseFloat(numstr, 64) *ptr.(*float64) = f * float64(unit) default: err = sysfsError(path, "can't parse numeric value '%s' into type %T", value, ptr) } return err } // ParseFileEntries parses a sysfs files for the given entries. func ParseFileEntries(path string, values map[string]interface{}, pickFn PickEntryFn) error { var err error data, err := os.ReadFile(path) if err != nil { sysfsError(path, "failed to read file: %v", err) } left := len(values) for _, line := range strings.Split(string(data), "\n") { key, value, err := pickFn(line) if err != nil { return err } ptr, ok := values[key] if !ok { continue } switch ptr.(type) { case *int, *int8, *int32, *int16, *int64, *uint, *uint8, *uint16, *uint32, *uint64: if err = parseNumeric(path, value, ptr); err != nil { return err } case *float32, *float64: if err = parseNumeric(path, value, ptr); err != nil { return err } case *string: *ptr.(*string) = value case *bool: *ptr.(*bool), err = strconv.ParseBool(value) if err != nil { return sysfsError(path, "failed to parse line %s, value '%s' for boolean key '%s'", line, value, key) } default: return sysfsError(path, "don't know how to parse key '%s' of type %T", key, ptr) } left-- if left == 0 { break } } return nil } ================================================ FILE: pkg/sysfs/system.go ================================================ // Copyright 2019 Intel Corporation. All Rights Reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package sysfs import ( "fmt" "os" "path/filepath" "sort" "strconv" "strings" logger "github.com/intel/cri-resource-manager/pkg/log" "github.com/intel/cri-resource-manager/pkg/utils" "github.com/intel/cri-resource-manager/pkg/utils/cpuset" "github.com/intel/goresctrl/pkg/sst" idset "github.com/intel/goresctrl/pkg/utils" ) var ( // Parent directory under which host sysfs, etc. is mounted (if non-standard location). sysRoot = "" ) const ( // sysfs devices/cpu subdirectory path sysfsCPUPath = "devices/system/cpu" // sysfs device/node subdirectory path sysfsNumaNodePath = "devices/system/node" ) // MemoryType is an enum for the Node memory type MemoryType int const ( // MemoryTypeDRAM means that the node has regular DRAM-type memory MemoryTypeDRAM MemoryType = iota // MemoryTypePMEM means that the node has persistent memory MemoryTypePMEM // MemoryTypeHBM means that the node has high bandwidth memory MemoryTypeHBM ) // System devices type System interface { Discover() error SetCpusOnline(online bool, cpus idset.IDSet) (idset.IDSet, error) SetCPUFrequencyLimits(min, max uint64, cpus idset.IDSet) error PackageIDs() []idset.ID NodeIDs() []idset.ID CPUIDs() []idset.ID PackageCount() int SocketCount() int CPUCount() int NUMANodeCount() int ThreadCount() int CPUSet() cpuset.CPUSet Package(id idset.ID) CPUPackage Node(id idset.ID) Node NodeDistance(from, to idset.ID) int CPU(id idset.ID) CPU Offlined() cpuset.CPUSet Isolated() cpuset.CPUSet } // System devices type system struct { logger.Logger // our logger instance path string // sysfs mount point packages map[idset.ID]*cpuPackage // physical packages nodes map[idset.ID]*node // NUMA nodes cpus map[idset.ID]*cpu // CPUs cache map[idset.ID]*Cache // Cache offline idset.IDSet // offlined CPUs isolated idset.IDSet // isolated CPUs threads int // hyperthreads per core } // CPUPackage is a physical package (a collection of CPUs). type CPUPackage interface { ID() idset.ID CPUSet() cpuset.CPUSet DieIDs() []idset.ID NodeIDs() []idset.ID DieNodeIDs(idset.ID) []idset.ID DieCPUSet(idset.ID) cpuset.CPUSet SstInfo() *sst.SstPackageInfo } type cpuPackage struct { id idset.ID // package id cpus idset.IDSet // CPUs in this package nodes idset.IDSet // nodes in this package dies idset.IDSet // dies in this package dieCPUs map[idset.ID]idset.IDSet // CPUs per die dieNodes map[idset.ID]idset.IDSet // NUMA nodes per die sstInfo *sst.SstPackageInfo // Speed Select Technology info } // Node represents a NUMA node. type Node interface { ID() idset.ID PackageID() idset.ID DieID() idset.ID CPUSet() cpuset.CPUSet Distance() []int DistanceFrom(id idset.ID) int MemoryInfo() (*MemInfo, error) GetMemoryType() MemoryType HasNormalMemory() bool } type node struct { path string // sysfs path id idset.ID // node id pkg idset.ID // package id die idset.ID // die id cpus idset.IDSet // cpus in this node memoryType MemoryType // node memory type normalMem bool // node has memory in a normal (kernel space allocatable) zone distance []int // distance/cost to other NUMA nodes } // CPU is a CPU core. type CPU interface { ID() idset.ID PackageID() idset.ID DieID() idset.ID NodeID() idset.ID CoreID() idset.ID ThreadCPUSet() cpuset.CPUSet BaseFrequency() uint64 FrequencyRange() CPUFreq EPP() EPP Online() bool Isolated() bool SetFrequencyLimits(min, max uint64) error SstClos() int } type cpu struct { path string // sysfs path id idset.ID // CPU id pkg idset.ID // package id die idset.ID // die id node idset.ID // node id core idset.ID // core id threads idset.IDSet // sibling/hyper-threads baseFreq uint64 // CPU base frequency freq CPUFreq // CPU frequencies epp EPP // Energy Performance Preference from cpufreq governor online bool // whether this CPU is online isolated bool // whether this CPU is isolated sstClos int // SST-CP CLOS the CPU is associated with } // CPUFreq is a CPU frequency scaling range type CPUFreq struct { min uint64 // minimum frequency (kHz) max uint64 // maximum frequency (kHz) all []uint64 // discrete set of frequencies if applicable/known } // EPP represents the value of a CPU energy performance profile type EPP int const ( EPPPerformance EPP = iota EPPBalancePerformance EPPBalancePower EPPPower EPPUnknown ) // MemInfo contains data read from a NUMA node meminfo file. type MemInfo struct { MemTotal uint64 MemFree uint64 MemUsed uint64 } // CPU cache. // Notes: cache-discovery is forced off now (by forcibly clearing the related discovery bit) // Can't seem to make sense of the cache information exposed under sysfs. The cache ids // do not seem to be unique, which IIUC is contrary to the documentation. // CacheType specifies a cache type. type CacheType string const ( // DataCache marks data cache. DataCache CacheType = "Data" // InstructionCache marks instruction cache. InstructionCache CacheType = "Instruction" // UnifiedCache marks a unified data/instruction cache. UnifiedCache CacheType = "Unified" ) // Cache has details about cache. type Cache struct { id idset.ID // cache id kind CacheType // cache type size uint64 // cache size level uint8 // cache level cpus idset.IDSet // CPUs sharing this cache } // SetSysRoot sets the sys root directory. func SetSysRoot(path string) { sysRoot = path } // SysRoot returns the sys root directory. func SysRoot() string { return sysRoot } // DiscoverSystem performs discovery of the running systems details. func DiscoverSystem() (System, error) { return DiscoverSystemAt(filepath.Join("/", sysRoot, "sys")) } // DiscoverSystemAt performs discovery of the running systems details from sysfs mounted at path. func DiscoverSystemAt(path string) (System, error) { sys := &system{ Logger: logger.NewLogger("sysfs"), path: path, offline: idset.NewIDSet(), } if err := sys.Discover(); err != nil { return nil, err } return sys, nil } // Discover performs system/hardware discovery. func (sys *system) Discover() error { if err := sys.discoverCPUs(); err != nil { return err } if err := sys.discoverNodes(); err != nil { return err } if err := sys.discoverPackages(); err != nil { return err } if err := sys.discoverSst(); err != nil { // Just consider SST unsupported if our detection fails for some reason sys.Warn("%v", err) } if len(sys.nodes) > 0 { for _, pkg := range sys.packages { for _, nodeID := range pkg.nodes.SortedMembers() { if node, ok := sys.nodes[nodeID]; ok { node.pkg = pkg.id } else { return sysfsError("NUMA nodes", "can't find NUMA node for ID %d", nodeID) } } for _, dieID := range pkg.DieIDs() { for _, nodeID := range pkg.DieNodeIDs(dieID) { if node, ok := sys.nodes[nodeID]; ok { node.die = dieID } else { return sysfsError("NUMA nodes", "can't find NUMA node for ID %d", nodeID) } } } } } if sys.DebugEnabled() { for id, pkg := range sys.packages { sys.Info("package #%d:", id) sys.Debug(" cpus: %s", pkg.cpus) sys.Debug(" nodes: %s", pkg.nodes) sys.Debug(" dies: %s", pkg.dies) for die := range pkg.dies { sys.Debug(" die #%v nodes: %v", die, pkg.DieNodeIDs(die)) sys.Debug(" die #%v cpus: %s", die, pkg.DieCPUSet(die).String()) } } for id, node := range sys.nodes { sys.Debug("node #%d:", id) sys.Debug(" cpus: %s", node.cpus) sys.Debug(" distance: %v", node.distance) sys.Debug(" package: #%d", node.pkg) sys.Debug(" die: #%d", node.die) } for id, cpu := range sys.cpus { sys.Debug("CPU #%d:", id) sys.Debug(" pkg: %d", cpu.pkg) sys.Debug(" die: %d", cpu.die) sys.Debug(" node: %d", cpu.node) sys.Debug(" core: %d", cpu.core) sys.Debug(" threads: %s", cpu.threads) sys.Debug(" base freq: %d", cpu.baseFreq) sys.Debug(" freq: %d - %d", cpu.freq.min, cpu.freq.max) sys.Debug(" epp: %d", cpu.epp) } sys.Debug("offline CPUs: %s", sys.offline) sys.Debug("isolated CPUs: %s", sys.isolated) for id, cch := range sys.cache { sys.Debug("cache #%d:", id) sys.Debug(" type: %v", cch.kind) sys.Debug(" size: %d", cch.size) sys.Debug(" level: %d", cch.level) sys.Debug(" CPUs: %s", cch.cpus) } } return nil } // SetCpusOnline puts a set of CPUs online. Return the toggled set. Nil set implies all CPUs. func (sys *system) SetCpusOnline(online bool, cpus idset.IDSet) (idset.IDSet, error) { var entries []string if cpus == nil { entries, _ = filepath.Glob(filepath.Join(sys.path, sysfsCPUPath, "cpu[0-9]*")) } else { entries = make([]string, cpus.Size()) for idx, id := range cpus.Members() { entries[idx] = sys.path + "/" + sysfsCPUPath + "/cpu" + strconv.Itoa(int(id)) } } desired := map[bool]int{false: 0, true: 1}[online] changed := idset.NewIDSet() for _, entry := range entries { var current int id := getEnumeratedID(entry) if id <= 0 { continue } if _, err := writeSysfsEntry(entry, "online", desired, ¤t); err != nil { return nil, sysfsError(entry, "failed to set online to %d: %v", desired, err) } if desired != current { changed.Add(id) if cpu, found := sys.cpus[id]; found { cpu.online = online if online { sys.offline.Del(id) } else { sys.offline.Add(id) } } } } return changed, nil } // SetCPUFrequencyLimits sets the CPU frequency scaling limits. Nil set implies all CPUs. func (sys *system) SetCPUFrequencyLimits(min, max uint64, cpus idset.IDSet) error { if cpus == nil { cpus = idset.NewIDSet(sys.CPUIDs()...) } for _, id := range cpus.Members() { if cpu, ok := sys.cpus[id]; ok { if err := cpu.SetFrequencyLimits(min, max); err != nil { return err } } } return nil } // PackageIDs gets the ids of all packages present in the system. func (sys *system) PackageIDs() []idset.ID { ids := make([]idset.ID, len(sys.packages)) idx := 0 for id := range sys.packages { ids[idx] = id idx++ } sort.Slice(ids, func(i, j int) bool { return int(ids[i]) < int(ids[j]) }) return ids } // NodeIDs gets the ids of all NUMA nodes present in the system. func (sys *system) NodeIDs() []idset.ID { ids := make([]idset.ID, len(sys.nodes)) idx := 0 for id := range sys.nodes { ids[idx] = id idx++ } sort.Slice(ids, func(i, j int) bool { return int(ids[i]) < int(ids[j]) }) return ids } // CPUIDs gets the ids of all CPUs present in the system. func (sys *system) CPUIDs() []idset.ID { ids := make([]idset.ID, len(sys.cpus)) idx := 0 for id := range sys.cpus { ids[idx] = id idx++ } sort.Slice(ids, func(i, j int) bool { return int(ids[i]) < int(ids[j]) }) return ids } // PackageCount returns the number of discovered CPU packages (sockets). func (sys *system) PackageCount() int { return len(sys.packages) } // SocketCount returns the number of discovered CPU packages (sockets). func (sys *system) SocketCount() int { return len(sys.packages) } // CPUCount resturns the number of discovered CPUs/cores. func (sys *system) CPUCount() int { return len(sys.cpus) } // NUMANodeCount returns the number of discovered NUMA nodes. func (sys *system) NUMANodeCount() int { cnt := len(sys.nodes) if cnt < 1 { cnt = 1 } return cnt } // ThreadCount returns the number of threads per core discovered. func (sys *system) ThreadCount() int { return sys.threads } // CPUSet gets the ids of all CPUs present in the system as a CPUSet. func (sys *system) CPUSet() cpuset.CPUSet { return CPUSetFromIDSet(idset.NewIDSet(sys.CPUIDs()...)) } // Package gets the package with a given package id. func (sys *system) Package(id idset.ID) CPUPackage { return sys.packages[id] } // Node gets the node with a given node id. func (sys *system) Node(id idset.ID) Node { return sys.nodes[id] } // NodeDistance gets the distance between two NUMA nodes. func (sys *system) NodeDistance(from, to idset.ID) int { return sys.nodes[from].DistanceFrom(to) } // CPU gets the CPU with a given CPU id. func (sys *system) CPU(id idset.ID) CPU { return sys.cpus[id] } // Offlined gets the set of offlined CPUs. func (sys *system) Offlined() cpuset.CPUSet { return CPUSetFromIDSet(sys.offline) } // Isolated gets the set of isolated CPUs." func (sys *system) Isolated() cpuset.CPUSet { return CPUSetFromIDSet(sys.isolated) } // Discover Cpus present in the system. func (sys *system) discoverCPUs() error { if sys.cpus != nil { return nil } sys.cpus = make(map[idset.ID]*cpu) _, err := readSysfsEntry(sys.path, filepath.Join(sysfsCPUPath, "isolated"), &sys.isolated, ",") if err != nil { sys.Error("failed to get set of isolated cpus: %v", err) } entries, _ := filepath.Glob(filepath.Join(sys.path, sysfsCPUPath, "cpu[0-9]*")) for _, entry := range entries { if err := sys.discoverCPU(entry); err != nil { return fmt.Errorf("failed to discover cpu for entry %s: %v", entry, err) } } return nil } // Discover details of the given CPU. func (sys *system) discoverCPU(path string) error { cpu := &cpu{path: path, id: getEnumeratedID(path), online: true, sstClos: -1} cpu.isolated = sys.isolated.Has(cpu.id) if online, err := readSysfsEntry(path, "online", nil); err == nil { cpu.online = (online != "" && online[0] != '0') } if cpu.online { if _, err := readSysfsEntry(path, "topology/physical_package_id", &cpu.pkg); err != nil { return err } readSysfsEntry(path, "topology/die_id", &cpu.die) if _, err := readSysfsEntry(path, "topology/core_id", &cpu.core); err != nil { return err } if _, err := readSysfsEntry(path, "topology/thread_siblings_list", &cpu.threads, ","); err != nil { return err } } else { sys.offline.Add(cpu.id) } if _, err := readSysfsEntry(path, "cpufreq/base_frequency", &cpu.baseFreq); err != nil { cpu.baseFreq = 0 } if _, err := readSysfsEntry(path, "cpufreq/cpuinfo_min_freq", &cpu.freq.min); err != nil { cpu.freq.min = 0 } if _, err := readSysfsEntry(path, "cpufreq/cpuinfo_max_freq", &cpu.freq.max); err != nil { cpu.freq.max = 0 } if _, err := readSysfsEntry(path, "cpufreq/energy_performance_preference", &cpu.epp); err != nil { cpu.epp = EPPUnknown } if node, _ := filepath.Glob(filepath.Join(path, "node[0-9]*")); len(node) == 1 { cpu.node = getEnumeratedID(node[0]) } else { return fmt.Errorf("exactly one node per cpu allowed") } if sys.threads < 1 { sys.threads = 1 } if cpu.threads.Size() > sys.threads { sys.threads = cpu.threads.Size() } sys.cpus[cpu.id] = cpu return nil } // ID returns the id of this CPU. func (c *cpu) ID() idset.ID { return c.id } // PackageID returns package id of this CPU. func (c *cpu) PackageID() idset.ID { return c.pkg } // DieID returns the die id of this CPU. func (c *cpu) DieID() idset.ID { return c.die } // NodeID returns the node id of this CPU. func (c *cpu) NodeID() idset.ID { return c.node } // CoreID returns the core id of this CPU (lowest CPU id of all thread siblings). func (c *cpu) CoreID() idset.ID { return c.core } // ThreadCPUSet returns the CPUSet for all threads in this core. func (c *cpu) ThreadCPUSet() cpuset.CPUSet { return CPUSetFromIDSet(c.threads) } // BaseFrequency returns the base frequency setting for this CPU. func (c *cpu) BaseFrequency() uint64 { return c.baseFreq } // FrequencyRange returns the frequency range for this CPU. func (c *cpu) FrequencyRange() CPUFreq { return c.freq } // EPP returns the energy performance profile of this CPU. func (c *cpu) EPP() EPP { return c.epp } // Online returns if this CPU is online. func (c *cpu) Online() bool { return c.online } // Isolated returns if this CPU is isolated. func (c *cpu) Isolated() bool { return c.isolated } // SstClos returns the Speed Select Core Power CLOS number assigned to the CPU // -1 implies that no SST prioritization is in effect func (c *cpu) SstClos() int { return c.sstClos } // SetFrequencyLimits sets the frequency scaling limits for this CPU. func (c *cpu) SetFrequencyLimits(min, max uint64) error { if c.freq.min == 0 { return nil } min /= 1000 max /= 1000 if min < c.freq.min && min != 0 { min = c.freq.min } if min > c.freq.max { min = c.freq.max } if max < c.freq.min && max != 0 { max = c.freq.min } if max > c.freq.max { max = c.freq.max } if _, err := writeSysfsEntry(c.path, "cpufreq/scaling_min_freq", min, nil); err != nil { return err } if _, err := writeSysfsEntry(c.path, "cpufreq/scaling_max_freq", max, nil); err != nil { return err } return nil } func readCPUsetFile(base, entry string) (cpuset.CPUSet, error) { path := filepath.Join(base, entry) blob, err := os.ReadFile(path) if err != nil { return cpuset.New(), sysfsError(path, "failed to read sysfs entry: %v", err) } return cpuset.Parse(strings.Trim(string(blob), "\n")) } // Discover NUMA nodes present in the system. func (sys *system) discoverNodes() error { if sys.nodes != nil { return nil } sysNodesPath := filepath.Join(sys.path, sysfsNumaNodePath) sys.nodes = make(map[idset.ID]*node) entries, _ := filepath.Glob(filepath.Join(sysNodesPath, "node[0-9]*")) for _, entry := range entries { if err := sys.discoverNode(entry); err != nil { return fmt.Errorf("failed to discover node for entry %s: %v", entry, err) } } normalMemNodeIDs, err := readSysfsEntry(sysNodesPath, "has_normal_memory", nil) if err != nil { return fmt.Errorf("failed to discover nodes with normal memory: %v", err) } normalMemNodes, err := cpuset.Parse(normalMemNodeIDs) if err != nil { return fmt.Errorf("failed to parse nodes with normal memory (%q): %v", normalMemNodes, err) } memoryNodeIDs, err := readSysfsEntry(sysNodesPath, "has_memory", nil) if err != nil { return fmt.Errorf("failed to discover nodes with memory: %v", err) } memoryNodes, err := cpuset.Parse(memoryNodeIDs) if err != nil { return fmt.Errorf("failed to parse nodes with memory (%q): %v", memoryNodeIDs, err) } cpuNodesSlice := []int{} for id, node := range sys.nodes { if node.cpus.Size() > 0 { cpuNodesSlice = append(cpuNodesSlice, int(id)) } if normalMemNodes.Contains(int(id)) { node.normalMem = true } } cpuNodes := cpuset.New(cpuNodesSlice...) sys.Logger.Info("NUMA nodes with CPUs: %s", cpuNodes.String()) sys.Logger.Info("NUMA nodes with (any) memory: %s", memoryNodes.String()) sys.Logger.Info("NUMA nodes with normal memory: %s", normalMemNodes.String()) dramNodes := memoryNodes.Intersection(cpuNodes) pmemOrHbmNodes := memoryNodes.Difference(dramNodes) dramNodeIds := IDSetFromCPUSet(dramNodes) pmemOrHbmNodeIds := IDSetFromCPUSet(pmemOrHbmNodes) infos := make(map[idset.ID]*MemInfo) dramAvg := uint64(0) if len(pmemOrHbmNodeIds) > 0 && len(dramNodeIds) > 0 { // There is special memory present in the system. // FIXME assumption: if a node only has memory (and no CPUs), it's PMEM or HBM. Otherwise it's DRAM. // Also, we figure out if the memory is HBM or PMEM based on the amount. If the amount of memory is // smaller than the average amount of DRAM per node, it's HBM, otherwise PMEM. dramTotal := uint64(0) for _, node := range sys.nodes { info, err := node.MemoryInfo() if err != nil { return fmt.Errorf("failed to get memory info for node %v: %s", node, err) } infos[node.id] = info if _, ok := dramNodeIds[node.id]; ok { dramTotal += info.MemTotal } } dramAvg = dramTotal / uint64(len(dramNodeIds)) if dramAvg == 0 { // FIXME: should be no reason to bail out when memory types are properly determined. return fmt.Errorf("no dram in the system, cannot determine special memory types") } } for _, node := range sys.nodes { if _, ok := pmemOrHbmNodeIds[node.id]; ok { mem, ok := infos[node.id] if !ok { return fmt.Errorf("not able to determine system special memory types") } if mem.MemTotal < dramAvg { sys.Logger.Info("node %d has HBM memory", node.id) node.memoryType = MemoryTypeHBM } else { sys.Logger.Info("node %d has PMEM memory", node.id) node.memoryType = MemoryTypePMEM } } else if _, ok := dramNodeIds[node.id]; ok { sys.Logger.Info("node %d has DRAM memory", node.id) node.memoryType = MemoryTypeDRAM } else { return fmt.Errorf("Unknown memory type for node %v (pmem nodes: %s, dram nodes: %s)", node, pmemOrHbmNodes, dramNodes) } } return nil } // Discover details of the given NUMA node. func (sys *system) discoverNode(path string) error { node := &node{path: path, id: getEnumeratedID(path)} if _, err := readSysfsEntry(path, "cpulist", &node.cpus, ","); err != nil { return err } if _, err := readSysfsEntry(path, "distance", &node.distance); err != nil { return err } sys.nodes[node.id] = node return nil } // ID returns id of this node. func (n *node) ID() idset.ID { return n.id } // PackageID returns the package id for this node. func (n *node) PackageID() idset.ID { return n.pkg } // DieID returns the die id for this node. func (n *node) DieID() idset.ID { return n.die } // CPUSet returns the CPUSet for all cores/threads in this node. func (n *node) CPUSet() cpuset.CPUSet { return CPUSetFromIDSet(n.cpus) } // Distance returns the distance vector for this node. func (n *node) Distance() []int { return n.distance } // DistanceFrom returns the distance of this and a given node. func (n *node) DistanceFrom(id idset.ID) int { if int(id) < len(n.distance) { return n.distance[int(id)] } return -1 } // MemoryInfo memory info for the node (partial content from the meminfo sysfs entry). func (n *node) MemoryInfo() (*MemInfo, error) { meminfo := filepath.Join(n.path, "meminfo") buf := &MemInfo{} err := ParseFileEntries(meminfo, map[string]interface{}{ "MemTotal:": &buf.MemTotal, "MemFree:": &buf.MemFree, }, func(line string) (string, string, error) { fields := strings.Fields(strings.TrimSpace(line)) if len(fields) < 4 { return "", "", sysfsError(meminfo, "failed to parse entry: '%s'", line) } key := fields[2] val := fields[3] if len(fields) == 5 { val += " " + fields[4] } return key, val, nil }, ) if err != nil { return nil, err } // // On some HW and kernel combinations we've seen more free than total // memory being reported. This causes exorbitant usage of memory being // reported which later can cause failures in policies which trust and // rely on this information. // // Give here a clear(er) error about that. This should also prevent us // immediately from starting up. // if buf.MemFree > buf.MemTotal { return nil, sysfsError(meminfo, "System reports more free than total memory. "+ "This can be caused by a kernel bug. Please update your kernel.") } buf.MemUsed = buf.MemTotal - buf.MemFree return buf, nil } // GetMemoryType returns the memory type for this node. func (n *node) GetMemoryType() MemoryType { return n.memoryType } // HasNormalMemory returns true if the node has memory that belongs to a normal zone. func (n *node) HasNormalMemory() bool { return n.normalMem } // Discover physical packages (CPU sockets) present in the system. func (sys *system) discoverPackages() error { if sys.packages != nil { return nil } sys.packages = make(map[idset.ID]*cpuPackage) for _, cpu := range sys.cpus { pkg, found := sys.packages[cpu.pkg] if !found { pkg = &cpuPackage{ id: cpu.pkg, cpus: idset.NewIDSet(), nodes: idset.NewIDSet(), dies: idset.NewIDSet(), dieCPUs: make(map[idset.ID]idset.IDSet), dieNodes: make(map[idset.ID]idset.IDSet), } sys.packages[cpu.pkg] = pkg } pkg.cpus.Add(cpu.id) pkg.nodes.Add(cpu.node) pkg.dies.Add(cpu.die) if dieCPUs, ok := pkg.dieCPUs[cpu.die]; !ok { pkg.dieCPUs[cpu.die] = idset.NewIDSet(cpu.id) } else { dieCPUs.Add(cpu.id) } if dieNodes, ok := pkg.dieNodes[cpu.die]; !ok { pkg.dieNodes[cpu.die] = idset.NewIDSet(cpu.node) } else { dieNodes.Add(cpu.node) } } return nil } func (sys *system) discoverSst() error { if !sst.SstSupported() { sys.Info("Speed Select Technology (SST) support not detected") return nil } for _, pkg := range sys.packages { sstInfo, err := sst.GetPackageInfo(pkg.id) if err != nil { return fmt.Errorf("failed to get SST info for package %d: %v", pkg.id, err) } sys.DebugBlock("", "Speed Select Technology info detected for package %d:\n%s", pkg.id, utils.DumpJSON(sstInfo)) if sstInfo[pkg.id].CPEnabled { ids := pkg.cpus.SortedMembers() for _, id := range ids { clos, err := sst.GetCPUClosID(id) if err != nil { return fmt.Errorf("failed to get SST-CP clos id for cpu %d: %v", id, err) } sys.cpus[id].sstClos = clos } } pkg.sstInfo = sstInfo[pkg.id] } return nil } // ID returns the id of this package. func (p *cpuPackage) ID() idset.ID { return p.id } // CPUSet returns the CPUSet for all cores/threads in this package. func (p *cpuPackage) CPUSet() cpuset.CPUSet { return CPUSetFromIDSet(p.cpus) } // DieIDs returns the die ids for this package. func (p *cpuPackage) DieIDs() []idset.ID { return p.dies.SortedMembers() } // NodeIDs returns the NUMA node ids for this package. func (p *cpuPackage) NodeIDs() []idset.ID { return p.nodes.SortedMembers() } // DieNodeIDs returns the set of NUMA nodes in the given die of this package. func (p *cpuPackage) DieNodeIDs(id idset.ID) []idset.ID { if dieNodes, ok := p.dieNodes[id]; ok { return dieNodes.SortedMembers() } return []idset.ID{} } // DieCPUSet returns the set of CPUs in the given die of this package. func (p *cpuPackage) DieCPUSet(id idset.ID) cpuset.CPUSet { if dieCPUs, ok := p.dieCPUs[id]; ok { return CPUSetFromIDSet(dieCPUs) } return cpuset.New() } func (p *cpuPackage) SstInfo() *sst.SstPackageInfo { return p.sstInfo } // eppStrings initialized this way to better catch changes in the enum var eppStrings = func() [EPPUnknown]string { var e [EPPUnknown]string e[EPPPerformance] = "performance" e[EPPBalancePerformance] = "balance_performance" e[EPPBalancePower] = "balance_power" e[EPPPower] = "power" return e }() var eppValues = func() map[string]EPP { m := make(map[string]EPP, len(eppStrings)) for i, v := range eppStrings { m[v] = EPP(i) } return m }() // String returns EPP value as string func (e EPP) String() string { if int(e) < len(eppStrings) { return eppStrings[e] } return "" } // EPPFromString converts string to EPP value func EPPFromString(s string) EPP { if v, ok := eppValues[s]; ok { return v } return EPPUnknown } ================================================ FILE: pkg/sysfs/utils.go ================================================ // Copyright 2019 Intel Corporation. All Rights Reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package sysfs import ( "fmt" "os" "path/filepath" "strconv" "strings" "github.com/intel/cri-resource-manager/pkg/utils/cpuset" idset "github.com/intel/goresctrl/pkg/utils" ) // Get the trailing enumeration part of a name. func getEnumeratedID(name string) idset.ID { id := 0 base := 1 for idx := len(name) - 1; idx > 0; idx-- { d := name[idx] if '0' <= d && d <= '9' { id += base * (int(d) - '0') base *= 10 } else { if base > 1 { return idset.ID(id) } return idset.ID(-1) } } return idset.ID(-1) } // Read content of a sysfs entry and convert it according to the type of a given pointer. func readSysfsEntry(base, entry string, ptr interface{}, args ...interface{}) (string, error) { var buf string path := filepath.Join(base, entry) blob, err := os.ReadFile(path) if err != nil { return "", sysfsError(path, "failed to read sysfs entry: %v", err) } buf = strings.Trim(string(blob), "\n") if ptr == interface{}(nil) { return buf, nil } switch ptr.(type) { case *string, *int, *uint, *int8, *uint8, *int16, *uint16, *int32, *uint32, *int64, *uint64: err := parseValue(buf, ptr) if err != nil { return "", sysfsError(path, "%v", err) } return buf, nil case *idset.IDSet, *[]int, *[]uint, *[]int8, *[]uint8, *[]int16, *[]uint16, *[]int32, *[]uint32, *[]int64, *[]uint64: sep, err := getSeparator(" ", args) if err != nil { return "", sysfsError(path, "%v", err) } err = parseValueList(buf, sep, ptr) if err != nil { return "", sysfsError(path, "%v", err) } return buf, nil case *EPP: *ptr.(*EPP) = EPPFromString(buf) return buf, nil } return "", sysfsError(path, "unsupported sysfs entry type %T", ptr) } // Write a value to a sysfs entry. An optional item separator can be specified for slice values. func writeSysfsEntry(base, entry string, val, oldp interface{}, args ...interface{}) (string, error) { var buf, old string var err error if oldp != nil { if old, err = readSysfsEntry(base, entry, oldp, args...); err != nil { return "", err } } path := filepath.Join(base, entry) switch val.(type) { case string, int, uint, int8, uint8, int16, uint16, int32, uint32, int64, uint64: buf, err = formatValue(val) if err != nil { return "", sysfsError(path, "%v", err) } case idset.IDSet, []int, []uint, []int8, []uint8, []int16, []uint16, []int32, []uint32, []int64, []uint64: sep, err := getSeparator(" ", args) if err != nil { return "", sysfsError(path, "%v", err) } buf, err = formatValueList(sep, val) if err != nil { return "", sysfsError(path, "%v", err) } default: return "", sysfsError(path, "unsupported sysfs entry type %T", val) } f, err := os.OpenFile(path, os.O_WRONLY, 0) if err != nil { return "", sysfsError(path, "cannot open: %v", err) } defer f.Close() if _, err = f.Write([]byte(buf + "\n")); err != nil { return "", sysfsError(path, "cannot write: %v", err) } return old, nil } // Determine list separator string, given an optional separator variadic argument. func getSeparator(defaultVal string, args []interface{}) (string, error) { switch len(args) { case 0: return defaultVal, nil case 1: return args[0].(string), nil } return "", fmt.Errorf("invalid separator (%v), 1 expected, %d given", args, len(args)) } // Parse a value from a string. func parseValue(str string, value interface{}) error { switch value.(type) { case *string: *value.(*string) = str case *int, *int8, *int16, *int32, *int64: v, err := strconv.ParseInt(str, 0, 0) if err != nil { return fmt.Errorf("invalid entry '%s': %v", str, err) } switch value.(type) { case *int: *value.(*int) = int(v) case *int8: *value.(*int8) = int8(v) case *int16: *value.(*int16) = int16(v) case *int32: *value.(*int32) = int32(v) case int64: *value.(*int64) = v } case *uint, *uint8, *uint16, *uint32, *uint64: v, err := strconv.ParseUint(str, 0, 0) if err != nil { return fmt.Errorf("invalid entry: '%s': %v", str, err) } switch value.(type) { case *uint: *value.(*uint) = uint(v) case *uint8: *value.(*uint8) = uint8(v) case *uint16: *value.(*uint16) = uint16(v) case *uint32: *value.(*uint32) = uint32(v) case *uint64: *value.(*uint64) = v } } return nil } // Parse a list of values from a string into a slice. func parseValueList(str, sep string, valuep interface{}) error { var value interface{} switch valuep.(type) { case *idset.IDSet: value = idset.NewIDSet() case *[]int: value = []int{} case *[]uint: value = []uint{} case *[]int8: value = []int8{} case *[]uint8: value = []uint8{} case *[]int16: value = []int16{} case *[]uint16: value = []uint16{} case *[]int32: value = []int32{} case *[]uint32: value = []uint32{} case *[]int64: value = []int64{} case *[]uint64: value = []uint64{} default: return fmt.Errorf("invalid slice value type: %T", valuep) } for _, s := range strings.Split(str, sep) { if s == "" { break } switch value.(type) { case idset.IDSet: if rng := strings.Split(s, "-"); len(rng) == 1 { id, err := strconv.Atoi(s) if err != nil { return fmt.Errorf("invalid entry '%s': %v", s, err) } value.(idset.IDSet).Add(idset.ID(id)) } else { beg, err := strconv.Atoi(rng[0]) if err != nil { return fmt.Errorf("invalid entry '%s': %v", s, err) } end, err := strconv.Atoi(rng[1]) if err != nil { return fmt.Errorf("invalid entry '%s': %v", s, err) } for id := beg; id <= end; id++ { value.(idset.IDSet).Add(idset.ID(id)) } } case []int, []int8, []int16, []int32, []int64: v, err := strconv.ParseInt(s, 0, 0) if err != nil { return fmt.Errorf("invalid entry '%s': %v", s, err) } switch value.(type) { case []int: value = append(value.([]int), int(v)) case []int8: value = append(value.([]int8), int8(v)) case []int16: value = append(value.([]int16), int16(v)) case []int32: value = append(value.([]int32), int32(v)) case []int64: value = append(value.([]int64), v) } case []uint, []uint8, []uint16, []uint32, []uint64: v, err := strconv.ParseUint(s, 0, 0) if err != nil { return fmt.Errorf("invalid entry '%s': %v", s, err) } switch value.(type) { case []uint: value = append(value.([]uint), uint(v)) case []uint8: value = append(value.([]uint8), uint8(v)) case []uint16: value = append(value.([]uint16), uint16(v)) case []uint32: value = append(value.([]uint32), uint32(v)) case []uint64: value = append(value.([]uint64), v) } } } switch valuep.(type) { case *idset.IDSet: *valuep.(*idset.IDSet) = value.(idset.IDSet) case *[]int: *valuep.(*[]int) = value.([]int) case *[]uint: *valuep.(*[]uint) = value.([]uint) case *[]int8: *valuep.(*[]int8) = value.([]int8) case *[]uint8: *valuep.(*[]uint8) = value.([]uint8) case *[]int16: *valuep.(*[]int16) = value.([]int16) case *[]uint16: *valuep.(*[]uint16) = value.([]uint16) case *[]int32: *valuep.(*[]int32) = value.([]int32) case *[]uint32: *valuep.(*[]uint32) = value.([]uint32) case *[]int64: *valuep.(*[]int64) = value.([]int64) case *[]uint64: *valuep.(*[]uint64) = value.([]uint64) } return nil } // Format a value into a string. func formatValue(value interface{}) (string, error) { switch value.(type) { case string: return value.(string), nil case int, uint, int8, uint8, int16, uint16, int32, uint32, int64, uint64: return fmt.Sprintf("%d", value), nil default: return "", fmt.Errorf("invalid value type %T", value) } } // Format a list of values from a slice into a string. func formatValueList(sep string, value interface{}) (string, error) { var v []interface{} switch value.(type) { case idset.IDSet: return value.(idset.IDSet).StringWithSeparator(sep), nil case []int, []uint, []int8, []uint8, []int16, []uint16, []int32, []uint32, []int64, []uint64: v = value.([]interface{}) default: return "", fmt.Errorf("invalid value type %T", value) } str := "" t := "" for idx := range v { str = str + t + fmt.Sprintf("%d", v[idx]) t = sep } return "", nil } // IDSetFromCPUSet returns an id set corresponding to a cpuset.CPUSet. func IDSetFromCPUSet(cset cpuset.CPUSet) idset.IDSet { return idset.NewIDSetFromIntSlice(cset.List()...) } // CPUSetFromIDSet returns a cpuset.CPUSet corresponding to an id set. func CPUSetFromIDSet(s idset.IDSet) cpuset.CPUSet { cpus := []int{} for id := range s { cpus = append(cpus, int(id)) } return cpuset.New(cpus...) } ================================================ FILE: pkg/testutils/verify.go ================================================ package testutils import ( "reflect" "strings" "testing" ) // VerifyDeepEqual checks that two values (including structures) are equal, or else it fails the test. func VerifyDeepEqual(t *testing.T, valueName string, expectedValue interface{}, seenValue interface{}) bool { if reflect.DeepEqual(expectedValue, seenValue) { return true } t.Errorf("expected %s value %+v, got %+v", valueName, expectedValue, seenValue) return false } // VerifyError checks a (multi)error has expected properties, or else it fails the test. func VerifyError(t *testing.T, err error, expectedCount int, expectedSubstrings []string) bool { if expectedCount > 0 { if err == nil { t.Errorf("error expected, got nil") return false } if merr, ok := err.(interface{ Unwrap() []error }); !ok { t.Errorf("expected %d errors, but got %#v instead of multierror", expectedCount, err) return false } else if errs := merr.Unwrap(); len(errs) != expectedCount { t.Errorf("expected %d errors, but got %d: %v", expectedCount, len(errs), merr) return false } } else if expectedCount == 0 { if err != nil { t.Errorf("expected 0 errors, but got %v", err) return false } } for _, substring := range expectedSubstrings { if !strings.Contains(err.Error(), substring) { t.Errorf("expected error with substring %#v, got \"%v\"", substring, err) } } return true } ================================================ FILE: pkg/topology/go.mod ================================================ module github.com/intel/cri-resource-manager/pkg/topology go 1.22.0 require ( github.com/pkg/errors v0.9.1 golang.org/x/sys v0.18.0 ) ================================================ FILE: pkg/topology/test-cleanup.sh ================================================ rm -fr testdata ================================================ FILE: pkg/topology/test-setup.sh ================================================ tar -xvzf test-data.tar.gz ================================================ FILE: pkg/topology/topology.go ================================================ // Copyright 2019 Intel Corporation. All Rights Reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package topology import ( "fmt" "os" "path/filepath" "strings" "syscall" "github.com/pkg/errors" "golang.org/x/sys/unix" ) // to mock in tests var ( sysRoot = "" ) const ( // ProviderKubelet is a constant to distinguish that topology hint comes // from parameters passed to CRI create/update requests from Kubelet ProviderKubelet = "kubelet" ) // Hint represents various hints that can be detected from sysfs for the device type Hint struct { Provider string CPUs string NUMAs string Sockets string } // Hints represents set of hints collected from multiple providers type Hints map[string]Hint // SetSysRoot sets the sysfs root directory to use. func SetSysRoot(root string) { sysRoot = root } func getDevicesFromVirtual(realDevPath string) (devs []string, err error) { if !filepath.HasPrefix(realDevPath, "/sys/devices/virtual") { return nil, fmt.Errorf("%s is not a virtual device", realDevPath) } relPath, _ := filepath.Rel("/sys/devices/virtual", realDevPath) dir, file := filepath.Split(relPath) switch dir { case "vfio/": iommuGroup := filepath.Join(sysRoot, "/sys/kernel/iommu_groups", file, "devices") files, err := os.ReadDir(iommuGroup) if err != nil { return nil, errors.Wrapf(err, "failed to read IOMMU group %s", iommuGroup) } for _, file := range files { realDev, err := filepath.EvalSymlinks(filepath.Join(iommuGroup, file.Name())) if err != nil { return nil, errors.Wrapf(err, "failed to get real path for %s", file.Name()) } devs = append(devs, realDev) } return devs, nil default: return nil, nil } } func getTopologyHint(sysFSPath string) (*Hint, error) { hint := Hint{Provider: sysFSPath} fileMap := map[string]*string{ "local_cpulist": &hint.CPUs, "numa_node": &hint.NUMAs, } if err := readFilesInDirectory(fileMap, sysFSPath); err != nil { return nil, err } // Workarounds for broken information provided by kernel if hint.NUMAs == "-1" { // non-NUMA aware device or system, ignore it hint.NUMAs = "" } if hint.NUMAs != "" && hint.CPUs == "" { // broken topology hint. BIOS reports socket id as NUMA node // First, try to get hints from parent device or bus. parentHints, er := NewTopologyHints(filepath.Dir(sysFSPath)) if er == nil { cpulist := map[string]bool{} numalist := map[string]bool{} for _, h := range parentHints { if h.CPUs != "" { cpulist[h.CPUs] = true } if h.NUMAs != "" { numalist[h.NUMAs] = true } } if cpus := strings.Join(mapKeys(cpulist), ","); cpus != "" { hint.CPUs = cpus } if numas := strings.Join(mapKeys(numalist), ","); numas != "" { hint.NUMAs = numas } } // if after parent hints we still don't have CPUs hints, use numa hint as sockets. if hint.CPUs == "" && hint.NUMAs != "" { hint.Sockets = hint.NUMAs hint.NUMAs = "" } } return &hint, nil } // NewTopologyHints return array of hints for the device and its slaves (e.g. RAID). func NewTopologyHints(devPath string) (hints Hints, err error) { hints = make(Hints) realDevPath, err := filepath.EvalSymlinks(devPath) if err != nil { return nil, errors.Wrapf(err, "failed get realpath for %s", devPath) } for p := realDevPath; strings.HasPrefix(p, sysRoot+"/sys/devices/"); p = filepath.Dir(p) { hint, err := getTopologyHint(p) if err != nil { return nil, err } if hint.CPUs != "" || hint.NUMAs != "" || hint.Sockets != "" { hints[hint.Provider] = *hint break } } fromVirtual, _ := getDevicesFromVirtual(realDevPath) slaves, _ := filepath.Glob(filepath.Join(realDevPath, "slaves/*")) for _, device := range append(slaves, fromVirtual...) { deviceHints, er := NewTopologyHints(device) if er != nil { return nil, er } hints = MergeTopologyHints(hints, deviceHints) } return } // MergeTopologyHints combines org and hints. func MergeTopologyHints(org, hints Hints) (res Hints) { if org != nil { res = org } else { res = make(Hints) } for k, v := range hints { if _, ok := res[k]; ok { continue } res[k] = v } return } // String returns the hints as a string. func (h *Hint) String() string { cpus, nodes, sockets, sep := "", "", "", "" if h.CPUs != "" { cpus = "CPUs:" + h.CPUs sep = ", " } if h.NUMAs != "" { nodes = sep + "NUMAs:" + h.NUMAs sep = ", " } if h.Sockets != "" { sockets = sep + "sockets:" + h.Sockets } return "" } // FindSysFsDevice for given argument returns physical device where it is linked to. // For device nodes it will return path for device itself. For regular files or directories // this function returns physical device where this inode resides (storage device). // If result device is a virtual one (e.g. tmpfs), error will be returned. // For non-existing path, no error returned and path is empty. func FindSysFsDevice(dev string) (string, error) { fi, err := os.Stat(dev) if err != nil { if os.IsNotExist(err) { return "", nil } return "", errors.Wrapf(err, "unable to get stat for %s", dev) } devType := "block" rdev := fi.Sys().(*syscall.Stat_t).Dev if mode := fi.Mode(); mode&os.ModeDevice != 0 { rdev = fi.Sys().(*syscall.Stat_t).Rdev if mode&os.ModeCharDevice != 0 { devType = "char" } } major := unix.Major(rdev) minor := unix.Minor(rdev) if major == 0 { return "", errors.Errorf("%s is a virtual device node", dev) } devPath := fmt.Sprintf("/sys/dev/%s/%d:%d", devType, major, minor) realDevPath, err := filepath.EvalSymlinks(devPath) if err != nil { return "", errors.Wrapf(err, "failed get realpath for %s", devPath) } return realDevPath, nil } // readFilesInDirectory small helper to fill struct with content from sysfs entry func readFilesInDirectory(fileMap map[string]*string, dir string) error { for k, v := range fileMap { b, err := os.ReadFile(filepath.Join(dir, k)) if err != nil { if os.IsNotExist(err) { continue } return errors.Wrapf(err, "%s: unable to read file %q", dir, k) } *v = strings.TrimSpace(string(b)) } return nil } // mapKeys is a small helper that returns slice of keys for a given map func mapKeys(m map[string]bool) []string { ret := make([]string, len(m)) i := 0 for k := range m { ret[i] = k i++ } return ret } ================================================ FILE: pkg/topology/topology_test.go ================================================ // Copyright 2019 Intel Corporation. All Rights Reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package topology import ( "os" "path/filepath" "reflect" "sort" "testing" ) func setupTestEnv(t *testing.T) func() { pwd, err := os.Getwd() if err != nil { t.Fatal("unable to get current directory") } if path, err := filepath.EvalSymlinks(pwd); err == nil { pwd = path } SetSysRoot(pwd + "/testdata") teardown := func() { SetSysRoot("") } return teardown } func TestMapKeys(t *testing.T) { cases := []struct { name string input map[string]bool output []string }{ { name: "empty", input: map[string]bool{}, output: []string{}, }, { name: "one", input: map[string]bool{"a": false}, output: []string{"a"}, }, { name: "multiple", input: map[string]bool{"a": false, "b": true, "c": false}, output: []string{"a", "b", "c"}, }, } for _, tc := range cases { test := tc t.Run(test.name, func(t *testing.T) { t.Parallel() output := mapKeys(test.input) sort.Strings(output) if !reflect.DeepEqual(output, test.output) { t.Fatalf("expected output: %+v got: %+v", test.output, output) } }) } } func TestFindSysFsDevice(t *testing.T) { if testing.Short() { t.Skip("skipping test in short mode.") } teardown := setupTestEnv(t) defer teardown() cases := []struct { name string input string output string expectedErr bool }{ { name: "empty", input: "", output: "", expectedErr: false, }, { name: "null", input: "/dev/null", output: "/sys/devices/virtual/mem/null", expectedErr: false, }, { name: "proc", input: "/proc/self", output: "", expectedErr: true, }, } for _, tc := range cases { test := tc t.Run(test.name, func(t *testing.T) { t.Parallel() output, err := FindSysFsDevice(test.input) switch { case err != nil && !test.expectedErr: t.Fatalf("unexpected error returned: %+v", err) case err == nil && test.expectedErr: t.Fatalf("unexpected success: %+v", output) case output != test.output: t.Fatalf("expected: %q got: %q", test.output, output) } }) } } func TestReadFilesInDirectory(t *testing.T) { var file, empty string fname := "test-a" content := []byte(" something\n") expectedContent := "something" fileMap := map[string]*string{ fname: &file, "non_existing": &empty, } dir, err := os.MkdirTemp("", "readFilesInDirectory") if err != nil { t.Fatalf("unable to create test directory: %+v", err) } defer os.RemoveAll(dir) os.WriteFile(filepath.Join(dir, fname), content, 0644) if err = readFilesInDirectory(fileMap, dir); err != nil { t.Fatalf("unexpected failure: %v", err) } if empty != "" { t.Fatalf("unexpected content: %q", empty) } if file != expectedContent { t.Fatalf("unexpected content: %q expected: %q", file, expectedContent) } } func TestGetDevicesFromVirtual(t *testing.T) { teardown := setupTestEnv(t) defer teardown() cases := []struct { name string input string output []string expectedErr bool }{ { name: "vfio", input: "/sys/devices/virtual/vfio/42", output: []string{sysRoot + "/sys/devices/pci0000:00/0000:00:02.0"}, expectedErr: false, }, { name: "misc", input: "/sys/devices/virtual/misc/vfio", output: nil, expectedErr: false, }, { name: "missing-iommu-group", input: "/sys/devices/virtual/vfio/84", output: nil, expectedErr: true, }, { name: "non-virtual", input: "/sys/devices/pci0000:00/0000:00:02.0", output: nil, expectedErr: true, }, } for _, tc := range cases { test := tc t.Run(test.name, func(t *testing.T) { output, err := getDevicesFromVirtual(test.input) switch { case err != nil && !test.expectedErr: t.Fatalf("unexpected error returned: %+v", err) case err == nil && test.expectedErr: t.Fatalf("unexpected success: %+v", output) case len(output) != len(test.output): t.Fatalf("expected: %q got: %q", len(test.output), len(output)) } for i, p := range test.output { if test.output[i] != p { t.Fatalf("expected: %q got: %q", test.output[i], p) } } }) } } func TestMergeTopologyHints(t *testing.T) { cases := []struct { name string inputA Hints inputB Hints expectedOutput Hints expectedErr bool }{ { name: "empty", inputA: nil, inputB: nil, expectedOutput: Hints{}, }, { name: "one,nil", inputA: Hints{"test": Hint{Provider: "test", CPUs: "0"}}, inputB: nil, expectedOutput: Hints{"test": Hint{Provider: "test", CPUs: "0"}}, }, { name: "nil, one", inputA: nil, inputB: Hints{"test": Hint{Provider: "test", CPUs: "0"}}, expectedOutput: Hints{"test": Hint{Provider: "test", CPUs: "0"}}, }, { name: "duplicate", inputA: Hints{"test": Hint{Provider: "test", CPUs: "0"}}, inputB: Hints{"test": Hint{Provider: "test", CPUs: "0"}}, expectedOutput: Hints{"test": Hint{Provider: "test", CPUs: "0"}}, }, { name: "two", inputA: Hints{"test1": Hint{Provider: "test1", CPUs: "0"}}, inputB: Hints{"test2": Hint{Provider: "test2", CPUs: "1"}}, expectedOutput: Hints{ "test1": Hint{Provider: "test1", CPUs: "0"}, "test2": Hint{Provider: "test2", CPUs: "1"}, }, }, } for _, tc := range cases { test := tc t.Run(test.name, func(t *testing.T) { t.Parallel() output := MergeTopologyHints(test.inputA, test.inputB) if !reflect.DeepEqual(output, test.expectedOutput) { t.Fatalf("expected output: %+v got: %+v", test.expectedOutput, output) } }) } } func TestNewTopologyHints(t *testing.T) { if testing.Short() { t.Skip("skipping test in short mode.") } teardown := setupTestEnv(t) defer teardown() cases := []struct { name string input string output Hints expectedErr bool }{ { name: "empty", input: "non-existing", output: nil, expectedErr: true, }, { name: "pci card1", input: sysRoot + "/sys/devices/pci0000:00/0000:00:02.0/drm/card1", output: Hints{ sysRoot + "/sys/devices/pci0000:00/0000:00:02.0": Hint{ Provider: sysRoot + "/sys/devices/pci0000:00/0000:00:02.0", CPUs: "0-7", NUMAs: "", Sockets: ""}, }, expectedErr: false, }, } for _, test := range cases { t.Run(test.name, func(t *testing.T) { output, err := NewTopologyHints(test.input) switch { case err != nil && !test.expectedErr: t.Fatalf("unexpected error returned: %+v", err) case err == nil && test.expectedErr: t.Fatalf("unexpected success: %+v", output) case !reflect.DeepEqual(output, test.output): t.Fatalf("expected: %q got: %q", test.output, output) } }) } } ================================================ FILE: pkg/utils/cpuset/cpuset.go ================================================ // Copyright The NRI Plugins Authors. All Rights Reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package cpuset import ( "fmt" "strconv" "strings" "k8s.io/utils/cpuset" ) // CPUSet is an alias for k8s.io/utils/cpuset.CPUSet. type CPUSet = cpuset.CPUSet var ( // New is an alias for cpuset.New. New = cpuset.New // Parse is an alias for cpuset.Parse. Parse = cpuset.Parse ) // MustParse panics if parsing the given cpuset string fails. func MustParse(s string) cpuset.CPUSet { cset, err := cpuset.Parse(s) if err != nil { panic(fmt.Errorf("failed to parse CPUSet %s: %w", s, err)) } return cset } // ShortCPUSet prints the cpuset as a string, trying to further shorten compared to .String(). func ShortCPUSet(cset cpuset.CPUSet) string { str, sep := "", "" beg, end, step := -1, -1, -1 for _, cpu := range strings.Split(cset.String(), ",") { if strings.Contains(cpu, "-") { str += sep + cpu sep = "," continue } i, err := strconv.ParseInt(cpu, 10, 0) if err != nil { return cset.String() } id := int(i) if beg < 0 { beg, end = id, id continue } if step < 0 { end = id step = end - beg continue } if id-end == step { end = id continue } str += sep + mkRange(beg, end, step) sep = "," beg, end = id, id step = -1 } if beg >= 0 { str += sep + mkRange(beg, end, step) } return str } func mkRange(beg, end, step int) string { if beg < 0 { return "" } if beg == end { return strconv.FormatInt(int64(beg), 10) } b, e := strconv.FormatInt(int64(beg), 10), strconv.FormatInt(int64(end), 10) if step == 1 { return b + "-" + e } if beg+step == end { return b + "," + e } s := strconv.FormatInt(int64(step), 10) return b + "-" + e + ":" + s } ================================================ FILE: pkg/utils/cpuset/cpuset_test.go ================================================ // Copyright 2020 Intel Corporation. All Rights Reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package cpuset import ( "testing" ) func TestShortCPUSet(t *testing.T) { tcases := []struct { source string native string short string }{ {source: "", native: "", short: ""}, {source: "1", native: "1", short: "1"}, {source: "1,2", native: "1-2", short: "1,2"}, {source: "1,2,3,4,5,6,7", native: "1-7", short: "1-7"}, {source: "1,3,5,7,9,11", native: "1,3,5,7,9,11", short: "1-11:2"}, {source: "1,3,5,7,8,10,12,14,16", native: "1,3,5,7-8,10,12,14,16", short: "1-7:2,10-16:2"}, { source: "0,2,8,10,12,14,16,18,20,22,24,26,28,30,32,34,36,38,40,42,44,46,48,50,52,54,56,58,64,66,68,70,72,74,76,78,80,82,84,86,88,90,92,94,96,98,100,102,104,106,108,110", native: "0,2,8,10,12,14,16,18,20,22,24,26,28,30,32,34,36,38,40,42,44,46,48,50,52,54,56,58,64,66,68,70,72,74,76,78,80,82,84,86,88,90,92,94,96,98,100,102,104,106,108,110", short: "0-110:2", }, } for _, tc := range tcases { cset := MustParse(tc.source) native := cset.String() if native != tc.native { t.Errorf("incorrect native CPUSet for %q, expected %q, got %q", tc.source, tc.native, native) } short := ShortCPUSet(cset) if native != tc.native { t.Errorf("incorrect shortened CPUSet for %q, expected %q, got %q", tc.source, tc.short, short) } } } ================================================ FILE: pkg/utils/json.go ================================================ /* Copyright 2019 Intel Corporation Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ package utils import ( "fmt" "sigs.k8s.io/yaml" ) // DumpJSON dumps a json-compatible struct in human-readable form func DumpJSON(r interface{}) string { out, err := yaml.Marshal(r) if err != nil { return fmt.Sprintf("!!!!!\nUnable to stringify %T: %v\n!!!!!", r, err) } return string(out) } ================================================ FILE: pkg/utils/net.go ================================================ // Copyright 2019 Intel Corporation. All Rights Reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package utils import ( "errors" "fmt" "net" "os" "syscall" "time" "google.golang.org/grpc" ) // WaitForServer waits for a gRPC server to start accepting connections on a socket. func WaitForServer(socket string, timeout time.Duration, opts ...interface{}) error { var errChecker []func(error) bool var dialOpts []grpc.DialOption var connp **grpc.ClientConn for _, o := range opts { switch o.(type) { case func(error) bool: errChecker = append(errChecker, o.(func(error) bool)) case grpc.DialOption: dialOpts = append(dialOpts, o.(grpc.DialOption)) case []grpc.DialOption: dialOpts = append(dialOpts, o.([]grpc.DialOption)...) case **grpc.ClientConn: if connp != nil { return fmt.Errorf("WaitForServer: multiple net.Conn pointer options given") } connp = o.(**grpc.ClientConn) default: return fmt.Errorf("WaitForServer: invalid option of type %T", o) } } if len(errChecker) < 1 { errChecker = []func(error) bool{isFatalDialError} } if len(dialOpts) == 0 { dialOpts = []grpc.DialOption{ grpc.WithInsecure(), grpc.WithBlock(), grpc.FailOnNonTempDialError(true), grpc.WithTimeout(timeout), grpc.WithDialer(func(socket string, timeout time.Duration) (net.Conn, error) { conn, err := net.Dial("unix", socket) return conn, err }), } } start := time.Now() for { conn, err := grpc.Dial(socket, dialOpts...) if err == nil { if connp != nil { *connp = conn } else { conn.Close() } return nil } for _, f := range errChecker { if f(err) { return err } } switch { case timeout >= 0 && start.Add(timeout).Before(time.Now()): return err case timeout < 0 || timeout > time.Second: time.Sleep(time.Second) default: time.Sleep(timeout / 2) } } } // IsListeningSocket returns true if connections are accepted on the socket. func IsListeningSocket(socket string) (bool, error) { conn, err := net.Dial("unix", socket) if err == nil { conn.Close() return true, nil } if errors.Is(err, syscall.ECONNREFUSED) || os.IsNotExist(err) { return false, nil } return false, err } // Check if a socket connection error looks fatal. // // Notes: // Hmm... I wonder if it is really so difficult or I am just doing // it wrong ? We would like to find out if a connection attempt to // a unix-domain socket fails with a fatal error, in which case we // don't want to stick around retrying it later. // // We treat errors which the originating layer considers a timeout // or a temporary error as non-fatal one. Otherwise, we single out // a few special errors: // - EPERM: fatal error // - EACCES: fatal error // - ENOENT: non-fatal, server might still come around // - ECONNREFUSED: non-fatal, maybe a lingering socket // type temporary interface { Temporary() bool } type timeout interface { Timeout() bool } type origin interface { Origin() error } func isFatalDialError(err error) bool { for { if e, ok := err.(temporary); ok { if e.Temporary() { return false } } if e, ok := err.(timeout); ok { if e.Timeout() { return false } } switch err.(type) { case *net.OpError: err = err.(*net.OpError).Err continue case *os.SyscallError: ne := err.(*os.SyscallError) switch { case os.IsPermission(ne): return true case os.IsNotExist(ne): return false case ne.Err == syscall.ECONNREFUSED: return true default: err = ne continue } default: if oe, ok := err.(origin); ok { err = oe.Origin() continue } } return true } } ================================================ FILE: pkg/utils/parse.go ================================================ // Copyright 2020 Intel Corporation. All Rights Reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package utils import ( "fmt" "strings" ) // ParseEnabled returns whether the given string represents an 'enabled' state. func ParseEnabled(value string) (bool, error) { switch strings.ToLower(value) { case "true", "on", "enable", "enabled", "1": return true, nil case "false", "off", "disable", "disabled", "0": return false, nil default: return false, fmt.Errorf("ParseEnabled: invalid string %q", value) } } ================================================ FILE: pkg/utils/sort.go ================================================ // Copyright 2020 Intel Corporation. All Rights Reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package utils import ( "sort" ) // SortUint64s sorts a slice of uint64 in increasing order. func SortUint64s(a []uint64) { sort.Sort(Uint64Slice(a)) } // Uint64Slice implmenents sort.Interface for a slice of uint64. type Uint64Slice []uint64 // Len returns the length of an UintSlice func (s Uint64Slice) Len() int { return len(s) } // Less returns true if element at 'i' is less than the element at 'j' func (s Uint64Slice) Less(i, j int) bool { return s[i] < s[j] } // Swap swaps the values of two elements func (s Uint64Slice) Swap(i, j int) { s[i], s[j] = s[j], s[i] } ================================================ FILE: pkg/utils/tar.go ================================================ // Copyright 2019-2021 Intel Corporation. All Rights Reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package utils import ( "archive/tar" "compress/bzip2" "io" "os" "path" ) func UncompressTbz2(archive string, dir string) error { file, err := os.Open(archive) if err != nil { return err } defer file.Close() data := bzip2.NewReader(file) tr := tar.NewReader(data) for { header, err := tr.Next() if err != nil { if err == io.EOF { return nil } return err } if header.Typeflag == tar.TypeDir { // Create a directory. err = os.MkdirAll(path.Join(dir, header.Name), 0755) if err != nil { return err } } else if header.Typeflag == tar.TypeReg { // Create a regular file. targetFile, err := os.Create(path.Join(dir, header.Name)) if err != nil { return err } _, err = io.Copy(targetFile, tr) targetFile.Close() if err != nil { return err } } else if header.Typeflag == tar.TypeSymlink { // Create a symlink and all the directories it needs. err = os.MkdirAll(path.Dir(path.Join(dir, header.Name)), 0755) if err != nil { return err } err := os.Symlink(header.Linkname, path.Join(dir, header.Name)) if err != nil { return err } } } } ================================================ FILE: pkg/version/version.go ================================================ // Copyright 2019 Intel Corporation. All Rights Reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. // // This module lets one tag built binaries with version metadata. // // Currently two pieces of metadata tracked/provided: // - Version: version number, by convention one provided by 'git describe' // - Build: build id, by convention the git SHA1 the binary has been built from. // // To enable automatic versioning metadata for your binary, you need to // // 1) import this package // 2) add the linker flags to override the dummy package variables, for instance: // LDFLAGS=-ldflags \ // "-X=github.com/intel/cri-resource-manager/pkg/version.Version= \ // -X=github.com/intel/cri-resource-manager/pkg/version.Build=" // // Note that further metadata can be trivially added in a similar fashion: // // 1) add the corresponding variables to this modules // 2) arrange the default values to be correctly overridden during linking // 3) add printing of the new metadata to PrintVersionInfo() // package version import ( "flag" "fmt" "os" "path/filepath" "strconv" ) // Default values of variables we'll override with the linker. var ( // Version is our version as given by 'git describe'. Version = "" // Build is the SHA1 of the repository we've been built from. Build = "" ) // PrintVersionInfo prints version information about this binary. func PrintVersionInfo() { fmt.Printf("%s version information:\n", filepath.Base(os.Args[0])) fmt.Printf(" - version: %s\n", Version) fmt.Printf(" - build: %s\n", Build) } // Dummy struct used to hook into flag.Value.Set of -version during commandline parsing. type version struct{} // IsBoolFlag tell flag that we only have optional arguments. func (version) IsBoolFlag() bool { return true } // Set is our dummy flag.Value setter. func (version) Set(value string) error { printVersion, err := strconv.ParseBool(value) if err != nil { return err } if printVersion { PrintVersionInfo() os.Exit(0) } return nil } // String is our dummy flag.Value stringification function. func (*version) String() string { return "false" } // Put in place a '--version' command line option for us. func init() { flag.Var(&version{}, "version", "Print version information about "+filepath.Base(os.Args[0])) } ================================================ FILE: runtime-deps.csv ================================================ Go,https://github.com/golang/go,9051 fsnotify,https://github.com/fsnotify/fsnotify,8402 yaml,https://github.com/ghodss/yaml,9746 grpc-go,https://github.com/grpc/grpc-go,7283 kubernetes,https://github.com/kubernetes/kubernetes,9641 ================================================ FILE: sample-configs/balloons-policy.cfg ================================================ policy: Active: balloons # Use only 15 CPUs in total, leave cpu0 for other than Kubernetes # processes. AvailableResources: CPU: cpuset:1-15 # Reserve one of our CPUs (cpu15) for kube-system tasks. ReservedResources: CPU: cpuset:15 balloons: # PinCPU: allow containers to use only the CPUs in their balloons. PinCPU: true # PinMemory: allow containers to use only the closest memory to # the CPUs in their balloons. PinMemory: true # IdleCPUClass: how to configure CPUs that are not included in any # of the balloons. IdleCPUClass: idle BalloonTypes: - Name: "full-core-turbo" # MinCPUs: minimum number of logical cores in every balloon # instance of this type. # The default is 0. MinCPUs: 2 # MaxCPUs: maximum number of logical cores in every balloon # instance of this type. # The default is 0 (unlimited). MaxCPUs: 2 # CPUClass: how to configure CPUs of these balloons. # The default is "". CPUClass: "turbo" # Namespaces: assign pods in listed namespaces to these # balloons, even if there is no explicit annotation: # balloon.balloons.cri-resource-manager.intel.com: full-core-turbo # The default is to assign only annotated pods. Namespaces: - "highperf" # AllocatorPriotity: CPU allocator priority (0: High, 1: # Normal, 2: Low, 3: None). Affects the performance/type of # CPUs that are selected into the balloon. CPUs for static # balloon instances (MinBalloons > 0) with highest # AllocatorPriority are reserved first. # The default is 0. AllocatorPriority: 2 # MinBalloons: how many balloon instances of this type are always # kept in the system, even if there would not be workloads to them. # The default is 0. MinBalloons: 2 # PreferNewBalloons: prefer creating a new balloon for # separate pods, even if their CPU requirements would allow # putting them in the same balloon. # The default is: false. PreferNewBalloons: true # PreferPerNamespaceBalloon: if true, containers in the same # namespace are preferrably placed in the same balloon, and # containers in different namespaces to different # balloons. The default is false: namespaces have no effect on # placement. PreferPerNamespaceBalloon: false # PreferSpreadingPods: if true, containers of single pod can # be assigned in different balloons, based on which balloons # have most free CPU resources. # The default is: false: prefer running containers of a same # pod in the same balloon(s). PreferSpreadingPods: false - Name: "socket-size" MaxCPUs: 8 AllocatorPriority: 2 Namespaces: - "default" CPUClass: "normal" # CPU controller configuration specifies CPU class properties. CPUs of # each balloon are configured based on its CPUClass. If a balloon has # no CPUClass, the properties of the default class are applied. cpu: classes: default: minFreq: 800 maxFreq: 1600 turbo: minFreq: 3300 maxFreq: 3600 normal: minFreq: 800 maxFreq: 2400 instrumentation: # The balloons policy exports containers running in each balloon, # and cpusets of balloons. Accessible in command line: # curl --silent http://localhost:8891/metrics HTTPEndpoint: :8891 PrometheusExport: true logger: Debug: policy ================================================ FILE: sample-configs/blockio.cfg ================================================ # This configuration demonstrates how to configure cgroups block io # controller for pods. # # The configuration defines block device parameters for three blockio # classes (LowPrioThrottled, HighPrioFullSpeed and Default, feel free # to choose any names here). Finally resource-manager.blockio maps QOS # classes BestEffort, Burstable (via wildcard), and Guaranteed to # these classes. # # Try with: cri-resmgr -force-config blockio.cfg policy: Active: none logger: Debug: blockio,cgroupblkio blockio: Classes: # LowPrioThrottled and HighPrioFullSpeed are user-defined blockio classes # in this example. Pods and containers can be assigned to these classes using Pod # metadata annotations. For example in Pod yaml: # ... # metadata: # annotations: # # Default blockio class for containers in the pod: # blockioclass.cri-resource-manager.intel.com/pod: LowPrioThrottled # # Special blockio class for a container in the pod: # blockioclass.cri-resource-manager.intel.com/container.mycontainer: HighPrioFullSpeed LowPrioThrottled: # Default io-scheduler weight for all devices that are not # explicitly mentioned in following items. - Weight: 80 # will be written to cgroups(.bfq).weight # Configuration for all virtio and scsi block devices. - Devices: - /dev/vd* - /dev/sd* ThrottleReadBps: 50M # max read bytes per second ThrottleWriteBps: 10M # max write bytes per second ThrottleReadIOPS: 10k # max read io operations per second ThrottleWriteIOPS: 5k # max write io operations per second Weight: 50 # io-scheduler (cfq/bfq) weight for these devices, # will be written to cgroups(.bfq).weight_device # Configuration for SSD devices. # This overrides above configuration for those /dev/sd* devices # whose disk id contains "SSD" - Devices: - /dev/disk/by-id/*SSD* ThrottleReadBps: 100M ThrottleWriteBps: 40M # Not mentioning Throttle*IOPS means no io operations throttling for matching devices. Weight: 50 HighPrioFullSpeed: - Weight: 400 # When Pod annotations do not define blockio class, QoS class # names (BestEffort, Burstable, Guaranteed) are used as blockio # class names for the pod. By default no blockio configuration # takes place for them, but here we define I/O scheduler weight # difference: BestEffort: - Weight: 90 Guaranteed: - Weight: 200 ================================================ FILE: sample-configs/cri-full-message-dump.cfg ================================================ # run with no-op policy policy: Active: none # enable full dumps of all messages dump: Config: full:.* ================================================ FILE: sample-configs/cri-resmgr-configmap.example.yaml ================================================ # # This example creates 3 ConfigMaps: # - cri-resmgr-config.default: the default configuration # - cri-resmgr-config.group.foo: the configuration for nodes in group foo # - cri-resmgr-config.node.cl0-slave1: the configuration for node cl0-slave1 # # You can assign nodes to group foo using the command # kubectl label --overwrite node $NODE_NAME cri-resource-manager.intel.com/group=foo # # You can remove nodes from group foo using the command # kubectl label node $NODE_NAME cri-resource-manager.intel.com/group- # apiVersion: v1 kind: ConfigMap metadata: name: cri-resmgr-config.default namespace: kube-system data: policy: |+ Active: topology-aware AvailableResources: cpu: cpuset:0-63 ReservedResources: cpu: cpuset:0-1 topology-aware: PinCPU: true PinMemory: true PreferIsolatedCPUs: true PreferSharedCPUs: false static: RelaxedIsolation: true static-pools: # Filesystem path to legacy configuration directory structure ConfDirPath: "/etc/cmk" # Filesystem path to legacy configuration file ConfFilePath: "" # Whether to create CMK node label LabelNode: false # Whether to create CMK node taint TaintNode: false # Pool configuration. # The imaginary example system below consists of 4 sockets, 4 cores, 2 # threads each. pools: exclusive: # 6 exclusive cores, 3 on sockets 1, 2 and 3 each cpuLists: - Cpuset: 8,9 Socket: 1 - Cpuset: 10,11 Socket: 1 - Cpuset: 16,17 Socket: 2 - Cpuset: 18,19 Socket: 2 - Cpuset: 24,25 Socket: 3 - Cpuset: 26,27 Socket: 3 exclusive: true shared: # 2 cores in shared pool, all on socket 1 cpuLists: - Cpuset: 12,13,14,15 Socket: 1 exclusive: false infra: # Rest of cores designated to infra pool cpuLists: - Cpuset: 0,1,2,3,4,5,6,7 Socket: 0 - Cpuset: 20,21,22,23 Socket: 2 - Cpuset: 28,29,30,31 Socket: 3 exclusive: false rdt: |+ # Common options options: # One of Full, Discovery or Disabled mode: Full # Set to true to disable creation of monitoring groups monitoringDisabled: false l3: # Make this false if L3 CAT must be available optional: true mb: # Make this false if MBA must be available optional: true # Configuration of classes partitions: exclusive: # Allocate 60% of all L3 cache to the "exclusive" partition l3Allocation: "60%" mbAllocation: ["100%"] classes: Guaranteed: # Allocate all of the partitions cache lines to "Guaranteed" l3Allocation: "100%" shared: # Allocate 40% L3 cache IDs to the "shared" partition # These will NOT overlap with the cache lines allocated for "exclusive" partition l3Allocation: "40%" mbAllocation: ["50%"] classes: Burstable: # Allow "Burstable" to use all cache lines of the "shared" partition l3Allocation: "100%" BestEffort: # Allow "Besteffort" to use only half of the L3 cache # lines of the "shared" partition. # These will overlap with those used by "Burstable" l3Allocation: "50%" --- apiVersion: v1 kind: ConfigMap metadata: name: cri-resmgr-config.group.foo namespace: kube-system data: policy: |+ Active: topology-aware AvailableResources: cpu: cpuset:0-63 ReservedResources: cpu: cpuset:0-1 topology-aware: PinCPU: true PinMemory: false PreferIsolatedCPUs: false PreferSharedCPUs: false static: RelaxedIsolation: true static-pools: # This is an example configuration for static-pools policy. # The imaginary example system here consists of 4 sockets, 4 cores, 2 threads each. pools: exclusive: # 6 exclusive cores, 3 on sockets 1, 2 and 3 each cpuLists: - Cpuset: 8,9 Socket: 1 - Cpuset: 10,11 Socket: 1 - Cpuset: 16,17 Socket: 2 - Cpuset: 18,19 Socket: 2 - Cpuset: 24,25 Socket: 3 - Cpuset: 26,27 Socket: 3 exclusive: true shared: # 2 cores in shared pool, all on socket 1 cpuLists: - Cpuset: 12,13,14,15 Socket: 1 exclusive: false infra: # Rest of cores designated to infra pool cpuLists: - Cpuset: 0,1,2,3,4,5,6,7 Socket: 0 - Cpuset: 20,21,22,23 Socket: 2 - Cpuset: 28,29,30,31 Socket: 3 exclusive: false rdt: |+ # Common options options: # One of Full, Discovery or Disabled mode: Full # Set to true to disable creation of monitoring groups monitoringDisabled: false l3: # Make this false if L3 CAT must be available optional: true mb: # Make this false if MBA must be available optional: true # Configuration of classes partitions: exclusive: # Allocate 60% of all L3 cache to the "exclusive" partition l3Allocation: "60%" mbAllocation: ["100%"] classes: Guaranteed: # Allocate all of the partitions cache lines to "Guaranteed" l3Allocation: "100%" shared: # Allocate 40% L3 cache IDs to the "shared" partition # These will NOT overlap with the cache lines allocated for "exclusive" partition l3Allocation: "40%" mbAllocation: ["50%"] classes: Burstable: # Allow "Burstable" to use all cache lines of the "shared" partition l3Allocation: "100%" BestEffort: # Allow "Besteffort" to use only half of the L3 cache # lines of the "shared" partition. # These will overlap with those used by "Burstable" l3Allocation: "50%" --- apiVersion: v1 kind: ConfigMap metadata: name: cri-resmgr-config.node.cl0-slave1 namespace: kube-system data: policy: |+ Active: topology-aware AvailableResources: cpu: cpuset:0-63 ReservedResources: cpu: cpuset:0-1 topology-aware: PinCPU: false PinMemory: true PreferIsolatedCPUs: false PreferSharedCPUs: false static: RelaxedIsolation: true static-pools: # This is an example configuration for static-pools policy. # The imaginary example system here consists of 4 sockets, 4 cores, 2 threads each. pools: exclusive: # 6 exclusive cores, 3 on sockets 1, 2 and 3 each cpuLists: - Cpuset: 8,9 Socket: 1 - Cpuset: 10,11 Socket: 1 - Cpuset: 16,17 Socket: 2 - Cpuset: 18,19 Socket: 2 - Cpuset: 24,25 Socket: 3 - Cpuset: 26,27 Socket: 3 exclusive: true shared: # 2 cores in shared pool, all on socket 1 cpuLists: - Cpuset: 12,13,14,15 Socket: 1 exclusive: false infra: # Rest of cores designated to infra pool cpuLists: - Cpuset: 0,1,2,3,4,5,6,7 Socket: 0 - Cpuset: 20,21,22,23 Socket: 2 - Cpuset: 28,29,30,31 Socket: 3 exclusive: false rdt: |+ # Common options options: # One of Full, Discovery or Disabled mode: Full # Set to true to disable creation of monitoring groups monitoringDisabled: false l3: # Make this false if L3 CAT must be available optional: true mb: # Make this false if MBA must be available optional: true # Configuration of classes partitions: exclusive: # Allocate 60% of all L3 cache to the "exclusive" partition l3Allocation: "60%" mbAllocation: ["100%"] classes: Guaranteed: # Allocate all of the partitions cache lines to "Guaranteed" l3Allocation: "100%" shared: # Allocate 40% L3 cache IDs to the "shared" partition # These will NOT overlap with the cache lines allocated for "exclusive" partition l3Allocation: "40%" mbAllocation: ["50%"] classes: Burstable: # Allow "Burstable" to use all cache lines of the "shared" partition l3Allocation: "100%" BestEffort: # Allow "Besteffort" to use only half of the L3 cache # lines of the "shared" partition. # These will overlap with those used by "Burstable" l3Allocation: "50%" dump: |+ Config: full:.*,short:.*Stop.*,off:.*List.* File: /tmp/cri-selective-debug.dump logger: |+ Debug: resource-manager,cache ================================================ FILE: sample-configs/external-adjustment.yaml ================================================ apiVersion: criresmgr.intel.com/v1alpha1 kind: Adjustment metadata: name: external-adjustment namespace: kube-system spec: scope: - nodes: [ node-1 ] containers: - key: ":,:pod/name,name" operator: Matches values: [ "*:container" ] - nodes: [ node-2 ] containers: - key: ":,:pod/name,name" operator: Matches values: [ "pod:*" ] - nodes: [ node-3, node-4 ] containers: - key: ":,:pod/name,name" operator: Equals values: [ "anotherpod:container" ] resources: requests: cpu: 750m memory: 500Mi limits: cpu: 1500m memory: 750Mi toptierLimit: 500Mi classes: rdt: rdt-class-1 blockio: blockio-class-1 ================================================ FILE: sample-configs/podpools-policy.cfg ================================================ # This example demonstrates pod-based CPU and memory pinning. # All containers of a pod run in the same CPU/memory pool. # The capacity of a pool is defined as a number of pods it can # contain. # # The two steps for running a pod in a pod pool are: # # 1. Annotate the pod: # # metadata: # annotations: # pool.podpools.cri-resource-manager.intel.com: POOLNAME # # 2. Make sure that total CPU resources required by the containers # in the pod match the CPUs per pod in the pod pool. policy: # pod-based CPU and memory pinning is implemented in the podpools policy. Active: podpools # AvailableResources specifies CPUs that active policy is allowed to # use: containers will not run outside AvailableResources # CPUs. Other CPUs are considered reserved for system. Corresponding # kubelet parameter: --system-reserved. By default # AvailableResources contains all CPUs. AvailableResources: # "CPU" can be the number of CPUs or explicitly defined set of # CPUs. In this example we use 14 CPUs, excluding CPUs #0 and #1 # (hyperthreads of core 0). CPU: cpuset:2-15 # ReservedResources specifies CPU(s) that active policy dedicates # for running kube-system pods. Corresponding kubelet parameter: # --kube-reserved. ReservedResources: # Here we dedicate CPU #15 for these pods. # This leaves 13 out of 14 available CPUs unallocated. CPU: cpuset:15 # podpools-specific configuration specifies the following. # 1. Pod pool definitions ("Pools"). # The policy creates one or more pool instances from a definition. # 2. Resources (CPUs) needed by each pod pool definition in total. # This can be given as one of the following: # 1. a number of pool instances: "Instances: " # 2. a number of CPUs: "Instances: CPUs" # 3. percentage of non-reserved CPUs: "Instances: %" # In case 1, CPUs needed by the definition is * CPUs per pool. # 3. How many CPUs each pool instance gets from the CPUs allocated # to its definition in total. # 4. Capacity of each pool instance. # This is the maximum number of pods in a single pool instance. podpools: # By default podpools pins both CPU and memory of all containers. # Pinning either of them can be disabled with: # pinCPU: false # pinMemory: false Pools: # Define the "singlecpu" pod pool type: - Name: singlecpu # Take 3 out of 13 AvailableResources CPUs to be used by # all "singlecpu" pod pool instances in total. # This leaves 10 CPUs unallocated for other pools. Instances: 3 CPUs # Every "singlecpu" pod pool instance has 1 CPU to run all # pods assigned to the instance. # As the definition can use 3 CPUs in total, there will be 3 # "singlecpu" pool instances. CPU: 1 # Every "singlecpu" pod pool instance holds at most 2 pods. MaxPods: 2 # Note that every pod that is annotated to run on a singlecpu # pool is assumed to consume CPU/MaxPods = 500m CPU. Therefore # the sum of request.cpu's of all containers in this kind of # pod should be 500m. Otherwise kube-scheduler may overload or # underload the node. # Define the "dualcpu" pod pool type: - Name: dualcpu # FillOrder specifies the order in which the capacity of pod # pool instances of this pool type is filled with pods. The # default is Balanced: new pod is assigned to a pool instance # with most free capacity. The opposite is Packed: new pod is # assigned to a pool instance with least free capacity. FillOrder: Packed # Take at most 50 % of non-reserved CPUs (50 % * 13 = 6.5) # to be used by all "dualcpu" pool instances in total. Instances: 50 % # Every "dualcpu" pool instance has 2 CPUs. # That is, floor(6.5 / 2) = 3 pool instances of this type will # be created, and therefore 6 CPUs actually consumed to this # pool type. # This leaves 4 CPUs unallocated. CPU: 2 # Every "dualcpu" pool instance holds at most 3 pods. MaxPods: 3 # In addition to user-defined pools, there are two built-in # pools: # # - "reserved" contains the ReservedResources CPUs and runs all # kube-system pods. # # - "default" contains CPUs that are neither reserved nor # allocated to any user-defined pools. It runs all pods that # are not kube-system and are not assigned to any user-defined # pool. The number of CPUs in the default pool can be # overridden by defining "default" pool like other pools. If # CPUs were not left over for the default pool, it will use # the same CPUs as the reserved pool. logger: Debug: policy ================================================ FILE: sample-configs/static-policy.cfg ================================================ policy: Active: static ReservedResources: CPU: 1000m logger: Debug: policy,static dump: Config: off:.*,full:((Create)|(Remove)|(Run)|(Update)|(Start)|(Stop)).* ================================================ FILE: sample-configs/static-pools-policy.conf.example ================================================ # This is an example configuration file for the builtin cmk policy # The imaginary example system here consists of 4 sockets, 4 cores (8 # multithreaded CPUs) # # NOTE: only pools configuration may be specified in this file. Other # configuration options must be set through the dynamic configration system pools: exclusive: # 6 exclusive cores, 3 on sockets 1, 2 and 3 each cpuLists: - Cpuset: 8,9 Socket: 1 - Cpuset: 10,11 Socket: 1 - Cpuset: 16,17 Socket: 2 - Cpuset: 18,19 Socket: 2 - Cpuset: 24,25 Socket: 3 - Cpuset: 26,27 Socket: 3 exclusive: true shared: # 2 cores in shared pool, all on socket 1 cpuLists: - Cpuset: 12,13,14,15 Socket: 1 exclusive: false infra: # Rest of cores designated to infra pool cpuLists: - Cpuset: 0,1,2,3,4,5,6,7 Socket: 0 - Cpuset: 20,21,22,23 Socket: 2 - Cpuset: 28,29,30,31 Socket: 3 exclusive: false ================================================ FILE: sample-configs/topology-aware-policy.cfg ================================================ policy: Active: topology-aware ReservedResources: CPU: 750m logger: Debug: cri-resmgr,resource-manager,cache dump: Config: off:.*,full:((Create)|(Start)|(Run)|(Update)|(Stop)|(Remove)).* ================================================ FILE: scripts/build/docker-build-image ================================================ #!/bin/bash IMAGE=$1 DOCKERFILE=dockerfiles/cross-build/Dockerfile.${IMAGE%-build} shift 1 echo "* Building docker images with" echo " - Dockerfile: $DOCKERFILE" echo " - image name: $IMAGE" echo " - options : $@" docker build . \ -f "$DOCKERFILE" -t "$IMAGE" \ --build-arg "CREATE_USER=$USER" \ --build-arg USER_UID="$(id -u)" \ "$@" || exit 1 ================================================ FILE: scripts/build/get-buildid ================================================ #!/bin/bash # # Script to determine a version string, a buildid as well as related RPM # and debian package versions. These are determined using the following # sources in decreasing order of preference: # # 1. git metadata: # - version: git describe --tags --long --dirty # - buildid: git rev-parse --short HEAD # 2. stored git metadata: # - version: git-version # - buildid: git-buildid # 3. directory name: # - version: cri-resource-manager-(.*): # - buildid: unknown # 4. date: # - version: 0.0.0-$(date +%Y%m%d%H%M) # - buildid: unknown # PARENT_DIRNAME=cri-resource-manager VERSION_FILE=version BUILDID_FILE=buildid VERSION="" BUILDID="" RPM="" DEB="" fail() { echo "$*" 2>&1 exit 1 } log() { echo "$*" 1>&2 } print_usage() { local _status=0 if [ -n "$*" ]; then echo "$*" _status=1 fi echo "usage $0 [--store[=]] [--version] [--buildid] [--rpm] [--deb] [--tar] [--all]" exit $_status } dotgit_hasrepo() { git status >& /dev/null } dotgit_version() { local _v _id _dirty _count if [ -z "$TEST_DESCRIBE" ]; then if ! dotgit_hasrepo; then return 1 fi _id=$(git rev-parse --short HEAD) _dirty=$(git diff --quiet -- ':!go.mod' ':!go.sum' || echo '-dirty') _v=$(git describe --tags --long --dirty 2>/dev/null) else _v="$TEST_DESCRIBE" _id="$TEST_REV" _dirty="" fi case "$_v" in v*) _v="${_v#v}" ;; *) _count=$(git rev-list --count HEAD) _v="0.0.0-$_count-g$_id$_dirty" ;; esac VERSION="$_v" BUILDID="$_id$_dirty" } stored_hasdata() { if [ ! -f "$OUTDIR/$VERSION_FILE" ] || [ ! -f "$OUTDIR/$BUILDID_FILE" ]; then return 1 fi STORED_VERSION=$(cat "$OUTDIR/$VERSION_FILE") && \ STORED_BUILDID=$(cat "$OUTDIR/$BUILDID_FILE") } stored_version() { if ! stored_hasdata; then return 1 fi VERSION="$STORED_VERSION" BUILDID="$STORED_BUILDID" } stored_update() { if stored_hasdata; then if [ "$STORED_VERSION" = "$VERSION" ] && [ "$STORED_BUILDID" = "$BUILDID" ]; then return 0 fi fi mkdir -p "$OUTDIR" || fail "failed to create $OUTDIR" echo "$VERSION" > "$OUTDIR/$VERSION_FILE" echo "$BUILDID" > "$OUTDIR/$BUILDID_FILE" } parent_version() { local _dir _dir=$(basename "$(realpath .)") case "$_dir" in "${PARENT_DIRNAME}"-*) VERSION="${_dir##${PARENT_DIRNAME}-}" BUILDID=unknown return 0 ;; esac return 1 } unknown_version() { VERSION="0.0.0-$(date +%Y%m%d%H%M)" BUILDID=unknown } package_versions() { case "$VERSION" in [0-9.]**-g[0-9a-f]*) local _full="$VERSION" local _numeric=${_full%%-*} local _cntsha1=${_full#*-} local _clean=${_cntsha1%-dirty} local _dirty=${_cntsha1#$_clean}; _cntsha1="$_clean" local _sha1=${_cntsha1##*-g} local _cnt=${_cntsha1%-g*} VERSION=$_numeric if [ -n "$_cnt" ] && [ "$_cnt" != "0" ]; then VERSION="$VERSION-$_cnt-g$_sha1" fi VERSION=$VERSION$_dirty RPM=$(echo "$VERSION" | tr '+-' '_') DEB=$VERSION ;; [0-9.]*) RPM=$VERSION DEB=$VERSION ;; *) fail "can't parse version $VERSION" ;; esac } print_variables() { local _what _var _val for _what in $PRINT; do case $_what in version) [ -n "$SHVAR" ] && _var='gitversion=' _val="$VERSION" ;; buildid) [ -n "$SHVAR" ] && _var='gitbuildid=' _val="$BUILDID" ;; rpm) [ -n "$SHVAR" ] && _var='rpmversion=' _val="$RPM" ;; deb) [ -n "$SHVAR" ] && _var='debversion=' _val="$DEB" ;; tar) [ -n "$SHVAR" ] && _var='tarversion=' _val="$VERSION" ;; *) print_usage "unknown version/buildid-related tag \"$_what\"" ;; esac echo "$_var$_val" done } ######################### # main script # OUTDIR="." STORE="" PRINT="" SHVAR=y TEST_DESCRIBE="" TEST_REV="" while [ "$#" != "0" ]; do case $1 in --help|-h) print_usage ;; --debug) set -x ;; --store=*|-s*) STORE=y out="${1##*=}" if [ "$out" != "$1" ]; then OUTDIR="$out" fi ;; --version|-v) PRINT="$PRINT version" ;; --buildid|-b) PRINT="$PRINT buildid" ;; --rpm) PRINT="$PRINT rpm" ;; --deb) PRINT="$PRINT deb" ;; --tar) PRINT="$PRINT tar" ;; --all) PRINT="version buildid rpm deb tar" ;; --shell*|--sh-syntax*) val="${1##*=}" if [ "$val" != "$1" ]; then case $val in y*|t*) SHVAR=y;; n*|f*) SHVAR="";; esac else SHVAR=y fi ;; --no-shell|--no-sh-syntax) SHVAR="" ;; --test) TEST_DESCRIBE="$2" TEST_REV="$3" shift 2 ;; *) print_usage "unknown option \"$1\"" ;; esac shift done if ! dotgit_version; then if ! stored_version; then if ! parent_version; then unknown_version fi fi fi if [ -z "$STORE" ] && [ -z "$PRINT" ]; then PRINT="version buildid" fi package_versions print_variables if [ -n "$STORE" ]; then stored_update fi ================================================ FILE: scripts/build/update-gh-pages.sh ================================================ #!/bin/bash -e set -o pipefail script=`basename $0` usage () { cat << EOF Usage: $script [-h] [-a] [BUILD_SUBDIR] Options: -h show this help and exit -a amend (with --reset-author) instead of creating a new commit EOF } # Helper function for detecting available versions from the current directory create_versions_js() { _baseurl="/cri-resource-manager" echo -e "function getVersionsMenuItems() {\n return [" # 'stable' is a symlink pointing to the latest version [ -f stable ] && echo " { name: 'stable', url: '$_baseurl/stable' }," for f in `ls -d */ | tr -d / | sed s'/releases//'`; do echo " { name: '$f', url: '$_baseurl/$f' }," done echo -e " ];\n}" } # Helper function for detecting archived releases from the current directory create_releases_js() { echo -e "function getReleaseListItems() {\n return [" for f in `ls -d v*/ | tr -d /`; do echo " { name: '$f', url: '$f' }," done echo -e " ];\n}" } # # Argument parsing # while [ "${1#-}" != "$1" -a -n "$1" ]; do case "$1" in -a|--amend) amend="--amend --reset-author" ;; -h|--help) usage exit 0 ;; *) usage exit 1 ;; esac shift done build_subdir="$1" # Check that no extra args were provided if [ $# -gt 1 ]; then echo "ERROR: unknown arguments: $@" usage exit 1 fi # # Build the documentation # build_dir="_build" echo "Creating new Git worktree at $build_dir" git worktree add "$build_dir" gh-pages # Drop worktree on exit trap "echo 'Removing Git worktree $build_dir'; git worktree remove --force '$build_dir'" EXIT # Parse subdir name from GITHUB_REF release_tag= if [ -z "$build_subdir" ]; then case "$GITHUB_REF" in refs/tags/*) _base_ref=${GITHUB_REF#refs/tags/} release_tag=$_base_ref ;; refs/heads/*) _base_ref=${GITHUB_REF#refs/heads/} ;; *) _base_ref= esac echo "Parsed baseref: '$_base_ref'" case "$GITHUB_REF" in refs/tags/v*) _version=${GITHUB_REF#refs/tags/v} ;; refs/heads/release-*) _version=${GITHUB_REF#refs/heads/release-} ;; *) _version= esac echo "Detected version: '$_version'" _version=`echo -n $_version | sed -nE s'!^([0-9]+\.[0-9]+).*$!\1!p'` # Use version as the subdir build_subdir=${_version:+v$_version} # Fallback to base-ref i.e. name of the branch or tag if [ -z "$build_subdir" ]; then # For master branch we use the name 'devel' [ "$_base_ref" = "master" ] && build_subdir=devel || build_subdir=$_base_ref fi fi # Default to 'devel' if no subdir was given and we couldn't parse # it build_subdir=${build_subdir:-devel} echo "Updating site version subdir: '$build_subdir'" export SITE_BUILDDIR="$build_dir/$build_subdir" export VERSIONS_MENU=1 export VERSIONS_MENU_THIS_VERSION=$build_subdir make html # Update releases/ subdir if [ "$release_tag" ]; then echo "Building archived docs for release $release_tag" export SITE_BUILDDIR="$build_dir/releases/$release_tag" make html fi # Only update the releases "site" from master if [ "$GITHUB_REF" = "refs/heads/master" ]; then echo "Building releases/" sphinx-build docs/releases "$build_dir"/releases fi # # Update gh-pages branch # commit_hash=`git describe --dirty --always` # Switch to work in the gh-pages worktree pushd "$build_dir" # Add "const" files we need in root dir touch .nojekyll _stable=`(ls -d1 v*/ || :) | sort -n | tail -n1` if [ -n "$_stable" ]; then ln -sfT "$_stable" stable redirect_to="stable" else redirect_to=$build_subdir fi # Detect existing versions from the gh-pages branch create_versions_js > versions.js # Update releases directory mkdir -p releases cp versions.js releases/ pushd releases create_releases_js > releases.js popd cat > index.html << EOF EOF if [ -z "`git status --short`" ]; then echo "No new content, gh-pages branch already up-to-date" exit 0 fi # Create a new commit commit_msg=`echo -e "Update documentation for $build_subdir\n\nAuto-generated from $commit_hash by '$script'"` echo "Committing changes..." # Exclude doctrees dir git add -- ":!$build_subdir/.doctrees" git commit $amend -m "$commit_msg" popd echo "gh-pages branch successfully updated" ================================================ FILE: scripts/code-generator/boilerplate.go.txt ================================================ // Copyright 2019-2020 Intel Corporation. All Rights Reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. ================================================ FILE: scripts/code-generator/generate-groups.sh ================================================ #!/bin/bash # This is a helper for running the identically named code-generator script from # https://github.com/kubernetes/code-generator. REPO=https://github.com/kubernetes/code-generator SCRIPT="$(realpath "$0")" HEADER="${SCRIPT%/*}"/boilerplate.go.txt TOPDIR=${SCRIPT%/scripts/*} MODDIR=$TOPDIR MODURL=$(grep ^module "$TOPDIR"/go.mod | sed 's/^module *//g') MODULES=${MODULES:-pkg/topology} fail() { echo "error: $*" exit 1 } # Parse $* for --output-base, set $gendir and $repo accordingly. pick-gen-dir() { local _save="" _a gendir=$TOPDIR/generate for _a in "$@"; do case $_a in --output-base) _save=y;; *) if [ -n "$_save" ]; then gendir=$_a _save="" fi ;; esac done repo=$gendir/${REPO##*/} } # Set $tag to correspond to $KUBERNETE_VERSION. pick-git-tag() { if [ -z "$KUBERNETES_VERSION" ]; then fail "KUBERNETES_VERSION not set, please set it to the desired version to match/use." fi case $KUBERNETES_VERSION in v1.[0-9.]*) tag=${KUBERNETES_VERSION/#v1/v0};; *) fail "Don't know how to convert KUBERNETES_VERSION $KUBERNETES_VERSION to tag." ;; esac } # Clone $REPO as $repo. git-clone() { if [ ! -d "$repo"/.git ]; then mkdir -p "$gendir" || fail "failed to clone git repo" (cd "$gendir" && git clone $REPO) || fail "failed to clone git repo $REPO" else (cd "$repo" && git fetch -q origin) || fail "failed to update/fetch git repo $REPO" fi } # Check out the $tag corresponding to $KUBERNETES_VERSION. git-switch() { (set -e cd "$repo" git reset -q --hard HEAD 2> /dev/null git checkout -q "$tag" ) || fail "failed to checkout git tag $tag" } # Patch $repo/go.mod with replacement rules from $TOPDIR and add replacement rules for $TOPDIR. go-mod-patch() { (set -e cd "$repo" grep -A 640 '^replace ' "$TOPDIR"/go.mod | grep -v pkg/topology >> go.mod go mod edit -replace="$MODURL=$MODDIR" for mod in $MODULES; do go mod edit -replace="$MODURL/$mod=$MODDIR/$mod" done ) || fail "failed to patch go.mod" } # Check any previously generated files for $MODURL, bail out if they exist. check-existing() { local _pkg=${3%:*} _ver=${3#*:} _dir for _dir in "$gendir/$1" "$gendir/$2/$_pkg/$_ver"; do if [ -d "$_dir" ]; then fail "$_dir already exists, refusing to overwrite it" fi done } # Run generate run-generator() { (set -e cd "$repo" ./generate-groups.sh "$@" --go-header-file "$HEADER" ) || fail "code generation failed" } pick-gen-dir "$@" pick-git-tag git-clone git-switch go-mod-patch check-existing "$2" "$3" "$4" run-generator "$@" ================================================ FILE: scripts/hack/create-webhook-secrets.sh ================================================ #!/bin/sh -e this=$(realpath "$0") this_dir=$(dirname "$this") template_dir=$(realpath "$this_dir/../../cmd/cri-resmgr-webhook/") outdir="deploy/cri-resmgr-webhook" outdir_abs="$(pwd)/$outdir" cat << EOF *** *** *** WARNING: NOT FOR PRODUCTION USE *** *** *** EOF info () { echo "[INFO] $1" } info "Generating x509 keys..." mkdir -p "$outdir" # Create temp workdir and remove it on exit tmpdir=$(mktemp -d --suffix=.cri-resmgr) trap 'rm -rf $tmpdir' EXIT cd "$tmpdir" # Create a self-signed CA certificate openssl req -batch -new -newkey rsa:2048 -x509 -sha256 -nodes -days=30 -out ca.crt -keyout ca.key export cn=cri-resmgr-webhook.cri-resmgr.svc openssl req -batch -newkey rsa:2048 -nodes -keyout svc.key -out $cn.csr -subj "/CN=cri-resmgr-webhook.cri-resmgr.svc" openssl x509 -req -in $cn.csr -CA ca.crt -CAkey ca.key -CAcreateserial -sha256 -out svc.crt -days 3650 # Copy artifacts to outdir cp ca.crt svc.crt svc.key "$outdir_abs" info "Done" info "Sample cert and key files successfully generated under '$outdir'" info "Creating MutatingWebhookConfiguration template" sed "s/CA_BUNDLE_PLACEHOLDER/$(base64 -w0 < ca.crt)/" "$template_dir/mutating-webhook-config.yaml" > "$outdir_abs/mutating-webhook-config.yaml" # Print instructions cat << EOF Instructions for example deployment =================================== 0. Create cri-resmgr namespace, if it does not exist: kubectl create ns cri-resmgr 1. Create Kubernetes secrets with: kubectl -n cri-resmgr create secret generic cri-resmgr-webhook-secret \\ --from-file=$outdir/svc.crt --from-file=$outdir/svc.key 2. Build and publish webhook container: make image-webhook IMAGE_REPO=my-image-repo IMAGE_TAG=my-version And deploy it: sed s'!IMAGE_PLACEHOLDER!my-image-repo/cri-resmgr-webhook:my-version!' cmd/webhook/webhook-deployment.yaml | kubectl apply -f - 3. Create MutatingWebhookConfiguration with: kubectl apply -f $outdir/mutating-webhook-config.yaml EOF ================================================ FILE: scripts/hack/go-mod-replace-helper.sh ================================================ #!/bin/bash -e set -o pipefail this=`basename $0` usage () { cat << EOF USAGE: $this REPO_CACHE_DIR VERSION MODULE... OPTIONS -h show this help and exit EXAMPLES Print replace directives for all k8s.io/* updated to v0.19.4: $ sed -n '/replace/,$p' go.mod | grep k8s.io | awk '{print $1}' | \\ xargs ./scripts/hack/go-mod-replace-helper.sh ../k8s-cache/ v0.19.4 EOF } update_cache() { local module_base=`basename "$1"` local module_cache_dir="$cache_dir/$module_base" if [ ! -e "$module_cache_dir" ]; then module_repo="https://github.com/kubernetes/$module_base" echo "Cloning $module_repo to $module_cache_dir" git clone -q --depth=1 "$module_repo" "$module_cache_dir" fi echo "Updating $1 at $module_cache_dir" cd "$module_cache_dir" git fetch -q --tags --depth=1 cd ->/dev/null } gomodrev() { local module_base=`basename "$1"` local module_cache_dir="$cache_dir/$module_base" cd "$module_cache_dir" # Resolve to a commit sha=`git rev-parse "$2"~0` short_sha=`git rev-parse --short=12 $sha` unix_ts=`git show $sha --format=%ct --date=unix | head -n1` gomod_ts=`date -u --date=@$unix_ts +'%Y%m%d%H%M%S'` echo "v0.0.0-$gomod_ts-$short_sha" cd - >/dev/null } while [ "${1#-}" != "$1" -a -n "$1" ]; do case "$1" in -h|--help) usage exit 0 ;; *) usage exit 1 ;; esac shift done if [ $# -lt 3 ]; then usage exit 1 fi cache_dir="$1" shift module_version="$1" shift module_names="$@" cat << EOF UPDATING CACHE ============== EOF for m in $@; do update_cache $m done cat << EOF GO.MOD REPLACE ============== EOF for m in $@; do r=`gomodrev $m $module_version` echo -e "\t$m v0.0.0 => $m $r" done ================================================ FILE: scripts/hack/go-mod-tree ================================================ #!/usr/bin/env python3 """go-mod-tree - inspect go module import hierarchy Usage: go mod graph | go-mod-tree [options] Options: -h, --help print help. Input: -g, --graph FILE read graph from FILE instead of stdin. Dependency tree selection: (MODULEs are regular expressions) -r, --reverse print reverse tree: from importees to importers. -f, --from MODULE print tree starting from matching MODULEs. -t, --to MODULE print tree with only branches that end to matching MODULEs. -x, --exclude MODULE exclude matching MODULEs from the graph. -s, --shortest-path MODULE print only a shortest path to matching MODULEs. -d, --depth DEPTH limit printed tree to DEPTH. Output format: -H hide from line format: L: line number D: depth I: indentation R: reference to already printed line -I STRING indentation by repeating STRING Examples: - Print full import graph as a tree: go mod graph | go-mod-tree - Print which of the direct dependencies lead to importing x/net: go mod graph | go-mod-tree --to golang.org/x/net --depth 1 - Print modules directly imported by different versions of x/net: go mod graph | go-mod-tree --from golang.org/x/net --depth 1 - Print modules that directly depend on any version of x/net: go mod graph | go-mod-tree --reverse --from golang.org/x/net --depth 1 - Print shortest import paths to 2010-2019 versions of x/net: go mod graph | go-mod-tree --shortest-path .*/x/net@.*201[0-9].* - Print full reverse import tree from a specific x/net version: go mod graph | go-mod-tree --reverse --from .*20190311183353-d8887717615a """ import getopt import re import sys g_command = "go-mod-tree" opt_fmt = "%(prefix)s%(indent)s%(node)s %(ref)s\n" opt_indent = ": " opt_reverse = False opt_graph = "-" opt_shortest_path = None opt_from = None opt_to = None opt_exclude = None opt_depth = float("inf") opt_hide = "" def error(msg, exit_status=1): """print error message and exit""" if msg: sys.stderr.write("%s: %s\n" % (g_command, msg)) if exit_status != None: sys.exit(1) def output(msg): try: sys.stdout.write(msg) except: error("broken pipe") def read_graph(s): """read go mod graph output from a string""" deps = {} # {importer: set(importee, ...)} for line in s.splitlines(): if not line: continue if not " " in line: continue importer, importee = line.split(" ", 1) if not importer in deps: deps[importer] = set() deps[importer].add(importee) return deps g_lineno = 0 def dump_tree(graph, module, depth=0, already_seen={}, max_depth=opt_depth): def dump_line(depth, node): global g_lineno g_lineno += 1 if "D" not in opt_hide: pp_depth = "D%d" % (depth,) else: pp_depth = "" if "L" not in opt_hide: pp_lineno = "L%d" % (g_lineno,) else: pp_lineno = "" if "D" in opt_hide and "L" in opt_hide: pp_lineprefix = "" else: pp_lineprefix = "%-8s" % ((pp_lineno + pp_depth),) if "I" in opt_hide: pp_indent = "" else: pp_indent = opt_indent * depth pp_ref = "" if node in already_seen and "R" not in opt_hide: pp_ref = " (see L%(line)sD%(depth)s...)" % already_seen[node] output((opt_fmt % { 'prefix': pp_lineprefix, 'indent': pp_indent, 'node': node, 'ref': pp_ref})) if depth > max_depth: return dump_line(depth, module) if module in already_seen: return already_seen[module] = {"line": g_lineno, "depth": depth} for child in sorted(graph.get(module, set())): dump_tree(graph, child, depth+1, already_seen, max_depth=max_depth) def graph_clear(graph): """return graph without node keys that have no outgoing edges""" new_graph = {} for node in graph: if graph[node]: new_graph[node] = set(graph[node]) return new_graph def graph_exclude(graph, exclude_nodes): """return graph without nodes in the exclude_nodes set""" new_graph = {} for node in graph: if node not in exclude_nodes: new_graph[node] = graph[node] - exclude_nodes return graph_clear(new_graph) def graph_reverse(graph): """return reversed graph""" new_graph = {} for from_node, to_nodes in graph.items(): for to_node in to_nodes: if not to_node in new_graph: new_graph[to_node] = set() new_graph[to_node].add(from_node) return new_graph def graph_reachable_part(graph, from_nodes): """return the part of the graph that is reachable from a set of nodes""" new_graph = {} stack = list(set(graph.keys()).intersection(from_nodes)) while stack: node = stack.pop() if node in new_graph: continue new_graph[node] = set() for child in graph.get(node, set()): new_graph[node].add(child) stack.append(child) return graph_clear(new_graph) def graph_from_to(graph, from_nodes, to_nodes): """return graph between from_nodes and to_nodes""" new_graph = graph new_graph = graph_reverse(new_graph) new_graph = graph_reachable_part(new_graph, to_nodes) new_graph = graph_reverse(new_graph) new_graph = graph_reachable_part(new_graph, from_nodes) return new_graph def shortest_path(graph, from_node, to_node): """return new graph that contains only a shorest path between nodes""" shortest_path = None bfs_queue = [(child, [from_node]) for child in sorted(graph.get(from_node, set()))] seen = set(from_node) while bfs_queue: node, history = bfs_queue.pop(0) seen.add(node) if node == to_node: shortest_path = history + [node] break for child in sorted(graph.get(node, set())): if child in seen: continue bfs_queue.append((child, history + [node])) return shortest_path def graph_add_path(graph, path): """add a path to current graph""" for n, node in enumerate(path): if not node in graph: graph[node] = set() if n > 0: graph[path[n-1]].add(node) return graph def matching_nodes(graph, node_regexp): matching = set() nodes = set.union(set(graph.keys()), set.union(*graph.values())) for node in nodes: if re.match(node_regexp, node): matching.add(node) return sorted(matching) def root_nodes(graph): dest_nodes = set.union(*graph.values()) src_nodes = set(graph.keys()) roots = src_nodes - dest_nodes return sorted(roots) if __name__ == "__main__": try: opts, remainder = getopt.gnu_getopt( sys.argv[1:], 'd:f:g:hrs:t:x:H:I:', ['depth=', 'exclude=', 'from=', 'graph=', 'help', 'reverse', 'shortest-path=', 'to=']) except getopt.GetoptError as e: error(str(e)) for opt, arg in opts: if opt in ["-h", "--help"]: print(__doc__) error(None, exit_status=0) elif opt in ["-d", "--depth"]: try: opt_depth = int(arg) if opt_depth <= 0: raise Exception("depth <= 0") except: error('invalid --depth=%r, positive integer expected', (arg,)) elif opt in ["-f", "--from"]: opt_from = arg elif opt in ["-g", "--graph"]: opt_graph = arg elif opt in ["-r", "--reverse"]: opt_reverse = True elif opt in ["-s", "--shortest-path"]: opt_shortest_path = arg elif opt in ["-t", "--to"]: opt_to = arg elif opt in ["-H"]: opt_hide = arg elif opt in ["-I"]: opt_indent = arg elif opt in ["-x", "--exclude"]: opt_exclude = arg else: error('internal error: option "%s" not handled' % (opt,)) if len(remainder) > 0: error('too many parameters') if opt_graph == "-": graph_string = sys.stdin.read() else: try: graph_string = open(opt_graph).read() except Exception as err: error('failed to read graph from file "%s": %s' % (opt_graph, err)) graph = read_graph(graph_string) if opt_exclude: exclude_modules = matching_nodes(graph, opt_exclude) if not exclude_modules: error('no modules matching regular expression --exclude %r' % (opt_exclude,)) graph = graph_exclude(graph, set(exclude_modules)) if opt_reverse: graph = graph_reverse(graph) if opt_from: from_modules = matching_nodes(graph, opt_from) if not from_modules: error('no modules matching regular expression --from %r' % (opt_from,)) else: from_modules = root_nodes(graph) if opt_to: to_modules = matching_nodes(graph, opt_to) if not to_modules: error('no modules matching regular expression --to %r' % (opt_to,)) graph = graph_from_to(graph, set(from_modules), set(to_modules)) from_modules = set(from_modules).intersection( set.union(set(graph.keys()), set.union(*graph.values()))) if opt_shortest_path: new_graph = {} to_modules = matching_nodes(graph, opt_shortest_path) if not to_modules: error('no modules matching regular expression --shortest-path %r' % (opt_shortest_path,)) for from_node in from_modules: for to_node in to_modules: path = shortest_path(graph, from_node, to_node) if path: graph_add_path(new_graph, path) graph = new_graph for from_node in from_modules: dump_tree(graph, from_node, max_depth=opt_depth) ================================================ FILE: scripts/hack/install-protobuf ================================================ #!/usr/bin/env bash # Copyright The containerd Authors. # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # http://www.apache.org/licenses/LICENSE-2.0 # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # # Downloads and installs protobuf # set -eu -o pipefail PROTOBUF_VERSION=3.20.1 GOARCH=$(go env GOARCH) GOOS=$(go env GOOS) PROTOBUF_DIR=$(mktemp -d) case $GOARCH in arm64) wget -O "$PROTOBUF_DIR/protobuf" "https://github.com/protocolbuffers/protobuf/releases/download/v$PROTOBUF_VERSION/protoc-$PROTOBUF_VERSION-linux-aarch64.zip" unzip "$PROTOBUF_DIR/protobuf" -d /usr/local ;; amd64|386) if [ "$GOOS" = windows ]; then wget -O "$PROTOBUF_DIR/protobuf" "https://github.com/protocolbuffers/protobuf/releases/download/v$PROTOBUF_VERSION/protoc-$PROTOBUF_VERSION-win32.zip" elif [ "$GOOS" = linux ]; then wget -O "$PROTOBUF_DIR/protobuf" "https://github.com/protocolbuffers/protobuf/releases/download/v$PROTOBUF_VERSION/protoc-$PROTOBUF_VERSION-linux-x86_64.zip" fi unzip "$PROTOBUF_DIR/protobuf" -d /usr/local ;; ppc64le) wget -O "$PROTOBUF_DIR/protobuf" "https://github.com/protocolbuffers/protobuf/releases/download/v$PROTOBUF_VERSION/protoc-$PROTOBUF_VERSION-linux-ppcle_64.zip" unzip "$PROTOBUF_DIR/protobuf" -d /usr/local ;; *) wget -O "$PROTOBUF_DIR/protobuf" "https://github.com/protocolbuffers/protobuf/releases/download/v$PROTOBUF_VERSION/protobuf-cpp-$PROTOBUF_VERSION.zip" unzip "$PROTOBUF_DIR/protobuf" -d /usr/src/protobuf cd "/usr/src/protobuf/protobuf-$PROTOBUF_VERSION" ./autogen.sh ./configure --disable-shared make make check make install ldconfig ;; esac rm -rf "$PROTOBUF_DIR" # Download status.proto. grpc repos' one seems copied from # https://github.com/googleapis/googleapis/blob/master/google/rpc/status.proto, # but we use grpc's since the repos has tags/releases. mkdir -p /usr/local/include/google/rpc curl \ -L https://raw.githubusercontent.com/grpc/grpc/v1.45.2/src/proto/grpc/status/status.proto \ -o /usr/local/include/google/rpc/status.proto ================================================ FILE: scripts/testing/crictl ================================================ #!/bin/sh RELAY_SOCKET=unix:///var/run/cri-relay.sock if [ -z "$CRICTL" ]; then CRICTL=crictl fi sudo $CRICTL -i $RELAY_SOCKET -r $RELAY_SOCKET "$@" ================================================ FILE: scripts/testing/jaeger ================================================ #!/bin/sh ENVVARS="-e COLLECTOR_ZIPKIN_HTTP_PORT=9411" PORTS="-p 5775:5775/udp \ -p 6831:6831/udp \ -p 6832:6832/udp \ -p 5778:5778 \ -p 16686:16686 \ -p 14268:14268 \ -p 9411:9411" if [ "$1" = "--permanent" ]; then storage=/tmp/jaeger-trace data=$storage/data key=$storage/key echo "Using $data and $key to store (badger) traces..." mkdir -p $storage STORAGE="-e SPAN_STORAGE_TYPE=badger \ -e BADGER_EPHEMERAL=false \ -e BADGER_DIRECTORY_VALUE=$data \ -e BADGER_DIRECTORY_KEY=$key \ -v $storage:$storage" fi cmd="docker run $ENVVARS $PORTS $STORAGE jaegertracing/all-in-one:latest" echo "Running command $cmd..." $cmd ================================================ FILE: scripts/testing/kube-cgroups ================================================ #!/bin/bash usage() { cat <&2 exit 1 } full_filename=0 empty_files=0 ns_regexp="default" # regexp matching pod_regexp="." # regexp matching any pod name cntr_regexp="." # regexp matching any container line cgfile_regexp="cpuset.cpus|cpuset.mems|blkio.throttle.*_device" # regexp matching any cgroup file cg_controller_dir=/sys/fs/cgroup while getopts "hg:EFn:p:c:f:" OPTION; do case $OPTION in h) usage exit 0 ;; g) cg_controller_dir="$OPTARG" ;; E) empty_files=1 ;; F) full_filename=1 ;; n) ns_regexp="$OPTARG" ;; p) pod_regexp="$OPTARG" ;; c) cntr_regexp="$OPTARG" ;; f) cgfile_regexp="$OPTARG" ;; *) error "invalid option $OPTION" ;; esac done if [ ! -d "$cg_controller_dir" ]; then error "cgroup directory '$cg_controller_dir' does not exist" fi kubectl get pods -A | grep -E "$pod_regexp" | while read -r namespace podname rest; do [ "$namespace" == "NAMESPACE" ] && continue grep -q -E "$ns_regexp" <<< "$namespace" || continue kubectl describe pod -n "$namespace" "$podname" | grep -B1 'Container ID:' | while read -r container _ containerid; do if [[ "$container" != "Container" ]] && [[ "$container" != "--" ]]; then containername="${container%%:*}" continue fi containerID=${containerid#*://} if [[ -z "$containerID" ]]; then continue fi grep -q -E "$cntr_regexp" <<< "$containername" || continue while read -r cgroupdir; do if [[ "$cgroupdir" == *crio-conmon* ]]; then continue fi for filename in "$cgroupdir"/*; do if [[ ! -f "$filename" ]]; then continue fi filename_nodir="${filename##*/}" grep -q -E "$cgfile_regexp" <<< "$filename_nodir" || continue if [[ -n "$podname" ]]; then echo "$namespace/$podname:" unset podname fi [[ -n "$containername" ]] && { echo " $containername:" unset containername } linecount="$(wc -l < "$filename")" if [[ "$linecount" == "0" ]] && [[ "$empty_files" == "0" ]]; then continue fi if [[ "$full_filename" == "1" ]]; then print_filename="$filename" else print_filename="$filename_nodir" fi if (( "$linecount" <= 1 )); then # print contents of a single-line file after filename echo " $print_filename: $(< "$filename")" else # print contents of a multiline file indented echo " $print_filename:" sed "s/^/ /g" < "$filename" fi done done <<< "$(find "$cg_controller_dir" -name "*${containerID}*")" done done ================================================ FILE: scripts/testing/pairwise ================================================ #!/usr/bin/env python3 """pairwise - print var-value combinations that cover all value pairs Usage: pairwise VAR=VALUE [VAR=VALUE...] Example: $ pairwise \\ distro={debian-sid,opensuse,fedora} \\ k8scni={cilium,weavenet,flannel} \\ k8scri={crio,containerd} \\ k8s={1.22.0,1.23.0} """ import sys def error(msg, exit_status=1): sys.stderr.write('pairwise: %s\n' % (msg,)) if exit_status is not None: sys.exit(exit_status) def output(msg): sys.stdout.write(msg) # This program prints an optimized set of value combinations # that covers all value pairs. def all_combinations(var_values): combinations = [{}] for var in var_values: new_combinations = [] for d in combinations: for value in var_values[var]: new_comb = dict(d) new_comb[var] = value new_combinations.append(new_comb) combinations = new_combinations return combinations def combination_to_triplets(d): triplets = set() keys = sorted(d.keys()) for key1_index, key1 in enumerate(keys): val1 = d[key1] for key2_index, key2 in enumerate(keys[key1_index+1:]): val2 = d[key2] for key3 in keys[key1_index + key2_index + 2:]: val3 = d[key3] triplets.add(frozenset(((key1, val1), (key2, val2), (key3, val3)))) return triplets def combination_to_pairs(d): pairs = set() keys = sorted(d.keys()) for key1_index, key1 in enumerate(keys): val1 = d[key1] for key2 in keys[key1_index+1:]: val2 = d[key2] pairs.add(frozenset(((key1, val1), (key2, val2)))) return pairs def combination_to_singles(d): singles = set() for key1 in d.keys(): val1 = d[key1] singles.add(frozenset((key1, val1))) return singles def cover_pairwise(var_values): chosen_combinations = [] covered_pairs = set() combination_pairs = {} all_triplets = set() all_pairs = set() all_singles = set() combinations = all_combinations(var_values) for c in combinations: all_triplets = all_triplets.union(combination_to_triplets(c)) all_pairs = all_pairs.union(combination_to_pairs(c)) all_singles = all_singles.union(combination_to_singles(c)) uncovered_triplets = set(all_triplets) number_of_triplets = len(uncovered_triplets) uncovered_pairs = set(all_pairs) uncovered_singles = set(all_singles) while uncovered_pairs: combination_score = [] for c in combinations: covers_triplets = combination_to_triplets(c) covers_pairs = combination_to_pairs(c) covers_singles = combination_to_singles(c) combination_score.append( (len(uncovered_pairs.intersection(covers_pairs)) + len(uncovered_singles.intersection(covers_singles)) + len(uncovered_triplets.intersection(covers_triplets)) / number_of_triplets, c, covers_pairs, covers_singles, covers_triplets)) best_score, best_comb, best_pairs, best_singles, best_triplets = sorted(combination_score, key=lambda comb_score: comb_score[0])[-1] chosen_combinations.append(best_comb) uncovered_triplets = uncovered_triplets - best_triplets uncovered_pairs = uncovered_pairs - best_pairs uncovered_singles = uncovered_singles - best_singles return chosen_combinations if __name__ == "__main__": if len(sys.argv) < 2 or "-h" in sys.argv or "--help" in sys.argv: output(__doc__) error('missing VAR=VALUE...', exit_status=0) # construct var_values from command line arguments var_values = {} # {var: list-of-values} for var_value in sys.argv[1:]: try: var, value = var_value.split("=", 1) except: error('bad argument %r, VAR=VALUE expected', var_value) if var not in var_values: var_values[var] = [] var_values[var].append(value) for comb in cover_pairwise(var_values): var_value_row = [] for var in sorted(comb.keys()): var_value_row.append('%s="%s"' % (var, comb[var])) output(" ".join(var_value_row) + "\n") ================================================ FILE: scripts/testing/prometheus ================================================ #!/bin/sh dir=$(dirname "$0") cfg=$dir/prometheus.yaml cmd="docker run -p 9090:9090 \ -v $cfg:/etc/prometheus/prometheus.yml \ prom/prometheus --config.file=/etc/prometheus/prometheus.yml $*" echo "Running command $cmd..." $cmd ================================================ FILE: scripts/testing/prometheus.yaml ================================================ global: scrape_interval: 10s external_labels: monitor: 'CRI-RM' scrape_configs: - job_name: 'CRI-RM' scrape_interval: 10s static_configs: - targets: ['10.0.0.2:8888'] ================================================ FILE: scripts/testing/set-path ================================================ #!/bin/sh # set -x dirpart=packages/src/github.com/intel/cri-interceptor case $(pwd) in */$dirpart*) ;; *) echo "Don't know how: I don't see $dirpart in $(pwd)..." return 1 ;; esac dir=$(pwd) kubedir=${dir%%/github.com*}/k8s.io/kubernetes kubebin=$kubedir/_output/local/bin/linux/amd64 if [ ! -d "$kubebin" ]; then echo "*** You don't seem to have a $kubebin directory." return 1 fi if [ ! -x "$kubebin"/kubelet ]; then echo "*** You don't seem to have kubelet in $kubebin (done a make WHAT=cmd/kubelet ?)" ls -ls "$kubebin" return 1 fi export PATH="$kubebin:$PATH" ================================================ FILE: test/critest/run.sh ================================================ #!/bin/bash TEST_TITLE="CRI validation tests with critest" PV='pv -qL' SCRIPT_DIR="$(dirname "${BASH_SOURCE[0]}")" DEMO_LIB_DIR=$(realpath "$SCRIPT_DIR/../../demo/lib") BIN_DIR=$(realpath "$SCRIPT_DIR/../../bin") OUTPUT_DIR=${outdir-$SCRIPT_DIR/output} COMMAND_OUTPUT_DIR=$OUTPUT_DIR/commands # shellcheck disable=SC1091 # shellcheck source=../../demo/lib/command.bash source "$DEMO_LIB_DIR"/command.bash # shellcheck disable=SC1091 # shellcheck source=../../demo/lib/host.bash source "$DEMO_LIB_DIR"/host.bash # shellcheck disable=SC1091 # shellcheck source=../../demo/lib/vm.bash source "$DEMO_LIB_DIR"/vm.bash usage() { echo "$TEST_TITLE" echo "Usage: [VAR=VALUE] ./run.sh MODE" echo " MODE: \"play\" plays the test as a demo." echo " \"test\" runs fast, reports pass or fail." echo " VARs:" echo " tests: space-separated list of cri-resmgr configurations." echo " The default is all *.cfg files in $SCRIPT_DIR." echo " vm: govm virtual machine name." echo " The default is \"crirm-test-critest\"." echo " speed: Demo play speed." echo " The default is 10 (keypresses per second)." echo " cleanup: Level of cleanup after a test run:" echo " 0: leave VM running (the default)" echo " 1: delete VM" echo " 2: stop VM, but do not delete it." echo " outdir: Save output under given directory." echo " The default is \"${SCRIPT_DIR}/output\"." } error() { (echo ""; echo "error: $1" ) >&2 exit 1 } out() { if [ -n "$PV" ]; then speed=${speed-10} echo "$1" | $PV "$speed" else echo "$1" fi echo "" } screen-create-vm() { speed=60 out "### Running the test in VM \"$vm\"." host-create-vm "$vm" "$topology" if [ -z "$VM_IP" ]; then error "creating VM failed" fi vm-networking } screen-install-containerd() { speed=60 out "### Installing Containerd to the VM." vm-install-cri vm-install-containernetworking } screen-copy-cri-resmgr() { prefix=/usr/local host-command "scp \"$BIN_DIR/cri-resmgr\" \"$SCRIPT_DIR/tsl\" $VM_SSH_USER@$VM_IP:" || { command-error "copying cri-resmgr failed" } vm-command "mv cri-resmgr tsl $prefix/bin/" || { command-error "installing cri-resmgr to $prefix/bin failed" } PV="" vm-command "command -v cri-resmgr" >/dev/null ( echo "$COMMAND_OUTPUT" | grep -q $prefix/bin/cri-resmgr ) || { command-error "\"cri-resmgr\" does not execute $prefix/bin/cri-resmgr on VM" } } screen-install-critest() { speed=60 out "### Installing critest to VM." vm-command "apt update && apt install -y golang make socat" vm-command "go get -d github.com/kubernetes-sigs/cri-tools" CRI_TOOLS_SOURCE_DIR=$(awk '/package.*cri-tools/{print $NF}' <<< "$COMMAND_OUTPUT") [ -n "$CRI_TOOLS_SOURCE_DIR" ] || { command-error "downloading cri-tools failed" } vm-command "pushd \"$CRI_TOOLS_SOURCE_DIR\" && make && make install && popd" || { command-error "building and installing cri-tools failed" } } screen-critest-crirm-config() { config_file=$1 cri_endpoint=/var/run/containerd/containerd.sock cri_resmgr_endpoint=/var/run/cri-resmgr/cri-resmgr.sock host-command "scp $config_file $VM_SSH_USER@$VM_IP:" vm-command "rm -rf *.tsl; killall cri-resmgr; systemctl stop containerd; sleep 1; systemctl start containerd; sleep 1; rm -rf /var/lib/cri-resmgr" vm-command "cri-resmgr -force-config $config_file -runtime-socket $cri_endpoint -relay-socket $cri_resmgr_endpoint 2>&1 | tsl -uU -F \"%(ts) s cri-resmgr: %(line)s\" -o cri-resmgr.output.tsl" bg sleep 5 vm-command "critest -runtime-endpoint unix://$cri_resmgr_endpoint 2>&1 | tsl -uU -F \"%(ts) s critest: %(line)s\" -o critest.output.tsl" vm-command "killall cri-resmgr" vm-command-q "cat *.tsl | sort -n | awk '{if (t_start==0) t_start=\$1; \$1=sprintf(\"%.6fs\", \$1-t_start); print;}'" > "$OUTPUT_DIR/test-$config_file.log" } screen-critest-containerd() { cri_endpoint=/var/run/containerd/containerd.sock vm-command "rm -rf *.tsl; critest -runtime-endpoint unix://$cri_endpoint 2>&1 | tsl -uU -F \"%(ts) s critest: %(line)s\" -o critest.output.tsl" vm-command-q "cat *.tsl | sort -n | awk '{if (t_start==0) t_start=\$1; \$1=sprintf(\"%.6fs\", \$1-t_start); print;}'" > "$OUTPUT_DIR/test-containerd.log" } require_cmd() { cmd=$1 if ! command -v "$cmd" >/dev/null ; then error "required command missing \"${cmd}\", make sure it is in PATH" fi } # Validate parameters mode=$1 topology=${topology:='[{"cores": 2, "mem": "8G"}]'} distro=${distro:="ubuntu-20.04"} cri=${cri:="containerd"} vm=${vm:="critest-$distro-$cri"} cleanup=${cleanup-0} host-set-vm-config "$vm" "$distro" "$cri" cd "${SCRIPT_DIR}" || error "failed to cd to \"${SCRIPT_DIR}\"" tests=${tests-*.cfg} if [ "$mode" == "test" ]; then PV= elif [ "$mode" == "play" ] ; then speed=${speed-10} else usage error "invalid MODE" fi # Prepare for test/demo mkdir -p "$OUTPUT_DIR" mkdir -p "$COMMAND_OUTPUT_DIR" rm -f "$COMMAND_OUTPUT_DIR"/0* ( echo x > "$OUTPUT_DIR/x" && rm -f "$OUTPUT_DIR/x" ) || { error "output directory outdir=$OUTPUT_DIR is not writable" } if [ -z "$VM_IP" ] || [ -z "$VM_SSH_USER" ] || [ -z "$VM_NAME" ]; then screen-create-vm fi # always copy new version of the binary to VM screen-copy-cri-resmgr if ! vm-command-q "dpkg -l | grep -q containerd"; then screen-install-containerd fi if ! vm-command-q "command -v critest | grep -q critest"; then screen-install-critest fi # Run test/demo # 1. Run critest on cri-resmgr with each config file. for config_file in $tests; do screen-critest-crirm-config "$config_file" done # 2. Run critest without cri-resmgr for reference. screen-critest-containerd # Cleanup if [ "$cleanup" == "0" ]; then echo "The VM with critest, cri-resmgr and containerd is left running. Next steps:" vm-print-usage elif [ "$cleanup" == "1" ]; then host-stop-vm $vm host-delete-vm $vm elif [ "$cleanup" == "2" ]; then host-stop-vm $vm fi # Summarize results SUMMARY_FILE="$OUTPUT_DIR/summary.txt" echo -n "" > "$SUMMARY_FILE" || error "cannot write summary to \"$SUMMARY_FILE\"" for testlog in "$OUTPUT_DIR"/test-*.log; do { echo -n "$(basename "$testlog") " awk 'BEGIN{s=0;e=0}/critest: /{if(s==0)s=$1;e=$1}END{printf "(runtime %.2f s): ",e-s}' < "$testlog" # remove ansi colors from critest output in the summary grep Pending "$testlog" | grep critest: | tail -n 1 | sed -r -e "s/[[:cntrl:]]\[[0-9]+m//g" -e "s/^.* -- //g" } >> "$SUMMARY_FILE" done exit_status=0 # Declare verdict in test mode if [ "$mode" == "test" ]; then echo "" >> "$SUMMARY_FILE" # Test is passed if all critest executions had the same passrate, # no matter which cri-resmgr configuration was used. if [ "$(awk -F: '/Passed/{print $2}' < "$SUMMARY_FILE" | sort -u | wc -l)" == "1" ]; then echo "All critest results are the same." >> "$SUMMARY_FILE" echo "Test verdict: PASS" >> "$SUMMARY_FILE" else echo "Error: critest results are not the same in all configurations." >> "$SUMMARY_FILE" echo "Test verdict: FAIL" >> "$SUMMARY_FILE" exit_status=1 fi fi echo "" echo "Summary:" cat "$SUMMARY_FILE" exit "$exit_status" ================================================ FILE: test/critest/topology-aware-policy.cfg ================================================ policy: Active: topology-aware ReservedResources: CPU: 750m logger: Debug: cri-resmgr,resource-manager,cache,dump,instrumentation,policy dump: Config: off:.*,full:((Create)|(Start)|(Run)|(Update)|(Stop)|(Remove)).* instrumentation: Sampling: disabled ================================================ FILE: test/critest/tsl ================================================ #!/usr/bin/python3 # # Copyright 2020 Intel Corporation. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. """tsl - timestamp lines Usage: tsl [options] Options: -h, --help print help. -f TIMEFORMAT use TIMEFORMAT as output timeformat (man strftime). The default format is "%s.%f". -F LINEFORMAT use LINEFORMAT as line output format: - %(ts)s: timestamp - %(line)s: original line The default is "%(ts)s %(line)s". -o OUTFILE write output lines to OUTFILE. Supports many -o's. Special outfiles: - stdout: standard output - stderr: standard error -u unbuffered input: more accurate timestamps, slower throughput. -U unbuffered output: flush after every line, slower throughput. Examples: cmd1 | tsl -u -F "%(ts)s cmd1: %(line)s" > cmd1.tsl & cmd2 | tsl -u -F "%(ts)s cmd2: %(line)s" > cmd2.tsl & wait cat cmd1.tsl cmd2.tsl | sort -n > cmd1_cmd2.output """ import getopt import sys import datetime def unbuffered_xreadlines(fileobj): """like fileobj.xreadlines() but unbuffered""" ln = [] while True: c = fileobj.read(1) if not c: if ln: yield "".join(ln) break ln.append(c) if c == "\n": yield "".join(ln) ln = [] if __name__ == "__main__": opt_timeformat = "%s.%f" #"%Y-%m-%d %H:%M:%S" opt_lineformat = "%(ts)s %(line)s" opt_unbuffered_in = False opt_unbuffered_out = False opt_outfiles = [] opts, remainder = getopt.gnu_getopt( sys.argv[1:], 'hf:F:o:uU', ['help', 'format=']) for opt, arg in opts: if opt in ["-h", "--help"]: print(__doc__) sys.exit(0) elif opt in ["-f", "--format"]: opt_timeformat = arg elif opt in ["-F"]: opt_lineformat = arg elif opt in ["-o"]: if arg == "stdout": opt_outfiles.append(sys.stdout) elif arg == "stderr": opt_outfiles.append(sys.stderr) else: opt_outfiles.append(open(arg, "w")) elif opt in ["-u"]: opt_unbuffered_in = True elif opt in ["-U"]: opt_unbuffered_out = True if not opt_outfiles: opt_outfiles.append(sys.stdout) if opt_unbuffered_in: line_iter = unbuffered_xreadlines(sys.stdin) else: line_iter = sys.stdin for line in line_iter: ts = datetime.datetime.now().strftime(opt_timeformat) out_line = opt_lineformat % {'ts': ts, 'line': line} for outfile in opt_outfiles: outfile.write(out_line) if opt_unbuffered_out: outfile.flush() ================================================ FILE: test/e2e/benchmarks.test-suite/memtier_benchmark/cri-resmgr.cfg ================================================ policy: Active: topology-aware ReservedResources: CPU: 750m logger: Debug: cri-resmgr,resource-manager,cache,policy ================================================ FILE: test/e2e/benchmarks.test-suite/memtier_benchmark/memtier-benchmark-02.yaml.in ================================================ apiVersion: batch/v1 kind: Job metadata: name: memtier-benchmark spec: template: metadata: annotations: cri-resource-manager.intel.com/${AFFINITY}: |+ memtier-benchmark: - scope: key: pod/name operator: Matches values: - redis-* match: key: name operator: Equals values: - redis weight: 10 spec: containers: $(for contnum in $(seq 1 ${CONTCOUNT}); do echo " - name: ${NAME}c$(( contnum - 1 )) image: redislabs/memtier_benchmark:edge imagePullPolicy: IfNotPresent args: ['${ARGS// /\', \'}'] resources: requests: cpu: ${CPU} memory: '${MEM}' limits: cpu: ${CPULIM} memory: '${MEMLIM}' "; done ) restartPolicy: Never ================================================ FILE: test/e2e/benchmarks.test-suite/memtier_benchmark/memtier-benchmark.yaml.in ================================================ apiVersion: batch/v1 kind: Job metadata: name: memtier-benchmark spec: template: metadata: annotations: cri-resource-manager.intel.com/${AFFINITY}: |+ memtier-benchmark: - scope: key: pod/name operator: Matches values: - redis-* match: key: name operator: Equals values: - redis weight: 10 spec: containers: - name: memtier-benchmark image: redislabs/memtier_benchmark:edge imagePullPolicy: IfNotPresent args: ['${ARGS// /\', \'}'] $(if [ "$CPU" != "0" ]; then echo " resources: requests: cpu: ${CPU} memory: '${MEM}' limits: cpu: ${CPULIM} memory: '${MEMLIM}' "; fi) restartPolicy: Never ================================================ FILE: test/e2e/benchmarks.test-suite/memtier_benchmark/n4c16/test01-memtier-stress-ng/code.var.sh ================================================ # Redis parameters REDIS_PASS=abc123xyz # Background load parameters STRESS_NG_CPUS=16 # workers per container STRESS_NG_CONTS=8 # number of containers per pod STRESS_NG_PODS=2 # number of pods # BG_* are background loads # CPU turbo licence level 2 (causes big drop on GHz) cannot be reached with stress-ng, but could be implemented with # 1. ["avx-turbo", "--test=avx512_vlzcnt_t", "--min-threads=1", "--max-threads=1", "--iters=0"] # 2. ["avx-turbo", "--test=avx512_vlzcnt_t", "--min-threads=1", "--max-threads=1", "--iters=0"] # License level observed with: # sudo perf stat --pid $(pidof avx-turbo) -e core_power.lvl0_turbo_license,core_power.lvl1_turbo_license,core_power.lvl2_turbo_license -- sleep 1 # In the following: "IPC" == Instructions Per Cycle BG_NOLOAD="" # BG_AVX_LL0="stress-ng --ipsec-mb $STRESS_NG_CPUS --ipsec-mb-feature avx512" # AVX, causing CPU turbo license level 0 # BG_AVX_LL1="stress-ng --ipsec-mb $STRESS_NG_CPUS --ipsec-mb-feature avx512" # AVX, causing CPU turbo license level 1 BG_SHM="stress-ng --shm $STRESS_NG_CPUS" # shared memory, memory bound (not causing 100% CPU load), IPC ~0.01-0.19 BG_MEMCPY="stress-ng --memcpy $STRESS_NG_CPUS" # memory bound; IPC =~ 0.15 BG_STREAM="stress-ng --stream $STRESS_NG_CPUS" # IPC =~ 0.49 BG_CPUJMP="stress-ng --cpu $STRESS_NG_CPUS --cpu-method jmp" # IPC ~3.6 BG_CPUALL="stress-ng --cpu $STRESS_NG_CPUS" # IPC ~1.8 # BM_* are benchmarks bm_stress_ng_iters=10000 BM_MEMTIER="memtier-benchmark --server=redis-service --authenticate=$REDIS_PASS" # this is special case # BM_MEMCPY="stress-ng --memcpy 1 --memcpy-ops $bm_stress_ng_iters" # IPC ~0.15 # BM_STREAM="stress-ng --stream 1 --stream-ops $bm_stress_ng_iters" # IPC ~0.49 # BM_JMP="stress-ng --cpu 1 --cpu-method jmp --cpu-ops $bm_stress_ng_iters" # IPC ~3.6 # BM_FFT="stress-ng --cpu 1 --cpu-method fft --cpu-ops $bm_stress_ng_iters" # IPC ~2.3 # BM_AVX_LL0="stress-ng --ipsec-mb 1 --ipsec-mb-feature avx2 --ipsec-mb-ops $bm_stress_ng_iters" # BM_AVX_LL2="stress-ng --ipsec-mb 1 --ipsec-mb-feature avx512 --ipsec-mb-ops $bm_stress_ng_iters" # Clean up vm-command "kubectl delete jobs --all --now; kubectl delete deployment redis; kubectl delete service redis-service; kubectl delete secret redis; kubectl delete pods --all --now; true" # Setup Redis wait="" create redis-secret CPU=4 MEM=32G CPULIM=8 MEMLIM=64G NAME=redis wait="Available" create redis NAME=redis-service wait="" create redis-service for bg_cmd in "${!BG_@}"; do # Reset counters in order to keep creating pod0... reset counters benchmark_output_dir="$OUTPUT_DIR/benchmark/$bg_cmd" mkdir -p "$benchmark_output_dir" # Start background noise if [[ "${!bg_cmd}" == "stress-ng "* ]]; then n="$STRESS_NG_PODS" ARGS="${!bg_cmd#stress-ng }" CONTCOUNT="$STRESS_NG_CONTS" CPU=50m MEM=50M CPULIM=$STRESS_NG_CPUS MEMLIM=1G wait_t=240s create stress-ng # Stabilize ( vm-run-until --timeout 60 "sh -c 'uptime; exit 1'" ) || echo "expected timeout" fi for bm_cmd in "${!BM_@}"; do for CPU in 4; do # Run benchmark if [[ "${!bm_cmd}" == "memtier-benchmark "* ]]; then AFFINITY=affinity CPU="$CPU" MEM="16G" CPULIM="$CPU" MEMLIM="24G" NAME=memtier-benchmark ARGS="${!bm_cmd#memtier-benchmark }" wait="Complete" wait_t="10m" create memtier-benchmark memtier_benchmark_pod="$(kubectl get pods | awk '/memtier-benchmark-/{print $1}')" kubectl logs "$memtier_benchmark_pod" | grep -A7 'ALL STATS' | tee "$benchmark_output_dir/$bm_cmd-affinity-cpu-$CPU.txt" kubectl delete jobs --all --now # AFFINITY=anti-affinity CPU="$CPU" MEM="16G" NAME=memtier-benchmark ARGS="${!bm_cmd#memtier-benchmark }" wait="Complete" wait_t="10m" create memtier-benchmark # memtier_benchmark_pod="$(kubectl get pods | awk '/memtier-benchmark-/{print $1}')" # kubectl logs "$memtier_benchmark_pod" | grep -A7 'ALL STATS' | tee "$benchmark_output_dir/$bm_cmd-antiaffinity-cpu-$CPU.txt" # kubectl delete jobs --all --now elif [[ "${!bm_cmd}" == "stress-ng "* ]]; then CPU="$CPU" MEM="200M" CPULIM="$STRESS_NG_CPUS" MEM="400M" NAME=stress-ng-benchmark ARGS="${!bm_cmd#stress-ng }" wait="Complete" wait_t="10m" create stress-ng-benchmark stress_ng_benchmark_pod="$(kubectl get pods | awk '/stress-ng-benchmark-/{print $1}')" kubectl logs "$stress_ng_benchmark_pod" | tee "$benchmark_output_dir/$bm_cmd-cpu-$CPU.txt" kubectl delete jobs --all --now fi done done # Stop background noise ( kubectl delete pods -l e2erole=bgload --now ) done ================================================ FILE: test/e2e/benchmarks.test-suite/memtier_benchmark/n4c16/test01-memtier-stress-ng/post-process.sh ================================================ #!/bin/bash # Usage: VAR=VALUE post-process.sh output-CRICONFIGNAME1 output-CRICONFIGNAME2... # VARs: # normalize=1.. # normalizes plotted values so that the smallest is 1.00 # normalize=0..1 # normalizes plotted to values between 0.0 and 1.0 # # if normalize="", values are not normalized # maxy=MAXY # maximum value on the Y axis # ytrans=log2 # logarithmic Y axis, the default ytrans is 'identity' # save=PREFIX # create PREFIX.svg and PREFIX.csv. The default is 'plot'. normalize="${normalize:-}" maxy="${maxy:-}" ytrans="${ytrans:-identity}" save="${save:-plot}" ( for out_path in "$@"; do ( benchmark_dir=$out_path/benchmark out_dir="$(basename "$out_path")" cd "$benchmark_dir" || exit for bgload in *; do ( cd "$bgload" || exit for memtier_results in BM_MEMTIER-*; do p50latency=\$6 p99latency=\$7 p999latency=\$8 awk "/Totals/{print \"$out_dir $bgload $memtier_results \"$p50latency\" \"$p99latency\" \"$p999latency}" < "$memtier_results" done ); done ); done ) > total-latencies.txt sed -e 's/output-//g' -e 's/BG_//g' -e 's/BM_MEMTIER-//g' -e 's/-cpu-[0-9]*.txt//g' < total-latencies.txt | awk '{print $1" "$2" "$3" "$4" "$5" "$6}' | grep -v ' antiaffinity' > data.txt cat > plot.R < /sys/block/$blkdev/queue/scheduler" vm-command-q "grep '[[]bfq[]]' /sys/block/$blkdev/queue/scheduler" || { error "failed to switch using bfq on /dev/$blkdev" } fi done if [[ "$k8scri" == *"containerd"* ]]; then # Start importing configurations from /etc/containerd/config.d/*.toml. vm-command-q "[ -f /etc/containerd/config.toml ] || echo "" > /etc/containerd/config.toml" vm-command-q "grep '^imports' /etc/containerd/config.toml || sed -i '1iimports = [\"/etc/containerd/config.d/*.toml\"]' /etc/containerd/config.toml" vm-command-q "grep -E '^imports.*/etc/containerd/config.d/' || sed -i 's:^\(imports.*\)\]:\1, \"/etc/containerd/config.d/*.toml\"\]:' /etc/containerd/config.toml" # e2e-specific config: tasks-service plugin loads blockio_config_file. vm-pipe-to-file /etc/containerd/config.d/e2e.toml < nsballoon[3] CPUREQ="2" MEMREQ="100M" CPULIM="2" MEMLIM="100M" namespace="e2e-b" CONTCOUNT=2 create balloons-busybox report allowed verify 'cpus["pod4c0"] == cpus["pod4c1"]' \ 'len(cpus["pod4c0"]) == 4' \ 'disjoint_sets(cpus["pod4c0"], cpus["pod3c0"], cpus["pod2c0"], cpus["pod1c0"])' # pod5: new namespace => nsballoon[5] CPUREQ="100m" MEMREQ="100M" CPULIM="100m" MEMLIM="100M" namespace="e2e-c" CONTCOUNT=2 create balloons-busybox report allowed verify 'cpus["pod5c0"] == cpus["pod5c1"]' \ 'len(cpus["pod5c0"]) == 2' \ 'disjoint_sets(cpus["pod5c0"], cpus["pod4c0"], cpus["pod3c0"], cpus["pod2c0"], cpus["pod1c0"])' # pod6: new namespace, but nsballoon[6] cannot be created because all # CPUs are already allocated to balloons. Cannot honor the preference # of spreading different namespaces to different balloon instances # anymore, should fallback to balanced assignment. CPUREQ="100m" MEMREQ="100M" CPULIM="100m" MEMLIM="100M" namespace="e2e-d" CONTCOUNT=2 create balloons-busybox report allowed verify 'cpus["pod6c0"] == cpus["pod6c1"]' cleanup terminate cri-resmgr launch cri-resmgr ================================================ FILE: test/e2e/policies.test-suite/balloons/n4c16/test06-update-configmap/code.var.sh ================================================ # This test verifies that configuration updates via cri-resmgr-agent # are handled properly in the balloons policy. testns=e2e-balloons-test06 cleanup() { vm-command "kubectl delete pods --all --now --wait; \ kubectl delete namespace $testns --now --wait --ignore-not-found || :; \ kubectl delete namespace btype1ns0 --now --wait --ignore-not-found || :" terminate cri-resmgr terminate cri-resmgr-agent vm-command "cri-resmgr -reset-policy; cri-resmgr -reset-config" } apply-configmap() { vm-put-file $(instantiate balloons-configmap.yaml) balloons-configmap.yaml vm-command "cat balloons-configmap.yaml" kubectl apply -f balloons-configmap.yaml } cleanup cri_resmgr_extra_args="-metrics-interval 1s" cri_resmgr_config=fallback launch cri-resmgr launch cri-resmgr-agent kubectl create namespace $testns kubectl create namespace btype1ns0 AVAILABLE_CPU="cpuset:0,4-15" BTYPE2_NAMESPACE0='"*"' BTYPE1_MAXCPUS='0' apply-configmap sleep 3 # pod0 in btype0, annotation CPUREQ=1 MEMREQ="100M" CPULIM=1 MEMLIM="100M" POD_ANNOTATION="balloon.balloons.cri-resource-manager.intel.com: btype0" create balloons-busybox # pod1 in btype1, namespace CPUREQ=1 MEMREQ="100M" CPULIM=1 MEMLIM="100M" namespace="btype1ns0" create balloons-busybox # pod2 in btype2, wildcard namespace CPUREQ=1 MEMREQ="100M" CPULIM=1 MEMLIM="100M" namespace="e2e-balloons-test06" create balloons-busybox vm-command "curl -s $verify_metrics_url" verify-metrics-has-line 'btype0\[0\].*containers=".*pod0:pod0c0' verify-metrics-has-line 'btype1\[0\].*containers=".*pod1:pod1c0' verify-metrics-has-line 'btype2\[0\].*containers=".*pod2:pod2c0' # Remove first two balloon types, change btype2 to match all # namespaces. BTYPE0_SKIP=1 BTYPE1_SKIP=1 BTYPE2_NAMESPACE0='"*"' apply-configmap # Note: # pod0 was successfully assigned to and running in balloon of btype0. # Now btype0 was completely removed from the node. # Currently this behavior is undefined. # Possible behaviors: evict pod0, continue assign chain, refuse config... # For now, skip pod0c0 balloon validation: # verify-metrics-has-line '"btype2\[0\]".*pod0:pod0c0' verify-metrics-has-line '"btype2\[0\]".*pod1:pod1c0' verify-metrics-has-line '"btype2\[0\]".*pod2:pod2c0' # Bring back btype0 where pod0 belongs to by annotation. BTYPE1_SKIP=1 BTYPE2_NAMESPACE0='"*"' apply-configmap verify-metrics-has-line '"btype0\[0\]".*pod0:pod0c0' verify-metrics-has-line '"btype2\[0\]".*pod1:pod1c0' verify-metrics-has-line '"btype2\[0\]".*pod2:pod2c0' # Change only CPU classes, no reassigning. verify-metrics-has-line 'btype0\[0\].*pod0:pod0c0.*cpu_class="classA"' verify-metrics-has-line 'btype2\[0\].*pod1:pod1c0.*cpu_class="classC"' verify-metrics-has-line 'btype2\[0\].*pod2:pod2c0.*cpu_class="classC"' BTYPE0_CPUCLASS="classC" BTYPE1_SKIP=1 BTYPE2_CPUCLASS="classB" BTYPE2_NAMESPACE0='"*"' apply-configmap verify-metrics-has-line 'btype0\[0\].*pod0:pod0c0.*cpu_class="classC"' verify-metrics-has-line 'btype2\[0\].*pod1:pod1c0.*cpu_class="classB"' verify-metrics-has-line 'btype2\[0\].*pod2:pod2c0.*cpu_class="classB"' cleanup launch cri-resmgr ================================================ FILE: test/e2e/policies.test-suite/balloons/n4c16/test07-maxballoons/balloons-maxballoons-impossible.cfg ================================================ policy: Active: balloons ReservedResources: CPU: 1 balloons: PinCPU: true PinMemory: true BalloonTypes: - Name: singleton MinCPUs: 2 MaxCPUs: 2 MinBalloons: 1 MaxBalloons: 1 - Name: impossible MinBalloons: 2 MaxBalloons: 1 logger: Debug: policy ================================================ FILE: test/e2e/policies.test-suite/balloons/n4c16/test07-maxballoons/balloons-maxballoons.cfg ================================================ policy: Active: balloons ReservedResources: CPU: 1 balloons: PinCPU: true PinMemory: true BalloonTypes: - Name: singleton MinCPUs: 2 MaxCPUs: 2 MinBalloons: 1 MaxBalloons: 1 - Name: dynamictwo MaxCPUs: 1 MaxBalloons: 2 PreferNewBalloon: true logger: Debug: policy ================================================ FILE: test/e2e/policies.test-suite/balloons/n4c16/test07-maxballoons/code.var.sh ================================================ cleanup() { vm-command "kubectl delete pods --all --now --wait" return 0 } cleanup terminate cri-resmgr cri_resmgr_cfg=${TEST_DIR}/balloons-maxballoons.cfg launch cri-resmgr # pod0: allocate 1500/2000 mCPUs of the singleton balloon CPUREQ="1500m" CPULIM="1500m" POD_ANNOTATION="balloon.balloons.cri-resource-manager.intel.com: singleton" CONTCOUNT=1 create balloons-busybox report allowed verify 'len(cpus["pod0c0"]) == 2' # pod1: allocate the rest 500/2000 mCPUs of the singleton balloon CPUREQ="500m" CPULIM="500m" POD_ANNOTATION="balloon.balloons.cri-resource-manager.intel.com: singleton" CONTCOUNT=1 create balloons-busybox report allowed verify 'cpus["pod0c0"] == cpus["pod1c0"]' # pod2: try to fit in the already full singleton balloon CPUREQ="100m" CPULIM="100m" ( POD_ANNOTATION="balloon.balloons.cri-resource-manager.intel.com: singleton" CONTCOUNT=1 wait_t=5s create balloons-busybox ) && { error "creating pod2 succeeded but was expected to fail with balloon allocation error" } echo "pod2 creation failed with an error as expected" vm-command "kubectl describe pod pod2" if ! grep -q 'no suitable balloon instance available' <<< "$COMMAND_OUTPUT"; then error "could not find 'no suitable balloon instance available' in pod2 description" fi vm-command "kubectl delete pod pod2 --now --wait --ignore-not-found" # pod2: create dynamically the first dynamictwo balloon CPUREQ="800m" CPULIM="800m" POD_ANNOTATION="balloon.balloons.cri-resource-manager.intel.com: dynamictwo" CONTCOUNT=1 create balloons-busybox report allowed verify 'len(cpus["pod2c0"]) == 1' # pod3: create dynamically the second dynamictwo balloon CPUREQ="600m" CPULIM="600m" POD_ANNOTATION="balloon.balloons.cri-resource-manager.intel.com: dynamictwo" CONTCOUNT=1 create balloons-busybox report allowed verify 'disjoint_sets(cpus["pod2c0"], cpus["pod3c0"])' # pod4: prefering new balloon fails, but this fits in the second dynamictwo balloon CPUREQ="300m" CPULIM="300m" POD_ANNOTATION="balloon.balloons.cri-resource-manager.intel.com: dynamictwo" CONTCOUNT=1 create balloons-busybox report allowed verify 'cpus["pod4c0"] == cpus["pod3c0"]' # pod5: prefering new balloon fails, and fitting to existing dynamictwo balloons fails CPUREQ="300m" CPULIM="300m" ( POD_ANNOTATION="balloon.balloons.cri-resource-manager.intel.com: dynamictwo" CONTCOUNT=1 wait_t=5s create balloons-busybox ) && { error "creating pod6 succeeded but was expected to fail with balloon allocation error" } vm-command "kubectl describe pod pod5" if ! grep -q 'no suitable balloon instance available' <<< "$COMMAND_OUTPUT"; then error "could not find 'no suitable balloon instance available' in pod6 description" fi vm-command "kubectl delete pod pod5 --now --wait --ignore-not-found" cleanup # Try starting cri-resmgr with a configuration where MinBalloons and # MaxBalloons of the same balloon type contradict. terminate cri-resmgr ( cri_resmgr_cfg=${TEST_DIR}/balloons-maxballoons-impossible.cfg launch cri-resmgr ) && { error "starting cri-resmgr succeeded, but was expected to fail due to impossible static balloons" } echo "starting cri-resmgr with impossible static balloons configuration failed as expected" terminate cri-resmgr launch cri-resmgr ================================================ FILE: test/e2e/policies.test-suite/balloons/n4c16/test08-numa/balloons-numa.cfg ================================================ policy: Active: balloons AvailableResources: CPU: cpuset:0-15 # Reserve one of our CPUs (cpu15) for kube-system tasks. ReservedResources: CPU: 1 balloons: PinCPU: true PinMemory: true BalloonTypes: - Name: fit-in-numa # All (non-system) containers are assigned to this balloon # type Namespaces: - "*" # Prevent a balloon to be inflated larger than a NUMA node MinCPUs: 0 MaxCPUs: 4 AllocatorPriority: 0 PreferNewBalloons: false ================================================ FILE: test/e2e/policies.test-suite/balloons/n4c16/test08-numa/code.var.sh ================================================ terminate cri-resmgr cri_resmgr_cfg=${TEST_DIR}/balloons-numa.cfg launch cri-resmgr # pod0: besteffort, make sure it still gets at least 1 CPU CPUREQ="" CPULIM="" MEMREQ="" MEMLIM="" CONTCOUNT=1 create balloons-busybox report allowed verify 'len(cpus["pod0c0"]) == 1' # pod1: guaranteed, make sure it gets the CPU it requested. # The configuration does not prefer creating new balloons, # so pod0 and pod1 should be placed in the same balloon. # Sum of their CPU requests is 1, so they should actually # run on the same CPU. CPUREQ="1" CPULIM="1" MEMREQ="50M" MEMLIM="50M" CONTCOUNT=1 create balloons-busybox report allowed verify 'len(cpus["pod0c0"]) == 1' \ 'len(cpus["pod1c0"]) == 1' \ 'cpus["pod0c0"] == cpus["pod1c0"]' # pod2: guaranteed, make sure it gets the CPU it requested. CPUREQ="1" CPULIM="1" MEMREQ="50M" MEMLIM="50M" CONTCOUNT=1 create balloons-busybox report allowed verify 'len(cpus["pod0c0"]) == 2' \ 'len(cpus["pod1c0"]) == 2' \ 'len(cpus["pod2c0"]) == 2' \ 'cpus["pod0c0"] == cpus["pod1c0"] == cpus["pod2c0"]' # pod3: guaranteed, make sure it gets the CPU it requested. CPUREQ="1" CPULIM="1" MEMREQ="50M" MEMLIM="50M" CONTCOUNT=1 create balloons-busybox report allowed verify 'len(cpus["pod0c0"]) == 3' \ 'len(cpus["pod1c0"]) == 3' \ 'len(cpus["pod2c0"]) == 3' \ 'len(cpus["pod3c0"]) == 3' \ 'cpus["pod0c0"] == cpus["pod1c0"] == cpus["pod2c0"] == cpus["pod3c0"]' # pod4: guaranteed, fill up a balloon to the MaxCPU CPUREQ="1" CPULIM="1" MEMREQ="50M" MEMLIM="50M" CONTCOUNT=1 create balloons-busybox report allowed verify 'len(cpus["pod0c0"]) == 4' \ 'len(cpus["pod1c0"]) == 4' \ 'len(cpus["pod2c0"]) == 4' \ 'len(cpus["pod3c0"]) == 4' \ 'len(cpus["pod4c0"]) == 4' \ 'cpus["pod0c0"] == cpus["pod1c0"] == cpus["pod2c0"] == cpus["pod3c0"] == cpus["pod4c0"]' # pod5: besteffort, no CPU request, should fit into the full balloon CPUREQ="" CPULIM="" MEMREQ="" MEMLIM="" CONTCOUNT=1 create balloons-busybox report allowed verify 'len(cpus["pod0c0"]) == 4' \ 'len(cpus["pod1c0"]) == 4' \ 'len(cpus["pod2c0"]) == 4' \ 'len(cpus["pod3c0"]) == 4' \ 'len(cpus["pod4c0"]) == 4' \ 'len(cpus["pod5c0"]) == 4' \ 'cpus["pod0c0"] == cpus["pod1c0"] == cpus["pod2c0"] == cpus["pod3c0"] == cpus["pod4c0"] == cpus["pod5c0"]' # pod6: guaranteed, start filling new balloon CPUREQ="1" CPULIM="1" MEMREQ="50M" MEMLIM="50M" CONTCOUNT=1 create balloons-busybox report allowed verify 'len(cpus["pod0c0"]) == 4' \ 'len(cpus["pod1c0"]) == 4' \ 'len(cpus["pod2c0"]) == 4' \ 'len(cpus["pod3c0"]) == 4' \ 'len(cpus["pod4c0"]) == 4' \ 'len(cpus["pod5c0"]) == 4' \ 'len(cpus["pod6c0"]) == 1' \ 'cpus["pod0c0"] == cpus["pod1c0"] == cpus["pod2c0"] == cpus["pod3c0"] == cpus["pod4c0"]' \ 'disjoint_sets(cpus["pod0c0"], cpus["pod6c0"])' # Leave only one guaranteed container to the first balloon. kubectl delete pods pod1 pod2 pod3 --now --wait --ignore-not-found report allowed verify 'len(cpus["pod0c0"]) == 1' \ 'len(cpus["pod4c0"]) == 1' \ 'len(cpus["pod5c0"]) == 1' \ 'len(cpus["pod6c0"]) == 1' \ 'cpus["pod0c0"] == cpus["pod4c0"] == cpus["pod5c0"]' \ 'disjoint_sets(cpus["pod0c0"], cpus["pod6c0"])' # Leave only bestefforts to the first balloon. Make sure they still # have a CPU. kubectl delete pods pod4 --now --wait --ignore-not-found report allowed verify 'len(cpus["pod0c0"]) == 1' \ 'len(cpus["pod5c0"]) == 1' \ 'len(cpus["pod6c0"]) == 1' \ 'cpus["pod0c0"] == cpus["pod5c0"]' \ 'disjoint_sets(cpus["pod0c0"], cpus["pod6c0"])' terminate cri-resmgr launch cri-resmgr ================================================ FILE: test/e2e/policies.test-suite/balloons/n4c16/test09-isolated/balloons-isolated.cfg ================================================ policy: Active: balloons ReservedResources: CPU: cpuset:0 balloons: BalloonTypes: - Name: isolated-pods MinCPUs: 0 MaxCPUs: 2 CPUClass: turbo MinBalloons: 2 PreferNewBalloons: true PreferSpreadingPods: false - Name: isolated-ctrs MinCPUs: 1 MaxCPUs: 1 CPUClass: turbo MinBalloons: 2 PreferNewBalloons: true PreferSpreadingPods: true instrumentation: HTTPEndpoint: :8891 PrometheusExport: true logger: Debug: policy Klog: skip_headers: true ================================================ FILE: test/e2e/policies.test-suite/balloons/n4c16/test09-isolated/code.var.sh ================================================ terminate cri-resmgr cri_resmgr_cfg=${TEST_DIR}/balloons-isolated.cfg cri_resmgr_extra_args="-metrics-interval 4s" launch cri-resmgr verify-metrics-has-line 'balloon="isolated-pods\[0\]"' verify-metrics-has-line 'balloon="isolated-pods\[1\]"' verify-metrics-has-no-line 'balloon="isolated-pods\[2\]"' # pod0: besteffort CPUREQ="" CPULIM="" MEMREQ="" MEMLIM="" POD_ANNOTATION="balloon.balloons.cri-resource-manager.intel.com: isolated-pods" CONTCOUNT=2 create balloons-busybox report allowed verify 'len(cpus["pod0c0"]) == 1' \ 'len(cpus["pod0c1"]) == 1' \ 'cpus["pod0c0"] == cpus["pod0c1"]' # Even if the isolated balloon type has PreferNewBalloons=1, adding # this pod0 or pod1 must not create a new balloon because existing # empty balloons should be filled first. verify-metrics-has-line 'balloon="isolated-pods\[0\]"' verify-metrics-has-line 'balloon="isolated-pods\[1\]"' verify-metrics-has-no-line 'balloon="isolated-pods\[2\]"' # pod1: guaranteed CPUREQ="600m" CPULIM="600m" MEMREQ="100M" MEMLIM="100M" POD_ANNOTATION="balloon.balloons.cri-resource-manager.intel.com: isolated-pods" CONTCOUNT=2 create balloons-busybox report allowed verify 'len(cpus["pod0c0"]) == 1' \ 'len(cpus["pod0c1"]) == 1' \ 'len(cpus["pod1c0"]) == 2' \ 'len(cpus["pod1c1"]) == 2' \ 'cpus["pod0c0"] == cpus["pod0c1"]' \ 'cpus["pod1c0"] == cpus["pod1c1"]' \ 'disjoint_sets(cpus["pod0c0"], cpus["pod1c0"])' verify-metrics-has-line 'balloon="isolated-pods\[0\]"' verify-metrics-has-line 'balloon="isolated-pods\[1\]"' verify-metrics-has-no-line 'balloon="isolated-pods\[2\]"' # pod2: burstable CPUREQ="100m" CPULIM="200m" POD_ANNOTATION="balloon.balloons.cri-resource-manager.intel.com: isolated-pods" CONTCOUNT=2 create balloons-busybox report allowed verify 'len(cpus["pod0c0"]) == 1' \ 'len(cpus["pod0c1"]) == 1' \ 'len(cpus["pod1c0"]) == 2' \ 'len(cpus["pod1c1"]) == 2' \ 'len(cpus["pod2c0"]) == 1' \ 'len(cpus["pod2c1"]) == 1' \ 'cpus["pod0c0"] == cpus["pod0c1"]' \ 'cpus["pod1c0"] == cpus["pod1c1"]' \ 'cpus["pod2c0"] == cpus["pod2c1"]' \ 'disjoint_sets(cpus["pod0c0"], cpus["pod1c0"], cpus["pod2c0"])' verify-metrics-has-line 'balloon="isolated-pods\[0\]"' verify-metrics-has-line 'balloon="isolated-pods\[1\]"' verify-metrics-has-line 'balloon="isolated-pods\[2\]"' verify-metrics-has-no-line 'balloon="isolated-pods\[3\]"' # pod3: isolated containers CPUREQ="" CPULIM="" MEMREQ="" MEMLIM="" POD_ANNOTATION="balloon.balloons.cri-resource-manager.intel.com: isolated-ctrs" CONTCOUNT=4 create balloons-busybox report allowed verify 'len(cpus["pod0c0"]) == 1' \ 'len(cpus["pod0c1"]) == 1' \ 'len(cpus["pod1c0"]) == 2' \ 'len(cpus["pod1c1"]) == 2' \ 'len(cpus["pod2c0"]) == 1' \ 'len(cpus["pod2c1"]) == 1' \ 'len(cpus["pod3c0"]) == 1' \ 'len(cpus["pod3c1"]) == 1' \ 'len(cpus["pod3c2"]) == 1' \ 'len(cpus["pod3c3"]) == 1' \ 'cpus["pod0c0"] == cpus["pod0c1"]' \ 'cpus["pod1c0"] == cpus["pod1c1"]' \ 'cpus["pod2c0"] == cpus["pod2c1"]' \ 'disjoint_sets(cpus["pod0c0"], cpus["pod1c0"], cpus["pod2c0"])' \ 'disjoint_sets(cpus["pod3c0"], cpus["pod3c1"], cpus["pod3c2"], cpus["pod3c3"])' \ 'disjoint_sets(cpus["pod0c0"], cpus["pod1c0"], cpus["pod2c0"], cpus["pod3c0"], cpus["pod3c1"], cpus["pod3c2"], cpus["pod3c3"])' verify-metrics-has-line 'balloon="isolated-pods\[0\]"' verify-metrics-has-line 'balloon="isolated-pods\[1\]"' verify-metrics-has-line 'balloon="isolated-pods\[2\]"' verify-metrics-has-no-line 'balloon="isolated-pods\[3\]"' verify-metrics-has-line 'balloon="isolated-ctrs\[0\]"' verify-metrics-has-line 'balloon="isolated-ctrs\[1\]"' verify-metrics-has-line 'balloon="isolated-ctrs\[2\]"' verify-metrics-has-line 'balloon="isolated-ctrs\[3\]"' terminate cri-resmgr launch cri-resmgr ================================================ FILE: test/e2e/policies.test-suite/balloons/n4c16/test10-allocator-opts/balloons-allocator-opts.cfg ================================================ policy: Active: balloons ReservedResources: CPU: 1 balloons: AllocatorTopologyBalancing: true PreferSpreadOnPhysicalCores: true BalloonTypes: - Name: policydefaults MinCPUs: 2 MinBalloons: 2 - Name: topo1cores0 MinCPUs: 2 MinBalloons: 2 PreferSpreadOnPhysicalCores: false - Name: topo0cores1 AllocatorTopologyBalancing: false PreferSpreadOnPhysicalCores: true - Name: topo0cores0 AllocatorTopologyBalancing: false PreferSpreadOnPhysicalCores: false - Name: topo1cores1 AllocatorTopologyBalancing: true PreferSpreadOnPhysicalCores: true instrumentation: HTTPEndpoint: :8891 PrometheusExport: true logger: Debug: policy Klog: skip_headers: true ================================================ FILE: test/e2e/policies.test-suite/balloons/n4c16/test10-allocator-opts/code.var.sh ================================================ cleanup() { vm-command "kubectl delete pods --all --now --wait" return 0 } cleanup # Launch cri-resmgr with wanted metrics update interval and a # configuration that opens the instrumentation http server. terminate cri-resmgr cri_resmgr_cfg=${TEST_DIR}/balloons-allocator-opts.cfg launch cri-resmgr # pod0 in a 2-CPU balloon CPUREQ="100m" MEMREQ="100M" CPULIM="100m" MEMLIM="100M" POD_ANNOTATION="balloon.balloons.cri-resource-manager.intel.com: policydefaults" CONTCOUNT=1 create balloons-busybox report allowed verify 'len(cores["pod0c0"]) == 2' \ 'len(cpus["pod0c0"]) == 2' # pod1 in a 2-CPU balloon CPUREQ="100m" MEMREQ="100M" CPULIM="100m" MEMLIM="100M" POD_ANNOTATION="balloon.balloons.cri-resource-manager.intel.com: topo1cores0" CONTCOUNT=1 create balloons-busybox report allowed verify 'len(cores["pod1c0"]) == 1' \ 'len(cpus["pod1c0"]) == 2' # pod2: container 0 resizes first from 0 to 1, container 2 from 1 to 2 CPUs, # use more cores CPUREQ="1" MEMREQ="100M" CPULIM="1" MEMLIM="100M" POD_ANNOTATION="balloon.balloons.cri-resource-manager.intel.com: topo1cores1" CONTCOUNT=2 create balloons-busybox report allowed verify 'len(cores["pod2c0"]) == 2' \ 'len(cpus["pod2c0"]) == 2' \ 'cpus["pod2c0"] == cpus["pod2c1"]' # pod3: container 0 resizes first from 0 to 1, container 2 from 1 to 2 CPUs, # pack tightly CPUREQ="1" MEMREQ="100M" CPULIM="1" MEMLIM="100M" POD_ANNOTATION="balloon.balloons.cri-resource-manager.intel.com: topo0cores0" CONTCOUNT=2 create balloons-busybox report allowed verify 'len(cores["pod3c0"]) == 1' \ 'len(cpus["pod3c0"]) == 2' \ 'cpus["pod3c0"] == cpus["pod3c1"]' cleanup ================================================ FILE: test/e2e/policies.test-suite/balloons/n4c16/topology.var.json ================================================ [ {"mem": "2G", "cores": 2, "nodes": 2, "packages": 2} ] ================================================ FILE: test/e2e/policies.test-suite/balloons/n4c32/test01-dynamic-baloons/balloons-dynamic.cfg ================================================ policy: Active: balloons ReservedResources: cpu: cpuset:31 balloons: AllocatorTopologyBalancing: true BalloonTypes: - Name: dynamic MaxCPUs: 32 MaxBalloons: 8 PreferNewBalloons: true ShareIdleCpusInSame: numa instrumentation: HTTPEndpoint: :8891 PrometheusExport: true logger: Debug: policy Klog: skip_headers: true ================================================ FILE: test/e2e/policies.test-suite/balloons/n4c32/test01-dynamic-baloons/code.var.sh ================================================ terminate cri-resmgr cri_resmgr_cfg=${TEST_DIR}/balloons-dynamic.cfg cri_resmgr_extra_args="-metrics-interval 4s" launch cri-resmgr # pod0-pod7: create 8 balloons, where each lands on a different NUMA node. # Each balloon (except one that lands on the NUMA node with reserved CPUs) # has 1 shared CPU at the most since a NUMA node has 4 CPUs and a pod is # requesting 1 CPU. Only one of the balloon that using NUMA node with #reserved CPU has 0 shared CPUs. CPUREQLIM="3" INITCPUREQLIM="100m-100m 100m-100m 100m-100m" POD_ANNOTATION="balloon.balloons.cri-resource-manager.intel.com: dynamic" n=8 create multicontainerpod verify-metrics-has-line 'balloon="dynamic\[0\]".*cpus_count="3"*' verify-metrics-has-line 'balloon="dynamic\[1\]".*cpus_count="3"*' verify-metrics-has-line 'balloon="dynamic\[2\]".*cpus_count="3"*' verify-metrics-has-line 'balloon="dynamic\[3\]".*cpus_count="3"*' verify-metrics-has-line 'balloon="dynamic\[4\]".*cpus_count="3"*' verify-metrics-has-line 'balloon="dynamic\[5\]".*cpus_count="3"*' verify-metrics-has-line 'balloon="dynamic\[6\]".*cpus_count="3"*' verify-metrics-has-line 'balloon="dynamic\[7\]".*cpus_count="3"*' verify-metrics-has-no-line 'cpus_count="4"' verify-metrics-has-line 'sharedidlecpus_count="1"' verify-metrics-has-line 'cpus_allowed_count="4"' verify-metrics-has-line 'sharedidlecpus_count="0"' verify-metrics-has-line 'cpus_allowed_count="3"' verify-metrics-has-no-line 'sharedidlecpus_count="2"' verify-metrics-has-no-line 'cpus_allowed_count="5"' verify 'disjoint_sets(nodes["pod0c0"], nodes["pod1c0"], nodes["pod2c0"], nodes["pod3c0"], nodes["pod4c0"], nodes["pod5c0"], nodes["pod6c0"], nodes["pod7c0"])' \ 'len(nodes["pod0c0"]) == len(nodes["pod1c0"]) == len(nodes["pod2c0"]) == \ len(nodes["pod3c0"]) == len(nodes["pod4c0"]) == len(nodes["pod5c0"]) == \ len(nodes["pod6c0"]) == len(nodes["pod7c0"]) == 1' # pod8: Add one more pod with 2 CPUs to inflate over NUMAs nodes, which should cross # the NUMA node boundaries but not the die boundaries. Because two NUMA nodes can offer # 2 CPUs in total. CPUREQLIM="2" POD_ANNOTATION="balloon.balloons.cri-resource-manager.intel.com: dynamic" create multicontainerpod verify-metrics-has-line 'cpus_count="5"' verify-metrics-has-line 'sharedidlecpus="",sharedidlecpus_count="0"' verify-metrics-has-line 'sharedidlecpus_count="1"' verify 'len(nodes["pod8c0"])==2' \ 'len(dies["pod8c0"])==1' \ 'len(packages["pod8c0"])==1' kubectl delete pod pod8 --now --wait --ignore-not-found verify-metrics-has-no-line 'cpus_count="5"' # pod9: Add one more pod with 4 CPUs to inflate over dies, which should cross # the NUMA node boundaries as well as dies boundaries. Since 2 dies under the # same package can offer 4 CPUs, we should not cross the package boundaries. CPUREQLIM="4" POD_ANNOTATION="balloon.balloons.cri-resource-manager.intel.com: dynamic" create multicontainerpod verify 'len(nodes["pod9c0"])==4' \ 'len(dies["pod9c0"])==2' \ 'len(packages["pod9c0"])==1' kubectl delete pod pod9 --now --wait --ignore-not-found verify 'disjoint_sets(nodes["pod0c0"], nodes["pod1c0"], nodes["pod2c0"], nodes["pod3c0"], nodes["pod4c0"], nodes["pod5c0"], nodes["pod6c0"], nodes["pod7c0"])' \ # pod9: Add one more pod with 7 CPUs to inflate over packages, which should cross # NUMA node, dies and package boundaries. At this point, there is no free CPUs # left on the host, so no shared CPUs. CPUREQLIM="6 1" POD_ANNOTATION="balloon.balloons.cri-resource-manager.intel.com: dynamic" create multicontainerpod verify 'len(nodes["pod10c0"])==7' \ 'len(dies["pod10c0"])==4' \ 'len(packages["pod10c0"])==2' verify-metrics-has-line 'sharedidlecpus="",sharedidlecpus_count="0"' verify-metrics-has-no-line 'sharedidlecpus_count="1"' # pod0, pod9 deflate. This should free up 10 CPUs that will cause having # shared CPUs available again. kubectl delete pod pod10 --now --wait --ignore-not-found kubectl delete pod pod0 --now --wait --ignore-not-found verify-metrics-has-line 'sharedidlecpus_count="1"' ================================================ FILE: test/e2e/policies.test-suite/balloons/n4c32/test01-dynamic-baloons/multicontainerpod.yaml.in ================================================ apiVersion: v1 kind: Pod metadata: name: ${NAME} $(if [ -n "$POD_ANNOTATION" ]; then echo " annotations: $POD_ANNOTATION "; fi) labels: app: ${NAME} spec: containers: $(contnum=0; for reqlim in ${CPUREQLIM}; do echo " - name: ${NAME}c${contnum} image: busybox imagePullPolicy: IfNotPresent command: - sh - -c - ${WORK}echo ${NAME}c${contnum} \$(sleep inf) $(if [ -n "${reqlim}" ]; then echo " resources: $(if [ -n "${reqlim/-*}" ]; then echo " requests: cpu: ${reqlim/-*/} "; fi) $(if [ -n "${reqlim/*-/}" ]; then echo " limits: cpu: ${reqlim/*-} "; fi) "; fi) "; contnum=$((contnum + 1)); done ) $(if [ -n "$INITCPUREQLIM" ]; then echo " initContainers: $(contnum=0; for initreqlim in ${INITCPUREQLIM}; do echo " - name: ${NAME}c${contnum}-init image: busybox imagePullPolicy: IfNotPresent command: - sh - -c - ${WORK}echo ${NAME}c${contnum}-init \$(sleep 1) $(if [ -n "${initreqlim}" ]; then echo " resources: $(if [ -n "${initreqlim/-*}" ]; then echo " requests: cpu: ${initreqlim/-*/} "; fi) $(if [ -n "${initreqlim/*-/}" ]; then echo " limits: cpu: ${initreqlim/*-} "; fi) "; fi) "; contnum=$((contnum + 1)); done ) "; fi) terminationGracePeriodSeconds: 1 ================================================ FILE: test/e2e/policies.test-suite/balloons/n4c32/topology.var.json ================================================ [ {"mem": "2G", "cores": 2, "nodes": 2, "dies": 2, "packages": 2} ] ================================================ FILE: test/e2e/policies.test-suite/balloons/verify.source.sh ================================================ # Utilities to verify data from metrics verify_metrics_url="http://localhost:8891/metrics" verify-metrics-has-line() { local expected_line="$1" vm-run-until --timeout 10 "echo 'waiting for metrics line: $expected_line' >&2; curl --silent $verify_metrics_url | grep -E '$expected_line'" || { command-error "expected line '$1' missing from the output" } } verify-metrics-has-no-line() { local unexpected_line="$1" vm-run-until --timeout 10 "echo 'checking absense of metrics line: $unexpected_line' >&2; ! curl --silent $verify_metrics_url | grep -Eq '$unexpected_line'" || { command-error "unexpected line '$1' found from the output" } } ================================================ FILE: test/e2e/policies.test-suite/check-correct-policy.source.sh ================================================ # This script does a policy check before the real test code is started. cache_policy="$(vm-command-q "cat /var/lib/cri-resmgr/cache" | jq -r .PolicyName)" cfg_policy=$(awk '/Active:/{print $2}' < "$cri_resmgr_cfg") if [ -n "$cache_policy" ] && [ -n "$cfg_policy" ] && [ "$cache_policy" != "$cfg_policy" ]; then echo "cri-resmgr is been started with policy \"$cache_policy\", switching to \"$cfg_policy\"" terminate cri-resmgr echo "destroying cri-resmgr cache with previous policy" vm-command "rm -rf /var/lib/cri-resmgr" launch cri-resmgr fi ================================================ FILE: test/e2e/policies.test-suite/dynamic-pools/cri-resmgr.cfg ================================================ policy: Active: dynamic-pools # Use only 15 CPUs in total, leave cpu0 for other than Kubernetes # processes. AvailableResources: CPU: cpuset:1-15 # Reserve one of our CPUs for kube-system tasks. ReservedResources: CPU: 1 dynamic-pools: PinCPU: true PinMemory: true DynamicPoolTypes: - Name: "pool1" Namespaces: - "pool1" CPUClass: "pool1-cpuclass" - Name: "pool2" Namespaces: - "pool2" CPUClass: "pool2-cpuclass" instrumentation: HTTPEndpoint: :8891 PrometheusExport: true logger: Debug: policy Klog: skip_headers: true cpu: classes: default: minFreq: 800 maxFreq: 2800 pool1-cpuclass: minFreq: 900 maxFreq: 2900 pool2-cpuclass: minFreq: 1000 maxFreq: 3000 ================================================ FILE: test/e2e/policies.test-suite/dynamic-pools/dyp-busybox.yaml.in ================================================ apiVersion: v1 kind: Pod metadata: name: ${NAME} $(if [ -n "$POD_ANNOTATION" ]; then echo " annotations: $POD_ANNOTATION "; fi) labels: app: ${NAME} spec: containers: $(for contnum in $(seq 1 ${CONTCOUNT}); do echo " - name: ${NAME}c$(( contnum - 1 )) image: busybox imagePullPolicy: IfNotPresent command: - sh - -c - ${WORK}echo ${NAME}c$(( contnum - 1 )) \$(sleep inf) $(if [ -n "${CPUREQ}" ]; then echo " resources: requests: cpu: ${CPUREQ} $(if [ -n "${MEMREQ}" ]; then echo " memory: '${MEMREQ}' "; fi) $(if [ -n "${CPULIM}" ]; then echo " limits: cpu: ${CPULIM} $(if [ -n "$MEMLIM" ]; then echo " memory: '${MEMLIM}' "; fi) "; fi) "; fi) "; done ) terminationGracePeriodSeconds: 1 ================================================ FILE: test/e2e/policies.test-suite/dynamic-pools/dyp-configmap.yaml.in ================================================ apiVersion: v1 kind: ConfigMap metadata: name: cri-resmgr-config.default namespace: kube-system data: policy: |+ Active: dynamic-pools AvailableResources: CPU: ${AVAILABLE_CPU:-cpuset:0-15} ReservedResources: CPU: ${RESERVED_CPU:-1} dynamic-pools: PinCPU: ${PINCPU:-true} PinMemory: ${PINMEMORY:-true} DynamicPoolTypes: $([ -n "$DYPTYPE0_SKIP" ] || echo " - Name: dyptype0 AllocatorPriority: ${DYPTYPE0_ALLOCATORPRIORITY:-0} CPUClass: ${DYPTYPE0_CPUCLASS:-classA} ") $([ -n "$DYPTYPE1_SKIP" ] || echo " - Name: dyptype1 Namespaces: - ${DYPTYPE1_NAMESPACE0:-dyptype1ns0} AllocatorPriority: ${DYPTYPE1_ALLOCATORPRIORITY:-1} CPUClass: ${DYPTYPE1_CPUCLASS:-classB} ") $([ -n "$DYPTYPE2_SKIP" ] || echo " - Name: dyptype2 Namespaces: - ${DYPTYPE2_NAMESPACE0:-dyptype2ns0} - ${DYPTYPE2_NAMESPACE1:-dyptype2ns1} AllocatorPriority: ${DYPTYPE2_ALLOCATORPRIORITY:-2} CPUClass: ${DYPTYPE2_CPUCLASS:-classC} ") instrumentation: |+ HTTPEndpoint: :8891 PrometheusExport: true logger: |+ Debug: policy cpu: |+ classes: default: minFreq: ${CPU_DEFAULT_MIN:-800} maxFreq: ${CPU_DEFAULT_MAX:-2800} classA: minFreq: ${CPU_CLASSA_MIN:-900} maxFreq: ${CPU_CLASSA_MAX:-2900} classB: minFreq: ${CPU_CLASSB_MIN:-1000} maxFreq: ${CPU_CLASSB_MAX:-3000} classC: minFreq: ${CPU_CLASSC_MIN:-1100} maxFreq: ${CPU_CLASSC_MAX:-3100} energyPerformancePreference: ${CPU_CLASSC_EPP:-1} ================================================ FILE: test/e2e/policies.test-suite/dynamic-pools/n4c16/test01-basic-placement/code.var.sh ================================================ # Test placing containers with and without annotations to correct dynamic pools # reserved and shared CPUs. cleanup() { vm-command "kubectl delete pods pod0 -n kube-system; kubectl delete pods -n pool1 --all --now; kubectl delete pods --all --now; kubectl delete namespace pool1" return 0 } cleanup terminate cri-resmgr launch cri-resmgr # pod0: run on reserved CPUs. namespace=kube-system CONTCOUNT=2 create dyp-busybox report allowed verify 'cpus["pod0c0"] == cpus["pod0c1"]' \ 'len(cpus["pod0c0"]) == 1' # pod1: run in shared dynamic pool. # We do not add annotations to this pod, and we do not set any # namespace, so this pod is expected to be created to the shared pool. create dyp-busybox report allowed verify 'len(cpus["pod1c0"]) == 14' # The size of each dynamic pool is obtained by adding the requests of the containers in this pool and the CPUs allocated based on cpu utilization, # so the size of each dynamic pool is greater than or equal to the sum of the requests of the containers in the pool. # pod2: run in the pool1. CPUREQ="100m" MEMREQ="100M" CPULIM="100m" MEMLIM="100M" POD_ANNOTATION="dynamic-pool.dynamic-pools.cri-resource-manager.intel.com/pod: pool1" CONTCOUNT=1 create dyp-busybox report allowed verify 'len(cpus["pod2c0"]) >= 1' \ 'len(cpus["pod1c0"]) + len(cpus["pod2c0"]) == 14' \ 'disjoint_sets(cpus["pod2c0"], cpus["pod1c0"])' # pod3: run in the pool1. CPUREQ="1500m" MEMREQ="100M" CPULIM="1500m" MEMLIM="100M" POD_ANNOTATION="dynamic-pool.dynamic-pools.cri-resource-manager.intel.com/pod: pool1" CONTCOUNT=1 create dyp-busybox report allowed verify 'cpus["pod2c0"] == cpus["pod3c0"]' \ 'len(cpus["pod3c0"]) >= 2' \ 'len(cpus["pod1c0"]) + len(cpus["pod3c0"]) == 14' \ 'disjoint_sets(cpus["pod1c0"], cpus["pod3c0"])' # pod4: run in the pool2. CPUREQ="1500m" MEMREQ="100M" CPULIM="1500m" MEMLIM="100M" POD_ANNOTATION="dynamic-pool.dynamic-pools.cri-resource-manager.intel.com/pod: pool2" CONTCOUNT=2 create dyp-busybox report allowed verify 'cpus["pod4c0"] == cpus["pod4c1"]' \ 'len(cpus["pod4c0"]) >= 3' \ 'len(cpus["pod3c0"]) >= 2' \ 'len(cpus["pod1c0"]) + len(cpus["pod3c0"]) + len(cpus["pod4c0"]) == 14' \ 'disjoint_sets(cpus["pod4c0"], cpus["pod3c0"], cpus["pod1c0"])' # pod5: run in the pool1. CPUREQ="1500m" MEMREQ="100M" CPULIM="1500m" MEMLIM="100M" kubectl create namespace "pool1" namespace="pool1" CONTCOUNT=1 create dyp-busybox report allowed verify 'cpus["pod5c0"] == cpus["pod2c0"]'\ 'len(cpus["pod5c0"]) >= 4' \ 'len(cpus["pod4c0"]) >= 3' \ 'len(cpus["pod1c0"]) + len(cpus["pod3c0"]) + len(cpus["pod4c0"]) == 14' cleanup ================================================ FILE: test/e2e/policies.test-suite/dynamic-pools/n4c16/test02-prometheus-metrics/code.var.sh ================================================ # This test verifies prometheus metrics from the dynamic-pools policy. cleanup() { vm-command "kubectl delete pods --all --now" terminate cri-resmgr terminate cri-resmgr-agent vm-command "cri-resmgr -reset-policy; cri-resmgr -reset-config" return 0 } cleanup # Launch cri-resmgr with wanted metrics update interval and a # configuration that opens the instrumentation http server. cri_resmgr_cfg=${TEST_DIR}/dyp-metrics.cfg cri_resmgr_extra_args="-metrics-interval 1s" launch cri-resmgr sleep 10 verify-metrics-has-line 'dynamicPool="shared"' verify-metrics-has-line 'dynamicPool="reserved"' verify-metrics-has-line 'dynamicPool="full-core"' verify-metrics-has-line 'dynamicPool="flex"' verify-metrics-has-line 'dynamicPool="fast-dualcore"' # pod0: run in shared dynamic pool. CPUREQ="100m" MEMREQ="100M" CPULIM="100m" MEMLIM="100M" CONTCOUNT=2 create dyp-busybox report allowed verify-metrics-has-line 'dynamicPool="reserved"' verify-metrics-has-line 'dynamicPool="full-core"' verify-metrics-has-line 'dynamicPool="flex"' verify-metrics-has-line 'dynamicPool="fast-dualcore"' verify-metrics-has-line 'DynamicPools{containers="pod0:pod0c0,pod0:pod0c1",cpu_class="",cpus=".*",dynamicPool="shared",dynamicPool_type="shared",mems=".*",tot_limit_millicpu="200",tot_req_millicpu="200"} 15' # pod1: run in fast-dualcore dynamic pool. CPUREQ="200m" MEMREQ="" CPULIM="200m" MEMLIM="" POD_ANNOTATION="dynamic-pool.dynamic-pools.cri-resource-manager.intel.com/pod: fast-dualcore" CONTCOUNT=1 create dyp-busybox report allowed verify-metrics-has-line 'containers="pod1:pod1c0".*dynamicPool="fast-dualcore",dynamicPool_type="fast-dualcore".*tot_req_millicpu="(199|200)"' verify 'len(cpus["pod1c0"]) >= 1' # pod2: run in flex dynamic pool. CPUREQ="3500m" MEMREQ="" CPULIM="3500m" MEMLIM="" POD_ANNOTATION="dynamic-pool.dynamic-pools.cri-resource-manager.intel.com/pod: flex" CONTCOUNT=1 create dyp-busybox report allowed verify-metrics-has-line 'containers="pod2:pod2c0".*dynamicPool="flex",dynamicPool_type="flex"' verify 'len(cpus["pod2c0"]) >= 4' # pod3: run in flex dynamic pool. CPUREQ="1200m" MEMREQ="" CPULIM="1200m" MEMLIM="" POD_ANNOTATION="dynamic-pool.dynamic-pools.cri-resource-manager.intel.com/pod: flex" CONTCOUNT=1 create dyp-busybox report allowed verify-metrics-has-line 'containers="pod2:pod2c0,pod3:pod3c0".*dynamicPool="flex",dynamicPool_type="flex"' verify 'len(cpus["pod2c0"]) >= 5' # Resize flex dynamic pool in metrics. kubectl delete pods --now pod3 verify-metrics-has-line 'containers="pod2:pod2c0".*dynamicPool="flex",dynamicPool_type="flex"' verify 'len(cpus["pod2c0"]) >= 4' kubectl delete pods --now pod2 sleep 5 verify-metrics-has-line 'containers="".*dynamicPool="flex",dynamicPool_type="flex".*0' # Delete all pods in shared dynamic pool. kubectl delete pods --now pod0 # pod4: run in fast-dualcore dynamic pool, all CPUs are allocated to fast-dualcore dynamic pool. CPUREQ="14" MEMREQ="" CPULIM="14" MEMLIM="" POD_ANNOTATION="dynamic-pool.dynamic-pools.cri-resource-manager.intel.com/pod: fast-dualcore" CONTCOUNT=1 create dyp-busybox report allowed verify-metrics-has-line 'containers="pod1:pod1c0,pod4:pod4c0".*dynamicPool="fast-dualcore",dynamicPool_type="fast-dualcore".*15' verify 'len(cpus["pod1c0"]) == 15' cleanup ================================================ FILE: test/e2e/policies.test-suite/dynamic-pools/n4c16/test02-prometheus-metrics/dyp-metrics.cfg ================================================ policy: Active: dynamic-pools AvailableResources: CPU: cpuset:0-15 # Reserve one of our CPUs for kube-system tasks. ReservedResources: CPU: cpuset:0 dynamic-pools: DynamicPoolTypes: - Name: full-core CPUClass: normal - Name: fast-dualcore CPUClass: turbo - Name: flex CPUClass: slow instrumentation: HTTPEndpoint: :8891 PrometheusExport: true logger: Debug: policy Klog: skip_headers: true ================================================ FILE: test/e2e/policies.test-suite/dynamic-pools/n4c16/test03-rebalancing/code.var.sh ================================================ # Re-launch cri-resmgr with the rebalancing parameter in order to # enable rebalancing calls. (See help of the "launch" function for # more options.) cleanup() { vm-command "kubectl delete pods --all --now" return 0 } cleanup terminate cri-resmgr cri_resmgr_extra_args="-metrics-interval 1s -rebalance-interval 2s" launch cri-resmgr sleep 10 # Create three pods: # - pod0 to "shared" # - pod1 to "pool1" # - pod2 to "pool2" create dyp-busybox POD_ANNOTATION="dynamic-pool.dynamic-pools.cri-resource-manager.intel.com/pod: pool1" create dyp-busybox POD_ANNOTATION="dynamic-pool.dynamic-pools.cri-resource-manager.intel.com/pod: pool2" create dyp-busybox # Print initial CPU pinning. report allowed # Wait at least one rebalancing round. sleep 3 verify 'len(cpus["pod0c0"]) >= 1' verify 'len(cpus["pod1c0"]) >= 1' verify 'len(cpus["pod2c0"]) >= 1' verify-metrics-has-line 'containers="pod0:pod0c0".*dynamicPool="shared",dynamicPool_type="shared"' verify-metrics-has-line 'containers="pod1:pod1c0".*dynamicPool="pool1",dynamicPool_type="pool1"' verify-metrics-has-line 'containers="pod2:pod2c0".*dynamicPool="pool2",dynamicPool_type="pool2"' # Increase CPU usage of pod1 to 200% vm-command "nohup kubectl exec pod1 -- /bin/sh -c 'gzip /dev/null' &/dev/null &" vm-command "nohup kubectl exec pod1 -- /bin/sh -c 'gzip /dev/null' &/dev/null &" # Wait at least one rebalancing round and print CPU pinning. sleep 10 report allowed # Now "pool1" has 200% CPU load, "shared" and "pool2" have 0%. # Verify that the number of CPUs in pool1 is the largest. verify 'len(cpus["pod1c0"]) > len(cpus["pod0c0"])' verify 'len(cpus["pod1c0"]) > len(cpus["pod2c0"])' verify 'len(cpus["pod0c0"]) + len(cpus["pod1c0"]) + len(cpus["pod2c0"]) == 14' # Remove CPU load from pool1 and put 100% CPU load to pool2. vm-command "pkill gzip" vm-command "nohup kubectl exec pod2 -- /bin/sh -c 'gzip /dev/null' &/dev/null &" # Wait at least one rebalancing round and print CPU pinning. sleep 10 report allowed # Verify that the number of CPUs in pool2 is the largest. verify 'len(cpus["pod2c0"]) > len(cpus["pod0c0"])' verify 'len(cpus["pod2c0"]) > len(cpus["pod1c0"])' verify 'len(cpus["pod0c0"]) + len(cpus["pod1c0"]) + len(cpus["pod2c0"]) == 14' # Remove CPU load from pool1 and put 100% CPU load to pool2 and pool1. vm-command "pkill gzip" vm-command "nohup kubectl exec pod1 -- /bin/sh -c 'gzip /dev/null' &/dev/null &" vm-command "nohup kubectl exec pod2 -- /bin/sh -c 'gzip /dev/null' &/dev/null &" # Takes time to reach a state of balance sleep 10 report allowed # Verify that the number of CPUs in pool1 is greater than or equal to 6 and less than or equal to 8. # Verify that the number of CPUs in pool2 is greater than or equal to 6 and less than or equal to 8. verify 'len(cpus["pod0c0"]) == 1' verify 'len(cpus["pod1c0"]) >= 6' verify 'len(cpus["pod1c0"]) <= 8' verify 'len(cpus["pod2c0"]) >= 6' verify 'len(cpus["pod2c0"]) <= 8' # Remove CPU load from pool1 and pool2 vm-command "pkill gzip" cleanup ================================================ FILE: test/e2e/policies.test-suite/dynamic-pools/n4c16/test04-reserved/code.var.sh ================================================ terminate cri-resmgr cri_resmgr_cfg=${TEST_DIR}/dyp-reserved.cfg launch cri-resmgr cleanup() { vm-command \ "kubectl delete pod -n kube-system --now pod0 kubectl delete pod -n monitor-mypods --now pod1 kubectl delete pod -n system-logs --now pod2 kubectl delete pod -n kube-system --now pod3 kubectl delete pods --now pod4 pod5 pod6 kubectl delete pod -n kube-system --now pod7 kubectl delete namespace monitor-mypods kubectl delete namespace system-logs kubectl delete namespace my-exact-name" return 0 } cleanup kubectl create namespace monitor-mypods kubectl create namespace system-logs kubectl create namespace my-exact-name # pod0: kube-system CPUREQ="100m" MEMREQ="100M" CPULIM="100m" MEMLIM="100M" namespace=kube-system create dyp-busybox report allowed verify 'cpus["pod0c0"] == {"cpu00", "cpu01", "cpu02"}' # pod1: match first ReservedPoolNamespaces glob, multicontainer CPUREQ="1" MEMREQ="" CPULIM="1" MEMLIM="" namespace=monitor-mypods CONTCOUNT=2 create dyp-busybox report allowed verify 'cpus["pod1c0"] == cpus["pod0c0"]' \ 'cpus["pod1c1"] == cpus["pod0c0"]' # pod2: match last ReservedPoolNamespaces glob, slightly overbook reserved CPU CPUREQ="1" MEMREQ="" CPULIM="1" MEMLIM="" namespace=system-logs create dyp-busybox report allowed verify 'cpus["pod2c0"] == cpus["pod0c0"]' # pod3: force a kube-system pod to full-core dynamic pool using an annotation CPUREQ="2" MEMREQ="" CPULIM="2" MEMLIM="" POD_ANNOTATION="dynamic-pool.dynamic-pools.cri-resource-manager.intel.com/pod: full-core" namespace=kube-system create dyp-busybox report allowed verify 'len(cpus["pod3c0"]) >= 2' \ 'disjoint_sets(cpus["pod0c0"], cpus["pod3c0"])' # pod4: run in shared dynamic pool CPUREQ="2500m" MEMREQ="" CPULIM="2500m" MEMLIM="" create dyp-busybox report allowed verify 'len(cpus["pod4c0"]) >= 3' \ 'disjoint_sets(cpus["pod0c0"], cpus["pod3c0"], cpus["pod4c0"])' # pod5: annotate otherwise a default pod to the reserved CPUs, # severely overbook reserved CPUs CPUREQ="2500m" MEMREQ="" CPULIM="2500m" MEMLIM="" POD_ANNOTATION="dynamic-pool.dynamic-pools.cri-resource-manager.intel.com/pod: reserved" create dyp-busybox report allowed verify 'cpus["pod5c0"] == {"cpu00", "cpu01", "cpu02"}' \ 'disjoint_sets(cpus["pod5c0"], cpus["pod3c0"], cpus["pod4c0"])' cleanup # Now that all pods are deleted, make sure that cpus of reserved and # default dynamic pools are as expected. # pod6: run in shared dynamic pool CPUREQ="999m" MEMREQ="" CPULIM="999m" MEMLIM="" create dyp-busybox report allowed verify 'len(cpus["pod6c0"]) >= 1' # pod7: kube-system CPUREQ="100m" MEMREQ="100M" CPULIM="100m" MEMLIM="100M" namespace=kube-system create dyp-busybox report allowed verify 'cpus["pod7c0"] == {"cpu00", "cpu01", "cpu02"}' cleanup terminate cri-resmgr launch cri-resmgr ================================================ FILE: test/e2e/policies.test-suite/dynamic-pools/n4c16/test04-reserved/dyp-reserved.cfg ================================================ policy: Active: dynamic-pools ReservedResources: CPU: cpuset:0-2 dynamic-pools: PinCPU: true PinMemory: true ReservedPoolNamespaces: - "monitor-*" - "*-log*" DynamicPoolTypes: - Name: reserved Namespaces: - my-exact-name CPUClass: reserved-class - Name: default - Name: full-core CPUClass: turbo logger: Debug: policy Klog: skip_headers: true ================================================ FILE: test/e2e/policies.test-suite/dynamic-pools/n4c16/test05-namespace/code.var.sh ================================================ terminate cri-resmgr cri_resmgr_cfg=${TEST_DIR}/dyp-namespace.cfg launch cri-resmgr cleanup() { vm-command \ "kubectl delete pods -n e2e-a --all --now kubectl delete pods -n e2e-b --all --now kubectl delete pods -n e2e-c --all --now kubectl delete pods -n e2e-d --all --now kubectl delete pods --all --now kubectl delete namespace e2e-a kubectl delete namespace e2e-b kubectl delete namespace e2e-c kubectl delete namespace e2e-d" return 0 } cleanup kubectl create namespace e2e-a kubectl create namespace e2e-b kubectl create namespace e2e-c kubectl create namespace e2e-d # pod0: create in the default namespace, CPUREQ is nil, both containers go to shared dynamic pool. CPUREQ="" CONTCOUNT=2 create dyp-busybox report allowed verify 'cpus["pod0c0"] == cpus["pod0c1"]' \ 'len(cpus["pod0c0"]) == 15' # pod1: create in the e2e-a namespace, CPUREQ is nil, both containers go to shared dynamic pool. CPUREQ="" namespace="e2e-a" CONTCOUNT=2 create dyp-busybox report allowed verify 'cpus["pod1c0"] == cpus["pod1c1"] == cpus["pod0c0"]' \ 'len(cpus["pod1c0"]) == 15' \ # pod2: create in the default namespace, CPUREQ is 2*2, both containers go to nsdyp dynamic pool. CPUREQ="2" MEMREQ="100M" CPULIM="2" MEMLIM="100M" CONTCOUNT=2 create dyp-busybox report allowed verify 'cpus["pod2c0"] == cpus["pod2c1"]' \ 'len(cpus["pod2c0"]) >= 4' \ 'disjoint_sets(cpus["pod2c0"], cpus["pod1c0"])' \ 'disjoint_sets(cpus["pod2c0"], cpus["pod0c0"])' # pod3: create again in the default namespace, CPUREQ is 200m*2, both containers go to nsdyp dynamic pool. CPUREQ="100m" MEMREQ="100M" CPULIM="100m" MEMLIM="100M" CONTCOUNT=2 create dyp-busybox report allowed verify 'cpus["pod3c0"] == cpus["pod3c1"] == cpus["pod2c0"]' \ 'len(cpus["pod3c0"]) >= 5' # pod4: create in the e2e-b namespace, CPUREQ is 2*2, both containers go to nsdyp dynamic pool. CPUREQ="2" MEMREQ="100M" CPULIM="2" MEMLIM="100M" namespace="e2e-b" CONTCOUNT=2 create dyp-busybox report allowed verify 'cpus["pod4c0"] == cpus["pod4c1"] == cpus["pod3c0"] == cpus["pod2c0"]' \ 'len(cpus["pod4c0"]) >= 9' # pod5: create in the e2e-c namespace, CPUREQ is 100m*2, both containers go to nsdyp dynamic pool. CPUREQ="100m" MEMREQ="100M" CPULIM="100m" MEMLIM="100M" namespace="e2e-c" CONTCOUNT=2 create dyp-busybox report allowed verify 'cpus["pod5c0"] == cpus["pod5c1"] == cpus["pod4c0"] == cpus["pod3c0"] == cpus["pod2c0"]' \ 'len(cpus["pod5c0"]) >= 9' cleanup terminate cri-resmgr launch cri-resmgr ================================================ FILE: test/e2e/policies.test-suite/dynamic-pools/n4c16/test05-namespace/dyp-namespace.cfg ================================================ policy: Active: dynamic-pools ReservedResources: CPU: 1 dynamic-pools: PinCPU: true PinMemory: true DynamicPoolTypes: - Name: nsdyp Namespaces: - "*" logger: Debug: policy ================================================ FILE: test/e2e/policies.test-suite/dynamic-pools/n4c16/test06-update-configmap/code.var.sh ================================================ # This test verifies that configuration updates via cri-resmgr-agent # are handled properly in the dynamic-pools policy. testns=e2e-dyp-test06 cleanup() { vm-command "kubectl delete pods --all --now; \ kubectl delete pods -n $testns --all --now; \ kubectl delete pods -n dyptype1ns0 --all --now; \ kubectl delete namespace $testns || :; \ kubectl delete namespace dyptype1ns0 || :" terminate cri-resmgr terminate cri-resmgr-agent vm-command "cri-resmgr -reset-policy; cri-resmgr -reset-config" } apply-configmap() { vm-put-file $(instantiate dyp-configmap.yaml) dyp-configmap.yaml vm-command "cat dyp-configmap.yaml" kubectl apply -f dyp-configmap.yaml } cleanup cri_resmgr_extra_args="-metrics-interval 1s" cri_resmgr_config=fallback launch cri-resmgr launch cri-resmgr-agent kubectl create namespace $testns kubectl create namespace dyptype1ns0 AVAILABLE_CPU="cpuset:1,4-15" DYPTYPE2_NAMESPACE0='"*"' apply-configmap sleep 3 # pod0 run in dyptype0, annotation CPUREQ=1 MEMREQ="100M" CPULIM=1 MEMLIM="100M" POD_ANNOTATION="dynamic-pool.dynamic-pools.cri-resource-manager.intel.com/pod: dyptype0" create dyp-busybox # pod1 run in dyptype1, namespace CPUREQ=1 MEMREQ="100M" CPULIM=1 MEMLIM="100M" namespace="dyptype1ns0" create dyp-busybox # pod2 run in dyptype2, wildcard namespace CPUREQ=1 MEMREQ="100M" CPULIM=1 MEMLIM="100M" namespace="e2e-dyp-test06" create dyp-busybox sleep 3 vm-command "curl -s $verify_metrics_url" verify-metrics-has-line 'pod0:pod0c0.*"dyptype0"' verify-metrics-has-line 'pod1:pod1c0.*"dyptype1"' verify-metrics-has-line 'pod2:pod2c0.*"dyptype2"' # Remove first two dynamic pool types, change dyptype2 to match all # namespaces. DYPTYPE0_SKIP=1 DYPTYPE1_SKIP=1 DYPTYPE2_NAMESPACE0='"*"' apply-configmap # Note: # pod0 was successfully assigned to and running in dyptype0 dynamic pool. # Now dyptype0 was completely removed from the node. # Currently this behavior is undefined. # Possible behaviors: evict pod0, continue assign chain, refuse config... # For now, skip pod0c0 dynamic pool validation: # verify-metrics-has-line '"dyptype2".*pod0:pod0c0' verify-metrics-has-line 'pod1:pod1c0.*"dyptype2"' verify-metrics-has-line 'pod2:pod2c0.*"dyptype2"' # Bring back dyptype0 where pod0 belongs to by annotation. DYPTYPE1_SKIP=1 DYPTYPE2_NAMESPACE0='"*"' apply-configmap verify-metrics-has-line 'pod0:pod0c0.*"dyptype0"' verify-metrics-has-line 'pod1:pod1c0.*"dyptype2"' verify-metrics-has-line 'pod2:pod2c0.*"dyptype2"' # Change only CPU classes, no reassigning. verify-metrics-has-line 'pod0:pod0c0.*cpu_class="classA".*"dyptype0"' verify-metrics-has-line 'pod1:pod1c0.*cpu_class="classC".*"dyptype2"' verify-metrics-has-line 'pod2:pod2c0.*cpu_class="classC".*"dyptype2"' DYPTYPE0_CPUCLASS="classC" DYPTYPE1_SKIP=1 DYPTYPE2_CPUCLASS="classB" DYPTYPE2_NAMESPACE0='"*"' apply-configmap verify-metrics-has-line 'pod0:pod0c0.*cpu_class="classC".*"dyptype0"' verify-metrics-has-line 'pod1:pod1c0.*cpu_class="classB".*"dyptype2"' verify-metrics-has-line 'pod2:pod2c0.*cpu_class="classB".*"dyptype2"' cleanup launch cri-resmgr ================================================ FILE: test/e2e/policies.test-suite/dynamic-pools/n4c16/test07-numa/code.var.sh ================================================ terminate cri-resmgr cri_resmgr_cfg=${TEST_DIR}/dyp-numa.cfg launch cri-resmgr # pod0: besteffort, go to shared dynamic pool, make sure it still gets at least 1 CPU. CPUREQ="" CPULIM="" MEMREQ="" MEMLIM="" CONTCOUNT=1 create dyp-busybox report allowed verify 'len(cpus["pod0c0"]) == 15' # pod1: guaranteed, go to fit-in-numa dynamic pool, make sure it gets the CPU it requested. CPUREQ="1" CPULIM="1" MEMREQ="50M" MEMLIM="50M" CONTCOUNT=1 create dyp-busybox report allowed verify 'len(cpus["pod0c0"]) >= 1' \ 'len(cpus["pod1c0"]) >= 1' \ 'disjoint_sets(cpus["pod0c0"], cpus["pod1c0"])' # pod2: guaranteed, go to fit-in-numa dynamic pool, make sure it gets the CPU it requested. CPUREQ="1" CPULIM="1" MEMREQ="50M" MEMLIM="50M" CONTCOUNT=1 create dyp-busybox report allowed verify 'len(cpus["pod0c0"]) >= 1' \ 'len(cpus["pod1c0"]) >= 2' \ 'len(cpus["pod2c0"]) >= 2' \ 'cpus["pod1c0"] == cpus["pod2c0"]' \ 'disjoint_sets(cpus["pod0c0"], cpus["pod2c0"])' # pod3: guaranteed, go to fit-in-numa dynamic pool, make sure it gets the CPU it requested. CPUREQ="1" CPULIM="1" MEMREQ="50M" MEMLIM="50M" CONTCOUNT=1 create dyp-busybox report allowed verify 'len(cpus["pod0c0"]) >= 1' \ 'len(cpus["pod1c0"]) >= 3' \ 'len(cpus["pod2c0"]) >= 3' \ 'len(cpus["pod3c0"]) >= 3' \ 'cpus["pod1c0"] == cpus["pod2c0"] == cpus["pod3c0"]' \ 'disjoint_sets(cpus["pod0c0"], cpus["pod3c0"])' # pod4: guaranteed, go to fit-in-numa dynamic pool, make sure it gets the CPU it requested. CPUREQ="1" CPULIM="1" MEMREQ="50M" MEMLIM="50M" CONTCOUNT=1 create dyp-busybox report allowed verify 'len(cpus["pod0c0"]) >= 1' \ 'len(cpus["pod1c0"]) >= 4' \ 'len(cpus["pod2c0"]) >= 4' \ 'len(cpus["pod3c0"]) >= 4' \ 'len(cpus["pod4c0"]) >= 4' \ 'cpus["pod1c0"] == cpus["pod2c0"] == cpus["pod3c0"] == cpus["pod4c0"]' \ 'disjoint_sets(cpus["pod0c0"], cpus["pod4c0"])' # pod5: besteffort, no CPU request, should fit into the shared dynamic pool. CPUREQ="" CPULIM="" MEMREQ="" MEMLIM="" CONTCOUNT=1 create dyp-busybox report allowed verify 'len(cpus["pod0c0"]) >= 1' \ 'len(cpus["pod1c0"]) >= 4' \ 'len(cpus["pod2c0"]) >= 4' \ 'len(cpus["pod3c0"]) >= 4' \ 'len(cpus["pod4c0"]) >= 4' \ 'len(cpus["pod5c0"]) >= 1' \ 'cpus["pod1c0"] == cpus["pod2c0"] == cpus["pod3c0"] == cpus["pod4c0"]' \ 'cpus["pod0c0"] == cpus["pod5c0"]' # Leave only one guaranteed container to the fit-in-numa dynamic pool. kubectl delete pods pod1 pod2 pod3 --now report allowed verify 'len(cpus["pod0c0"]) >= 1' \ 'len(cpus["pod4c0"]) >= 1' \ 'len(cpus["pod5c0"]) >= 1' \ 'cpus["pod0c0"] == cpus["pod5c0"]' \ 'disjoint_sets(cpus["pod0c0"], cpus["pod4c0"])' # Leave only bestefforts to the dynamic pool. kubectl delete pods pod4 --now report allowed verify 'len(cpus["pod0c0"]) >= 1' \ 'len(cpus["pod5c0"]) >= 1' \ 'cpus["pod0c0"] == cpus["pod5c0"]' terminate cri-resmgr launch cri-resmgr ================================================ FILE: test/e2e/policies.test-suite/dynamic-pools/n4c16/test07-numa/dyp-numa.cfg ================================================ policy: Active: dynamic-pools AvailableResources: CPU: cpuset:0-15 # Reserve one of our CPUs (cpu15) for kube-system tasks. ReservedResources: CPU: 1 dynamic-pools: PinCPU: true PinMemory: true DynamicPoolTypes: - Name: fit-in-numa # All (non-system) containers are assigned to this dynamic pool # type Namespaces: - "*" ================================================ FILE: test/e2e/policies.test-suite/dynamic-pools/n4c16/topology.var.json ================================================ [ {"mem": "2G", "cores": 2, "nodes": 2, "packages": 2} ] ================================================ FILE: test/e2e/policies.test-suite/dynamic-pools/verify.source.sh ================================================ # Utilities to verify data from metrics verify_metrics_url="http://localhost:8891/metrics" verify-metrics-has-line() { local expected_line="$1" vm-run-until --timeout 10 "echo 'waiting for metrics line: $expected_line' >&2; curl --silent $verify_metrics_url | grep -E '$expected_line'" || { command-error "expected line '$1' missing from the output" } } verify-metrics-has-no-line() { local unexpected_line="$1" vm-run-until --timeout 10 "echo 'checking absense of metrics line: $unexpected_line' >&2; ! curl --silent $verify_metrics_url | grep -Eq '$unexpected_line'" || { command-error "unexpected line '$1' found from the output" } } ================================================ FILE: test/e2e/policies.test-suite/podpools/cri-resmgr.cfg ================================================ policy: Active: podpools # Use 14 CPUs in total. AvailableResources: CPU: cpuset:2-15 # One CPU is dedicated for reserved tasks, 13 CPUs left. ReservedResources: CPU: cpuset:15 podpools: PinCPU: true PinMemory: true Pools: # Take 3 CPUs to "singlecpu" podpools, 10 CPUs left. - Name: singlecpu CPU: 1 MaxPods: 2 Instances: 3 CPUs # Not defining pool fill order equals to the default: # fillOrder: Balanced. # Take at most ~6.5 CPUs (= 50% * 13) to "dualcpu" pools. # Allocating 2 CPUs per pool allows instantiating 3 pools, # that is, 6 CPUs is really taken. # 4 CPUs left. # Leftover CPUs will be shared among pods and containers not in # pools. - Name: dualcpu CPU: 2 MaxPods: 3 Instances: 50 % FillOrder: Packed logger: Debug: cri-resmgr,resource-manager,cache,policy Klog: skip_headers: true ================================================ FILE: test/e2e/policies.test-suite/podpools/n4c16/podpools-configmap.yaml.in ================================================ apiVersion: v1 kind: ConfigMap metadata: name: cri-resmgr-config.default namespace: kube-system data: policy: |+ Active: podpools ReservedResources: CPU: 1 podpools: Pools: - Name: $NAME Instances: $INSTANCES CPU: $CPU MaxPods: $MAXPODS logger: |+ Debug: resource-manager,cache,policy,memory ================================================ FILE: test/e2e/policies.test-suite/podpools/n4c16/py_consts.var.py ================================================ # This file captures expected CPU allocator behavior when the podpools # policy is started with the test default cri-resmgr configuration on # n4c16 topology. # cri-resmgr output on constructed pools. expected_podpools_output = """ podpools policy pools: - pool 0: reserved[0]{cpus:15, mems:3, pods:0/0, containers:0} - pool 1: default[0]{cpus:5,12-14, mems:1,3, pods:0/0, containers:0} - pool 2: singlecpu[0]{cpus:2, mems:0, pods:0/2, containers:0} - pool 3: singlecpu[1]{cpus:3, mems:0, pods:0/2, containers:0} - pool 4: singlecpu[2]{cpus:4, mems:1, pods:0/2, containers:0} - pool 5: dualcpu[0]{cpus:6-7, mems:1, pods:0/3, containers:0} - pool 6: dualcpu[1]{cpus:8-9, mems:2, pods:0/3, containers:0} - pool 7: dualcpu[2]{cpus:10-11, mems:2, pods:0/3, containers:0} """ # 1. Parse expected_podpools_output into # expected.cpus.POOLNAME[INSTANCE] = {"cpuNN", ...} # 2. Calculate memory nodes based on expected.cpus into # expected.mems.POOLNAME[INSTANCE] = {"nodeN", ...} # (do not read these from output in order to verify its correctness) # # As the result: # expected.cpus.singlecpu == [{"cpu02"}, {"cpu03"}, {"cpu04"}] # expected.mems.singlecpu == [{"node0"}, {"node0"}, {"node1"}] import re class expected: class cpus: pass class mems: pass def _add_expected_pool(poolname, poolindex, cpuset): cpus = [] for cpurange in cpuset.split(","): lower_upper = [int(n) for n in cpurange.split("-")] if len(lower_upper) == 1: cpus.append(lower_upper[0]) else: cpus.extend([i for i in range(lower_upper[0], lower_upper[1]+1)]) if not hasattr(expected.cpus, poolname): setattr(expected.cpus, poolname, []) setattr(expected.mems, poolname, []) getattr(expected.cpus, poolname).append(set('cpu%s' % (str(cpu).zfill(2),) for cpu in cpus)) getattr(expected.mems, poolname).append(set("node%s" % (cpu//4,) for cpu in cpus)) for poolname, poolindex, cpuset in re.findall(r': ([a-z]+)\[([0-9]+)\]\{cpus:([0-9,-]+), ', expected_podpools_output): _add_expected_pool(poolname, poolindex, cpuset) ================================================ FILE: test/e2e/policies.test-suite/podpools/n4c16/test01-basic-placement/code.var.sh ================================================ # Test placing containers with and without annotations to correct pools # reserved and shared CPUs. ( kubectl delete pods pod3 -n kube-system --now --wait --ignore-not-found ) || true # pod0: singlecpu out "" out "### Multicontainer pod, all containers run on single CPU" # singlecpu pool has capacity for two pods => 500 mCPU/pod # test with 3 containers per pod => 167 mCPU/container CPUREQ="167m" MEMREQ="" CPULIM="" MEMLIM="" POD_ANNOTATION="pool.podpools.cri-resource-manager.intel.com: singlecpu" CONTCOUNT=3 create podpools-busybox report allowed verify 'cpus["pod0c0"] == cpus["pod0c1"] == cpus["pod0c2"]' \ 'cpus["pod0c0"] == expected.cpus.singlecpu[0]' \ 'mems["pod0c0"] == expected.mems.singlecpu[0]' # pod1: dualcpu out "" out "### Multicontainer pod, all containers run on two CPUs." POD_ANNOTATION="pool.podpools.cri-resource-manager.intel.com: dualcpu" CONTCOUNT=3 create podpools-busybox report allowed verify 'cpus["pod1c0"] == cpus["pod1c1"] == cpus["pod1c2"]' \ 'cpus["pod1c0"] == expected.cpus.dualcpu[0]' \ 'mems["pod1c1"] == expected.mems.dualcpu[0]' # pod2: default out "" out "### Multicontainer pod, no annotations. Runs on shared CPUs." CONTCOUNT=3 create podpools-busybox report allowed verify 'cpus["pod2c0"] == cpus["pod2c1"] == cpus["pod2c2"]' \ 'cpus["pod2c0"] == expected.cpus.default[0]' \ 'mems["pod2c2"] == expected.mems.default[0]' # pod3: reserved out "" out "### Multicontainer pod in kube-system namespace. Runs on reserved CPUs." namespace=kube-system CONTCOUNT=3 create podpools-busybox report allowed verify 'cpus["pod3c0"] == cpus["pod3c1"] == cpus["pod3c2"]' \ 'cpus["pod3c0"] == expected.cpus.reserved[0]' \ 'mems["pod3c0"] == expected.mems.reserved[0]' kubectl delete pods pod3 -n kube-system --now --wait --ignore-not-found # pod4: bad pool name out "" out "### Single container pod, fallback to the default pool." POD_ANNOTATION="pool.podpools.cri-resource-manager.intel.com: non-existing-pool" create podpools-busybox report allowed verify 'cpus["pod4c0"] == expected.cpus.default[0]' \ 'mems["pod4c0"] == expected.mems.default[0]' kubectl delete pods pod0 pod1 pod2 --now --wait --ignore-not-found ================================================ FILE: test/e2e/policies.test-suite/podpools/n4c16/test02-fill-order/code.var.sh ================================================ # Test filling pools with pods in correct order # Test only BestEffort containers CPUREQ="" MEMREQ="" CPULIM="" MEMLIM="" # pod0..2: balanced filling, every singlecpu pool should have one pod out "### Filling singlecpu pool in Balanced fill order" n=3 POD_ANNOTATION="pool.podpools.cri-resource-manager.intel.com: singlecpu" CONTCOUNT=2 create podpools-busybox report allowed verify 'cpus["pod0c0"] == cpus["pod0c1"]' \ 'cpus["pod1c0"] == cpus["pod1c1"]' \ 'cpus["pod2c0"] == cpus["pod2c1"]' \ 'len(cpus["pod0c0"]) == 1' \ 'len(cpus["pod1c0"]) == 1' \ 'len(cpus["pod2c0"]) == 1' \ 'disjoint_sets(cpus["pod0c0"], cpus["pod1c0"], cpus["pod2c0"])' # pod3..5: balanced filling up to max, every singlecpu pool should have two pods n=3 POD_ANNOTATION="pool.podpools.cri-resource-manager.intel.com: singlecpu" CONTCOUNT=2 create podpools-busybox report allowed verify 'cpus["pod0c0"] == cpus["pod0c1"]' \ 'cpus["pod1c0"] == cpus["pod1c1"]' \ 'cpus["pod2c0"] == cpus["pod2c1"]' \ 'len(cpus["pod0c0"]) == 1' \ 'len(cpus["pod1c0"]) == 1' \ 'len(cpus["pod2c0"]) == 1' \ 'disjoint_sets(cpus["pod0c0"], cpus["pod1c0"], cpus["pod2c0"])' \ 'cpus["pod3c0"] == cpus["pod3c1"]' \ 'cpus["pod4c0"] == cpus["pod4c1"]' \ 'cpus["pod5c0"] == cpus["pod5c1"]' \ 'len(cpus["pod3c0"]) == 1' \ 'len(cpus["pod4c0"]) == 1' \ 'len(cpus["pod5c0"]) == 1' \ 'disjoint_sets(cpus["pod3c0"], cpus["pod4c0"], cpus["pod5c0"])' \ 'cpus["pod5c0"] == cpus["pod2c0"]' # the last pool should have been filled by pods 2 and 5 # make a little room to the first pool and clear the last pool kubectl delete pods pod0 pod2 pod5 --now --wait --ignore-not-found # pod6: Balanced fill order should place this pod to the last pool (it has maximal free space) POD_ANNOTATION="pool.podpools.cri-resource-manager.intel.com: singlecpu" CONTCOUNT=1 create podpools-busybox report allowed verify 'disjoint_sets(cpus["pod6c0"], set.union(cpus["pod1c0"], cpus["pod3c0"], cpus["pod4c0"]))' kubectl delete pods --all --now --wait reset counters out "### Filling dualcpu pool in Packed fill order" # pod0..2: should go to the first pool n=3 POD_ANNOTATION="pool.podpools.cri-resource-manager.intel.com: dualcpu" CONTCOUNT=1 create podpools-busybox report allowed verify 'cpus["pod0c0"] == cpus["pod1c0"] == cpus["pod2c0"]' # pod3..5: should go to the second pool n=3 POD_ANNOTATION="pool.podpools.cri-resource-manager.intel.com: dualcpu" CONTCOUNT=1 create podpools-busybox report allowed verify 'cpus["pod0c0"] == cpus["pod1c0"] == cpus["pod2c0"]' \ 'cpus["pod3c0"] == cpus["pod4c0"] == cpus["pod5c0"]' \ 'disjoint_sets(cpus["pod0c0"], cpus["pod3c0"])' # Deleting two pods from the first pool, one from the last. kubectl delete pods pod0 pod1 pod5 --now --wait --ignore-not-found # pod6: Packed fill order should place this to the last pool (it has minimal free space) POD_ANNOTATION="pool.podpools.cri-resource-manager.intel.com: dualcpu" CONTCOUNT=1 create podpools-busybox report allowed verify 'cpus["pod3c0"] == cpus["pod4c0"] == cpus["pod6c0"]' \ 'disjoint_sets(cpus["pod2c0"], cpus["pod6c0"])' ================================================ FILE: test/e2e/policies.test-suite/podpools/n4c16/test03-qos/code.var.sh ================================================ # Test all QoS class pods in a pool, reserved and shared CPUs. # Verify that CFS CPU shares is set correctly in all cases. vm-put-file "$HOST_PROJECT_DIR/scripts/testing/kube-cgroups" "/usr/local/bin/kube-cgroups" verify-cpushare() { podXcY=$1 expected_cgv1=$2 expected_cgv2=$3 vm-command "kube-cgroups -n . -c $podXcY -f 'cpu.(shares|weight)\$'" CPU_SHARES_WEIGHT=$(echo "$COMMAND_OUTPUT" | awk '/cpu.*:/{print $2}') if [ "$CPU_SHARES_WEIGHT" = "$expected_cgv1" ]; then echo "verified cpu.shares of $podXcY == $expected_cgv1" elif [ "$CPU_SHARES_WEIGHT" = "$expected_cgv2" ]; then echo "verified cpu.weight of $podXcY == $expected_cgv2" else echo "assertion failed when verifying $podXcY: got '$COMMAND_OUTPUT' expected 'cpu.shares=$expected_cgv1' or 'cpu.weight=$expected_cgv2'" exit 1 fi } CPUREQ="" MEMREQ="" CPULIM="" MEMLIM="" POD_ANNOTATION="" out "### Assigning BestEffort, Burstable and Guaranteed pods to the same (dualcpu) pool" # pod0c0: besteffort POD_ANNOTATION="pool.podpools.cri-resource-manager.intel.com: dualcpu" create podpools-busybox # pod1c0: burstable POD_ANNOTATION="pool.podpools.cri-resource-manager.intel.com: dualcpu" CPUREQ=500m create podpools-busybox # pod2c0: guaranteed POD_ANNOTATION="pool.podpools.cri-resource-manager.intel.com: dualcpu" CPUREQ=1 CPULIM=1 MEMREQ=100M MEMLIM=100M create podpools-busybox report allowed verify-cpushare pod0c0 2 1 verify-cpushare pod1c0 512 20 verify-cpushare pod2c0 1024 39 kubectl delete pods --all --now --wait reset counters out "### Assigning BestEffort, Burstable and Guaranteed pods shared CPUs" # pod0c0: besteffort create podpools-busybox # pod1c0: burstable CPUREQ=500m create podpools-busybox # pod2c0: guaranteed CPUREQ=1 CPULIM=1 MEMREQ=100M MEMLIM=100M create podpools-busybox report allowed verify-cpushare pod0c0 2 1 verify-cpushare pod1c0 512 20 verify-cpushare pod2c0 1024 39 kubectl delete pods --all --now --wait reset counters out "### Assigning BestEffort, Burstable and Guaranteed pods reserved CPUs" # pod0c0: besteffort namespace=kube-system create podpools-busybox # pod1c0: burstable namespace=kube-system CPUREQ=500m create podpools-busybox # pod2c0: guaranteed namespace=kube-system CPUREQ=1 CPULIM=1 MEMREQ=100M MEMLIM=100M create podpools-busybox report allowed verify-cpushare pod0c0 2 1 verify-cpushare pod1c0 512 20 verify-cpushare pod2c0 1024 39 kubectl delete pods pod0 pod1 pod2 -n kube-system --now --wait --ignore-not-found ================================================ FILE: test/e2e/policies.test-suite/podpools/n4c16/test04-overbook-cpus/code.var.sh ================================================ # Test CPU request warnings and errors: # - Overbooked CPU sets # - Bad CPU requests: mismatch between pool CPUs per pod and container CPU requests CRI_RESMGR_OUTPUT="cat cri-resmgr.output.txt" # pod0: overbook with single burstable pod and container POD_ANNOTATION="pool.podpools.cri-resource-manager.intel.com: dualcpu" CPUREQ=2900m CPULIM="" MEMREQ="" MEMLIM="" create podpools-busybox report allowed vm-command "$CRI_RESMGR_OUTPUT | grep -E '^E.*overbooked.*(2899|2900)m'" || error "missing overbook warning" kubectl delete pods --all --now --wait # pod1: overbook with single burstable pod with two containers POD_ANNOTATION="pool.podpools.cri-resource-manager.intel.com: dualcpu" CPUREQ=1050m CPULIM="" MEMREQ="" MEMLIM="" CONTCOUNT=2 create podpools-busybox report allowed vm-command "$CRI_RESMGR_OUTPUT | grep -E '^E.*overbooked.*2100m'" || error "missing overbook warning" kubectl delete pods --all --now --wait # pod2, pod3: overbook with two guaranteed pods, one container in each pod n=2 POD_ANNOTATION="pool.podpools.cri-resource-manager.intel.com: dualcpu" CPUREQ=1001m MEMREQ=100M CPULIM=1001m MEMLIM=100M create podpools-busybox report allowed vm-command "$CRI_RESMGR_OUTPUT | grep -E '^E.*overbooked.*2002m'" || error "missing overbook warning" kubectl delete pods --all --now --wait # pod4, pod5: no overbooking with exact CPUs guaranteed + besteffort pod terminate cri-resmgr # restart to clear log launch cri-resmgr POD_ANNOTATION="pool.podpools.cri-resource-manager.intel.com: dualcpu" CPUREQ=1000m CPULIM=1000m MEMREQ=100M MEMLIM=100M CONTCOUNT=2 create podpools-busybox POD_ANNOTATION="pool.podpools.cri-resource-manager.intel.com: dualcpu" CPUREQ="" CPULIM="" MEMREQ="" MEMLIM="" create podpools-busybox report allowed vm-command "$CRI_RESMGR_OUTPUT | grep -E '^E.*overbooked'" && error "overbook warning with maximum allowed load" kubectl delete pods --all --now --wait # podpools logs misaligned CPU requests after pod deletion vm-command "$CRI_RESMGR_OUTPUT | grep -E '^E.*bad CPU requests:.*pod4.* requested 2000 mCPUs.* 666 mCPUs'" || error "bad CPU request from pod4 expected but not found" vm-command "$CRI_RESMGR_OUTPUT | grep -E '^E.*bad CPU requests:.*pod5.* requested 0 mCPUs.* 666 mCPUs'" || error "bad CPU request from pod5 expected but not found" # pod6: request 4 * 167 mCPU, that is almost required 666 mCPU. Should not be bad CPU request POD_ANNOTATION="pool.podpools.cri-resource-manager.intel.com: dualcpu" CPUREQ=167m CPULIM="" MEMREQ="" MEMLIM="" CONTCOUNT=4 create podpools-busybox vm-command "$CRI_RESMGR_OUTPUT | grep -E '^E.*bad CPU requests:.*pod6'" && error "pod6 CPU request was ok, but 'bad CPU request' error found" kubectl delete pods --all --now --wait ================================================ FILE: test/e2e/policies.test-suite/podpools/n4c16/test05-agent-updates-config/code.var.sh ================================================ # Relaunch cri-resmgr so that it will listen to cri-resmgr-agent cleanup() { vm-command "kubectl delete pod -n kube-system pod0 --now --wait --ignore-not-found; kubectl delete pods --all --now --wait; kubectl delete cm -n kube-system cri-resmgr-config.default" terminate cri-resmgr terminate cri-resmgr-agent vm-command "cri-resmgr -reset-policy; cri-resmgr -reset-config" } cleanup cri_resmgr_config=fallback launch cri-resmgr launch cri-resmgr-agent # Create a pod to every pod pool in the default config: # reserved, shared, singlecpu, dualcpu # pod0: reserved CPUREQ="" namespace=kube-system create podpools-busybox # pod1: default CPUREQ="" create podpools-busybox # pod2: singlecpu CPUREQ="1" POD_ANNOTATION="pool.podpools.cri-resource-manager.intel.com: singlecpu" create podpools-busybox # pod3, pod4, pod5, pod6: dualcpu (dualcpu 3 pods/pool, packed) n=4 CPUREQ="1" POD_ANNOTATION="pool.podpools.cri-resource-manager.intel.com: dualcpu" create podpools-busybox report allowed verify "cpus['pod0c0'] == expected.cpus.reserved[0]" \ "cpus['pod1c0'] == expected.cpus.default[0]" \ "cpus['pod2c0'] == expected.cpus.singlecpu[0]" \ "cpus['pod3c0'] == expected.cpus.dualcpu[0]" \ "cpus['pod4c0'] == expected.cpus.dualcpu[0]" \ "cpus['pod5c0'] == expected.cpus.dualcpu[0]" \ "cpus['pod6c0'] == expected.cpus.dualcpu[1]" echo "Switch to new configuration without singlecpu pools" vm-put-file $(NAME=dualcpu CPU=2 MAXPODS=2 INSTANCES="100 %" instantiate podpools-configmap.yaml) podpools-dualcpu-configmap.yaml kubectl apply -f podpools-dualcpu-configmap.yaml sleep 5 report allowed verify "cpus['pod0c0'] == expected.cpus.reserved[0]" `# reserved remains the same` \ "len(cpus['pod1c0']) == 1" `# the default pool has only one CPU` \ "cpus['pod2c0'] == cpus['pod1c0']" `# no singlecpu pool -> assign to default` \ `# there are many dualcpu pools (1 out of 2 pods/pool, balanced)` \ "len(cpus['pod3c0']) == 2" \ "len(cpus['pod4c0']) == 2" \ "len(cpus['pod5c0']) == 2" \ "len(cpus['pod6c0']) == 2" \ "disjoint_sets(cpus['pod3c0'], cpus['pod4c0'], cpus['pod5c0'], cpus['pod6c0'])" echo "Negative test: try switching to an invalid configuration, check assignments have not changed" vm-put-file $(NAME=borked CPU=130 MAXPODS=2 INSTANCES=1 instantiate podpools-configmap.yaml) podpools-borked-configmap.yaml kubectl apply -f podpools-borked-configmap.yaml sleep 5 report allowed verify "cpus['pod0c0'] == {'cpu15'}" \ "cpus['pod1c0'] == cpus['pod2c0']" \ "disjoint_sets(cpus['pod3c0'], cpus['pod4c0'], cpus['pod5c0'], cpus['pod6c0'])" \ echo "After broken reconfiguration trial, switch to valid configuration without dualcpu pools" # This configuration leaves no left-over CPUs for the default pool # => the default pool will use the same CPUs as the reserved pool. vm-put-file $(NAME=singlecpu CPU=1 MAXPODS=1 INSTANCES="100 %" instantiate podpools-configmap.yaml) podpools-dualcpu-configmap.yaml kubectl apply -f podpools-dualcpu-configmap.yaml sleep 5 report allowed verify "cpus['pod0c0'] == expected.cpus.reserved[0]" `# reserved remains the same` \ "cpus['pod1c0'] == expected.cpus.reserved[0]" `# the default pool equals to reserved` \ "len(cpus['pod2c0']) == 1" `# pod2 in singlecpu[0]` \ "disjoint_sets(cpus['pod2c0'], expected.cpus.reserved[0])" \ `# all dualcpu pods endup into the default pool` \ "cpus['pod3c0'] == cpus['pod4c0'] == cpus['pod5c0'] == cpus['pod6c0']" \ "cpus['pod3c0'] == expected.cpus.reserved[0]" echo "Not enough dualcpu pools for all running dualcpu pods, the rest fall back to the default pool" vm-put-file $(NAME=dualcpu CPU=2 MAXPODS=1 INSTANCES="2" instantiate podpools-configmap.yaml) podpools-dualcpu-configmap.yaml kubectl apply -f podpools-dualcpu-configmap.yaml sleep 5 report allowed pp cpus verify "cpus['pod0c0'] == expected.cpus.reserved[0]" `# reserved remains the same` \ "len(cpus['pod1c0']) == 9" `# the default pool` \ "cpus['pod2c0'] == cpus['pod1c0']" `# no singlecpu pool -> assign to default` \ `# two dualcpu pods go to dualcpu pools, two to the default pool` \ "len([c for c in ['pod3c0', 'pod4c0', 'pod5c0', 'pod6c0'] if len(cpus[c])==2]) == 2" \ "len([c for c in ['pod3c0', 'pod4c0', 'pod5c0', 'pod6c0'] if len(cpus[c])==9]) == 2" # Clean up agent-delivered configuration setup as it might break tests # that by default rely on forced configurations. cleanup launch cri-resmgr launch cri-resmgr-agent ================================================ FILE: test/e2e/policies.test-suite/podpools/n4c16/test06-prometheus-metrics/code.var.sh ================================================ # Test reporting Prometheus metrics from podpools cleanup() { vm-command "kubectl get pods -A | grep -E ' pod[0-9]' | while read namespace pod rest; do kubectl -n \$namespace delete pod \$pod --now --wait --ignore-not-found; done" } parse-commandoutput-log_pool_cpuset() { log_pool_cpuset=$(awk -F 'cpus:|, ' "{print \$2}" <<< "$COMMAND_OUTPUT") out "parsed: log_pool_cpuset=$log_pool_cpuset" } parse-commandoutput-log_pool_name() { log_pool_name=$(awk -F"[ {]*" "{print \$10}" <<< "$COMMAND_OUTPUT") out "parsed: log_pool_name=$log_pool_name" } verify-log-vs-metrics() { local podXcY="$1" local cpuUsageMin="$2" # optional local cpuUsageMax="$3" # optional vm-command "grep 'assigning container $podXcY to pool' cri-resmgr.output.txt" parse-commandoutput-log_pool_cpuset parse-commandoutput-log_pool_name local usageCmd="curl --silent $metrics_url | grep $log_pool_cpuset | grep $podXcY" vm-run-until --timeout 10 "$usageCmd" || { error "cannot find pod:container $1 and cpuset $log_pool_cpuset from the report" } if [ -n "$cpuUsageMax" ]; then echo "verifying CPU usage $cpuUsageMin < X < $cpuUsageMax" vm-run-until --timeout 20 "X=\"\$($usageCmd)\"; echo \"\$X\"; X=\${X##* }; X=\${X%%.*}; echo $cpuUsageMin \< \$X \< $cpuUsageMax; (( $cpuUsageMin < \$X )) && (( \$X < $cpuUsageMax ))" fi } verify-metrics-has-line() { local expected_line="$1" out "verifying metrics line syntax..." vm-run-until --timeout 10 "echo ' waiting for metrics line: $expected_line' >&2; curl --silent $metrics_url | grep -E '$expected_line'" || { command-error "expected line '$1' missing from the output" } } # Delete left-over test pods from the kube-system namespace for podX in $(kubectl get pods -n kube-system | awk '/^pod[0-9]/{print $1}'); do kubectl delete pods $podX -n kube-system --now --wait --ignore-not-found done metrics_url="http://localhost:8891/metrics" # Launch cri-resmgr with wanted metrics update interval # and configuration that opens the instrumentation http server. terminate cri-resmgr cri_resmgr_cfg=${TEST_DIR}/podpools-metrics.cfg cri_resmgr_extra_args="-metrics-interval 4s" launch cri-resmgr # pod0: single container, reserve 400m CPU, but do not use it. out "" out "### Idle single-container pod" CPUREQ="400m" MEMREQ="" CPULIM="400m" MEMLIM="" POD_ANNOTATION="pool.podpools.cri-resource-manager.intel.com: 400mCPU" CONTCOUNT=1 create podpools-busybox report allowed verify-log-vs-metrics pod0:pod0c0 0 20 # pod0: single container, reserve 400m CPU and use it. # "yes" should show up in top with 40 % CPU consumption. out "" out "### Busy single-container pod" CPUREQ="400m" MEMREQ="" CPULIM="400m" MEMLIM="" POD_ANNOTATION="pool.podpools.cri-resource-manager.intel.com: 400mCPU" CONTCOUNT=1 WORK='yes>/dev/null & ' create podpools-busybox report allowed verify-log-vs-metrics pod1:pod1c0 30 50 out "" out "### Idle four-container pod" CPUREQ="100m" CPULIM="100m" POD_ANNOTATION="pool.podpools.cri-resource-manager.intel.com: 400mCPU" CONTCOUNT=4 create podpools-busybox report allowed verify-metrics-has-line 'pool_cpu_usage{CPUs="[0-9]-[0-9]",container_name="pod2:pod2c0,pod2:pod2c1,pod2:pod2c2,pod2:pod2c3",def_name="400mCPU",memory="1",pod_name="pod2",policy="podpools",pool_size="2000",pretty_name="400mCPU\[[0-9]\]"}' verify-log-vs-metrics pod2:pod2c3 0 20 out "" out "### Busy four-container pod" CPUREQ="100m" CPULIM="100m" POD_ANNOTATION="pool.podpools.cri-resource-manager.intel.com: 400mCPU" CONTCOUNT=4 WORK='yes>/dev/null & ' create podpools-busybox report allowed verify-log-vs-metrics pod3:pod3c3 30 50 out "" out "### Multicontainer pod, no annotations. Runs on shared CPUs." CPUREQ="" CPULIM="" CONTCOUNT=2 create podpools-busybox report allowed vm-command "curl --silent $metrics_url | grep -v ^cgroup_" verify-log-vs-metrics pod4:pod4c1 0 20 out "" out "### Multicontainer pod in kube-system namespace. Runs on reserved CPUs." CPUREQ="" CPULIM="" namespace=kube-system CONTCOUNT=3 create podpools-busybox report allowed vm-command "curl --silent $metrics_url | grep -v ^cgroup_" # There should be kube-apiserver, etcd etc. running on reserved CPUs as well, # therefore allow a lot of CPU usage yet pod5 is not doing anything. verify-log-vs-metrics pod5:pod5c1 0 100 cleanup ================================================ FILE: test/e2e/policies.test-suite/podpools/n4c16/test06-prometheus-metrics/podpools-metrics.cfg ================================================ policy: Active: podpools ReservedResources: CPU: 1 podpools: Pools: - Name: 400mCPU Instances: 90 % CPU: 2 MaxPods: 5 # (2000m CPUs/pool) / (5 pods/pool) = 400m CPUs/pod instrumentation: HTTPEndpoint: :8891 PrometheusExport: true logger: Debug: resource-manager,cache,policy,memory Klog: skip_headers: true ================================================ FILE: test/e2e/policies.test-suite/podpools/n4c16/test07-custom-default-pool/code.var.sh ================================================ # Launch cri-resmgr with a custom default pool and many highperf # pools. The CPUs in the custom default pool are disjoint from CPUs in # the reserved pool. 100 % of remaining CPUs are allocated to highperf # pools. terminate cri-resmgr cri_resmgr_cfg=${TEST_DIR}/podpools-custom-default.cfg launch cri-resmgr cleanup() { ( kubectl delete pods --all --now --wait ) ( kubectl delete pod -n kube-system pod0c-mysystem --now --wait --ignore-not-found ) ( kubectl delete namespace daemons --now --wait --ignore-not-found ) } cleanup namespace=kube-system NAME=pod0c-mysystem CONTCOUNT=2 create podpools-busybox kubectl create namespace daemons namespace=daemons NAME=pod0c-mydaemon CONTCOUNT=2 create podpools-busybox report allowed verify 'len(cpus["pod0c-mysystemc0"]) == 1' \ 'len(cpus["pod0c-mydaemonc0"]) == 3' \ 'disjoint_sets(cpus["pod0c-mysystemc0"], cpus["pod0c-mydaemonc0"])' NAME=pod1c-highperf POD_ANNOTATION="pool.podpools.cri-resource-manager.intel.com: highperf" CPUREQ=2 CPULIM=2 MEMREQ="" MEMLIM="" create podpools-busybox NAME=pod2c-highperf POD_ANNOTATION="pool.podpools.cri-resource-manager.intel.com: highperf" CPUREQ=2 CPULIM=2 MEMREQ="" MEMLIM="" create podpools-busybox NAME=pod3c-highperf POD_ANNOTATION="pool.podpools.cri-resource-manager.intel.com: highperf" CPUREQ=2 CPULIM=2 MEMREQ="" MEMLIM="" create podpools-busybox NAME=pod4c-highperf POD_ANNOTATION="pool.podpools.cri-resource-manager.intel.com: highperf" CPUREQ=2 CPULIM=2 MEMREQ="" MEMLIM="" create podpools-busybox report allowed verify 'len(cpus["pod1c-highperfc0"]) == 2' \ 'len(cpus["pod2c-highperfc0"]) == 2' \ 'len(cpus["pod3c-highperfc0"]) == 2' \ 'len(cpus["pod4c-highperfc0"]) == 2' \ 'disjoint_sets(cpus["pod1c-highperfc0"], cpus["pod2c-highperfc0"], cpus["pod3c-highperfc0"], cpus["pod4c-highperfc0"])' cleanup vm-command "cat < cri-resmgr.output.txt > cri-resmgr-podpools-single-pool.output.txt" terminate cri-resmgr launch cri-resmgr ================================================ FILE: test/e2e/policies.test-suite/podpools/n4c16/test07-custom-default-pool/podpools-custom-default.cfg ================================================ policy: Active: podpools ReservedResources: CPU: cpuset:0 podpools: Pools: - Name: default CPU: 3 - Name: highperf Instances: 100% CPU: 2 MaxPods: 1 logger: Debug: resource-manager,cache,policy,memory Klog: skip_headers: true ================================================ FILE: test/e2e/policies.test-suite/podpools/n4c16/topology.var.json ================================================ [ {"mem": "2G", "cores": 2, "nodes": 2, "packages": 2} ] ================================================ FILE: test/e2e/policies.test-suite/podpools/podpools-busybox.yaml.in ================================================ apiVersion: v1 kind: Pod metadata: name: ${NAME} $(if [ -n "$POD_ANNOTATION" ]; then echo " annotations: $POD_ANNOTATION "; fi) labels: app: ${NAME} spec: containers: $(for contnum in $(seq 1 ${CONTCOUNT}); do echo " - name: ${NAME}c$(( contnum - 1 )) image: busybox imagePullPolicy: IfNotPresent command: - sh - -c - ${WORK}echo ${NAME}c$(( contnum - 1 )) \$(sleep inf) $(if [ -n "${CPUREQ}" ]; then echo " resources: requests: cpu: ${CPUREQ} $(if [ -n "${MEMREQ}" ]; then echo " memory: '${MEMREQ}' "; fi) $(if [ -n "${CPULIM}" ]; then echo " limits: cpu: ${CPULIM} $(if [ -n "$MEMLIM" ]; then echo " memory: '${MEMLIM}' "; fi) "; fi) "; fi) "; done ) terminationGracePeriodSeconds: 1 ================================================ FILE: test/e2e/policies.test-suite/static-pools/README.txt ================================================ # E2E static-pools policy test ## Requirements This test requires containerd v1.4 or later on the VM. Earlier containerd versions fail to mount container images built on top of Clear Linux base image. That includes mounting cri-resmgr-webhook. `cri-resmgr-webhook` image must be present on the host (`make images`). The latest image in `docker images cri-resmgr-webhook` list will be installed and tested on the VM. ================================================ FILE: test/e2e/policies.test-suite/static-pools/cmk-exclusive.yaml.in ================================================ apiVersion: v1 kind: Pod metadata: name: ${NAME} labels: app: ${NAME} spec: terminationGracePeriodSeconds: 1 tolerations: - key: 'cmk' operator: 'Equal' value: 'true' effect: 'NoSchedule' containers: - name: ${NAME}c0 image: busybox imagePullPolicy: IfNotPresent env: $([ -z $STP_POOL ] || echo " - name: STP_POOL value: '${STP_POOL}'") $([ -z $STP_SOCKET_ID ] || echo " - name: STP_SOCKET_ID value: '${STP_SOCKET_ID}'") command: ['sh', '-c'] args: - 'while :; do echo ${NAME}c0 CMK_CPUS_ASSIGNED=\"\$CMK_CPUS_ASSIGNED\"; sleep 1; done' resources: requests: cpu: ${CPU} $([ "$EXCLCORES" = "omit" ] || echo " cmk.intel.com/exclusive-cores: '${EXCLCORES}'") limits: cpu: ${CPU} $([ "$EXCLCORES" = "omit" ] || echo " cmk.intel.com/exclusive-cores: '${EXCLCORES}'") ================================================ FILE: test/e2e/policies.test-suite/static-pools/cmk-isolate.yaml.in ================================================ apiVersion: v1 kind: Pod metadata: name: ${NAME} labels: app: ${NAME} spec: terminationGracePeriodSeconds: 1 tolerations: - key: 'cmk' operator: 'Equal' value: 'true' effect: 'NoSchedule' containers: - name: ${NAME}c0 image: busybox imagePullPolicy: IfNotPresent env: $([ -z $STP_POOL ] || echo " - name: STP_POOL value: '${STP_POOL}'") $([ -z $STP_SOCKET_ID ] || echo " - name: STP_SOCKET_ID value: '${STP_SOCKET_ID}'") $([ "$CMDSPLIT" = "command_all" ] && echo " command: ['cmk', 'isolate' $CMK_ISOLATE, 'sh', '-c', 'while :; do echo ${NAME}c0 ${ECHO_VARS}; sleep 1; done']" [ "$CMDSPLIT" = "command_cmk_sh" ] && echo " command: ['cmk', 'isolate' $CMK_ISOLATE, 'sh', '-c'] args: ['while :; do echo ${NAME}c0 ${ECHO_VARS}; sleep 1; done']" [ "$CMDSPLIT" = "command_cmk" ] && echo " command: ['cmk', 'isolate' $CMK_ISOLATE] args: ['sh', '-c', 'while :; do echo ${NAME}c0 ${ECHO_VARS}; sleep 1; done']") resources: requests: cpu: ${CPU} $([ -z $EXCLCORES ] || echo "cmk.intel.com/exclusive-cores: '${EXCLCORES}'") limits: cpu: ${CPU} $([ -z $EXCLCORES ] || echo "cmk.intel.com/exclusive-cores: '${EXCLCORES}'") ================================================ FILE: test/e2e/policies.test-suite/static-pools/cmk-tolerating-guaranteed.yaml.in ================================================ apiVersion: v1 kind: Pod metadata: name: ${NAME} labels: app: ${NAME} spec: tolerations: - {'key': 'cmk', 'operator': 'Equal', 'value': 'true', 'effect': 'NoSchedule'} containers: $(for contnum in $(seq 1 ${CONTCOUNT}); do echo " - name: ${NAME}c$(( contnum - 1 )) image: busybox imagePullPolicy: IfNotPresent command: - sh - -c - echo ${NAME}c$(( contnum - 1 )) \$(sleep inf) resources: requests: cpu: ${CPU} memory: '${MEM}' limits: cpu: ${CPU} memory: '${MEM}' "; done ) terminationGracePeriodSeconds: 1 ================================================ FILE: test/e2e/policies.test-suite/static-pools/cri-resmgr.cfg ================================================ policy: Active: static-pools ReservedResources: CPU: 750m static-pools: pools: shared: cpuLists: - Cpuset: 0-7 Socket: 0 - Cpuset: 8-15 Socket: 1 exclusive: false logger: Debug: cri-resmgr,resource-manager,cache,policy,stp Klog: skip_headers: true ================================================ FILE: test/e2e/policies.test-suite/static-pools/n4c16/cri-resmgr-static-pools.cfg ================================================ policy: Active: static-pools ReservedResources: CPU: 750m static-pools: ConfFilePath: "/etc/cmk/pools.conf" LabelNode: true TaintNode: true logger: Debug: cri-resmgr,resource-manager,cache,policy,stp Klog: skip_headers: true ================================================ FILE: test/e2e/policies.test-suite/static-pools/n4c16/py_consts.var.py ================================================ exclusive_cores={'node0/core0', 'node0/core1', 'node2/core0'} shared_cores={'node1/core2', 'node1/core3'} infra_cores={'node2/core1', 'node3/core2', 'node3/core3'} ================================================ FILE: test/e2e/policies.test-suite/static-pools/n4c16/test00-node-status/code.var.sh ================================================ # Test that the static-pools policy # 1. labels the node with cmk.intel.com/cmk-node # 2. advertises correct number of exclusive-cores resources # 3. taints the node # shellcheck disable=SC2148 cri_resmgr_cfg="$TEST_DIR/../cri-resmgr-static-pools.cfg" static-pools-relaunch-cri-resmgr out "" out "### Verifying that node has cmk-node label" vm-run-until 'kubectl get nodes -o jsonpath="{.items[*].metadata.labels}" | grep \"cmk.intel.com/cmk-node\"\:\"true\"' || error "cmk.intel.com/cmk-node label missing" out "" out "### Verifying that amount exclusive cores on node matches /etc/cmk/pools.conf" vm-run-until 'kubectl get nodes -o jsonpath="{.items[*].status.allocatable}" | grep -q \"cmk.intel.com/exclusive-cores\"\:\"3\"' || error "expected 3 allocatable cmk.intel.com/exclusive-cores" out "" out "### Creating a pod that should not be scheduled due to node taint" ( wait_t=2s create besteffort ) || { echo "failed as expected due to node taint" } out "" out "### Verifying that scheduling normal pod failed" vm-command 'kubectl describe pods/pod0 | grep -E "FailedScheduling .*cmk: true"' || { error "FailedScheduling expected but not found" } ================================================ FILE: test/e2e/policies.test-suite/static-pools/n4c16/test01-exclusive-pods/code.var.sh ================================================ # Test that exclusive-cores containers # 1. run on exclusive cores # 2. are pinned according to STP_POOL and STP_SOCKET_ID # when "cmk isolate" is not used. # 3. all exclusive cores can be consumed with and without # specifying STP_SOCKET_ID. # shellcheck disable=SC2148 cri_resmgr_cfg="$TEST_DIR/../cri-resmgr-static-pools.cfg" static-pools-relaunch-cri-resmgr export STP_POOL=exclusive out "" out "### Creating exclusive CMK pod with 1 exclusive core" CPU=1000m STP_SOCKET_ID=1 EXCLCORES=1 create cmk-exclusive report allowed verify 'len(cores["pod0c0"]) == 1' \ 'packages["pod0c0"] == {"package1"}' out "" out "### Deleting exclusive CMK pod" kubectl delete pods --all --now --wait out "" out "### Creating exclusive CMK pod with 2 exclusive cores" CPU=1000m STP_SOCKET_ID=0 EXCLCORES=2 create cmk-exclusive report allowed verify 'len(cores["pod1c0"]) == 2' \ 'packages["pod1c0"] == {"package0"}' out "" out "### Deleting exclusive CMK pod" kubectl delete pods --all --now --wait out "" out "### Creating two exclusive CMK pods with 1 exclusive core each" n=2 CPU=1000m STP_SOCKET_ID=0 EXCLCORES=1 create cmk-exclusive report allowed verify 'len(cores["pod2c0"]) == 1' \ 'len(cores["pod3c0"]) == 1' \ 'disjoint_sets(cores["pod2c0"], cores["pod3c0"])' \ 'packages["pod2c0"] == packages["pod3c0"] == {"package0"}' out "" out "### Creating one more exclusive CMK pods, consuming all exclusive cores" CPU=1000m STP_SOCKET_ID=1 EXCLCORES=1 create cmk-exclusive report allowed verify 'len(cores["pod2c0"]) == 1' \ 'len(cores["pod3c0"]) == 1' \ 'len(cores["pod4c0"]) == 1' \ 'disjoint_sets(cores["pod2c0"], cores["pod3c0"], cores["pod4c0"])' \ 'set.union(cores["pod2c0"], cores["pod3c0"], cores["pod4c0"]) == exclusive_cores' kubectl delete pods --all --now --wait out "" out "### Test consuming all exclusive cores without specifying STP_SOCKET_ID" n=3 CPU=1000m STP_SOCKET_ID="" EXCLCORES=1 create cmk-exclusive verify 'len(cores["pod5c0"]) == 1' \ 'len(cores["pod6c0"]) == 1' \ 'len(cores["pod7c0"]) == 1' \ 'disjoint_sets(cores["pod5c0"], cores["pod6c0"], cores["pod7c0"])' \ 'set.union(cores["pod5c0"], cores["pod6c0"], cores["pod7c0"]) == exclusive_cores' ================================================ FILE: test/e2e/policies.test-suite/static-pools/n4c16/test02-pods-without-cmk/code.var.sh ================================================ # Test that normal pods/containers scheduled on a CMK node # are running in the shared pool, yet there are not as many # CPUs as required. cri_resmgr_cfg="$TEST_DIR/../cri-resmgr-static-pools.cfg" static-pools-relaunch-cri-resmgr out "" out "### Creating a guaranteed pod, 1 CPU, goes to the shared bool" CPU=1 create cmk-tolerating-guaranteed report allowed verify 'cores["pod0c0"].issubset(shared_cores)' out "" out "### Creating next guaranteed pod, 2 CPUs, goes to the shared pool" CPU=2 create cmk-tolerating-guaranteed report allowed verify 'cores["pod0c0"].issubset(shared_cores)' \ 'cores["pod1c0"].issubset(shared_cores)' out "" out "### Creating next guaranteed pod, 4 CPUs, goes to the shared pool" CPU=4 create cmk-tolerating-guaranteed report allowed verify 'cores["pod0c0"].issubset(shared_cores)' \ 'cores["pod1c0"].issubset(shared_cores)' \ 'cores["pod2c0"].issubset(shared_cores)' out "" out "### Creating next guaranteed pod, 8 CPUs, goes to the shared pool" CPU=6 create cmk-tolerating-guaranteed report allowed verify 'cores["pod0c0"].issubset(shared_cores)' \ 'cores["pod1c0"].issubset(shared_cores)' \ 'cores["pod2c0"].issubset(shared_cores)' \ 'cores["pod3c0"].issubset(shared_cores)' ================================================ FILE: test/e2e/policies.test-suite/static-pools/n4c16/test03-cmk-isolate/code.var.sh ================================================ # Test that legacy exclusive-cores containers # 1. run on exclusive cores # 2. are pinned according to "cmk isolate" command # parameters # 3. run without "cmk" existing on the image # 3. all exclusive cores can be consumed # shellcheck disable=SC2148 cri_resmgr_cfg="$TEST_DIR/../cri-resmgr-static-pools.cfg" static-pools-relaunch-cri-resmgr export STP_POOL="" STP_SOCKET_ID="" export CMK_ISOLATE=", '--conf-dir=/etc/cmk.conf', '--pool=exclusive', '--socket-id=1'" out "" out "### Creating pod 'cmk', 'isolate'$CMK_ISOLATE..." CPU=1000m EXCLCORES=1 CMDSPLIT="command_all" create cmk-isolate report allowed verify 'len(cores["pod0c0"]) == 1' \ 'cores["pod0c0"].issubset(exclusive_cores)' \ 'packages["pod0c0"] == {"package1"}' export CMK_ISOLATE=", '--socket-id=0', '--pool=exclusive'" out "" out "### Creating pod 'cmk', 'isolate'$CMK_ISOLATE..." CPU=2000m EXCLCORES=2 CMDSPLIT="command_cmk_sh" create cmk-isolate report allowed verify 'len(cores["pod1c0"]) == 2' \ 'cores["pod1c0"].issubset(exclusive_cores)' \ 'packages["pod1c0"] == {"package0"}' export CMK_ISOLATE=", '--pool=shared'" out "" out "### Creating pod 'cmk', 'isolate'$CMK_ISOLATE..." CPU=1000m EXCLCORES="" CMDSPLIT="command_cmk" create cmk-isolate report allowed verify 'cores["pod2c0"] == shared_cores' export CMDSPLIT="command_cmk" export CMK_ISOLATE=", '--conf-dir=/etc/cmk.conf', '--pool=infra'" out "" out "### Creating pod 'cmk', 'isolate'$CMK_ISOLATE..." CPU=1000m EXCLCORES="" create cmk-isolate report allowed verify 'cores["pod3c0"] == infra_cores' out "" out "### Deleting only exclusive CMK pods, leave shared/infra running" kubectl delete pods/pod0 pods/pod1 --now --wait --ignore-not-found export CMK_ISOLATE=", '--pool=exclusive'" out "" out "### Creating 3 exclusive pods 'cmk', 'isolate'$CMK_ISOLATE..." n=3 CPU=1000m EXCLCORES=1 create cmk-isolate report allowed verify 'len(cores["pod4c0"]) == 1' \ 'len(cores["pod5c0"]) == 1' \ 'len(cores["pod6c0"]) == 1' \ 'disjoint_sets(cores["pod4c0"], cores["pod5c0"], cores["pod6c0"])' \ 'cores["pod4c0"].issubset(exclusive_cores)' \ 'cores["pod5c0"].issubset(exclusive_cores)' \ 'cores["pod6c0"].issubset(exclusive_cores)' ================================================ FILE: test/e2e/policies.test-suite/static-pools/n4c16/test04-cmk-isolate-noaffinity/code.var.sh ================================================ # Test that cmk isolate --no-affinity is effective on every pool # with and without STP_POOL / STP_SOCKET_ID env vars. # Test that all exclusive cores can be consumed with --no-affinity. # shellcheck disable=SC2148 cri_resmgr_cfg="$TEST_DIR/../cri-resmgr-static-pools.cfg" static-pools-relaunch-cri-resmgr export STP_POOL="" STP_SOCKET_ID="" CMDSPLIT="command_all" export ECHO_VARS='CMK_CPUS_ASSIGNED="$CMK_CPUS_ASSIGNED" CMK_CPUS_SHARED="$CMK_CPUS_SHARED" CMK_CPUS_INFRA="$CMK_CPUS_INFRA"' export CMK_ISOLATE=", '--conf-dir=/etc/cmk.conf', '--pool=exclusive', '--socket-id=0', '--no-affinity'" out "" out "### Creating no-affinity pod 'cmk', 'isolate'$CMK_ISOLATE..." CPU=1000m EXCLCORES=1 create cmk-isolate report allowed verify 'len(cores["pod0c0"]) == 8' cpus_assigned="$(kubectl logs pod0 | tail -n 1 | awk '{print $2}')" cpus_shared="$(kubectl logs pod0 | tail -n 1 | awk '{print $3}')" cpus_infra="$(kubectl logs pod0 | tail -n 1 | awk '{print $4}')" [ "$cpus_assigned" == "CMK_CPUS_ASSIGNED=0,1" ] || error "expected CMK_CPUS_ASSIGNED=0,1, got $cpus_assigned" [ "$cpus_shared" == "CMK_CPUS_SHARED=4-6,7" ] || error "expected CMK_CPUS_SHARED=4-6,7, got $cpus_shared" [ "$cpus_infra" == "CMK_CPUS_INFRA=10-15" ] || error "expected CMK_CPUS_INFRA=10-15, got $cpus_infra" export CMK_ISOLATE=", '--conf-dir=/etc/cmk.conf', '--pool=exclusive', '--socket-id=1', '--no-affinity'" out "" out "### Creating no-affinity pod 'cmk', 'isolate'$CMK_ISOLATE..." CPU=1000m EXCLCORES=1 STP_POOL="exclusive" STP_SOCKET_ID="1" create cmk-isolate report allowed verify 'len(cores["pod1c0"]) == 8' cpus_assigned="$(kubectl logs pod1 | tail -n 1 | awk '{print $2}')" [ "$cpus_assigned" == "CMK_CPUS_ASSIGNED=8,9" ] || error "expected CMK_CPUS_ASSIGNED=8,9, got $cpus_assigned" export CMK_ISOLATE=", '--conf-dir=/etc/cmk.conf', '--pool=exclusive', '--no-affinity'" out "" out "### Creating no-affinity pod 'cmk', 'isolate'$CMK_ISOLATE..." CPU=1000m EXCLCORES=1 STP_POOL="exclusive" create cmk-isolate report allowed verify 'len(cores["pod2c0"]) == 8' cpus_assigned="$(kubectl logs pod2 | tail -n 1 | awk '{print $2}')" [ "$cpus_assigned" == "CMK_CPUS_ASSIGNED=2,3" ] || error "expected CMK_CPUS_ASSIGNED=2,3, got $cpus_assigned" export CMK_ISOLATE=", '--no-affinity', '--pool=shared'" out "" out "### Creating no-affinity pod 'cmk', 'isolate'$CMK_ISOLATE..." CPU=1000m EXCLCORES="" create cmk-isolate report allowed verify 'len(cores["pod3c0"]) == 8' cpus_assigned="$(kubectl logs pod3 | tail -n 1 | awk '{print $2}')" [ "$cpus_assigned" == "CMK_CPUS_ASSIGNED=4-6,7" ] || error "expected CMK_CPUS_ASSIGNED=5-6,7, got $cpus_assigned" export CMK_ISOLATE=", '--pool=infra', '--no-affinity'" out "" out "### Creating no-affinity pod 'cmk', 'isolate'$CMK_ISOLATE..." CPU=1000m EXCLCORES="" create cmk-isolate report allowed verify 'len(cores["pod4c0"]) == 8' cpus_assigned="$(kubectl logs pod4 | tail -n 1 | awk '{print $2}')" [ "$cpus_assigned" == "CMK_CPUS_ASSIGNED=10-15" ] || error "expected CMK_CPUS_ASSIGNED=10-15 got $cpus_assigned" ================================================ FILE: test/e2e/policies.test-suite/static-pools/n4c16/test05-negative-tests/code.var.sh ================================================ # shellcheck disable=SC2148 cri_resmgr_cfg="$TEST_DIR/../cri-resmgr-static-pools.cfg" static-pools-relaunch-cri-resmgr export STP_POOL=exclusive errmsg_zero_cores="static-pools: exclusive pool specified but the number of exclusive CPUs requested is 0" errmsg_non_existing_pool="static-pools: non-existent pool" errmsg_not_enough_exclcores="static-pools: not enough free cpu lists" out "" out "### Request cores from non-existing pool" ( CPU=1000m STP_SOCKET_ID=0 EXCLCORES=1 STP_POOL=elusive wait_t=5s create cmk-exclusive ) && error "expected timeout, but pod launched with cores from non-existing pool" vm-run-until "kubectl describe pods/pod0 | grep '$errmsg_non_existing_pool'" || error "cannot find expected error message from pod description" out "Failed as expected" kubectl delete pods --all --now --wait || error "failed to delete pods" out "" out "### Request cores from non-existing socket" ( CPU=1000m STP_SOCKET_ID=2 EXCLCORES=1 wait_t=5s create cmk-exclusive ) && error "expected timeout, but pod launched with cores from non-existing socket" vm-run-until "kubectl describe pods/pod0 | grep '$errmsg_not_enough_exclcores'" || error "cannot find expected error message from pod description" out "Failed as expected" kubectl delete pods --all --now --wait || error "failed to delete pods" out "" out "### Request exclusive pool but do not mention exclusive-cores" ( CPU=1000m STP_SOCKET_ID=0 EXCLCORES='omit' wait_t=5s create cmk-exclusive ) && error "expected timeout, but pod launched without mentioning exclusive cores from the exclusive pool" vm-run-until "kubectl describe pods/pod0 | grep '$errmsg_zero_cores'" || error "cannot find expected error message from pod description" out "Failed as expected" kubectl delete pods --all --now --wait || error "failed to delete pods" out "" out "### Request 0 cores from exclusive pool" ( CPU=1000m STP_SOCKET_ID=0 EXCLCORES=0 wait_t=5s create cmk-exclusive ) && error "expected timeout, but pod launched with 0 cores from the exclusive pool" vm-run-until "kubectl describe pods/pod0 | grep '$errmsg_zero_cores'" || error "cannot find expected error message from pod description" out "Failed as expected" kubectl delete pods --all --now --wait || error "failed to delete pods" out "" out "### Request more cores from socket 0 than available" ( CPU=3000m STP_SOCKET_ID=0 EXCLCORES=3 wait_t=5s create cmk-exclusive ) && error "expected timeout, but pod got too many cores successfully" vm-run-until "kubectl describe pods/pod0 | grep '$errmsg_not_enough_exclcores'" || error "cannot find expected error message from pod description" out "Failed as expected" kubectl delete pods --all --now --wait || error "failed to delete pods" out "" out "### Request more cores from socket 1 than available" ( CPU=1000m STP_SOCKET_ID=1 EXCLCORES=2 wait_t=5s create cmk-exclusive ) && error "expected timeout, but pod got too many cores successfully" vm-run-until "kubectl describe pods/pod0 | grep '$errmsg_not_enough_exclcores'" || error "cannot find expected error message from pod description" out "Failed as expected" kubectl delete pods --all --now --wait || error "failed to delete pods" ================================================ FILE: test/e2e/policies.test-suite/static-pools/n4c16/test99-cleanup/code.var.sh ================================================ # This test cleans up static-pools test suite configurations from vm. # Other policy tests can be run after this test on the same vm without # recreating the vm from scratch. static-pools-cleanup ================================================ FILE: test/e2e/policies.test-suite/static-pools/n4c16/topology.var.json ================================================ [ {"mem": "2G", "cores": 2, "nodes": 2, "packages": 2} ] ================================================ FILE: test/e2e/policies.test-suite/static-pools/n4c16/vm-files/etc/cmk/pools.conf ================================================ pools: exclusive: cpuLists: - Cpuset: 8,9 Socket: 1 - Cpuset: 0,1 Socket: 0 - Cpuset: 2,3 Socket: 0 exclusive: true shared: cpuLists: - Cpuset: 4-6,7 Socket: 0 exclusive: false infra: cpuLists: - Cpuset: 10-15 Socket: 1 exclusive: false ================================================ FILE: test/e2e/policies.test-suite/static-pools/static-pools-lib.source.sh ================================================ # shellcheck disable=SC2148 static-pools-relaunch-cri-resmgr() { local webhook_running=0 out "# Relaunching cri-resmgr and agent, launch webhook if not already running" vm-command-q "kubectl get mutatingwebhookconfiguration/cri-resmgr" >& /dev/null && { webhook_running=1 } # cleanup terminate cri-resmgr terminate cri-resmgr-agent vm-command "rm -rf /var/lib/cri-resmgr" extended-resources remove cmk.intel.com/exclusive-cpus >/dev/null # launch again launch cri-resmgr-agent launch cri-resmgr vm-run-until "! kubectl get node | grep NotReady" || error "kubectl node is NotReady after launching cri-resmgr-agent and cri-resmgr" if [ "$webhook_running" == 0 ]; then vm-command-q "[ -f webhook/webhook-deployment.yaml ]" || install cri-resmgr-webhook launch cri-resmgr-webhook fi } static-pools-cleanup() { ( terminate cri-resmgr-agent ) ( uninstall cri-resmgr-webhook ) ( extended-resources remove cmk.intel.com/exclusive-cpus >/dev/null ) ( terminate cri-resmgr ) vm-command 'kubectl taint node $(hostname) cmk=true:NoSchedule-' || true } ================================================ FILE: test/e2e/policies.test-suite/topology-aware/c4pmem4/test01-pmem-node-assigning/code.var.sh ================================================ # Test that CPU-less PMEM nodes are assigned to closest nodes with CPU. # Restart cri-resmgr in order to clear logs and make sure assignment # is successful with installed cri-resmgr. terminate cri-resmgr launch cri-resmgr CRI_RESMGR_OUTPUT_COMMAND="cat cri-resmgr.output.txt" echo "Verify PMEM node assignment to CPU-ful nodes" for expected_output in \ "PMEM node #4 assigned to .*#2" \ "PMEM node #5 assigned to .*#3" \ "PMEM node #6 assigned to .*#0" \ "PMEM node #7 assigned to .*#1"; do vm-command "$CRI_RESMGR_OUTPUT_COMMAND | grep -E '$expected_output'" || command-error "expected PMEM assignment not found" done ================================================ FILE: test/e2e/policies.test-suite/topology-aware/c4pmem4/test02-annotation-memory-type/code.var.sh ================================================ # Test that container memory is pinned according to memory-type annotation # pod0c0 runs on node 1, uses only dram # pod0c1 runs on node 2, uses only pmem # pod0c2 runs on node 3, uses dram+pmem # pod0c9 runs on root node (all non-reserved CPUs), # no memory-type restrictions (=> use all memory nodes) MEM=250M MEMTYPEC0=dram MEMTYPEC1=pmem MEMTYPEC2=pmem,dram create memtype-guaranteed report allowed verify 'cpus["pod0c0"] == {"cpu1"}' \ 'mems["pod0c0"] == {"node1"}' \ 'cpus["pod0c1"] == {"cpu2"}' \ 'mems["pod0c1"] == {"node4"}' \ 'cpus["pod0c2"] == {"cpu3"}' \ 'mems["pod0c2"] == {"node3", "node5"}' \ 'cpus["pod0c9"] == {"cpu1", "cpu2", "cpu3"}' \ 'mems["pod0c9"] == {"node0", "node1", "node2", "node3", "node4", "node5", "node6", "node7"}' ================================================ FILE: test/e2e/policies.test-suite/topology-aware/c4pmem4/test02-annotation-memory-type/memtype-guaranteed.yaml.in ================================================ apiVersion: v1 kind: Pod metadata: name: ${NAME} annotations: memory-type.cri-resource-manager.intel.com/container.${NAME}c0: ${MEMTYPEC0} memory-type.cri-resource-manager.intel.com/container.${NAME}c1: ${MEMTYPEC1} memory-type.cri-resource-manager.intel.com/container.${NAME}c2: ${MEMTYPEC2} spec: containers: $(for CONT in 0 1 2; do echo " - name: ${NAME}c${CONT} image: busybox imagePullPolicy: IfNotPresent command: ['sh', '-c', 'echo ${NAME}c${CONT} \$(sleep inf)'] resources: requests: cpu: 500m memory: ${MEM} limits: cpu: 500m memory: ${MEM} "; done) - name: ${NAME}c9 image: busybox imagePullPolicy: IfNotPresent command: ['sh', '-c', 'echo ${NAME}c9 \$(sleep inf)'] resources: requests: cpu: 500m memory: ${MEM} limits: cpu: 500m memory: ${MEM} ================================================ FILE: test/e2e/policies.test-suite/topology-aware/c4pmem4/test02-annotation-memory-type-deprecated-syntax/code.var.sh ================================================ # Test that container memory is pinned according to memory-type annotation # pod0c0 runs on node 1, uses only dram # pod0c1 runs on node 2, uses only pmem # pod0c2 runs on node 3, uses dram+pmem # pod0c9 runs on root node (all non-reserved CPUs), # no memory-type restrictions (=> use all memory nodes) MEM=250M MEMTYPEC0=dram MEMTYPEC1=pmem MEMTYPEC2=pmem,dram create memtype-guaranteed report allowed verify 'cpus["pod0c0"] == {"cpu1"}' \ 'mems["pod0c0"] == {"node1"}' \ 'cpus["pod0c1"] == {"cpu2"}' \ 'mems["pod0c1"] == {"node4"}' \ 'cpus["pod0c2"] == {"cpu3"}' \ 'mems["pod0c2"] == {"node3", "node5"}' \ 'cpus["pod0c9"] == {"cpu1", "cpu2", "cpu3"}' \ 'mems["pod0c9"] == {"node0", "node1", "node2", "node3", "node4", "node5", "node6", "node7"}' ================================================ FILE: test/e2e/policies.test-suite/topology-aware/c4pmem4/test02-annotation-memory-type-deprecated-syntax/memtype-guaranteed.yaml.in ================================================ apiVersion: v1 kind: Pod metadata: name: ${NAME} annotations: cri-resource-manager.intel.com/memory-type: | ${NAME}c0: ${MEMTYPEC0} ${NAME}c1: ${MEMTYPEC1} ${NAME}c2: ${MEMTYPEC2} spec: containers: $(for CONT in 0 1 2; do echo " - name: ${NAME}c${CONT} image: busybox imagePullPolicy: IfNotPresent command: ['sh', '-c', 'echo ${NAME}c${CONT} \$(sleep inf)'] resources: requests: cpu: 500m memory: ${MEM} limits: cpu: 500m memory: ${MEM} "; done) - name: ${NAME}c9 image: busybox imagePullPolicy: IfNotPresent command: ['sh', '-c', 'echo ${NAME}c9 \$(sleep inf)'] resources: requests: cpu: 500m memory: ${MEM} limits: cpu: 500m memory: ${MEM} ================================================ FILE: test/e2e/policies.test-suite/topology-aware/c4pmem4/test03-coldstart/bb-coldstart.yaml.in ================================================ apiVersion: v1 kind: Pod metadata: name: ${NAME} annotations: memory-type.cri-resource-manager.intel.com/container.${NAME}c0: dram,pmem cold-start.cri-resource-manager.intel.com/container.${NAME}c0: | duration: ${DURATION} spec: containers: - name: ${NAME}c0 image: busybox imagePullPolicy: IfNotPresent command: - sh - -c - 'cold_alloc=\$(dd if=/dev/zero bs=${COLD_ALLOC_KB}kB count=1 | tr \"\\\0\" \"x\"); sh -c \"paused after cold_alloc \\\$(sleep inf)\"; warm_alloc=\$(dd if=/dev/zero bs=${WARM_ALLOC_KB}kB count=1 | tr \"\\\0\" \"x\"); sh -c \"paused after warm_alloc \\\$(sleep inf)\"; echo ${NAME}c0 \$(sleep inf); # needed for pod resource discovery' resources: requests: cpu: 500m memory: ${MEM} limits: cpu: 500m memory: ${MEM} ================================================ FILE: test/e2e/policies.test-suite/topology-aware/c4pmem4/test03-coldstart/code.var.sh ================================================ # Test that a cold-started pod... # 1. is allowed to allocate memory only from PMEM nodes # during cold period (of length $DURATION). # 2. is restricted from the very beginning of pod execution: # immediately allocated memory blob consumes PMEM from expected node. # 3. is allowed to allocate memory from both PMEM and DRAM after # the cold period. # 4. is no more restricted after $DURATION + 1s has passed in pod: # warm-allocated memory is not taken from PMEM nodes. PMEM_NODES='{"node4", "node5", "node6", "node7"}' # pmem-used returns total MemUsed (allocated) memory on PMEM nodes pmem-used() { local pmem_nodes_shell=${PMEM_NODES//[\" ]/} vm-command "cat /sys/devices/system/node/$pmem_nodes_shell/meminfo | awk '/MemUsed:/{mem+=\$4}END{print mem}'" >/dev/null || command-error "cannot read PMEM usage from node $node" echo "$COMMAND_OUTPUT" } CRI_RESMGR_OUTPUT="cat cri-resmgr.output.txt" PMEM_USED_BEFORE_POD0="$(pmem-used)" DURATION=10s COLD_ALLOC_KB=$((50 * 1024)) WARM_ALLOC_KB=$((100 * 1024)) MEM=1G create bb-coldstart echo "Wait that coldstart period is started for the pod" vm-run-until "$CRI_RESMGR_OUTPUT | grep 'coldstart: triggering coldstart for pod0:pod0c0'" || error "cri-resmgr did not report triggering coldstart period" verify 'cores["pod0c0"] == {"node1/core0"}' \ "mems['pod0c0'] == {'node7'}" echo "Wait that the pod has finished memory allocation during cold period." vm-run-until "pgrep -f '^sh -c paused after cold_alloc'" >/dev/null || error "cold memory allocation timed out" echo "Verify PMEM consumption during cold period." # meminfo MemUsed vs dd bytes error margin, use 10% PMEM_ERROR_MARGIN=$((COLD_ALLOC_KB / 10)) sleep 1 PMEM_USED_COLD_POD0="$(pmem-used)" PMEM_COLD_CONSUMED=$(( $PMEM_USED_COLD_POD0 - $PMEM_USED_BEFORE_POD0 )) if (( $PMEM_COLD_CONSUMED + $PMEM_ERROR_MARGIN < $COLD_ALLOC_KB )); then error "pod0 did not allocate ${COLD_ALLOC_KB}kB from PMEM. MemUsed PMEM delta: $PMEM_COLD_CONSUMED" else echo "### Verified: PMEM memory consumed during cold period: $PMEM_COLD_CONSUMED kB, pod script allocated: $COLD_ALLOC_KB kB" fi coldstarts=$(vm-command-q "$CRI_RESMGR_OUTPUT | grep 'finishing coldstart period for pod0:pod0c0' | wc -l") echo "Wait that cri-resmgr finishes coldstart period within 5s + $DURATION." sleep 5s vm-run-until --timeout ${DURATION%s} "[ \$($CRI_RESMGR_OUTPUT | grep 'finishing coldstart period for pod0:pod0c0' | wc -l) -gt $coldstarts ]" || error "cri-resmgr did not report finishing coldstart period within $DURATION" vm-command "$CRI_RESMGR_OUTPUT | grep 'pinning to memory 1,7'" || error "cri-resmgr did not report pinning to expected memory nodes" verify 'cores["pod0c0"] == {"node1/core0"}' \ 'mems["pod0c0"] == {"node1", "node7"}' echo "Let the pod continue from cold_alloc to warm_alloc." vm-command 'kill -9 $(pgrep -f "^sh -c paused after cold_alloc")' echo "Make sure that bb-coldstart finishes allocating memory in warm mode." vm-run-until "pgrep -f '^sh -c paused after warm_alloc'" || error "warm memory allocation timed out" echo "Verify (soft): PMEM consumption after cold period." sleep 1 PMEM_USED_WARM_POD0="$(pmem-used)" PMEM_WARM_CONSUMED=$(( $PMEM_USED_WARM_POD0 - $PMEM_USED_COLD_POD0 )) if (( $PMEM_WARM_CONSUMED > 0 )); then echo "### Verify (soft) failed: pod0 allocated $WARM_ALLOC_KB kB from PMEM. Should have been taken from DRAM." else echo "### Verified (soft): PMEM memory consumption delta during warm period: $PMEM_WARM_CONSUMED kB, pod script allocated: $WARM_ALLOC_KB kB" fi ================================================ FILE: test/e2e/policies.test-suite/topology-aware/c4pmem4/test03-coldstart-deprecated-syntax/bb-coldstart.yaml.in ================================================ apiVersion: v1 kind: Pod metadata: name: ${NAME} annotations: cri-resource-manager.intel.com/memory-type: | ${NAME}c0: dram,pmem cri-resource-manager.intel.com/cold-start: | ${NAME}c0: duration: ${DURATION_S}s spec: containers: - name: ${NAME}c0 image: busybox imagePullPolicy: IfNotPresent command: - sh - -c - 'cold_alloc=\$(dd if=/dev/zero bs=${COLD_ALLOC_KB}kB count=1 | tr \"\\\0\" \"x\"); sh -c \"paused after cold_alloc \\\$(sleep inf)\"; warm_alloc=\$(dd if=/dev/zero bs=${WARM_ALLOC_KB}kB count=1 | tr \"\\\0\" \"x\"); sh -c \"paused after warm_alloc \\\$(sleep inf)\"; echo ${NAME}c0 \$(sleep inf); # needed for pod resource discovery' resources: requests: cpu: 500m memory: ${MEM} limits: cpu: 500m memory: ${MEM} ================================================ FILE: test/e2e/policies.test-suite/topology-aware/c4pmem4/test03-coldstart-deprecated-syntax/code.var.sh ================================================ # Test that a cold-started pod... # 1. is allowed to allocate memory only from PMEM nodes # during cold period (of length $DURATION_S). # 2. is restricted from the very beginning of pod execution: # immediately allocated memory blob consumes PMEM from expected node. # 3. is allowed to allocate memory from both PMEM and DRAM after # the cold period. # 4. is no more restricted after $DURATION_S + 1s has passed in pod: # warm-allocated memory is not taken from PMEM nodes. PMEM_NODES='{"node4", "node5", "node6", "node7"}' # pmem-used returns total MemUsed (allocated) memory on PMEM nodes pmem-used() { local pmem_nodes_shell=${PMEM_NODES//[\" ]/} vm-command "cat /sys/devices/system/node/$pmem_nodes_shell/meminfo | awk '/MemUsed:/{mem+=\$4}END{print mem}'" >/dev/null || command-error "cannot read PMEM usage from node $node" echo "$COMMAND_OUTPUT" } CRI_RESMGR_OUTPUT="cat cri-resmgr.output.txt" PMEM_USED_BEFORE_POD0="$(pmem-used)" DURATION_S=10 COLD_ALLOC_KB=$((50 * 1024)) WARM_ALLOC_KB=$((100 * 1024)) MEM=1G create bb-coldstart echo "Wait that coldstart period is started for the pod" vm-run-until "$CRI_RESMGR_OUTPUT | grep 'coldstart: triggering coldstart for pod0:pod0c0'" || error "cri-resmgr did not report triggering coldstart period" verify 'cores["pod0c0"] == {"node1/core0"}' \ "mems['pod0c0'] == {'node7'}" echo "Wait that the pod has finished memory allocation during cold period." vm-run-until "pgrep -f '^sh -c paused after cold_alloc'" >/dev/null || error "cold memory allocation timed out" echo "Verify PMEM consumption during cold period." # meminfo MemUsed vs dd bytes error margin, use 10% PMEM_ERROR_MARGIN=$((COLD_ALLOC_KB / 10)) sleep 1 PMEM_USED_COLD_POD0="$(pmem-used)" PMEM_COLD_CONSUMED=$(( $PMEM_USED_COLD_POD0 - $PMEM_USED_BEFORE_POD0 )) if (( $PMEM_COLD_CONSUMED + $PMEM_ERROR_MARGIN < $COLD_ALLOC_KB )); then error "pod0 did not allocate ${COLD_ALLOC_KB}kB from PMEM. MemUsed PMEM delta: $PMEM_COLD_CONSUMED" else echo "### Verified: PMEM memory consumed during cold period: $PMEM_COLD_CONSUMED kB, pod script allocated: $COLD_ALLOC_KB kB" fi coldstarts=$(vm-command-q "$CRI_RESMGR_OUTPUT | grep 'finishing coldstart period for pod0:pod0c0' | wc -l") echo "Wait that cri-resmgr finishes coldstart period within $(($DURATION_S + 10)) seconds." vm-run-until --timeout $((DURATION_S + 10)) "[ \$($CRI_RESMGR_OUTPUT | grep 'finishing coldstart period for pod0:pod0c0' | wc -l) -gt $coldstarts ]" || error "cri-resmgr did not report finishing coldstart period within $DURATION_S seconds" vm-command "$CRI_RESMGR_OUTPUT | grep 'pinning to memory 1,7'" || error "cri-resmgr did not report pinning to expected memory nodes" verify 'cores["pod0c0"] == {"node1/core0"}' \ 'mems["pod0c0"] == {"node1", "node7"}' echo "Let the pod continue from cold_alloc to warm_alloc." vm-command 'kill -9 $(pgrep -f "^sh -c paused after cold_alloc")' echo "Make sure that bb-coldstart finishes allocating memory in warm mode." vm-run-until "pgrep -f '^sh -c paused after warm_alloc'" || error "warm memory allocation timed out" echo "Verify (soft): PMEM consumption after cold period." sleep 1 PMEM_USED_WARM_POD0="$(pmem-used)" PMEM_WARM_CONSUMED=$(( $PMEM_USED_WARM_POD0 - $PMEM_USED_COLD_POD0 )) if (( $PMEM_WARM_CONSUMED > 0 )); then echo "### Verify (soft) failed: pod0 allocated $WARM_ALLOC_KB kB from PMEM. Should have been taken from DRAM." else echo "### Verified (soft): PMEM memory consumption delta during warm period: $PMEM_WARM_CONSUMED kB, pod script allocated: $WARM_ALLOC_KB kB" fi ================================================ FILE: test/e2e/policies.test-suite/topology-aware/c4pmem4/test04-dynamic-page-demotion/bb-memload.yaml.in ================================================ apiVersion: v1 kind: Pod metadata: name: ${NAME} annotations: memory-type.cri-resource-manager.intel.com/pod: dram,pmem spec: containers: - name: ${NAME}c0 image: busybox imagePullPolicy: IfNotPresent command: - sh - -c - while :; do dd status=none if=/dev/zero bs=$(( $BSIZE - 7 )) count=1 | tr '\\\0' 'A'; echo ${NAME}c0; done | awk '{r+=1;if(r<${WORN%M}*1024*1024/$BSIZE){worn[r]=\$1;wr+=1;}if($PRINT_WRBYTES_IF)print wr*$BSIZE;}' resources: requests: cpu: ${CPU} memory: $(( ${WORN%M} * 1024 * 1024 / $BSIZE + 100000 ))k limits: cpu: ${CPU} memory: $(( ${WORN%M} * 1024 * 1024 / $BSIZE + 100000 ))k - name: ${NAME}c1 image: busybox imagePullPolicy: IfNotPresent command: - sh - -c - while :; do dd status=none if=/dev/zero bs=$(( $BSIZE - 7 )) count=1 | tr '\\\0' 'A'; echo ${NAME}c1; done | awk '{r+=1;wmrn[r%(${WMRN%M}*1024*1024/$BSIZE)]=\$1;wr+=1;if($PRINT_WRBYTES_IF)print wr*$BSIZE;}' resources: requests: cpu: ${CPU} memory: $(( ${WMRN%M} * 1024 * 1024 / $BSIZE + 100000 ))k limits: cpu: ${CPU} memory: $(( ${WMRN%M} * 1024 * 1024 / $BSIZE + 100000 ))k - name: ${NAME}c2 image: busybox imagePullPolicy: IfNotPresent command: - sh - -c - while :; do dd status=none if=/dev/zero bs=$(( $BSIZE - 7 )) count=1 | tr '\\\0' 'A'; echo ${NAME}c2; done | awk '{r+=1;if (worm[r%(${WORM%M}*1024*1024/$BSIZE)]!=\$1){worm[r%(${WORM%M}*1024*1024/$BSIZE)]=\$1;wr+=1;}if($PRINT_WRBYTES_IF)print wr*$BSIZE;}' resources: requests: cpu: ${CPU} memory: $(( ${WORM%M} * 1024 * 1024 / $BSIZE + 100000 ))k limits: cpu: ${CPU} memory: $(( ${WORM%M} * 1024 * 1024 / $BSIZE + 100000 ))k - name: ${NAME}c3 image: busybox imagePullPolicy: IfNotPresent command: - sh - -c - while :; do dd status=none if=/dev/zero bs=$(( $BSIZE - 7 )) count=1 | tr '\\\0' 'A'; echo ${NAME}c3; done | awk '{r+=1;if (wmrm[r%(${WMRM%M}*1024*1024/$BSIZE)]!=\$1 || length(\$1) > 0){wmrm[r%(${WMRM%M}*1024*1024/$BSIZE)]=\$1;wr+=1;}if($PRINT_WRBYTES_IF)print wr*$BSIZE;}' resources: requests: cpu: ${CPU} memory: $(( ${WMRM%M} * 1024 * 1024 / $BSIZE + 100000 ))k limits: cpu: ${CPU} memory: $(( ${WMRM%M} * 1024 * 1024 / $BSIZE + 100000 ))k ================================================ FILE: test/e2e/policies.test-suite/topology-aware/c4pmem4/test04-dynamic-page-demotion/code.var.sh ================================================ # Test migrating memory pages from DRAM to PMEM. # - Memory pages that are written once and never read # must be migrated to PMEM and must stay there. # - Memory pages that are actively written and read # must not be migrated to PMEM. # - Migration speed is as configured. vm-command "echo 0 > /proc/sys/kernel/numa_balancing || true" # Relaunch cri-resmgr with dynamic page demotion configuration. cri_resmgr_cfg=$TEST_DIR/cri-resmgr-dynamic-page-demotion.cfg terminate cri-resmgr launch cri-resmgr # Different memory usage profiles are implemented with awk # in order to manage with the same busybox image as other tests. # Memory size parameters for the busybox memory load pod: # - BSIZE: Block size in bytes (length of each stored string) # The larger the block the faster the awk goes through its memory. # If too large, memory for strings is no more allocated from heap # which makes page tracking harder and breaks this test. # - WORN: Write Once Read Never # - WORM: Write Once Read Many # - WMRN: Write Many Read Never # - WMRM: Write Many Read Many PRINT_WRBYTES_IF="wr%1000==0 && wr<10000" CPU=500m BSIZE=4096 awkmem=2M WORN=$awkmem WORM=$awkmem WMRN=$awkmem WMRM=$awkmem create bb-memload # Calculate page migration speed from cri-resmgr configuration. pages_per_second_per_process="$(awk ' /MaxPageMoveCount:/{mpmc=$2} /PageMoveInterval:/{gsub(/[^0-9]/, "", $2); pmi=$2} END{print mpmc/pmi} ' < "$cri_resmgr_cfg")" # After how many rounds (seconds) first migrations should be visible. first_migrations_visible="$(awk ' /PageScanInterval:/{gsub(/[^0-9]/, "", $2); print $2+8} ' < "$cri_resmgr_cfg")" # Expected migrated number of pages when fully migrated. pages_error_margin=100 fully_migrated_threshold=$(( ${awkmem%M} * 1024 * 1024 / 4096 - pages_error_margin )) # Maximum number of pages in PMEM when not migrated. not_migrated_threshold=$pages_error_margin # Watch memory page locations and validate results. memload_stats="$OUTPUT_DIR/memload-stats.txt" echo -n "" > "$memload_stats" max_rounds=30 round=0 declare -A pmem_pages_prev # number of pages in PMEM in previous round for wxrx in wmrm wmrn worm worn; do pmem_pages_prev[$wxrx]=0 done while (( round < max_rounds )); do vm-command-q ' cat /sys/devices/system/node/node[0-7]/meminfo | awk "/Active:/{a[\$2]=(\$4/1024)}END{s=\"active mem\";for(n=0;n<8;n++){s=sprintf(\"%s N%d=%.0fM\",s,n,a[n])}print s}" for p in $(pidof awk); do awkinfo=$(grep -a -o -E w[om]r[nm] /proc/$p/cmdline | head -n 1) rss=$(awk "/VmRSS:/{print \$2}" < /proc/$p/status); pages=$(echo $(grep -v file= /proc/$p/numa_maps | tr " " "\n" | awk -F= "/N([0-9])/{s[\$1]+=\$2}END{for(n=0;n<8;n++)if (s[\"N\"n]>0)print \"N\"n\"=\"s[\"N\"n]}")) echo "$awkinfo" pid "$p" VmRSS "$rss" kB, "pages:" "$pages" done' | while read line; do echo "round $round $line"; done | tee -a "$memload_stats" echo "validating..." # Check that at least something has migrated after scan period. if (( round > first_migrations_visible )); then grep -q -E 'pages:.*N[4-7]' "$memload_stats" || error "any of the awk processes was not migrated to PMEM in time" fi # Validate PMEM page migration speed. # Allow double the configured speed because stats polling interval > 1s. for wxrx in wmrm wmrn worm worn; do pmem_pages_now="$(grep "round $round $wxrx .*pages:" < "$memload_stats" | awk 'BEGIN{RS=" ";FS="=";pmem=0}/N[4-9]/{pmem+=$2}END{print pmem}')" if (( pmem_pages_now - pmem_pages_prev[$wxrx] > 2 * pages_per_second_per_process )); then error "number of PMEM pages of $wxrx grew too quickly on this round" fi pmem_pages_prev[$wxrx]=$pmem_pages_now done # Check that write-once-read-never (worn) has migrated and stays in PMEM. if (( round > 20 )); then worn_pmem_pages="$(grep "round $round worn .*pages:" < "$memload_stats" | awk 'BEGIN{RS=" ";FS="=";pmem=0}/N[4-9]/{pmem+=$2}END{print pmem}')" if (( worn_pmem_pages < fully_migrated_threshold )); then error "write-once-read-never was expected to end up and stay in PMEM, but only $worn_pmem_pages pages in PMEM." fi fi # Check that write-many-read-many and -read-never (wmrm and wmrn) stay in DRAM. for wmrx in wmrm wmrn; do wmrx_pmem_pages="$(grep "round $round $wmrx .*pages:" < "$memload_stats" | awk 'BEGIN{RS=" ";FS="=";pmem=0}/N[4-9]/{pmem+=$2}END{print pmem}')" if (( wmrx_pmem_pages > not_migrated_threshold )); then error "$wmrx was expected to stay in DRAM, but $wmrx_pmem_pages pages migrated to PMEM." fi done sleep 1 >/dev/null round=$(( round + 1 )) done echo "All rounds were good." kubectl delete pods --all --now --wait ================================================ FILE: test/e2e/policies.test-suite/topology-aware/c4pmem4/test04-dynamic-page-demotion/cri-resmgr-dynamic-page-demotion.cfg ================================================ policy: Active: topology-aware ReservedResources: CPU: 250m resource-manager: control: page-migration: PageScanInterval: 10s PageMoveInterval: 1s MaxPageMoveCount: 100 logger: Debug: cri-resmgr,resource-manager,cache,policy Klog: skip_headers: true ================================================ FILE: test/e2e/policies.test-suite/topology-aware/c4pmem4/test04-dynamic-page-demotion-deprecated-syntax/bb-memload.yaml.in ================================================ apiVersion: v1 kind: Pod metadata: name: ${NAME} annotations: cri-resource-manager.intel.com/memory-type: | pod0c0: dram,pmem pod0c1: dram,pmem pod0c2: dram,pmem pod0c3: dram,pmem spec: containers: - name: ${NAME}c0 image: busybox imagePullPolicy: IfNotPresent command: - sh - -c - while :; do dd status=none if=/dev/zero bs=$(( $BSIZE - 7 )) count=1 | tr '\\\0' 'A'; echo ${NAME}c0; done | awk '{r+=1;if(r<${WORN%M}*1024*1024/$BSIZE){worn[r]=\$1;wr+=1;}if($PRINT_WRBYTES_IF)print wr*$BSIZE;}' resources: requests: cpu: ${CPU} memory: $(( ${WORN%M} * 1024 * 1024 / $BSIZE + 100000 ))k limits: cpu: ${CPU} memory: $(( ${WORN%M} * 1024 * 1024 / $BSIZE + 100000 ))k - name: ${NAME}c1 image: busybox imagePullPolicy: IfNotPresent command: - sh - -c - while :; do dd status=none if=/dev/zero bs=$(( $BSIZE - 7 )) count=1 | tr '\\\0' 'A'; echo ${NAME}c1; done | awk '{r+=1;wmrn[r%(${WMRN%M}*1024*1024/$BSIZE)]=\$1;wr+=1;if($PRINT_WRBYTES_IF)print wr*$BSIZE;}' resources: requests: cpu: ${CPU} memory: $(( ${WMRN%M} * 1024 * 1024 / $BSIZE + 100000 ))k limits: cpu: ${CPU} memory: $(( ${WMRN%M} * 1024 * 1024 / $BSIZE + 100000 ))k - name: ${NAME}c2 image: busybox imagePullPolicy: IfNotPresent command: - sh - -c - while :; do dd status=none if=/dev/zero bs=$(( $BSIZE - 7 )) count=1 | tr '\\\0' 'A'; echo ${NAME}c2; done | awk '{r+=1;if (worm[r%(${WORM%M}*1024*1024/$BSIZE)]!=\$1){worm[r%(${WORM%M}*1024*1024/$BSIZE)]=\$1;wr+=1;}if($PRINT_WRBYTES_IF)print wr*$BSIZE;}' resources: requests: cpu: ${CPU} memory: $(( ${WORM%M} * 1024 * 1024 / $BSIZE + 100000 ))k limits: cpu: ${CPU} memory: $(( ${WORM%M} * 1024 * 1024 / $BSIZE + 100000 ))k - name: ${NAME}c3 image: busybox imagePullPolicy: IfNotPresent command: - sh - -c - while :; do dd status=none if=/dev/zero bs=$(( $BSIZE - 7 )) count=1 | tr '\\\0' 'A'; echo ${NAME}c3; done | awk '{r+=1;if (wmrm[r%(${WMRM%M}*1024*1024/$BSIZE)]!=\$1 || length(\$1) > 0){wmrm[r%(${WMRM%M}*1024*1024/$BSIZE)]=\$1;wr+=1;}if($PRINT_WRBYTES_IF)print wr*$BSIZE;}' resources: requests: cpu: ${CPU} memory: $(( ${WMRM%M} * 1024 * 1024 / $BSIZE + 100000 ))k limits: cpu: ${CPU} memory: $(( ${WMRM%M} * 1024 * 1024 / $BSIZE + 100000 ))k ================================================ FILE: test/e2e/policies.test-suite/topology-aware/c4pmem4/test04-dynamic-page-demotion-deprecated-syntax/code.var.sh ================================================ # Test migrating memory pages from DRAM to PMEM. # - Memory pages that are written once and never read # must be migrated to PMEM and must stay there. # - Memory pages that are actively written and read # must not be migrated to PMEM. # - Migration speed is as configured. # Relaunch cri-resmgr with dynamic page demotion configuration. cri_resmgr_cfg=$TEST_DIR/cri-resmgr-dynamic-page-demotion.cfg terminate cri-resmgr launch cri-resmgr # Different memory usage profiles are implemented with awk # in order to manage with the same busybox image as other tests. # Memory size parameters for the busybox memory load pod: # - BSIZE: Block size in bytes (length of each stored string) # The larger the block the faster the awk goes through its memory. # If too large, memory for strings is no more allocated from heap # which makes page tracking harder and breaks this test. # - WORN: Write Once Read Never # - WORM: Write Once Read Many # - WMRN: Write Many Read Never # - WMRM: Write Many Read Many PRINT_WRBYTES_IF="wr%1000==0 && wr<10000" CPU=500m BSIZE=4096 awkmem=2M WORN=$awkmem WORM=$awkmem WMRN=$awkmem WMRM=$awkmem create bb-memload # Calculate page migration speed from cri-resmgr configuration. pages_per_second_per_process="$(awk ' /MaxPageMoveCount:/{mpmc=$2} /PageMoveInterval:/{gsub(/[^0-9]/, "", $2); pmi=$2} END{print mpmc/pmi} ' < "$cri_resmgr_cfg")" # After how many rounds (seconds) first migrations should be visible. first_migrations_visible="$(awk ' /PageScanInterval:/{gsub(/[^0-9]/, "", $2); print $2+8} ' < "$cri_resmgr_cfg")" # Expected migrated number of pages when fully migrated. pages_error_margin=100 fully_migrated_threshold=$(( ${awkmem%M} * 1024 * 1024 / 4096 - pages_error_margin )) # Maximum number of pages in PMEM when not migrated. not_migrated_threshold=$pages_error_margin # Watch memory page locations and validate results. memload_stats="$OUTPUT_DIR/memload-stats.txt" echo -n "" > "$memload_stats" max_rounds=30 round=0 declare -A pmem_pages_prev # number of pages in PMEM in previous round for wxrx in wmrm wmrn worm worn; do pmem_pages_prev[$wxrx]=0 done while (( round < max_rounds )); do vm-command-q ' cat /sys/devices/system/node/node[0-7]/meminfo | awk "/Active:/{a[\$2]=(\$4/1024)}END{s=\"active mem\";for(n=0;n<8;n++){s=sprintf(\"%s N%d=%.0fM\",s,n,a[n])}print s}" for p in $(pidof awk); do awkinfo=$(grep -a -o -E w[om]r[nm] /proc/$p/cmdline | head -n 1) rss=$(awk "/VmRSS:/{print \$2}" < /proc/$p/status); pages=$(echo $(grep -v file= /proc/$p/numa_maps | tr " " "\n" | awk -F= "/N([0-9])/{s[\$1]+=\$2}END{for(n=0;n<8;n++)if (s[\"N\"n]>0)print \"N\"n\"=\"s[\"N\"n]}")) echo "$awkinfo" pid "$p" VmRSS "$rss" kB, "pages:" "$pages" done' | while read line; do echo "round $round $line"; done | tee -a "$memload_stats" echo "validating..." # Check that at least something has migrated after scan period. if (( round > first_migrations_visible )); then grep -q -E 'pages:.*N[4-7]' "$memload_stats" || error "any of the awk processes was not migrated to PMEM in time" fi # Validate PMEM page migration speed. # Allow double the configured speed because stats polling interval > 1s. for wxrx in wmrm wmrn worm worn; do pmem_pages_now="$(grep "round $round $wxrx .*pages:" < "$memload_stats" | awk 'BEGIN{RS=" ";FS="=";pmem=0}/N[4-9]/{pmem+=$2}END{print pmem}')" if (( pmem_pages_now - pmem_pages_prev[$wxrx] > 2 * pages_per_second_per_process )); then error "number of PMEM pages of $wxrx grew too quickly on this round" fi pmem_pages_prev[$wxrx]=$pmem_pages_now done # Check that write-once-read-never (worn) has migrated and stays in PMEM. if (( round > 20 )); then worn_pmem_pages="$(grep "round $round worn .*pages:" < "$memload_stats" | awk 'BEGIN{RS=" ";FS="=";pmem=0}/N[4-9]/{pmem+=$2}END{print pmem}')" if (( worn_pmem_pages < fully_migrated_threshold )); then error "write-once-read-never was expected to end up and stay in PMEM, but only $worn_pmem_pages pages in PMEM." fi fi # Check that write-many-read-many and -read-never (wmrm and wmrn) stay in DRAM. for wmrx in wmrm wmrn; do wmrx_pmem_pages="$(grep "round $round $wmrx .*pages:" < "$memload_stats" | awk 'BEGIN{RS=" ";FS="=";pmem=0}/N[4-9]/{pmem+=$2}END{print pmem}')" if (( wmrx_pmem_pages > not_migrated_threshold )); then error "$wmrx was expected to stay in DRAM, but $wmrx_pmem_pages pages migrated to PMEM." fi done sleep 1 >/dev/null round=$(( round + 1 )) done echo "All rounds were good." kubectl delete pods --all --now --wait ================================================ FILE: test/e2e/policies.test-suite/topology-aware/c4pmem4/test04-dynamic-page-demotion-deprecated-syntax/cri-resmgr-dynamic-page-demotion.cfg ================================================ policy: Active: topology-aware ReservedResources: CPU: 250m resource-manager: control: page-migration: PageScanInterval: 10s PageMoveInterval: 1s MaxPageMoveCount: 100 logger: Debug: cri-resmgr,resource-manager,cache,policy Klog: skip_headers: true ================================================ FILE: test/e2e/policies.test-suite/topology-aware/c4pmem4/test05-guarantee-memory/code.var.sh ================================================ CRI_RESMGR_OUTPUT="cat cri-resmgr.output.txt | tr -d '\0'" CRI_RESMGR_ROTATE="echo > cri-resmgr.output.txt" podno=0 kubectl delete pod --all --now --wait # account for being done with test for the current pod nextpod () { podno=$((podno+1)) } # print current pod name pod () { echo pod$podno } # print current container name, by default for current pod container () { local _p _c case $# in 0) _p=${podno}; _c=0;; 1) _p=${podno}; _c=$1;; 2) _p=$1; _c=$2;; *) _c=pod${1}c${2}; shift 2 echo ${_c}_INVALID_WITH_EXTRA_${#}_ARGS_$(echo $* | tr -s ' ' '_') return 1 ;; esac case $_p in +*|-*) _p=$((${podno}$_p));; esac echo pod${_p}c${_c} } # rotate cri-resmgr logs rotate_log () { vm-command "$CRI_RESMGR_ROTATE" } ########################################################################### # test #1: squeeze multiple containers in every NUMA node # # We squeeze an increasing number of containers in all NUMA node pools # in a loop. For every iteration we calculate the usable amount of CPU # and memory based on the available number of NUMA nodes and the amount # of CPU and memory per NUMA node. We use a conservative estimate for # the amount of memory available per NUMA node because some of them will # have a sizeable allocation by the kernel. # rotate_log # use conservative estimate for available memory per node PER_NODE_MEM=$((1500+4000)) PER_NODE_CPU=1000 PER_NODE_PMEM=1 NODE_COUNT_TOTAL=4 # All nodes have only a single CPU. Thus, with any (< 1000m) CPU reservation # we'll have one node (#0) fully reserved for kube-system containers. Hence, # our (usable) node count is one less than the total one. NODE_COUNT=$((NODE_COUNT_TOTAL - 1)) for pernode in 2 3 4; do cpu=$(echo "scale=3;0.75*$PER_NODE_CPU/$pernode" | bc | cut -d '.' -f1) mem=$(echo "scale=3;0.75*$PER_NODE_MEM/$pernode" | bc | cut -d '.' -f1) CPU=${cpu}m MEM=${mem}Mi CONTCOUNT=$((pernode*NODE_COUNT)) create guaranteed echo "Verify that any pod's containers were not raised to guarantee memory" echo "" vm-command "$CRI_RESMGR_OUTPUT | grep upward" && { pp mems error "Unexpected memset upward expansion detected!" } echo "Verify that all containers are pinned to a single NUMA node" echo "" c=0; while [ "$c" -lt "$((pernode*NODE_COUNT))" ]; do verify "len(mems['$(container $c)']) == $((1+PER_NODE_PMEM))" c=$((c+1)) done kubectl delete pod --all --now --wait nextpod done ########################################################################### # test #2: negative test for lifting containers upwards. # # This test first creates a pod that fits into a singe NUMA node. Then # it creates a pod that allocates a negligible amount of memory from the # root node (by asking for more CPU than a single NUMA node can provide). # The allocation of this pod must not cause lifting pod0 containers' # memory assignment upwards in the pool tree. # rotate_log CPU=200m MEM=100M create guaranteed report allowed verify "len(mems['$(container 0)']) == 2" nextpod CPU=1200m MEM=100M create guaranteed report allowed verify "len(mems['$(container -1 0)']) == 2" \ "len(mems['$(container 0)']) == 8" echo "Verify that $(pod)'s containers were not raised to guarantee memory" echo "" vm-command "$CRI_RESMGR_OUTPUT | grep upward" && { pp mems error "Unexpected memset upward expansion detected!" } kubectl delete pod $(pod) --now --wait --ignore-not-found nextpod ########################################################################### # test #3: positive test for lifting containers upwards. # # This test creates two containers which both get their own socket and # take > 50 % of their socket's mem. Then it reserves a lot of memory # from the root node to force lifting one of the containers. Every socket # has 6G PMEM+DRAM, one pods containers take 5G and the other take 2G. # => pessimistic max 7G will not fit to any socket # => no memory grants can be given to any socket alone. # CPU=200m MEM=5G CONTCOUNT=2 create guaranteed report allowed verify "len(mems['$(container 0)']) == 2" \ "len(mems['$(container 1)']) == 2" \ "mems['$(container 0)'] != mems['$(container 1)']" nextpod CPU=1200m MEM=2G create guaranteed echo "Verify that $(pod)'s containers were raised to guarantee memory" echo "" vm-command "$CRI_RESMGR_OUTPUT | grep upward" || { error "Expected memset upward expansion not found!" } report allowed pp mems verify "len(mems['$(container 0)']) == 8" \ "len(mems['$(container -1 0)']) == 8" \ "len(mems['$(container -1 1)']) == 8" ================================================ FILE: test/e2e/policies.test-suite/topology-aware/c4pmem4/topology.var.json ================================================ [ {"mem": "2G", "threads": 1, "cores": 1, "packages": 4}, {"mem": "4G", "node-dist": {"2": 17}}, {"mem": "4G", "node-dist": {"3": 17}}, {"mem": "4G", "node-dist": {"0": 17}}, {"mem": "4G", "node-dist": {"1": 17}} ] ================================================ FILE: test/e2e/policies.test-suite/topology-aware/cri-resmgr.cfg ================================================ policy: Active: topology-aware ReservedResources: CPU: 750m logger: Debug: cri-resmgr,resource-manager,cache,policy Klog: skip_headers: true dump: Config: off:.*,full:((Create)|(Start)|(Run)|(Update)|(Stop)|(Remove)).*,off:.*Image.* ================================================ FILE: test/e2e/policies.test-suite/topology-aware/n4c16/test00-basic-placement/code.var.sh ================================================ # pod0: Test that 4 guaranteed containers eligible for isolated CPU allocation # gets evenly spread over NUMA nodes. CONTCOUNT=4 CPU=1 create guaranteed report allowed verify \ 'len(cpus["pod0c0"]) == 1' \ 'len(cpus["pod0c1"]) == 1' \ 'len(cpus["pod0c2"]) == 1' \ 'len(cpus["pod0c3"]) == 1' \ 'disjoint_sets(cpus["pod0c0"], cpus["pod0c1"], cpus["pod0c2"], cpus["pod0c3"])' \ 'disjoint_sets(nodes["pod0c0"], nodes["pod0c1"], nodes["pod0c2"], nodes["pod0c3"])' kubectl delete pods --all --now --wait # pod1: Test that 4 guaranteed containers not eligible for isolated CPU allocation # gets evenly spread over NUMA nodes. CONTCOUNT=4 CPU=3 create guaranteed report allowed verify \ 'len(cpus["pod1c0"]) == 3' \ 'len(cpus["pod1c1"]) == 3' \ 'len(cpus["pod1c2"]) == 3' \ 'len(cpus["pod1c3"]) == 3' \ 'disjoint_sets(cpus["pod1c0"], cpus["pod1c1"], cpus["pod1c2"], cpus["pod1c3"])' \ 'disjoint_sets(nodes["pod1c0"], nodes["pod1c1"], nodes["pod1c2"], nodes["pod1c3"])' kubectl delete pods --all --now --wait # pod2: Test that 4 burstable containers not eligible for isolated/exclusive CPU allocation # gets evenly spread over NUMA nodes. CONTCOUNT=4 CPUREQ=2 CPULIM=4 create burstable report allowed verify \ 'disjoint_sets(cpus["pod2c0"], cpus["pod2c1"], cpus["pod2c2"], cpus["pod2c3"])' \ 'disjoint_sets(nodes["pod2c0"], nodes["pod2c1"], nodes["pod2c2"], nodes["pod2c3"])' kubectl delete pods --all --now --wait # pod3: Test that initContainer resources are freed before launching # containers: instantiate 5 init containers, each requiring 5 CPUs. If # the resources of an init container weren't freed before next init # container is launched, not all of them could be launched, and not # real containers could fit on the node. ICONTCOUNT=5 ICONTSLEEP=1 CONTCOUNT=2 CPU=5 MEM=100M create guaranteed report allowed verify \ 'disjoint_sets(cpus["pod3c0"], cpus["pod3c1"])' \ 'disjoint_sets(nodes["pod3c0"], nodes["pod3c1"])' \ 'disjoint_sets(packages["pod3c0"], packages["pod3c1"])' kubectl delete pods --all --now --wait # pod4: Test that with pod colocation enabled containers within a pod get # colocated (assigned topologically close to each other) as opposed to being # evenly spread out. terminate cri-resmgr cri_resmgr_cfg=$(COLOCATE_PODS=true instantiate cri-resmgr.cfg) launch cri-resmgr CONTCOUNT=4 CPU=100m create guaranteed report allowed verify \ 'cpus["pod4c1"] == cpus["pod4c0"]' \ 'cpus["pod4c2"] == cpus["pod4c0"]' \ 'cpus["pod4c3"] == cpus["pod4c0"]' kubectl delete pods --all --now --wait # pod{5,6,7}: Test that with namespace colocation enabled containers of pods # in the same namespace get colocated (assigned topologically close to each # other) as opposed to being evenly spread out. terminate cri-resmgr cri_resmgr_cfg=$(COLOCATE_NAMESPACES=true instantiate cri-resmgr.cfg) launch cri-resmgr kubectl create namespace test-ns CONTCOUNT=1 CPU=100m namespace=test-ns create guaranteed CONTCOUNT=1 CPU=100m namespace=test-ns create guaranteed CONTCOUNT=2 CPU=100m namespace=test-ns create guaranteed report allowed verify \ 'cpus["pod6c0"] == cpus["pod5c0"]' \ 'cpus["pod7c0"] == cpus["pod5c0"]' \ 'cpus["pod7c1"] == cpus["pod5c0"]' kubectl delete namespace test-ns --now --wait --ignore-not-found # Restore default test configuration, restart cri-resmgr. terminate cri-resmgr cri_resmgr_cfg=$(instantiate cri-resmgr.cfg) launch cri-resmgr ================================================ FILE: test/e2e/policies.test-suite/topology-aware/n4c16/test00-basic-placement/cri-resmgr.cfg.in ================================================ policy: Active: topology-aware ReservedResources: CPU: 750m topology-aware: ColocatePods: $(echo ${COLOCATE_PODS:-false}) ColocateNamespaces: $(echo ${COLOCATE_NAMESPACES:-false}) logger: Debug: cri-resmgr,resource-manager,cache,policy Klog: skip_headers: true dump: Config: off:.*,full:((Create)|(Start)|(Run)|(Update)|(Stop)|(Remove)).*,off:.*Image.* ================================================ FILE: test/e2e/policies.test-suite/topology-aware/n4c16/test01-always-fits/code.var.sh ================================================ # Test that guaranteed and burstable pods get the CPUs they require # when there are enough CPUs available. # pod0, fits in a core CPU=1 create guaranteed report allowed verify \ 'node_ids(nodes["pod0c0"]) == {1}' \ 'cpu_ids(cpus["pod0c0"]) == {4}' # pod1, takes full core - from a different node than pod0 CPU=2 create guaranteed report allowed verify \ 'cpu_ids(cpus["pod0c0"]) == {4}' \ 'node_ids(nodes["pod1c0"]) == {2}' \ 'cpu_ids(cpus["pod1c0"]) == {8, 9}' # pod2, does not fit in a core but fits in a node CPU=3 create guaranteed report allowed verify \ 'len(cpus["pod0c0"]) == 1' \ 'len(cpus["pod1c0"]) == 2' \ 'len(cores["pod1c0"]) == 1' \ 'len(cpus["pod2c0"]) == 3' \ 'len(cores["pod2c0"]) == 2' \ 'len(nodes["pod2c0"]) == 1' \ 'disjoint_sets(cpus["pod0c0"], cpus["pod1c0"], cpus["pod2c0"])' # pod3, tries to fully exhaust the shared subset of a (NUMA node) pool # Currently topology-aware refuses to exhaust even idle shared CPU subsets of # a pool. Therefore such attempts will try to squeeze the container to # another pool at the same level or, if none found, push the container # one level up to the parent pool. # # There is a pending commit to change this behavior to allow exhausting # fully idle subsets (no active shared grants). Once that lands, update # this test accordingly as well. CPU=4 create guaranteed report allowed verify \ 'len(cpus["pod0c0"]) == 1' \ 'len(cpus["pod1c0"]) == 2' \ 'len(cores["pod1c0"]) == 1' \ 'len(cpus["pod2c0"]) == 3' \ 'len(cores["pod2c0"]) == 2' \ 'len(nodes["pod2c0"]) == 1' \ 'len(cpus["pod3c0"]) == 4' \ 'len(cores["pod3c0"]) == 2' \ 'len(nodes["pod3c0"]) == 2' \ 'disjoint_sets(cpus["pod0c0"], cpus["pod1c0"], cpus["pod2c0"], cpus["pod3c0"])' kubectl delete pods --all --now --wait # pod4, fits in a die/package CPU=5 create guaranteed report allowed verify \ 'len(cpus["pod4c0"]) == 5' \ 'len(cores["pod4c0"]) == 3' \ 'len(nodes["pod4c0"]) == 2' \ 'len(dies["pod4c0"]) == 1' # pod5, takes a full die/package # cpu0 is reserved, so allocating 7 CPUs is expected to fill package0/die0 CPU=7 create guaranteed report allowed verify \ 'len(cpus["pod4c0"]) == 5' \ 'len(cores["pod4c0"]) == 3' \ 'len(nodes["pod4c0"]) == 2' \ 'len(dies["pod4c0"]) == 1' \ 'len(cpus["pod5c0"]) == 7' \ 'len(cores["pod5c0"]) == 4' \ 'len(dies["pod5c0"]) == 1' \ 'disjoint_sets(cpus["pod4c0"], cpus["pod5c0"])' kubectl delete pods --all --now --wait # pod6, doesn't fit in a die/package, needs virtual root CPU=9 create guaranteed report allowed verify \ 'len(cpus["pod6c0"]) == 9' \ 'len(packages["pod6c0"]) == 2' kubectl delete pods --all --now --wait reset counters # pod0, burstable containers must get at least the cores they require CPUREQ=3 CPULIM=$(( CPUREQ + 1 )) create burstable report allowed verify \ 'len(cpus["pod0c0"]) >= 2' # pod1 CPUREQ=4 CPULIM=$(( CPUREQ + 1 )) create burstable report allowed verify \ 'len(cpus["pod0c0"]) >= 2' \ 'len(cpus["pod1c0"]) >= 4' # pod2 CPUREQ=5 CPULIM=$(( CPUREQ + 1 )) create burstable report allowed verify \ 'len(cpus["pod0c0"]) >= 2' \ 'len(cpus["pod1c0"]) >= 4' \ 'len(cpus["pod2c0"]) >= 5' kubectl delete pods pod0 pod1 --now --wait --ignore-not-found # pod3 CPUREQ=8 CPULIM=$(( CPUREQ + 1 )) create burstable report allowed verify \ 'len(cpus["pod2c0"]) >= 5' \ 'len(cpus["pod3c0"]) >= 8' kubectl delete pods pod3 --now --wait --ignore-not-found # pod4, pod5 (and existing pod2) take 5 and 4 CPUs. As there are 8 # CPUs/node, pod2 and pod4 have consumed free node # pairs/dies/packages. pod5 will be spread across nodes. CPUREQ=5 CPULIM=$(( CPUREQ + 1 )) create burstable report allowed CPUREQ=4 CPULIM=$(( CPUREQ + 1 )) create burstable report allowed verify \ 'len(cpus["pod2c0"]) >= 5' \ 'len(cpus["pod4c0"]) >= 5' \ 'len(cpus["pod5c0"]) >= 4' ================================================ FILE: test/e2e/policies.test-suite/topology-aware/n4c16/test02-shrink-and-grow-shared/code.var.sh ================================================ # pod0: require 10 out of 16 CPUs with two containers. # Both containers should fit in their own die. (8 CPUs per die.) CPU=5 CONTCOUNT=2 create guaranteed report allowed verify \ 'len(cpus["pod0c0"]) == 5' \ 'len(cpus["pod0c1"]) == 5' \ 'len(nodes["pod0c0"]) == len(nodes["pod0c1"]) == 2' \ 'len(dies["pod0c0"]) == len(dies["pod0c1"]) == 1' \ 'disjoint_sets(cpus["pod0c0"], cpus["pod0c1"])' # pod1: two containers in a besteffort pod. CONTCOUNT=2 create besteffort report allowed verify \ 'len(cpus["pod0c0"]) == 5' \ 'len(cpus["pod0c1"]) == 5' \ 'disjoint_sets(set.union(cpus["pod0c0"], cpus["pod0c1"]))' \ 'len(cpus["pod1c0"]) > 0' \ 'len(cpus["pod1c1"]) > 0' \ 'disjoint_sets( set.union(cpus["pod0c0"], cpus["pod0c1"]), set.union(cpus["pod1c0"], cpus["pod1c1"]))' # Delete pod0 delete pods/pod0 --now report allowed # Next squeeze the besteffort containers to the minimum. # pod2: 4 guaranteed containers, each requiring 3 CPUs. CPU=3 CONTCOUNT=4 create guaranteed report allowed verify \ 'len(cpus["pod2c0"]) == len(cpus["pod2c1"]) == len(cpus["pod2c2"]) == len(cpus["pod2c3"]) == 3' \ 'disjoint_sets(cpus["pod2c0"], cpus["pod2c1"], cpus["pod2c2"], cpus["pod2c3"])' # pod3: 1 guaranteed container taking the last non-reserved CPU # that can be taken from shared pools. CPU=1 create guaranteed report allowed verify \ 'disjoint_sets( set.union(cpus["pod1c0"], cpus["pod1c1"]), set.union(cpus["pod3c0"], cpus["pod2c0"], cpus["pod2c1"], cpus["pod2c2"], cpus["pod2c3"]))' ================================================ FILE: test/e2e/policies.test-suite/topology-aware/n4c16/test03-simple-affinity/code.var.sh ================================================ # Test that guaranteed and burstable pods get the CPUs they require # when there are enough CPUs available. inject-affinities() { local var=$1 srcdst src dst hdr line shift if [ -z "$var" ] || [ -z "${!var}" ]; then return 0 fi case "$var" in ANTI_*|*_ANTI_*) hdr="cri-resource-manager.intel.com/anti-affinity";; *) hdr="cri-resource-manager.intel.com/affinity";; esac for srcdst in ${!var}; do src=${srcdst%:*} dst=${srcdst#*:} [ -n "$hdr" ] && { echo " $hdr: |"; hdr=""; } line="$src: [ ${dst//,/, } ]" echo 1>&2 "* [affinity]: injecting affinity '$line'" echo " $line" done } deref_keys() { eval "echo \${!$1[@]}" } deref_value() { eval "echo \${$1[$2]}" } inject-annotations() { local var=$1 values key value shift if [ -z "$var" ] || [ -z "${!var}" ]; then return 0 fi for key in $(deref_keys ${!var}); do value=$(deref_value ${!var} $key) line="$key: $value" echo 1>&2 "* [annotation]: injecting annotation '$line'" echo " $line" done } # pod0 # 4 containers, no affinities => spread out evenly over NUMA nodes CONTCOUNT=4 CPU=1 create guaranteed+affinity report allowed verify \ 'nodes["pod0c0"] == {"node1"}' \ 'nodes["pod0c1"] == {"node2"}' \ 'nodes["pod0c2"] == {"node3"}' \ 'nodes["pod0c3"] == {"node0"}' kubectl delete pods --all --now --wait # pod1 # 4 containers, affinites [0,1], [2,3] => colocate c0,c1 in node1, c2,c3 in node2 CONTCOUNT=4 AFFINITIES="pod1c0:pod1c1 pod1c2:pod1c3" CPU=1 create guaranteed+affinity report allowed verify \ 'nodes["pod1c0"] == nodes["pod1c1"] == {"node1"}' \ 'nodes["pod1c2"] == nodes["pod1c3"] == {"node2"}' kubectl delete pods --all --now --wait # pod2 # 6 containers, anti-affinites 4:[0,1,2], 5:[0,2,3] # => don't co-locate 4 with {0,1,2}, or 5 with {0,2,3} CONTCOUNT=6 ANTI_AFFINITIES="pod2c4:pod2c0,pod2c1,pod2c2 pod2c5:pod2c0,pod2c2,pod2c3" CPU=1 \ create guaranteed+affinity report allowed verify \ 'disjoint_sets(nodes["pod2c4"], nodes["pod2c0"], nodes["pod2c1"], nodes["pod2c2"])' \ 'disjoint_sets(nodes["pod2c5"], nodes["pod2c0"], nodes["pod2c2"], nodes["pod2c3"])' ================================================ FILE: test/e2e/policies.test-suite/topology-aware/n4c16/test03-simple-affinity/guaranteed+affinity.yaml.in ================================================ apiVersion: v1 kind: Pod metadata: name: ${NAME} labels: app: ${NAME} annotations: $([ -z "$(type -t inject-affinities)" ] || inject-affinities AFFINITIES) $([ -z "$(type -t inject-affinities)" ] || inject-affinities ANTI_AFFINITIES) $([ -z "$(type -t inject-annotations)" ] || inject-annotations ANNOTATIONS) spec: containers: $(for contnum in $(seq 1 ${CONTCOUNT}); do echo " - name: ${NAME}c$(( contnum - 1 )) image: busybox imagePullPolicy: IfNotPresent command: - sh - -c - echo ${NAME}c$(( contnum - 1 )) \$(sleep inf) resources: requests: cpu: ${CPU} memory: '${MEM}' limits: cpu: ${CPU} memory: '${MEM}' "; done ) terminationGracePeriodSeconds: 1 ================================================ FILE: test/e2e/policies.test-suite/topology-aware/n4c16/test04-available-resources/code.var.sh ================================================ # Test that AvailableResources are honored. # Test explicit cpuset in AvailableResources.CPU terminate cri-resmgr AVAILABLE_CPU="cpuset:4-7,8-11" cri_resmgr_cfg=$(instantiate cri-resmgr-available-resources.cfg) launch cri-resmgr # pod0: exclusive CPUs CPU=3 create guaranteed verify "cpus['pod0c0'] == {'cpu04', 'cpu05', 'cpu06'}" \ "mems['pod0c0'] == {'node1'}" # pod1: shared CPUs CONTCOUNT=2 CPU=980m create guaranteed verify "cpus['pod1c0'] == {'cpu08', 'cpu09', 'cpu10'}" \ "cpus['pod1c1'] == {'cpu08', 'cpu09', 'cpu10'}" \ "mems['pod1c0'] == {'node2'}" \ "mems['pod1c1'] == {'node2'}" kubectl delete pods --all --now --wait reset counters # Test cgroup cpuset directory in AvailableResources.CPU test-and-verify-allowed() { # pod0: shared CPUs CONTCOUNT=2 CPU=980m create guaranteed report allowed verify "cpus['pod0c0'] == {'cpu0$1', 'cpu0$2', 'cpu0$3'}" \ "cpus['pod0c1'] == {'cpu0$4'}" # pod1: exclusive CPU CPU=1 create guaranteed report allowed verify "disjoint_sets(cpus['pod1c0'], cpus['pod0c0'])" \ "disjoint_sets(cpus['pod1c0'], cpus['pod0c1'])" kubectl delete pods --all --now --wait reset counters } if vm-command "[ -d /sys/fs/cgroup/cpuset ]"; then # cgroup v1 CGROUP_CPUSET=/sys/fs/cgroup/cpuset else # cgroup v2 CGROUP_CPUSET=/sys/fs/cgroup fi CRIRM_CGROUP=$CGROUP_CPUSET/cri-resmgr-test-05-1 vm-command "rmdir $CRIRM_CGROUP; mkdir $CRIRM_CGROUP; echo 1-4,11 > $CRIRM_CGROUP/cpuset.cpus" terminate cri-resmgr AVAILABLE_CPU="\"$CRIRM_CGROUP\"" cri_resmgr_cfg=$(instantiate cri-resmgr-available-resources.cfg) launch cri-resmgr test-and-verify-allowed 1 2 3 4 vm-command "rmdir $CRIRM_CGROUP || true" CRIRM_CGROUP=$CGROUP_CPUSET/cri-resmgr-test-05-2 vm-command "rmdir $CRIRM_CGROUP; mkdir $CRIRM_CGROUP; echo 5-8,11 > $CRIRM_CGROUP/cpuset.cpus" terminate cri-resmgr AVAILABLE_CPU="\"${CRIRM_CGROUP#/sys/fs/cgroup/cpuset}\"" cri_resmgr_cfg=$(instantiate cri-resmgr-available-resources.cfg) launch cri-resmgr test-and-verify-allowed 5 6 7 8 vm-command "rmdir $CRIRM_CGROUP || true" # cleanup, do not leave weirdly configured cri-resmgr running terminate cri-resmgr ================================================ FILE: test/e2e/policies.test-suite/topology-aware/n4c16/test04-available-resources/cri-resmgr-available-resources.cfg.in ================================================ policy: Active: topology-aware AvailableResources: cpu: ${AVAILABLE_CPU} ReservedResources: cpu: cpuset:11 logger: Debug: cri-resmgr,resource-manager,cache,policy Klog: skip_headers: true dump: Config: off:.*,full:((Create)|(Start)|(Run)|(Update)|(Stop)|(Remove)).*,off:.*Image.* ================================================ FILE: test/e2e/policies.test-suite/topology-aware/n4c16/test05-reserved-resources/code.var.sh ================================================ # Test that # - kube-system containers are pinned on Reserved CPUs. # - Reserved CPU allocation and releasing works. # - A pod cannot be launched if reserved CPU capacity in insufficient. AVAILABLE_CPU="cpuset:4-7,8-13" cri_resmgr_cfg_orig=$cri_resmgr_cfg # This script will create pods to the kube-system namespace # that is not automatically cleaned up by the framework. # Make sure the namespace is clear when starting the test and clean it up # if exiting with success. Otherwise leave the pod running for # debugging in case of a failure. cleanup-kube-system() { ( kubectl delete pods pod0 pod1 pod2 pod3 pod4 pod5 -n kube-system --now --wait --ignore-not-found ) || true } cleanup-kube-system # Test launch failure, Reserved CPUs is not subset of Available CPUs terminate cri-resmgr RESERVED_CPU="cpuset:3,7,11,15" cri_resmgr_cfg=$(instantiate cri-resmgr-reserved.cfg) ( launch cri-resmgr ) && error "unexpected success" || { echo "Launch failed as expected" } # Test launch failure, there are more reserved CPUs than available CPUs terminate cri-resmgr RESERVED_CPU="11" cri_resmgr_cfg=$(instantiate cri-resmgr-reserved.cfg) ( launch cri-resmgr ) && error "unexpected success" || { echo "Launch failed as expected" } # Test that BestEffort containers are allowed to run on both Reserved # CPUs when the CPUs are on the same NUMA node. terminate cri-resmgr RESERVED_CPU="cpuset:10-11" cri_resmgr_cfg=$(instantiate cri-resmgr-reserved.cfg) launch cri-resmgr namespace=kube-system CONTCOUNT=3 create besteffort report allowed verify "cpus['pod0c0'] == cpus['pod0c1'] == cpus['pod0c2'] == {'cpu10', 'cpu11'}" kubectl delete -n kube-system pods pod0 --now --wait --ignore-not-found # Test that BestEffort containers are pinned to reserved CPUs. terminate cri-resmgr RESERVED_CPU="cpuset:7,11" cri_resmgr_cfg=$(instantiate cri-resmgr-reserved.cfg) launch cri-resmgr namespace=kube-system CONTCOUNT=4 create besteffort report allowed verify "cpus['pod1c0'] == cpus['pod1c1'] == cpus['pod1c2'] == cpus['pod1c3']" \ "cpus['pod1c0'] == {'cpu07', 'cpu11'}" # Test that guaranteed kube-system pods are pinned to Reserved CPUs. namespace=kube-system CPU=200m CONTCOUNT=4 create guaranteed report allowed verify "cpus['pod2c0'] == cpus['pod2c1'] == cpus['pod2c2'] == cpus['pod2c3']" \ "cpus['pod2c0'] == {'cpu07', 'cpu11'}" # Test requesting more reserved CPUs than available on single node # but what fits in the node tree. # pod2 already consumed 4 * 200m of reserved CPUs that have been balanced # so that at least 200m from both nodes have been consumed. There are # at most 800m reserved CPUs free on both nodes. Root node still has # 1200m free. That is, 1000m requesting, isolated-looking guaranteed # pod should fit in because reserved CPUs are not isolated. # # Run this twice to make sure allocated reserved CPUs are released correctly. for pod in pod3 pod4; do namespace=kube-system CPU=1 CONTCOUNT=1 create guaranteed verify "cpus['${pod}c0'] == {'cpu07', 'cpu11'}" kubectl delete -n kube-system pods/$pod --now --wait --ignore-not-found done # Test requesting more reserved CPUs than available in the system. # pod5 is expected to run on shared CPUs. namespace=kube-system CPU=2 CONTCOUNT=1 create guaranteed report allowed verify "cpus['pod5c0'] == {'cpu04', 'cpu05', 'cpu06', 'cpu08', 'cpu09', 'cpu10', 'cpu12', 'cpu13'}" cleanup-kube-system # Test that the first available CPUs are reserved when reserving milli CPUs. # The number of reserved CPUs is the ceiling of the milli CPUs. reset counters terminate cri-resmgr RESERVED_CPU="2250m" cri_resmgr_cfg=$(instantiate cri-resmgr-reserved.cfg) launch cri-resmgr namespace=kube-system CPU=2 CONTCOUNT=1 create besteffort verify "cpus['pod0c0'] == {'cpu04', 'cpu05', 'cpu06'}" kubectl delete -n kube-system pods/pod0 --now --wait --ignore-not-found terminate cri-resmgr cri_resmgr_cfg=$cri_resmgr_cfg_orig launch cri-resmgr ================================================ FILE: test/e2e/policies.test-suite/topology-aware/n4c16/test05-reserved-resources/cri-resmgr-reserved.cfg.in ================================================ policy: Active: topology-aware AvailableResources: cpu: ${AVAILABLE_CPU} ReservedResources: cpu: ${RESERVED_CPU} logger: Debug: cri-resmgr,resource-manager,cache,policy Klog: skip_headers: true dump: Config: off:.*,full:((Create)|(Start)|(Run)|(Update)|(Stop)|(Remove)).*,off:.*Image.* ================================================ FILE: test/e2e/policies.test-suite/topology-aware/n4c16/test06-fuzz/code.var.sh ================================================ source $TEST_DIR/codelib.sh || { echo "error importing codelib.sh" exit 1 } # Clean test pods from the kube-system namespace ( kubectl delete pods --now --wait --ignore-not-found -n kube-system $(kubectl get pods -n kube-system | awk '/t[0-9]r[gb][ue]/{print $1}') ) || true # Run generated*.sh test scripts in this directory. genscriptcount=0 for genscript in "$TEST_DIR"/generated*.sh; do if [ ! -f "$genscript" ]; then continue fi ( paralleloutdir="$outdir/parallel$genscriptcount" [ -d "$paralleloutdir" ] && rm -rf "$paralleloutdir" mkdir "$paralleloutdir" OUTPUT_DIR="$paralleloutdir" COMMAND_OUTPUT_DIR="$paralleloutdir/commands" mkdir "$COMMAND_OUTPUT_DIR" source "$genscript" 2>&1 | sed -u -e "s/^/$(basename "$genscript"): /g" ) & genscriptcount=$(( genscriptcount + 1)) done if [[ "$genscriptcount" == "0" ]]; then echo "WARNING:" echo "WARNING: Skipping fuzz tests:" echo "WARNING: - Generated tests not found." echo "WARNING: - Generate a test by running:" echo "WARNING: $TEST_DIR/generate.sh" echo "WARNING: - See test generation options:" echo "WARNING: $TEST_DIR/generate.sh --help" echo "WARNING:" sleep 5 exit 0 fi echo "waiting for $genscriptcount generated tests to finish..." wait ================================================ FILE: test/e2e/policies.test-suite/topology-aware/n4c16/test06-fuzz/codelib.sh ================================================ container-exit0() { # Terminate a container by killing the "sleep inf" child process in # echo CONTNAME $(sleep inf) local contname="$1" vm-command "contpid=\$(ps axf | grep -A1 'echo $contname' | grep -v grep | awk '/_ sleep inf/{print \$1}'); kill -KILL \$contpid" } container-signal() { local contname="$1" local signal="$2" vm-command "pkill -$signal -f 'echo $contname'" } ================================================ FILE: test/e2e/policies.test-suite/topology-aware/n4c16/test06-fuzz/fuzz.aal ================================================ language python { max_mem=7500 # maximum memory on VM in MB max_cpu=15000 # maximum CPUs on node in mCPU max_reserved_cpu=1000 # maximum reserved CPUs on node in mCPU class Vars: # namespace for variables in input names def __repr__(self): return "{" + ",".join("%s:%s" % (a, getattr(self, a)) for a in sorted(self.__dict__.keys()) if not a.startswith("_")) + "}\n" def inputvars(input_name): # parse VAR=VALUE's from input_name v = Vars() for word in input_name.split(): keyvalue = word.split("=") if len(keyvalue) == 2: if (keyvalue[1].endswith("m") or keyvalue[1].endswith("M")) and len(keyvalue[1]) > 1 and keyvalue[1][-2] in '0123456789': keyvalue[1] = keyvalue[1][:-1] try: setattr(v, keyvalue[0], int(keyvalue[1])) except: setattr(v, keyvalue[0], keyvalue[1]) return v } variables { mem, cpu, reserved_cpu, pods } initial_state { mem=0 cpu=0 reserved_cpu=0 pods={} } # Create non-reserved CPU pods # On this topology, there is # - 2G mem/numanode, 4G mem/package, 8G mem in total # - 4 CPU/numanode, 8 CPU/package, 16 CPU in total input "NAME=gu0 CONTCOUNT=1 CPU=200m MEM=1500M create guaranteed", "NAME=gu1 CONTCOUNT=2 CPU=1000m MEM=500M create guaranteed", "NAME=gu2 CONTCOUNT=2 CPU=1200m MEM=4500M create guaranteed", "NAME=gu3 CONTCOUNT=3 CPU=2000m MEM=500M create guaranteed", "NAME=gu4 CONTCOUNT=1 CPU=4200m MEM=100M create guaranteed", "NAME=bu0 CONTCOUNT=1 CPU=1200m MEM=50M CPUREQ=900m MEMREQ=49M CPULIM=1200m MEMLIM=50M create burstable", "NAME=bu1 CONTCOUNT=2 CPU=1900m MEM=300M CPUREQ=1800m MEMREQ=299M CPULIM=1900m MEMLIM=300M create burstable", "NAME=be0 CONTCOUNT=1 CPU=0 MEM=0 create besteffort", "NAME=be1 CONTCOUNT=3 CPU=0 MEM=0 create besteffort" { guard { v = inputvars(input_name) return (v.NAME not in pods and (mem + v.MEM * v.CONTCOUNT < max_mem) and (cpu + v.CPU * v.CONTCOUNT < max_cpu)) } body { v = inputvars(input_name) v.namespace = getattr(v, "namespace", "default") mem += v.MEM * v.CONTCOUNT cpu += v.CPU * v.CONTCOUNT pods[v.NAME] = v } } # Create pods to the kube-system namespace input "NAME=rgu0 CONTCOUNT=2 CPU=100m MEM=1000M namespace=kube-system create guaranteed", "NAME=rbu0 CONTCOUNT=1 CPU=100m MEM=100M CPUREQ=99m MEMREQ=99M CPULIM=100m MEMLIM=100M namespace=kube-system create burstable", "NAME=rbe0 CONTCOUNT=2 CPU=0 MEM=0 namespace=kube-system create besteffort" { guard { v = inputvars(input_name) return (v.NAME not in pods and (mem + v.MEM * v.CONTCOUNT < max_mem) and (reserved_cpu + v.CPU * v.CONTCOUNT < max_reserved_cpu)) } body { v = inputvars(input_name) mem += v.MEM * v.CONTCOUNT reserved_cpu += v.CPU * v.CONTCOUNT pods[v.NAME] = v } } # Kill a process in a container # - "echo gu0c1" matches and kills process only in container gu0c1 in pod gu0 # - "echo gu0" matches and kills processes in all containers of pod gu0 input "NAME=gu0 container-exit0 gu0c0", "NAME=gu1 container-exit0 gu1c0", "NAME=gu2 container-exit0 gu2c0", "NAME=gu3 container-exit0 gu3", "NAME=gu4 container-exit0 gu4c", "NAME=bu0 container-exit0 bu0c0", "NAME=bu1 container-exit0 bu1c0", "NAME=be0 container-exit0 be0c0", "NAME=be1 container-exit0 be0c0", "NAME=rgu0 container-exit0 rgu0c0", "NAME=rbu0 container-exit0 rbu0c0", "NAME=rbe0 container-exit0 rbe0c0" { guard { v = inputvars(input_name) return v.NAME in pods } } # Delete single pod input "NAME=gu0 kubectl delete pod gu0 --now --wait --ignore-not-found", "NAME=gu1 kubectl delete pod gu1 --now --wait --ignore-not-found", "NAME=gu2 kubectl delete pod gu2 --now --wait --ignore-not-found", "NAME=gu3 kubectl delete pod gu3 --now --wait --ignore-not-found", "NAME=gu4 kubectl delete pod gu4 --now --wait --ignore-not-found", "NAME=bu0 kubectl delete pod bu0 --now --wait --ignore-not-found", "NAME=bu1 kubectl delete pod bu1 --now --wait --ignore-not-found", "NAME=be0 kubectl delete pod be0 --now --wait --ignore-not-found", "NAME=be1 kubectl delete pod be1 --now --wait --ignore-not-found", "NAME=rgu0 kubectl delete pod rgu0 -n kube-system --now --wait --ignore-not-found", "NAME=rbu0 kubectl delete pod rbu0 -n kube-system --now --wait --ignore-not-found", "NAME=rbe0 kubectl delete pod rbe0 -n kube-system --now --wait --ignore-not-found" { guard { v = inputvars(input_name) return v.NAME in pods } body { v = inputvars(input_name) p = pods[v.NAME] mem -= p.MEM * p.CONTCOUNT if getattr(p, "namespace", "") == "kube-system": reserved_cpu -= p.CPU * p.CONTCOUNT else: cpu -= p.CPU * p.CONTCOUNT del pods[v.NAME] } } ================================================ FILE: test/e2e/policies.test-suite/topology-aware/n4c16/test06-fuzz/fuzz.fmbt.conf ================================================ model = aal_remote(remote_pyaal --verbose-fmbt-log fuzz.aal) heuristic = mrandom(80,lookahead(1:2),20,random) coverage = perm(2) pass = coverage(10) pass = steps(100) ================================================ FILE: test/e2e/policies.test-suite/topology-aware/n4c16/test06-fuzz/generate.sh ================================================ #!/bin/bash usage() { cat < Number of generated test scripts than run in parallel. MEM= Memory [MB] available for test pods in the system. CPU= Non-reserved CPU [mCPU] available for test pods in the system. RESERVED_CPU= Reserved CPU [mCPU] available for test pods in the system. STEPS= Total number of test steps in all parallel tests. FMBT_IMAGE= Generate the test using fmbt from docker image IMG:TAG. The default is fmbt-cli:latest. EOF exit 0 } if [ -n "$1" ]; then usage fi TESTCOUNT=${TESTCOUNT:-1} MEM=${MEM:-7500} # 950 mCPU taken by the control plane, split the remaining 15050 mCPU # available for test pods to CPU and RESERVED_CPU pods. CPU=${CPU:-14050} RESERVED_CPU=${RESERVED_CPU:-1000} STEPS=${STEPS:-100} FMBT_IMAGE=${FMBT_IMAGE:-"fmbt-cli:latest"} mem_per_test=$(( MEM / TESTCOUNT )) cpu_per_test=$(( CPU / TESTCOUNT )) reserved_cpu_per_test=$(( RESERVED_CPU / TESTCOUNT )) steps_per_test=$(( STEPS / TESTCOUNT )) # Check fmbt Docker image docker run "$FMBT_IMAGE" fmbt --version 2>&1 | grep ^Version: || { echo "error: cannot run fmbt from Docker image '$FMBT_IMAGE'" echo "You can build the image locally by running:" echo "( cd /tmp && git clone --branch devel https://github.com/intel/fmbt && cd fmbt && docker build . -t $FMBT_IMAGE -f Dockerfile.fmbt-cli )" exit 1 } cd "$(dirname "$0")" || { echo "cannot cd to the directory of $0" exit 1 } for testnum in $(seq 1 "$TESTCOUNT"); do testid=$(( testnum - 1)) sed -e "s/max_mem=.*/max_mem=${mem_per_test}/" \ -e "s/max_cpu=.*/max_cpu=${cpu_per_test}/" \ -e "s/max_reserved_cpu=.*/max_reserved_cpu=${reserved_cpu_per_test}/" \ < fuzz.aal > tmp.fuzz.aal sed -e "s/fuzz\.aal/tmp.fuzz.aal/" \ -e "s/pass = steps(.*/pass = steps(${steps_per_test})/" \ < fuzz.fmbt.conf > tmp.fuzz.fmbt.conf OUTFILE=generated${testid}.sh echo "generating $OUTFILE..." docker run -v "$(pwd):/mnt/models" "$FMBT_IMAGE" sh -c 'cd /mnt/models; fmbt tmp.fuzz.fmbt.conf 2>/dev/null | fmbt-log -f STEP\$sn\$as\$al' | grep -v AAL | sed -e 's/^, / /g' -e '/^STEP/! s/\(^.*\)/echo "TESTGEN: \1"/g' -e 's/^STEP\([0-9]*\)i:\(.*\)/echo "TESTGEN: STEP \1"; vm-command "date +%T.%N"; \2; vm-command "date +%T.%N"; kubectl get pods -A/g' | sed "s/\([^a-z0-9]\)\(r\?\)\(gu\|bu\|be\)\([0-9]\)/\1t${testid}\2\3\4/g" > "$OUTFILE" done ================================================ FILE: test/e2e/policies.test-suite/topology-aware/n4c16/test07-mixed-allocations/code.var.sh ================================================ # Place pod0c0 and pod0c1 to shared pools on separate nodes. CONTCOUNT=2 CPU=500m create guaranteed report allowed verify "len(mems['pod0c0']) == 1" \ "len(mems['pod0c1']) == 1" \ "disjoint_sets(mems['pod0c0'], mems['pod0c1'])" \ "len(cpus['pod0c0']) == 4" \ "len(cpus['pod0c1']) == 4" \ "disjoint_sets(cpus['pod0c0'], cpus['pod0c1'])" # Place pod1c0 to its own node, as there is still one 4-CPU node free. # The placement of pod1c1 is more interesting: # - node0 has only 3 CPUs (CPU #0 is reserved) # - node1, node2 and node3 have containers in their shared pools # - shared pools with pod0c* containers have more free space than node0 # => pod1c0 should be place to either of those # - because pod1c1 should get one exclusive CPU, either of pod0c0 and # pod0c1 should run in a shared pool of only 3 CPUs from now on. CONTCOUNT=2 CPU=1500m create guaranteed report allowed verify `# every container is placed on a single node (no socket, no root)` \ "[len(mems[c]) for c in mems] == [1] * len(mems)" \ `# pod1c0 and pod1c1 are on different nodes` \ "disjoint_sets(mems['pod1c0'], mems['pod1c1'])" \ `# either of pod0c0 and pod0c1 has only 3 CPUs, the other has 4.` \ "len(cpus['pod0c0']) == 3 or len(cpus['pod0c1']) == 3" \ "len(cpus['pod0c0']) == 4 or len(cpus['pod0c1']) == 4" \ `# pod1c0 and pod1c1 are allowed to use all CPUs on their nodes` \ "len(cpus['pod1c0']) == 4" \ "len(cpus['pod1c1']) == 4" \ `# pod1c1 should have one exclusive CPU on its node` \ "len(cpus['pod1c1'] - cpus['pod0c0'] - cpus['pod0c1']) == 1" # Place pod2c0 to node0, as it has largest free shared pool (3 CPUs). # Place pod2c1 to the node that has only either pod0c0 or pod0c1, # while the other one of them already shares a node with pod1c1. CONTCOUNT=2 CPU=2400m create guaranteed report allowed verify `# every container is placed on a single node (no socket, no root)` \ "[len(mems[c]) for c in mems] == [1] * len(mems)" \ `# pod1c1 should have kept its own exclusive CPU` \ "len(cpus['pod1c1'] - set.union(*[cpus[c] for c in cpus if c != 'pod1c1'])) == 1" \ `# pod2c0 is the only container in node0, so it happens to have 3 unshared CPUs for now` \ "len(cpus['pod2c0']) == 3" \ "len(cpus['pod2c0'] - set.union(*[cpus[c] for c in cpus if c != 'pod2c0'])) == 3" \ `# pod2c1 shares its node and should not have exclusive CPUs` \ "len(cpus['pod2c1']) == 4" \ "len(cpus['pod2c1'] - set.union(*[cpus[c] for c in cpus if c != 'pod2c1'])) == 0" \ `# pod2c1 should run in the same node as either pod0c0 or pod0c1` \ "mems['pod2c1'] == mems['pod0c0'] or mems['pod2c1'] == mems['pod0c1']" # pod3c0 should get 2 exclusive CPUs and 400m share from a shared pool. # To get that, annotate the pod to: # - opt-out from shared CPUs (=> opt-in to exclusive CPUs) # - opt-in to isolated CPUs (this should not matter, test opt-out with pod4). # There is only one node where the container fits: the same node as pod1c0. ANNOTATIONS=('prefer-shared-cpus.cri-resource-manager.intel.com/pod: "false"' 'prefer-isolated-cpus.cri-resource-manager.intel.com/pod: "true"') CONTCOUNT=1 CPU=2400m create guaranteed-annotated report allowed verify `# every container is placed on a single node (no socket, no root)` \ "[len(mems[c]) for c in mems] == [1] * len(mems)" \ `# pod3c0 and pod1c0 are placed in the same node` \ "mems['pod3c0'] == mems['pod1c0']" \ `# pod1c0 has 1 an exclusive CPU` \ "len(cpus['pod1c0'] - set.union(*[cpus[c] for c in cpus if c != 'pod1c0'])) == 1" \ `# pod3c0 has 2 exclusive CPUs` \ "len(cpus['pod3c0'] - set.union(*[cpus[c] for c in cpus if c != 'pod3c0'])) == 2" # Replace pod3 with pod4. # Test release/(re)allocate mixed pod with exclusive CPUs and # no-effect from isolated preference. # - opt-out from shared CPUs (=> opt-in to exclusive CPUs) # - opt-out from isolated CPUs (this does not affect getting exclusive CPUs) kubectl delete pods pod3 --now --wait --ignore-not-found ANNOTATIONS=('prefer-shared-cpus.cri-resource-manager.intel.com/pod: "false"' 'prefer-isolated-cpus.cri-resource-manager.intel.com/pod: "false"') CONTCOUNT=1 CPU=2400m create guaranteed-annotated report allowed verify `# every container is placed on a single node (no socket, no root)` \ "[len(mems[c]) for c in mems] == [1] * len(mems)" \ `# pod4c0 and pod1c0 are placed in the same node` \ "mems['pod4c0'] == mems['pod1c0']" \ `# pod1c0 has 1 exclusive CPU` \ "len(cpus['pod1c0'] - set.union(*[cpus[c] for c in cpus if c != 'pod1c0'])) == 1" \ `# pod4c0 has 2 exclusive CPUs` \ "len(cpus['pod4c0'] - set.union(*[cpus[c] for c in cpus if c != 'pod4c0'])) == 2" # Replace pod1 with pod5. # pod1 implicitly opted-in to exlusive CPUs due to 1500 mCPU request. # Now explicitly opt-out of it by opting-in to shared-cpus. kubectl delete pods pod1 --now --wait --ignore-not-found # Make sure that shared pool size increased correctly after mixed pod deletion. verify `# pod0c0 or pod0c1 shared a node with pod1c1 and had only 3 CPUs` \ "len(cpus['pod0c0']) == 4" \ "len(cpus['pod0c1']) == 4" ANNOTATIONS=('prefer-shared-cpus.cri-resource-manager.intel.com/pod: "true"') CONTCOUNT=2 CPU=1500m create guaranteed-annotated report allowed verify `# every container is placed on a single node (no socket, no root)` \ "[len(mems[c]) for c in mems] == [1] * len(mems)" \ `# pod5c0 should share a node with pod0c0 or pod0c1 and have access to all CPUs` \ "mems['pod5c0'] == mems['pod0c0'] or mems['pod5c0'] == mems['pod0c1']" \ "len(cpus['pod5c0']) == 4" \ "len(cpus['pod0c0']) == 4" \ "len(cpus['pod0c1']) == 4" \ `# pod5c1 should run in a node with pod4c0 (this is where pod1c0 used to be)` \ "mems['pod5c1'] == mems['pod4c0']" \ "len(cpus['pod5c1']) == 2" \ `# pod5c0 and pod5c1 share a node with another container => all their CPUs should be shared` \ "len(cpus['pod5c0'] - set.union(*[cpus[c] for c in cpus if c != 'pod5c0'])) == 0" \ "len(cpus['pod5c1'] - set.union(*[cpus[c] for c in cpus if c != 'pod5c1'])) == 0" ================================================ FILE: test/e2e/policies.test-suite/topology-aware/n4c16/test07-mixed-allocations/guaranteed-annotated.yaml.in ================================================ apiVersion: v1 kind: Pod metadata: name: ${NAME} $(if [ -n "${ANNOTATIONS[0]}" ]; then echo " annotations: $(for annotation in "${ANNOTATIONS[@]}"; do echo " $annotation "; done) "; fi) labels: app: ${NAME} spec: containers: $(for contnum in $(seq 1 ${CONTCOUNT}); do echo " - name: ${NAME}c$(( contnum - 1 )) image: busybox imagePullPolicy: IfNotPresent command: - sh - -c - echo ${NAME}c$(( contnum - 1 )) \$(sleep inf) resources: requests: cpu: ${CPU} memory: '${MEM}' limits: cpu: ${CPU} memory: '${MEM}' "; done ) terminationGracePeriodSeconds: 1 ================================================ FILE: test/e2e/policies.test-suite/topology-aware/n4c16/test08-isolcpus/code.var.sh ================================================ vm-command "grep isolcpus=8,9 /proc/cmdline" || { vm-set-kernel-cmdline "isolcpus=8,9" vm-reboot vm-command "grep isolcpus=8,9 /proc/cmdline" || { error "failed to set isolcpus kernel commandline parameter" } launch cri-resmgr vm-command "systemctl restart kubelet" sleep 1 vm-wait-process --timeout 120 kube-apiserver vm-run-until --timeout 120 "kubectl get node" } CONTCOUNT=1 # pod0: opt-in isolated CPUs ANNOTATIONS='prefer-isolated-cpus.cri-resource-manager.intel.com/pod: "true"' CPU=1 create guaranteed-annotated report allowed verify "cpus['pod0c0'] == {'cpu08'} or cpus['pod0c0'] == {'cpu09'}" \ "mems['pod0c0'] == {'node2'}" # pod1: opt-out isolated CPUs ANNOTATIONS='prefer-isolated-cpus.cri-resource-manager.intel.com/pod: "false"' CPU=1 create guaranteed-annotated report allowed verify "disjoint_sets(cpus['pod1c0'], {'cpu08', 'cpu09'})" # pod2: without annotation CPU=1 guaranteed pod is eligible to run on isolated CPUs ANNOTATIONS='' CPU=1 create guaranteed-annotated report allowed verify "cpus['pod0c0'] == {'cpu08'} or cpus['pod0c0'] == {'cpu09'}" \ "cpus['pod2c0'] == {'cpu08'} or cpus['pod2c0'] == {'cpu09'}" \ "disjoint_sets(cpus['pod0c0'], cpus['pod2c0'])" \ "mems['pod0c0'] == {'node2'}" \ "mems['pod2c0'] == {'node2'}" # free isolated (and all other) cpus kubectl delete pods --all --now --wait # pod3: opt-in isolated CPUs, take all of them ANNOTATIONS='prefer-isolated-cpus.cri-resource-manager.intel.com/pod: "true"' CPU=2000m create guaranteed-annotated report allowed verify "cpus['pod3c0'] == {'cpu08', 'cpu09'}" \ "len(cpus['pod3c0']) == 2" # free isolated cpus kubectl delete pods --all --now --wait # pod4: opt-in isolated CPUs but require a fraction more CPUs than there are isolated CPUs ANNOTATIONS=('prefer-isolated-cpus.cri-resource-manager.intel.com/pod: "true"' 'prefer-shared-cpus.cri-resource-manager.intel.com/pod: "false"') CPU=2500m create guaranteed-annotated report allowed verify "'cpu08' in cpus['pod4c0'] and 'cpu09' in cpus['pod4c0']" \ "len(cpus['pod4c0']) == 4" # free isolated cpus kubectl delete pods --all --now --wait # pod5: opt-in isolated CPUs but require a fraction less CPUs than there are isolated CPUs ANNOTATIONS=('prefer-isolated-cpus.cri-resource-manager.intel.com/pod: "true"' 'prefer-shared-cpus.cri-resource-manager.intel.com/pod: "false"') CPU=1500m create guaranteed-annotated report allowed verify "'cpu08' in cpus['pod5c0'] or 'cpu09' in cpus['pod5c0']" \ "'cpu10' in cpus['pod5c0'] and 'cpu11' in cpus['pod5c0']" \ "len(cpus['pod5c0']) == 3" # free isolated cpus kubectl delete pods --all --now --wait # pod6: opt-in isolated CPUs but require a full CPU more than there # are isolated CPUs ANNOTATIONS=('prefer-isolated-cpus.cri-resource-manager.intel.com/pod: "true"' 'prefer-shared-cpus.cri-resource-manager.intel.com/pod: "false"') CPU=3000m create guaranteed-annotated report allowed verify "len(cpus['pod6c0']) == 3" \ "disjoint_sets(cpus['pod6c0'], {'cpu08', 'cpu09'})" \ "len(mems['pod6c0']) == 1" # pod7: sub-core is never eligble for isolated CPUs, even if annotated # to opt-in. ANNOTATIONS=('prefer-isolated-cpus.cri-resource-manager.intel.com/pod: "true"' 'prefer-shared-cpus.cri-resource-manager.intel.com/pod: "false"') CONTCOUNT=4 CPU=200m create guaranteed-annotated report allowed verify "disjoint_sets(set.union(cpus['pod7c0'], cpus['pod7c1'], cpus['pod7c2'], cpus['pod7c3']), {'cpu08', 'cpu09'})" \ "len(cpus['pod7c0']) >= 2" \ "len(cpus['pod7c1']) >= 2" \ "len(cpus['pod7c2']) >= 2" \ "len(cpus['pod7c3']) >= 2" # Cleanup kernel commandline, otherwise isolcpus will affect CPU # pinning and cause false negatives from other tests on this VM. vm-set-kernel-cmdline "" vm-reboot vm-command "grep isolcpus /proc/cmdline" && { error "failed to clean up isolcpus kernel commandline parameter" } echo "isolcpus removed from kernel commandline" launch cri-resmgr vm-command "systemctl restart kubelet" vm-wait-process --timeout 120 kube-apiserver ================================================ FILE: test/e2e/policies.test-suite/topology-aware/n4c16/test08-isolcpus/guaranteed-annotated.yaml.in ================================================ apiVersion: v1 kind: Pod metadata: name: ${NAME} $(if [ -n "${ANNOTATIONS[0]}" ]; then echo " annotations: $(for annotation in "${ANNOTATIONS[@]}"; do echo " $annotation "; done) "; fi) labels: app: ${NAME} spec: containers: $(for contnum in $(seq 1 ${CONTCOUNT}); do echo " - name: ${NAME}c$(( contnum - 1 )) image: busybox imagePullPolicy: IfNotPresent command: - sh - -c - echo ${NAME}c$(( contnum - 1 )) \$(sleep inf) resources: requests: cpu: ${CPU} memory: '${MEM}' limits: cpu: ${CPU} memory: '${MEM}' "; done ) terminationGracePeriodSeconds: 1 ================================================ FILE: test/e2e/policies.test-suite/topology-aware/n4c16/test09-container-exit/code.var.sh ================================================ # Test resource allocation / free on different container exit and # restart scenarios. CONTCOUNT=1 CPU=1000m MEM=64M create guaranteed report allowed verify 'len(cpus["pod0c0"]) == 1' \ '"pod0c0" in allocations' out '### Crash and restart pod0c0' vm-command "kubectl get pods pod0" vm-command "set -x; [[ -n \"\$(pgrep -f pod0c0)\" ]] && [[ \"\$(pgrep -f pod0c0 --oldest)\" != \"\$(pgrep -f pod0c0 --newest)\" ]]" || { command-error "There must be separate parent and child 'pod0c0' processes in order to run this test" } out '### Kill the root process in pod0c0. The container should get Restarted.' vm-command "kill -KILL \$(pgrep -f pod0c0 --oldest)" sleep 2 vm-command 'kubectl wait --for=condition=Ready pods/pod0' vm-run-until --timeout 30 "pgrep -f pod0c0 > /dev/null 2>&1" vm-command "kubectl get pods pod0" report allowed verify 'len(cpus["pod0c0"]) == 1' \ '"pod0c0" in allocations' out '### Kill the child process in pod0c0. The root process exits with status 0, the container should get Completed.' vm-command "kubectl get pods pod0" vm-command "ps axf | grep pod0c0; echo newest: \$(pgrep -f pod0c0 --newest)" vm-command "kill -KILL \$(pgrep -f pod0c0 --newest)" sleep 2 vm-command "kubectl get pods pod0" # pod0c0 process is not on vm anymore verify '"pod0c0" not in cpus' # pod0c0 is not allocated any resources on CRI-RM ( verify '"pod0c0" not in allocations' ) || { # pretty-print allocations contents pp allocations error "pod0c0 expected to disappear from allocations" } ================================================ FILE: test/e2e/policies.test-suite/topology-aware/n4c16/test10-additional-reserved-namespaces/code.var.sh ================================================ # Test that # - containers marked in ReservedPoolNamespaces option pinned on Reserved CPUs. (kubectl create namespace reserved-test) || true cri_resmgr_cfg_orig=$cri_resmgr_cfg # This script will create pods to the reserved and default namespace. # Make sure the namespace is clear when starting the test and clean it up # if exiting with success. Otherwise leave the pod running for # debugging in case of a failure. cleanup-test-pods() { ( kubectl delete pods pod0 -n kube-system --now --wait --ignore-not-found ) || true ( kubectl delete pods pod1 --now --wait --ignore-not-found ) || true } cleanup-test-pods terminate cri-resmgr AVAILABLE_CPU="cpuset:8-11" RESERVED_CPU="cpuset:10-11" cri_resmgr_cfg=$(instantiate cri-resmgr-reserved-namespaces.cfg) launch cri-resmgr CONTCOUNT=1 namespace=kube-system create besteffort CONTCOUNT=1 create besteffort report allowed verify 'cpus["pod0c0"] == {"cpu10", "cpu11"}' verify 'cpus["pod1c0"] == {"cpu08", "cpu09"}' cleanup-test-pods # Test that # - containers that are namespace-assigned to reserved pools are pinned there # - containers that are annotated to opt-put that are pinned elsewhere, and # - containers that are namespace-assigned and annotated to reserved pools are pinned there (kubectl create namespace foobar) || true cleanup-foobar-namespace() { (kubectl delete pods -n foobar --all --now --wait) || true } cleanup-foobar-namespace CONTCOUNT=1 namespace=foobar create besteffort ANN0='prefer-reserved-cpus.cri-resource-manager.intel.com/pod: "false"' CONTCOUNT=1 namespace=foobar create besteffort ANN0='prefer-reserved-cpus.cri-resource-manager.intel.com/pod: "true"' CONTCOUNT=1 namespace=foobar create besteffort report allowed verify 'cpus["pod2c0"] == {"cpu10", "cpu11"}' verify 'cpus["pod3c0"] == {"cpu08", "cpu09"}' verify 'cpus["pod4c0"] == {"cpu10", "cpu11"}' cleanup-foobar-namespace ================================================ FILE: test/e2e/policies.test-suite/topology-aware/n4c16/test10-additional-reserved-namespaces/cri-resmgr-reserved-namespaces.cfg.in ================================================ policy: Active: topology-aware ReservedResources: cpu: ${RESERVED_CPU} AvailableResources: cpu: ${AVAILABLE_CPU} topology-aware: ReservedPoolNamespaces: [\"reserved-pool\",\"reserved-*\",\"foobar\"] logger: Debug: cri-resmgr,resource-manager,cache,policy Klog: skip_headers: true dump: Config: off:.*,full:((Create)|(Start)|(Run)|(Update)|(Stop)|(Remove)).*,off:.*Image.* ================================================ FILE: test/e2e/policies.test-suite/topology-aware/n4c16/test11-reserved-cpu-annotations/code.var.sh ================================================ # Test that # - containers marked in Annotations pinned on Reserved CPUs. cri_resmgr_cfg_orig=$cri_resmgr_cfg cleanup-test-pods() { ( kubectl delete pods pod0 --now --wait --ignore-not-found ) || true ( kubectl delete pods pod1 --now --wait --ignore-not-found ) || true } cleanup-test-pods cri_resmgr_cfg_orig=$cri_resmgr_cfg terminate cri-resmgr AVAILABLE_CPU="cpuset:8-11" RESERVED_CPU="cpuset:10-11" cri_resmgr_cfg=$(instantiate cri-resmgr-reserved-annotations.cfg) launch cri-resmgr ANNOTATIONS='prefer-reserved-cpus.cri-resource-manager.intel.com/pod: "true"' CONTCOUNT=1 create reserved-annotated report allowed ANNOTATIONS='prefer-reserved-cpus.cri-resource-manager.intel.com/container.special: "false"' CONTCOUNT=1 create reserved-annotated report allowed verify 'cpus["pod0c0"] == {"cpu10", "cpu11"}' verify 'cpus["pod1c0"] == {"cpu08"}' cleanup-test-pods terminate cri-resmgr cri_resmgr_cfg=$cri_resmgr_cfg_orig launch cri-resmgr ================================================ FILE: test/e2e/policies.test-suite/topology-aware/n4c16/test11-reserved-cpu-annotations/cri-resmgr-reserved-annotations.cfg.in ================================================ policy: Active: topology-aware ReservedResources: cpu: ${RESERVED_CPU} AvailableResources: cpu: ${AVAILABLE_CPU} logger: Debug: cri-resmgr,resource-manager,cache,policy Klog: skip_headers: true dump: Config: off:.*,full:((Create)|(Start)|(Run)|(Update)|(Stop)|(Remove)).*,off:.*Image.* ================================================ FILE: test/e2e/policies.test-suite/topology-aware/n4c16/test11-reserved-cpu-annotations/reserved-annotated.yaml.in ================================================ apiVersion: v1 kind: Pod metadata: name: ${NAME} $(if [ -n "${ANNOTATIONS[0]}" ]; then echo " annotations: $(for annotation in "${ANNOTATIONS[@]}"; do echo " $annotation "; done) "; fi) labels: app: ${NAME} spec: containers: $(for contnum in $(seq 1 ${CONTCOUNT}); do echo " - name: ${NAME}c$(( contnum - 1 )) image: busybox imagePullPolicy: IfNotPresent command: - sh - -c - echo ${NAME}c$(( contnum - 1 )) \$(sleep inf) resources: requests: cpu: ${CPU} memory: '${MEM}' limits: cpu: ${CPU} memory: '${MEM}' "; done ) terminationGracePeriodSeconds: 1 ================================================ FILE: test/e2e/policies.test-suite/topology-aware/n4c16/topology.var.json ================================================ [ {"mem": "2G", "cores": 2, "nodes": 2, "packages": 2} ] ================================================ FILE: test/e2e/run.sh ================================================ #!/bin/bash DEMO_TITLE="Container Runtime End-to-End Testing" DEFAULT_DISTRO="ubuntu-22.04" PV='pv -qL' binsrc=${binsrc-local} SCRIPT_DIR="$(dirname "${BASH_SOURCE[0]}")" DEMO_LIB_DIR=$(realpath "$SCRIPT_DIR/../../demo/lib") OUTPUT_DIR=${outdir-"$SCRIPT_DIR"/output} COMMAND_OUTPUT_DIR="$OUTPUT_DIR"/commands # shellcheck disable=SC1091 # shellcheck source=../../demo/lib/command.bash source "$DEMO_LIB_DIR"/command.bash # shellcheck disable=SC1091 # shellcheck source=../../demo/lib/host.bash source "$DEMO_LIB_DIR"/host.bash # shellcheck disable=SC1091 # shellcheck source=../../demo/lib/vm.bash source "$DEMO_LIB_DIR"/vm.bash script_source="$(< "$0") $(< "$DEMO_LIB_DIR/host.bash") $(< "$DEMO_LIB_DIR/command.bash") $(< "$DEMO_LIB_DIR/vm.bash")" usage() { echo "$DEMO_TITLE" echo "Usage: [VAR=VALUE] ./run.sh MODE [SCRIPT]" echo " MODE: \"play\" plays the test as a demo." echo " \"record\" plays and records the demo." echo " \"test\" runs fast, reports pass or fail." echo " \"debug\" enables k8scri pipe debugging and" echo " copies sources of all *_src VARs (see below) to vm." echo " \"interactive\" launches interactive shell" echo " for running test script commands" echo " (see ./run.sh help script [FUNCTION])." echo " SCRIPT: test script file to run instead of the default test." echo "" echo " VARs:" echo " vm: govm virtual machine name." echo " For non-govm-managed hosts: set VM_IP and VM_SSH_USER, too." echo " 'ssh \$VM_SSH_USER@\$VM_IP sudo id' must not require password." echo " containerd_src:" echo " \"/host/path/to/go/project\": replace vm /usr/bin binaries" echo " from /host/path/to/go/project/bin directory." echo " The default is to use vm OS package manager containerd." echo " crio_src:" echo " \"/host/path/to/go/project\": replace vm /usr/bin binaries" echo " from /host/path/to/go/project/bin directory." echo " Must be set if crio is a part of \$k8scri and the vm distro" echo " does not have (or implement installing) cri-o packages." echo " crirm_src:" echo " \"/host/path/to/go/project\": replace vm /usr/local/bin binaries" echo " from /host/path/to/go/project/bin directory." echo " The default is to use the project of these e2e tests." echo " runc_src:" echo " \"/host/path/to/go/project\": replace vm /usr/bin binaries" echo " from /host/path/to/go/project/bin directory." echo " distro_binaries:" echo " 0: use the normal binaries built for this host (the default)." echo " 1: use binaries cross-built for distros." echo " binsrc: Where to get cri-resmgr to the vm." echo " \"github\": go get from master and build inside vm." echo " \"local\": (the default) copy from \${crirm_src}/bin, or" echo " from \${crirm_src}/binaries/\$distro if \$distro_binaries=1." echo " \"packages/\": use distro packages from this dir" echo " reinstall_:" echo " If 1, stop the daemon (if not runc)," echo " then reinstall and restart it before starting test run." echo " The default is 0." echo " Set containerd_src/crio_src/runc_src to install a local build." echo " reinstall_k8s: if 1, destroy existing k8s cluster and create a new one." echo " reinstall_bootstrap: if 1, run the bootstrap and proxy setup commands." echo " Only available if VM_IP is set when calling the script." echo " reinstall_all: if 1, set all above reinstall_* options to 1." echo " omit_cri_resmgr: if 1, omit checking/installing/starting cri-resmgr." echo " omit_agent: if 1, omit checking/installing/starting cri-resmgr-agent." echo " outdir: Save output under given directory." echo " The default is \"${SCRIPT_DIR}/output\"." echo " speed: Demo play speed." echo " The default is 10 (keypresses per second)." echo " cleanup: Level of cleanup after a test run:" echo " 0: leave vm running (the default)" echo " 1: delete vm" echo " 2: stop vm, but do not delete it." echo " Hook VARs:" echo " on_vm_online: code to be executed when SSH connection to vm works." echo " on_k8s_online: code to be executed when Kubernetes is ready for use." echo " on_verify_fail, on_create_fail: code to be executed in case" echo " verify() or create() fails. Example: go to interactive" echo " mode if a verification fails: on_verify_fail=interactive" echo " on_verify, on_create, on_launch: code to be executed every time" echo " after verify/create/launch function" echo " on_{cri,runc,k8s}_install: code to be executed right after installing" echo " these components." echo "" echo " VM configuration VARs: (effective when vm is not already configured)" echo " topology: JSON to override NUMA node list used in tests." echo " See: python3 ${DEMO_LIB_DIR}/topology2qemuopts.py --help" echo " distro: Linux distribution to be / already installed on vm." echo " Supported values: debian-11, debian-12, debian-sid" echo " fedora, opensuse-tumbleweed," echo " opensuse-15.6 (same as opensuse), sles," echo " ubuntu-18.04, ubuntu-20.04, ubuntu-22.04, ubuntu-24.04" echo " If sles: set VM_SLES_REGCODE= to use official packages." echo " cgroups: cgroups version in the VM, v1 or v2. The default is v1." echo " cgroups=v2 is supported only on distro=fedora" echo " k8s: Kubernetes version to be installed on VM creation" echo " The default is the latest available on selected distro." echo " Example: k8s=1.31" echo " k8scri: The container runtime pipe where kubelet connects to." echo " Options are:" echo " \"cri-resmgr|containerd\" cri-resmgr is a proxy to containerd." echo " \"cri-resmgr|crio\" cri-resmgr is a proxy to cri-o." echo " \"containerd\" containerd, no cri-resmgr." echo " \"containerd&cri-resmgr\" containerd, cri-resmgr is an NRI plugin." echo " \"crio\" cri-o, no cri-resmgr." echo " \"crio&cri-resmgr\" cri-o, cri-resmgr is an NRI plugin." echo " The default is \"cri-resmgr|containerd\"." echo " k8scni: The container network interface plugin to install. Options are:" echo " \"cilium\" (the default), \"flannel\", \"weavenet\"." echo " k8smaster: Name of the existing vm whose cluster this vm will join." echo " If empty (default), this vm forms its own single-node cluster." echo " crio_version: Version of cri-o to try to pull in, if cri-o is" echo " not being installed from sources." echo " setup_proxies: Setup proxies even if not using govm based VM." echo " This is only needed if you have set VM_IP and want" echo " the proxy information set in the target host. By default" echo " the proxies are not set if VM_IP is set." echo "" echo " Test input VARs:" echo " cri_resmgr_cfg: configuration file forced to cri-resmgr." echo " cri_resmgr_extra_args: arguments to be added on cri-resmgr" echo " command line when launched" echo " cri_resmgr_agent_extra_args: arguments to be added on" echo " cri-resmgr-agent command line when launched" echo " use_host_images: if \"1\", export images from the host docker" echo " to vm whenever they are available." echo " The default is 0: always pull images from repositories to vm." echo " vm_files: \"serialized\" associative array of files to be created on vm" echo " associative array syntax:" echo " vm_files['/path/file']=file:/path/on/host" echo " ='data:,plain text content'" echo " =data:;base64,ZGF0YQ==" echo " =dir: (creates only /path/file directory)" echo " vm_files['/etc/motd']='data:,hello world'" echo " How to execute run.sh with serialized array:" echo " vm_files=\$(declare -p vm_files) ./run.sh" echo " code: Variable that contains test script code to be run" echo " if SCRIPT is not given." echo " py_consts: Python code that runs always before pyexec in SCRIPT." echo "" echo "Default test input VARs: ./run.sh help defaults" echo "" echo "Create VM 'foo' that runs k8s 1.28 on Debian Sid:" echo "vm=foo distro=debian-sid k8s=1.28 ./run.sh interactive" } error() { (echo ""; echo "error: $1" ) >&2 command-exit-if-not-interactive } out() { if [ -n "$PV" ]; then speed=${speed-10} echo "$1" | $PV "$speed" else echo "$1" fi echo "" } record() { clear out "Recording this screencast..." host-command "asciinema rec -t \"$DEMO_TITLE\" crirm-demo-blockio.cast -c \"./run.sh play\"" } screen-create-vm() { speed=60 out "### Running the test in vm=\"$VM_NAME\"." host-create-vm "$vm" "$topology" vm-networking if [ -z "$VM_IP" ]; then error "creating VM failed" fi } screen-install-cri-resmgr() { speed=60 out "### Installing CRI Resource Manager to VM." vm-install-cri-resmgr } screen-launch-cri-resmgr() { speed=60 out "### Launching cri-resmgr with config $cri_resmgr_cfg." if [ "${binsrc#packages}" != "$binsrc" ]; then launch cri-resmgr-systemd else launch cri-resmgr fi } screen-create-singlenode-cluster() { speed=60 out "### Setting up single-node Kubernetes cluster." speed=60 out "### Container runtime parts: $k8scri" vm-create-singlenode-cluster } screen-launch-cri-resmgr-agent() { speed=60 out "### Launching cri-resmgr-agent." speed=60 out "### The agent will make cri-resmgr configurable with ConfigMaps." launch cri-resmgr-agent } get-py-allowed() { topology_dump_file="$OUTPUT_DIR/topology_dump.$VM_NAME" res_allowed_file="$OUTPUT_DIR/res_allowed.$VM_NAME" if ! [ -f "$topology_dump_file" ]; then vm-command "$("$DEMO_LIB_DIR/topology.py" bash_topology_dump)" >/dev/null || { command-error "error fetching topology_dump from $VM_NAME" } echo -e "$COMMAND_OUTPUT" > "$topology_dump_file" fi get-res-allowed "$res_allowed_file" py_allowed=" import re allowed=$("$DEMO_LIB_DIR/topology.py" -t "$topology_dump_file" -r "$res_allowed_file" res_allowed -o json) _branch_pod=[(p, d, n, c, t, cpu, pod.rsplit('/', 1)[0]) for p in allowed for d in allowed[p] for n in allowed[p][d] for c in allowed[p][d][n] for t in allowed[p][d][n][c] for cpu in allowed[p][d][n][c][t] for pod in allowed[p][d][n][c][t][cpu]] # cpu resources allowed for a pod: packages, dies, nodes, cores, threads, cpus = {}, {}, {}, {}, {}, {} # mem resources allowed for a pod: mems = {} for p, d, n, c, t, cpu, pod in _branch_pod: if c == 'mem': # this _branch_pod entry is about memory if not pod in mems: mems[pod] = set() # topology.py can print memory nodes as children of cpu-ful nodes # if distance looks like they are behind the same memory controller. # The thread field, however, is the true node who contains the memory. mems[pod].add(t) continue # this _branch_pod entry is about cpu if not pod in packages: packages[pod] = set() dies[pod] = set() nodes[pod] = set() cores[pod] = set() threads[pod] = set() cpus[pod] = set() packages[pod].add(p) dies[pod].add('%s/%s' % (p, d)) nodes[pod].add(n) cores[pod].add('%s/%s' % (n, c)) threads[pod].add('%s/%s/%s' % (n, c, t)) cpus[pod].add(cpu) def disjoint_sets(*sets): 'set.isdisjoint() for n > 1 sets' s = sets[0] for next in sets[1:]: if not s.isdisjoint(next): return False s = s.union(next) return True def set_ids(str_ids, chars='[a-z]'): num_ids = set() for str_id in str_ids: if '/' in str_id: num_ids.add(tuple(int(re.sub(chars, '', s)) for s in str_id.split('/'))) else: num_ids.add(int(re.sub(chars, '', str_id))) return num_ids package_ids = lambda i: set_ids(i, '[package]') die_ids = lambda i: set_ids(i, '[packagedie]') node_ids = lambda i: set_ids(i, '[node]') core_ids = lambda i: set_ids(i, '[nodecore]') thread_ids = lambda i: set_ids(i, '[nodecorethread]') cpu_ids = lambda i: set_ids(i, '[cpu]') " } get-res-allowed() { local res_allowed_file="$1" local retries=5 while (( retries > 0 )); do # Fetch data and update allowed* variables from the virtual machine vm-command "$("$DEMO_LIB_DIR/topology.py" bash_res_allowed 'pod[0-9]*c[0-9]*')" >/dev/null || { command-error "error fetching res_allowed from $VM_NAME" } echo -e "$COMMAND_OUTPUT" > "$res_allowed_file" # Validate res_allowed_file. Retry if there is same container # name with two different sets of allowed CPUs or # memories. This is possible if cpuset.cpus of the cgroup has # been just changed and different processes in the same # container are just going through the change. Or if there are # several pods/containers running with the same name. awk -F "[ /]" '{if (pod[$1]!=0 && pod[$1]!=""$3""$4){print "error: ambiguous allowed resources for name "$1; exit(1)};pod[$1]=""$3""$4}' "$res_allowed_file" && return 0 mv "$res_allowed_file" "$res_allowed_file.retries${retries}" echo " see $res_allowed_file.retries${retries} for more details" retries=$(( retries - 1 )) done error "error: container/process name collision: test environment may need cleanup." } get-py-cache() { # Fetch current cri-resmgr cache from a virtual machine. speed=1000 vm-command "cat \"/var/lib/cri-resmgr/cache\"" >/dev/null 2>&1 || { command-error "fetching cache file failed" } cat > "${OUTPUT_DIR}/cache" <<<"$COMMAND_OUTPUT" py_cache=" import json cache=json.load(open(\"${OUTPUT_DIR}/cache\")) try: allocations=json.loads(cache['PolicyJSON']['allocations']) except KeyError: allocations=None containers=cache['Containers'] pods=cache['Pods'] for _contid in list(containers.keys()): try: _cmd = ' '.join(containers[_contid]['Command']) except: continue # Command may be None # Recognize echo podXcY ; sleep inf -type test pods and make them # easily accessible: containers['pod0c0'], pods['pod0'] if 'echo pod' in _cmd and 'sleep inf' in _cmd: _contname = _cmd.split()[3] # _contname is podXcY _podid = containers[_contid]['PodID'] _podname = pods[_podid]['Name'] # _podname is podX if not allocations is None and _contid in allocations: allocations[_contname] = allocations[_contid] containers[_contname] = containers[_contid] pods[_podname] = pods[_podid] " } resolve-template() { local name="$1" r="" d t shift for d in "$@"; do if [ -z "$d" ] || ! [ -d "$d" ]; then continue fi t="$d/$name.in" if ! [ -e "$t" ]; then continue fi if [ -z "$r" ]; then r="$t" echo 1>&2 "template $name resolved to file $r" else echo 1>&2 "WARNING: template file $r shadows $t" fi done if [ -n "$r" ]; then echo "$r" return 0 fi return 1 } is-hooked() { local hook_code_var hook_code hook_code_var=$1 hook_code="${!hook_code_var}" if [ -n "${hook_code}" ]; then return 0 # logic: if is-hooked xyz; then run-hook xyz; fi fi return 1 } run-hook() { local hook_code_var hook_code hook_code_var=$1 hook_code="${!hook_code_var}" echo "Running hook: $hook_code_var" eval "${hook_code}" } install-files() { # Usage: install-files $(declare -p files_assoc_array) # # Parameter is a serialized associative array with # key: target filepath on VM # value: source URL ("file:", limited "data:" and "dir:" schemes supported) # # Example: build an associative array and install files in the array # files['/path/file1']=file:/hostpath/file # files['/path/file2']=data:,hello # files['/path/file3']=data:;base64,aGVsbG8= # files['/path/dir1']='dir:' # install-files "$(declare -p files)" local -A files eval "files=${1#*=}" local tgt src data for tgt in "${!files[@]}"; do src="${files[$tgt]}" case $src in "data:,"*) data=${src#data:,} ;; "data:;base64,"*) data=$(base64 -d <<< "${src#data:;base64,}") ;; "file:"*) data=$(< "${src#file:}") ;; "dir:") echo -n "Creating on vm: $tgt/... " vm-command-q "mkdir -p \"$tgt\"" || { error "failed to make directory to vm \"$tgt\"" } echo "ok." continue ;; *) error "invalid source scheme \"${src}\", expected \"data:,\" \"data:;base64,\", \"file:\" or \"dir:\"" ;; esac echo -n "Writing on vm: $tgt... " vm-write-file "$tgt" "$data" || { error "failed to write to vm file \"$tgt\"" } echo "ok." done } ### Test script helpers install() { # script API # Usage: install TARGET # # Supported TARGETs: # cri-resmgr: install cri-resmgr to VM. # Install latest local build to VM: (the default) # $ install cri-resmgr # Fetch github master to VM, build and install on VM: # $ binsrc=github install cri-resmgr # cri-resmgr-webhook: install cri-resmgr-webhook to VM. # Installs from the latest webhook Docker image on the host. # # Example: # uninstall cri-resmgr # install cri-resmgr # launch cri-resmgr local target="$1" case "$target" in "cri-resmgr") vm-install-cri-resmgr ;; "cri-resmgr-agent") vm-install-cri-resmgr-agent ;; "cri-resmgr-webhook") vm-install-cri-resmgr-webhook ;; *) error "unknown target to install \"$1\"" ;; esac } uninstall() { # script API # Usage: uninstall TARGET # # Supported TARGETs: # cri-resmgr: stop (kill) cri-resmgr and purge all files from VM. # cri-resmgr-webhook: stop cri-resmgr-webhook and delete webhook files from VM. local target="$1" case $target in "cri-resmgr") terminate cri-resmgr terminate cri-resmgr-agent distro-remove-pkg cri-resource-manager vm-command "rm -rf /usr/local/bin/cri-resmgr /usr/bin/cri-resmgr /usr/local/bin/cri-resmgr-agent /usr/bin/cri-resmgr-agent /var/lib/cri-resmgr /etc/cri-resmgr" ;; "cri-resmgr-agent") terminate cri-resmgr-agent vm-command "rm -rf /usr/local/bin/cri-resmgr /usr/bin/cri-resmgr /usr/local/bin/cri-resmgr-agent /usr/bin/cri-resmgr-agent /var/lib/cri-resmgr /etc/cri-resmgr" ;; "cri-resmgr-webhook") terminate cri-resmgr-webhook vm-command "rm -rf webhook" ;; *) error "uninstall: invalid target \"$target\"" ;; esac } launch() { # script API # Usage: launch TARGET # # Supported TARGETs: # cri-resmgr: launch cri-resmgr on VM. Environment variables: # cri_resmgr_cfg: configuration filepath (on host) # cri_resmgr_extra_args: extra arguments on command line # cri_resmgr_config: "force" (default) or "fallback" # k8scri: if the CRI pipe starts with cri-resmgr # this launches cri-resmgr as a proxy, # otherwise as a dynamic NRI plugin. # # cri-resmgr-systemd: # launch cri-resmgr on VM using "systemctl start". # Works when installed with binsrc=packages/. # Environment variables: # cri_resmgr_cfg: configuration filepath (on host) # # cri-resmgr-agent: # launch cri-resmgr-agent on VM. Environment variables: # cri_resmgr_agent_extra_args: extra arguments on command line # # cri-resmgr-webhook: # deploy cri-resmgr-webhook from the image on VM. # # Example: # cri_resmgr_cfg=/tmp/topology-aware.cfg launch cri-resmgr local target="$1" local launch_cmd local adjustment_schema="$HOST_PROJECT_DIR/pkg/apis/resmgr/v1alpha1/adjustment-schema.yaml" local cri_resmgr_config_option="-${cri_resmgr_config:-force}-config" local cri_resmgr_mode="" case $target in "cri-resmgr") host-command "$SCP \"$cri_resmgr_cfg\" $VM_SSH_USER@$VM_IP:" || { command-error "copying \"$cri_resmgr_cfg\" to VM failed" } vm-command "cat $(basename "$cri_resmgr_cfg")" if [[ "$k8scri" == cri-resmgr* ]]; then # launch cri-resmgr as the top element in the k8s container runtime stack cri_resmgr_mode="-relay-socket ${cri_resmgr_sock} -runtime-socket $cri_sock -image-socket $cri_sock" else # launch cri-resmgr as an NRI plugin to running container runtime cri_resmgr_mode="-use-nri-plugin" fi launch_cmd="cri-resmgr $cri_resmgr_mode $cri_resmgr_config_option $(basename "$cri_resmgr_cfg") $cri_resmgr_extra_args" vm-command-q "rm -f $cri_resmgr_pidfile" vm-command-q "echo '$launch_cmd' > cri-resmgr.launch.sh ; rm -f cri-resmgr.output.txt" vm-command "$launch_cmd >cri-resmgr.output.txt 2>&1 &" vm-wait-process --timeout 30 --pidfile "$cri_resmgr_pidfile" cri-resmgr vm-command "grep 'FATAL ERROR' cri-resmgr.output.txt" >/dev/null 2>&1 && { command-error "launching cri-resmgr failed with FATAL ERROR" } vm-command "fuser ${cri_resmgr_pidfile}" >/dev/null 2>&1 || { echo "cri-resmgr last output line:" vm-command-q "tail -n 1 cri-resmgr.output.txt" command-error "launching cri-resmgr failed, cannot find cri-resmgr PID" } ;; "cri-resmgr-agent") host-command "$SCP \"$adjustment_schema\" $VM_SSH_USER@$VM_IP:" || command-error "copying \"$adjustment_schema\" to VM failed" vm-command "kubectl delete -f $(basename "$adjustment_schema"); kubectl create -f $(basename "$adjustment_schema")" launch_cmd="NODE_NAME=\$(hostname) cri-resmgr-agent -kubeconfig /root/.kube/config $cri_resmgr_agent_extra_args" vm-command-q "echo '$launch_cmd' >cri-resmgr-agent.launch.sh; rm -f cri-resmgr-agent.output.txt" vm-command "$launch_cmd >cri-resmgr-agent.output.txt 2>&1 &" vm-wait-process --timeout 30 cri-resmgr-agent vm-command "grep 'FATAL ERROR' cri-resmgr-agent.output.txt" >/dev/null 2>&1 && command-error "launching cri-resmgr-agent failed with FATAL ERROR" vm-command "fuser ${cri_resmgr_agent_sock}" >/dev/null 2>&1 || command-error "launching cri-resmgr-agent failed, cannot find cri-resmgr-agent PID" ;; "cri-resmgr-systemd") host-command "$SCP \"$cri_resmgr_cfg\" $VM_SSH_USER@$VM_IP:" || command-error "copying \"$cri_resmgr_cfg\" to VM failed" vm-command "cp \"$(basename "$cri_resmgr_cfg")\" /etc/cri-resmgr/fallback.cfg" vm-command "systemctl daemon-reload ; systemctl start cri-resource-manager" || command-error "systemd failed to start cri-resource-manager" vm-wait-process --timeout 30 cri-resmgr vm-command "systemctl is-active cri-resource-manager" || { vm-command "systemctl status cri-resource-manager" command-error "cri-resource-manager did not become active after systemctl start" } ;; "cri-resmgr-webhook") kubectl apply -f webhook/webhook-deployment.yaml kubectl wait --for=condition=Available -n cri-resmgr deployments/cri-resmgr-webhook || error "cri-resmgr-webhook deployment did not become Available" kubectl apply -f webhook/mutating-webhook-config.yaml ;; *) error "launch: invalid target \"$1\"" ;; esac is-hooked on_launch && run-hook on_launch return 0 } terminate() { # script API # Usage: terminate TARGET # # Supported TARGETs: # cri-resmgr: stop (kill) cri-resmgr. # cri-resmgr-agent: stop (kill) cri-resmgr-agent. # cri-resmgr-webhook: delete cri-resmgr-webhook from k8s. local target="$1" case $target in "cri-resmgr") vm-command "fuser --kill ${cri_resmgr_pidfile} 2>/dev/null" ;; "cri-resmgr-agent") vm-command "fuser --kill ${cri_resmgr_agent_sock} 2>/dev/null" ;; "cri-resmgr-webhook") vm-command "kubectl delete -f webhook/mutating-webhook-config.yaml; kubectl delete -f webhook/webhook-deployment.yaml" ;; *) error "terminate: invalid target \"$target\"" ;; esac } sleep() { # script API # Usage: sleep PARAMETERS # # Run sleep PARAMETERS on host. host-command "sleep $*" } extended-resources() { # script API # Usage: extended-resources RESOURCE [VALUE] # # Examples: # extended-resources remove cmk.intel.com/exclusive-cpus # extended-resources add cmk.intel.com/exclusive-cpus 4 local action="$1" local resource="$2" local value="$3" local resource_escaped="${resource/\//~1}" if [ -z "$resource" ]; then error "extended-resource: missing resource" return 1 fi # make sure kubectl proxy is running vm-command-q "ss -ltn | grep -q 127.0.0.1:8001 || { kubectl proxy &>/dev/null "$PYEXEC_STATE_PY" for PYTHONCODE in "$@"; do { echo "from pyexec_state import *" echo -e "$PYTHONCODE" } > "$PYEXEC_PY" PYTHONPATH="$OUTPUT_DIR:$PYTHONPATH:$DEMO_LIB_DIR" python3 "$PYEXEC_PY" 2>&1 | tee "$PYEXEC_LOG" last_exit_status=${PIPESTATUS[0]} if [ "$last_exit_status" != "0" ]; then error "pyexec: non-zero exit status \"$last_exit_status\", see \"$PYEXEC_PY\" and \"$PYEXEC_LOG\"" fi done return "$last_exit_status" } pp() { # script API # Usage: pp EXPR # # Pretty-print the value of Python expression EXPR. pyexec "pp($*)" } report() { # script API # Usage: report [VARIABLE...] # # Updates and reports current value of VARIABLE. # # Supported VARIABLEs: # allocations # allowed # cache # # Example: print cri-resmgr policy allocations. In interactive mode # you may use a pager like less. # report allocations | less local varname for varname in "$@"; do if [ "$varname" == "allocations" ]; then get-py-cache pyexec " import pprint pprint.pprint(allocations) " elif [ "$varname" == "allowed" ]; then get-py-allowed pyexec " import topology print(topology.str_tree(allowed)) " elif [ "$varname" == "cache" ]; then get-py-cache pyexec " import pprint pprint.pprint(cache) " else error "report: unknown variable \"$varname\"" fi done } verify() { # script API # Usage: verify [--retry N] [EXPR...] # # Run python3 -c "assert(EXPR)" to test that every EXPR is True. # Stop immediately after the first failing assertion and fail the test. # # If a verify is expected to fail, failing the whole test can be # prevented by running the verify in a subshell (in parenthesis): # (verify 'False') || echo '...this was expected to fail.' # # --retry N reruns all assertions at most N times before failing # the test. All assertions must hold at the same time for a # successful verification. By default N=3. # # Variables available in EXPRs: # See variables in: help pyexec # # Note that all variables are updated every time verify is called # before evaluating (asserting) expressions. # # Example: require that containers pod0c0 and pod1c0 run on separate NUMA # nodes and that pod0c0 is allowed to run on 4 CPUs: # verify 'set.intersection(nodes["pod0c0"], nodes["pod1c0"]) == set()' \ # 'len(cpus["pod0c0"]) == 4' local retries=3 local poll_delay=1s if [[ "$1" == "--retry" ]]; then retries="$2" shift; shift fi while ! _verify "$@"; do if (( retries <= 0 )); then if is-hooked on_verify_fail; then run-hook on_verify_fail else command-exit-if-not-interactive fi return 1 fi out "### Retrying verify at most $retries time(s) after $poll_delay..." sleep "$poll_delay" retries=$(( retries - 1 )) done is-hooked on_verify && run-hook on_verify return 0 } _verify() { get-py-allowed get-py-cache for py_assertion in "$@"; do speed=1000 out "### Verifying assertion '$py_assertion'" ( speed=1000 pyexec " try: import time,sys assert(${py_assertion}) except KeyError as e: print('WARNING: *') print('WARNING: *** KeyError - %s' % str(e)) print('WARNING: *** Your verify expression might have a typo/thinko.') print('WARNING: *') sys.stdout.flush() time.sleep(5) raise e except IndexError as e: print('WARNING: *') print('WARNING: *** IndexError - %s' % str(e)) print('WARNING: *** Your verify expression might have a typo/thinko.') print('WARNING: *') sys.stdout.flush() time.sleep(5) raise e " ) || { out "### The assertion FAILED ### post-mortem debug help begin ### cd $OUTPUT_DIR python3 from pyexec_state import * $py_assertion ### post-mortem debug help end ###" echo "verify: assertion '$py_assertion' failed." >> "$SUMMARY_FILE" return 1 } speed=1000 out "### The assertion holds." done return 0 } kubectl-force-delete() { # script API # Usage: kubectl-force-delete RESOURCE NAME # # Force-deleting a "Terminating" namespace clears finalizers that # have failed to finish. Therefore there may be resources left in the # namespace NAME. Following command prints them. # # kubectl api-resources --verbs=list --namespaced -o name | \ # xargs -n 1 kubectl get --show-kind --ignore-not-found -n NAME # # Example: delete a namespace that is stuck in the "Terminating" phase # # kubectl-force-delete namespace my-namespace if [ -z "$1" ]; then error "missing RESOURCE" return 1 fi if [ -z "$2" ]; then error "missing resource NAME" return 1 fi if [[ "$1" == "namespace" ]] || [[ "$1" == "ns" ]]; then local ns="$2" vm-command " kubectl get namespace $ns -o json > force-delete-ns.json || exit 0 ( grep -E phase.*Terminating force-delete-ns.json || exit 0 tr -d '\n' < force-delete-ns.json \ | sed 's/\"finalizers\": \[[^]]\+\]/\"finalizers\": []/' \ | kubectl replace --raw /api/v1/namespaces/$ns/finalize -f - ) rm -f force-delete-ns.json " else error "unsupported force-delete resource: $1" return 1 fi } kubectl() { # script API # Usage: kubectl parameters # # Runs kubectl command on virtual machine. vm-command "kubectl $*" || { command-error "kubectl $* failed" } } delete() { # script API # Usage: delete PARAMETERS # # Run "kubectl delete PARAMETERS". vm-command "kubectl delete $*" || { command-error "kubectl delete failed" } } instantiate() { # script API # Usage: instantiate FILENAME # # Produces $OUTPUT_DIR/instance/FILENAME. Prints the filename on success. # Uses FILENAME.in as source (resolved from $TEST_DIR, $TOPOLOGY_DIR, ...) local FILENAME="$1" local RESULT="$OUTPUT_DIR/instance/$FILENAME" template_file=$(resolve-template "$FILENAME" "$TEST_DIR" "$TOPOLOGY_DIR" "$POLICY_DIR" "$SCRIPT_DIR") if [ ! -f "$template_file" ]; then error "error instantiating \"$FILENAME\": missing template ${template_file}" fi mkdir -p "$(dirname "$RESULT")" 2>/dev/null eval "echo -e \"$(<"${template_file}")\"" | grep -v '^ *$' > "$RESULT" || error "instantiating \"$FILENAME\" failed" echo "$RESULT" } declare -a pulled_images_on_vm create() { # script API # Usage: [VAR=VALUE][n=COUNT] create TEMPLATE_NAME # # Create n instances from TEMPLATE_NAME.yaml.in, copy each of them # from host to vm, kubectl create -f them, and wait for them # becoming Ready. Templates are searched in $TEST_DIR, $TOPOLOGY_DIR, # $POLICY_DIR, and $SCRIPT_DIR in this order of preference. The first # template found is used. # # Parameters: # TEMPLATE_NAME: the name of the template without extension (.yaml.in) # # Optional parameters (VAR=VALUE): # namespace: namespace to which instances are created # wait: condition to be waited for (see kubectl wait --for=condition=). # If empty (""), skip waiting. The default is wait="Ready". # wait_t: wait timeout. The default is wait_t=240s. local template_file template_file=$(resolve-template "$1.yaml" "$TEST_DIR" "$TOPOLOGY_DIR" "$POLICY_DIR" "$SCRIPT_DIR") local namespace_args local template_kind template_kind=$(awk '/kind/{print tolower($2)}' < "$template_file") local wait=${wait-Ready} local wait_t=${wait_t-240s} local images local image local tag local errormsg local default_name=${NAME:-""} if [ -z "$n" ]; then local n=1 fi if [ -n "${namespace:-}" ]; then namespace_args="-n $namespace" else namespace_args="" fi if [ ! -f "$template_file" ]; then error "error creating from template \"$template_file.yaml.in\": template file not found" fi for _ in $(seq 1 $n); do kind_count[$template_kind]=$(( ${kind_count[$template_kind]} + 1 )) if [ -n "$default_name" ]; then local NAME="$default_name" else local NAME="${template_kind}$(( ${kind_count[$template_kind]} - 1 ))" # the first pod is pod0 fi eval "echo -e \"$(<"${template_file}")\"" | grep -v '^ *$' > "$OUTPUT_DIR/$NAME.yaml" host-command "$SCP \"$OUTPUT_DIR/$NAME.yaml\" $VM_SSH_USER@$VM_IP:" || { command-error "copying \"$OUTPUT_DIR/$NAME.yaml\" to VM failed" } vm-command "cat $NAME.yaml" images="$(grep -E '^ *image: .*$' "$OUTPUT_DIR/$NAME.yaml" | sed -E 's/^ *image: *([^ ]*)$/\1/g' | sort -u)" if [ "${#pulled_images_on_vm[@]}" = "0" ]; then # Initialize pulled images available on VM vm-command "crictl -i unix://${k8scri_sock} images" >/dev/null && while read -r image tag _; do if [ "$image" = "IMAGE" ]; then continue fi local notopdir_image="${image#*/}" local norepo_image="${image##*/}" if [ "$tag" = "latest" ]; then pulled_images_on_vm+=("$image") pulled_images_on_vm+=("$notopdir_image") pulled_images_on_vm+=("$norepo_image") fi pulled_images_on_vm+=("$image:$tag") pulled_images_on_vm+=("$notopdir_image:$tag") pulled_images_on_vm+=("$norepo_image:$tag") done <<< "$COMMAND_OUTPUT" fi for image in $images; do if ! [[ " ${pulled_images_on_vm[*]} " == *" ${image} "* ]]; then if [ "$use_host_images" == "1" ] && vm-put-docker-image "$image"; then : # no need to pull the image to vm, it is now imported. else vm-command "crictl -i unix://${k8scri_sock} pull \"$image\"" || { errormsg="pulling image \"$image\" for \"$OUTPUT_DIR/$NAME.yaml\" failed." if is-hooked on_create_fail; then echo "$errormsg" run-hook on_create_fail else command-error "$errormsg" fi } fi pulled_images_on_vm+=("$image") fi done vm-command "kubectl create -f $NAME.yaml $namespace_args" || { if is-hooked on_create_fail; then echo "kubectl create error" run-hook on_create_fail else command-error "kubectl create error" fi } if [ "x$wait" != "x" ]; then speed=1000 vm-command "kubectl wait --timeout=${wait_t} --for=condition=${wait} $namespace_args ${template_kind}/$NAME" >/dev/null 2>&1 || { errormsg="waiting for ${template_kind} \"$NAME\" to become ready timed out" if is-hooked on_create_fail; then echo "$errormsg" run-hook on_create_fail else command-error "$errormsg" fi } fi done is-hooked on_create && run-hook on_create return 0 } reset() { # script API # Usage: reset counters # # Resets counters if [ "$1" == "counters" ]; then kind_count[pod]=0 else error "invalid reset \"$1\"" fi } interactive() { # script API # Usage: interactive # # Enter the interactive mode: read next script commands from # the standard input until "exit". echo "Entering the interactive mode until \"exit\"." INTERACTIVE_MODE=$(( INTERACTIVE_MODE + 1 )) # shellcheck disable=SC2162 while read -e -p "run.sh> " -a commands; do if [ "${commands[0]}" == "exit" ]; then break fi eval "${commands[@]}" done INTERACTIVE_MODE=$(( INTERACTIVE_MODE - 1 )) } help() { # script API # Usage: help [FUNCTION|all] # # Print help on all functions or on the FUNCTION available in script. awk -v f="$1" \ '/^[a-z].*script API/{split($1,a,"(");if(f==""||f==a[1]||f=="all"){print "";print a[1]":";l=2}} !/^ #/{l=l-1} /^ #/{if(l>=1){split($0,a,"#"); print " "a[2]; if (f=="") l=0}}' <<<"$script_source" } ### End of user code helpers test-user-code() { vm-command-q "kubectl get pods 2>&1 | grep -q NAME" && vm-command "kubectl delete pods --all --now --wait" ( eval "$code" ) || { TEST_FAILURES="${TEST_FAILURES} test script failed" } } # Validate parameters input_var_names="mode user_script_file distro k8scri k8smaster vm cgroups speed binsrc reinstall_all reinstall_containerd reinstall_crio reinstall_cri_resmgr reinstall_k8s reinstall_oneshot outdir cleanup on_verify_fail on_create_fail on_verify on_create on_launch topology cri_resmgr_cfg cri_resmgr_extra_args cri_resmgr_agent_extra_args code py_consts" INTERACTIVE_MODE=0 mode=$1 user_script_file=$2 distro=${distro:=$DEFAULT_DISTRO} k8s=${k8s:=} k8scri=${k8scri:="cri-resmgr|containerd"} k8smaster=${k8smaster:=} cri_resmgr_pidfile="/var/run/cri-resmgr*.pid" cri_resmgr_sock="/var/run/cri-resmgr/cri-resmgr.sock" cri_resmgr_agent_sock="/var/run/cri-resmgr/cri-resmgr-agent.sock" case "${k8scri}" in "cri-resmgr|containerd") k8scri_sock="${cri_resmgr_sock}" cri_sock="/var/run/containerd/containerd.sock" cri=containerd ;; "cri-resmgr|crio") k8scri_sock="${cri_resmgr_sock}" cri_sock="/var/run/crio/crio.sock" cri=crio ;; "containerd") k8scri_sock="/var/run/containerd/containerd.sock" cri_sock="/var/run/containerd/containerd.sock" cri=containerd omit_cri_resmgr=1 omit_agent=1 ;; "containerd&cri-resmgr") k8scri_sock="/var/run/containerd/containerd.sock" cri_sock="/var/run/containerd/containerd.sock" cri=containerd ;; "crio") k8scri_sock="/var/run/crio/crio.sock" cri_sock="/var/run/crio/crio.sock" cri=crio omit_cri_resmgr=1 omit_agent=1 ;; "crio&cri-resmgr") k8scri_sock="/var/run/crio/crio.sock" cri_sock="/var/run/crio/crio.sock" cri=crio ;; *) error "unsupported k8scri: \"${k8scri}\"" ;; esac distro_binaries=${distro_binaries:=0} containerd_src=${containerd_src:=} crio_src=${crio_src:=} crirm_src=${crirm_src:=$HOST_PROJECT_DIR} runc_src=${runc_src:=} crio_version=${crio_version:=} if [ "$distro_binaries" = "1" ]; then if [ -z "$distro" ]; then error "distro_binaries=1 but distro is not set" fi BIN_DIR=${crirm_src}/binaries/$distro else BIN_DIR=${crirm_src}/bin fi TOPOLOGY_DIR=${TOPOLOGY_DIR:=e2e} vm=${vm:=$(basename ${TOPOLOGY_DIR})-${distro}-${cri}} vm_files=${vm_files:-""} cgroups=${cgroups:-v1} cri_resmgr_cfg=${cri_resmgr_cfg:-"${SCRIPT_DIR}/cri-resmgr-topology-aware.cfg"} cri_resmgr_extra_args=${cri_resmgr_extra_args:-""} cri_resmgr_agent_extra_args=${cri_resmgr_agent_extra_args:-""} cleanup=${cleanup:-0} reinstall_all=${reinstall_all:-0} reinstall_bootstrap=${reinstall_bootstrap:-0} reinstall_containerd=${reinstall_containerd:-0} reinstall_cri_resmgr=${reinstall_cri_resmgr:-0} reinstall_cri_resmgr_agent=${reinstall_cri_resmgr_agent:-0} reinstall_crio=${reinstall_crio:-0} reinstall_k8s=${reinstall_k8s:-0} reinstall_kubeadm=${reinstall_kubeadm:-0} reinstall_kubectl=${reinstall_kubectl:-0} reinstall_kubelet=${reinstall_kubelet:-0} reinstall_oneshot=${reinstall_oneshot:-0} reinstall_runc=${reinstall_runc:-0} if [ "$reinstall_all" == "1" ]; then for reinstall_var in ${!reinstall_*}; do eval "${reinstall_var}=1" done fi if [ "$reinstall_k8s" == "1" ]; then reinstall_kubeadm=1 reinstall_kubectl=1 reinstall_kubelet=1 fi if [ "$reinstall_bootstrap" == "1" ]; then setup_proxies=1 fi omit_agent=${omit_agent:-0} omit_cri_resmgr=${omit_cri_resmgr:-0} use_host_images=${use_host_images:-0} py_consts="${py_consts:-''}" topology=${topology:-'[ {"mem": "1G", "cores": 1, "nodes": 2, "packages": 2, "node-dist": {"4": 28, "5": 28}}, {"nvmem": "8G", "node-dist": {"5": 28, "0": 17}}, {"nvmem": "8G", "node-dist": {"2": 17}} ]'} code=${code:-" CPU=1 create guaranteed # creates pod 0, 1 CPU taken report allowed CPU=2 create guaranteed # creates pod 1, 3 CPUs taken report allowed CPU=3 create guaranteed # creates pod 2, 6 CPUs taken report allowed verify \\ 'len(cpus[\"pod0c0\"]) == 1' \\ 'len(cpus[\"pod1c0\"]) == 2' \\ 'len(cpus[\"pod2c0\"]) == 3' \\ 'len(set.union(cpus[\"pod0c0\"], cpus[\"pod1c0\"], cpus[\"pod2c0\"])) == 6' n=3 create besteffort # creates pods 3, 4 and 5 verify \\ 'set.intersection( set.union(cpus[\"pod0c0\"], cpus[\"pod1c0\"], cpus[\"pod2c0\"]), set.union(cpus[\"pod3c0\"], cpus[\"pod4c0\"], cpus[\"pod5c0\"])) == set()' delete pods pod2 # deletes pod 2, 3 CPUs taken n=2 create besteffort # creates pods 6 and 7 CPU=2 n=2 create guaranteed # creates pod 8 and 9, 7 CPUs taken verify \\ 'len(set.union(cpus[\"pod0c0\"], cpus[\"pod1c0\"], cpus[\"pod8c0\"], cpus[\"pod9c0\"])) == 7' "} warning_delay=${warning_delay:-5} yaml_in_defaults="CPU=1 MEM=100M ISO=true CPUREQ=1 CPULIM=2 MEMREQ=100M MEMLIM=200M CONTCOUNT=1" if [ "$mode" == "help" ]; then if [ "$2" == "defaults" ]; then echo "Test input defaults:" echo "" echo "topology=${topology}" echo "distro=${distro}" echo "k8s=${k8s}" echo "" echo "cri_resmgr_cfg=${cri_resmgr_cfg}" echo "" echo "cri_resmgr_extra_args=${cri_resmgr_extra_args}" echo "" echo -e "code=\"${code}\"" echo "" echo "The defaults to QOSCLASS.yaml.in variables:" echo " ${yaml_in_defaults}" elif [ "$2" == "script" ]; then if [ "x$3" == "x" ]; then help else help "$3" fi elif [ "x$2" == "x" ]; then usage else echo "invalid help page, try:" echo " ./run.sh help" echo " ./run.sh help defaults" echo " ./run.sh help script [FUNCTION|all]" exit 1 fi exit 0 elif [ "$mode" == "play" ]; then speed=${speed-10} elif [ "$mode" == "test" ]; then PV= elif [ "$mode" == "debug" ]; then PV= elif [ "$mode" == "interactive" ]; then PV= elif [ "$mode" == "record" ]; then record else usage error "missing valid MODE" exit 1 fi host-require-cmd jq host-require-cmd pv if [ -n "$user_script_file" ]; then if [ ! -f "$user_script_file" ]; then error "cannot find test script file \"$user_script_file\"" fi code=$(<"$user_script_file") fi # Prepare for test/demo mkdir -p "$OUTPUT_DIR" mkdir -p "$COMMAND_OUTPUT_DIR" rm -f "$COMMAND_OUTPUT_DIR"/0* ( echo x > "$OUTPUT_DIR"/x && rm -f "$OUTPUT_DIR"/x ) || { error "output directory outdir=$OUTPUT_DIR is not writable" } SUMMARY_FILE="$OUTPUT_DIR/summary.txt" echo -n "" > "$SUMMARY_FILE" || error "cannot write summary to \"$SUMMARY_FILE\"" ## Save test inputs and defaults for the record mkdir -p "$OUTPUT_DIR/input"; rm -f "$OUTPUT_DIR/input/*" for var in $input_var_names; do if [ -n "${!var}" ]; then echo -e "${!var}" > "$OUTPUT_DIR/input/${var}.var" fi done if [ "$binsrc" == "local" ]; then if [ "$omit_cri_resmgr" != "1" ]; then [ -f "${BIN_DIR}/cri-resmgr" ] || error "missing \"${BIN_DIR}/cri-resmgr\"" fi if [ "$omit_agent" != "1" ]; then [ -f "${BIN_DIR}/cri-resmgr-agent" ] || error "missing \"${BIN_DIR}/cri-resmgr-agent\"" fi fi host-get-vm-config "$vm" || host-set-vm-config "$vm" "$distro" "$cri" if [ -z "$VM_IP" ] || [ -z "$VM_SSH_USER" ]; then screen-create-vm else if [ "$setup_proxies" == "1" ]; then vm-setup-proxies fi if [ "$reinstall_bootstrap" == "1" ]; then vm-bootstrap fi fi is-hooked "on_vm_online" && run-hook "on_vm_online" if [ "$reinstall_oneshot" == "1" ] || ! vm-command-q "[ -f .vm-setup-oneshot ]"; then vm-setup-oneshot vm-command-q "touch .vm-setup-oneshot" fi if [ -n "$vm_files" ]; then install-files "$vm_files" fi if [ "$reinstall_containerd" == "1" ] || [ "$reinstall_crio" == "1" ] || ! vm-command-q "( type -p containerd || type -p crio ) >/dev/null"; then vm-install-cri is-hooked on_cri_install && run-hook on_cri_install fi # runc is installed as a dependency of containerd and crio. # If reinstalling runc is explictly wished for, it is safe to do # only after (re)installing contaienrd/crio. Otherwise # a custom locally built runc may be overridden from packages. if [ "$reinstall_runc" == "1" ] || ! vm-command-q "type -p runc >/dev/null"; then vm-install-runc is-hooked on_runc_install && run-hook on_runc_install fi if [ "$reinstall_k8s" == "1" ] || ! vm-command-q "type -p kubelet >/dev/null"; then vm-install-k8s is-hooked on_k8s_install && run-hook on_k8s_install fi if [ "$reinstall_cri_resmgr" == "1" ]; then uninstall cri-resmgr fi if [ "$reinstall_cri_resmgr_agent" == "1" ]; then uninstall cri-resmgr-agent fi if [[ "$k8scri" == cri-resmgr* ]] || [ -n "$crirm_src" ]; then if [ "$omit_cri_resmgr" != "1" ]; then if ! vm-command-q "type -p cri-resmgr >/dev/null"; then install cri-resmgr fi fi if [ "$omit_agent" != "1" ]; then if ! vm-command-q "type -p cri-resmgr-agent >/dev/null"; then install cri-resmgr-agent fi fi fi if [ "$mode" == "debug" ]; then vm-command-q "[ -x /root/go/bin/dlv ]" || vm-install-dlv if [ -d "$crio_src" ]; then vm-dlv-add-src "$crio_src" fi if [ -d "$containerd_src" ]; then vm-dlv-add-src "$containerd_src" fi if [ -d "$crirm_src" ]; then vm-dlv-add-src "$crirm_src" fi if [ -d "$runc_src" ]; then vm-dlv-add-src "$runc_src" fi echo "How to debug cri-resmgr:" echo "- Attach debugger to running cri-resmgr:" echo " ssh $VM_SSH_USER@$VM_IP" echo " sudo /root/go/bin/dlv attach \$(pidof cri-resmgr)" echo "- Relaunch cri-resmgr in debugger:" echo " ssh $VM_SSH_USER@$VM_IP" echo " sudo -i" echo " kill -9 \$(pidof cri-resmgr); /root/go/bin/dlv exec /usr/local/bin/cri-resmgr -- -force-config /home/$VM_SSH_USER/*.cfg" echo "dlv on VM is ready for use" exit 0 fi if [ -n "$containerd_src" ] && [[ "$k8scri" == *containerd* ]]; then vm-check-source-files-changed "$containerd_src" "$containerd_src/bin/containerd" vm-check-running-binary "$containerd_src/bin/containerd" fi if [ -n "$crio_src" ] && [[ "$k8scri" == *crio* ]]; then vm-check-source-files-changed "$crio_src" "$crio_src/bin/crio" vm-check-running-binary "$crio_src/bin/crio" fi # Start cri-resmgr if not already running if [ "$omit_cri_resmgr" != "1" ]; then if ! vm-command-q "fuser ${cri_resmgr_pidfile}" >/dev/null 2>&1; then screen-launch-cri-resmgr fi if [ -n "$crirm_src" ]; then vm-check-source-files-changed "$crirm_src" "$crirm_src/bin/cri-resmgr" vm-check-running-binary "$crirm_src/bin/cri-resmgr" fi fi # Create kubernetes cluster or wait that it is online if [ "$reinstall_k8s" == "1" ]; then vm-destroy-cluster fi if vm-command-q "[ ! -f /var/lib/kubelet/config.yaml ]"; then if [ -n "$k8smaster" ]; then vm-join "$k8smaster" else screen-create-singlenode-cluster fi else # Wait for kube-apiserver to launch (may be down if the VM was just booted) vm-wait-process kube-apiserver fi # Start cri-resmgr-agent if not already running if [ "$omit_agent" != "1" ]; then if ! vm-command-q "fuser ${cri_resmgr_agent_sock}" >/dev/null; then screen-launch-cri-resmgr-agent fi fi is-hooked "on_k8s_online" && run-hook "on_k8s_online" declare -A kind_count # associative arrays for counting created objects, like kind_count[pod]=1 eval "${yaml_in_defaults}" if [ "$mode" == "interactive" ]; then interactive else # Run test/demo TEST_FAILURES="" test-user-code fi # Save logs host-command "$SCP $VM_SSH_USER@$VM_IP:cri-resmgr*.output.txt \"$OUTPUT_DIR/\"" # Cleanup if [ "$cleanup" == "0" ]; then echo "The VM, Kubernetes and cri-resmgr are left running. Next steps:" vm-print-usage elif [ "$cleanup" == "1" ]; then host-stop-vm "$vm" host-delete-vm "$vm" elif [ "$cleanup" == "2" ]; then host-stop-vm "$vm" fi # Summarize results exit_status=0 if [ "$mode" == "test" ]; then if [ -n "$TEST_FAILURES" ]; then echo "Test verdict: FAIL" >> "$SUMMARY_FILE" else echo "Test verdict: PASS" >> "$SUMMARY_FILE" fi cat "$SUMMARY_FILE" fi exit $exit_status ================================================ FILE: test/e2e/run_all_configurations.sh ================================================ #!/bin/bash RUN_SH="${0%/*}/run.sh" PAIRWISE="${0%/*}/../../scripts/testing/pairwise" "${PAIRWISE}" \ distro={debian-sid,fedora-40,opensuse-tumbleweed} \ k8scri={containerd,crio,cri-resmgr\|containerd,cri-resmgr\|crio} \ k8scni={cilium,flannel,weavenet} | while read -r env_vars; do eval "export $env_vars" code='create besteffort' # shellcheck disable=SC2154 # ...as it cannot know that pairwise+eval exports distro et. al. vm="config-$distro-${k8scri/|/-}-$k8scni" outdir="output-configs/output-$vm" export code vm outdir govm rm "$vm" >/dev/null 2>&1 mkdir -p "$outdir" "$RUN_SH" test "$outdir/run.sh.output" 2>&1 govm rm "$vm" >/dev/null 2>&1 done ================================================ FILE: test/e2e/run_tests.sh ================================================ #!/bin/bash TESTS_DIR="$1" RUN_SH="${0%/*}/run.sh" DEFAULT_DISTRO="ubuntu-22.04" usage() { echo "Usage: run_tests.sh TESTS_DIR" echo "TESTS_DIR is expected to be structured as POLICY/TOPOLOGY/TEST with files:" echo "POLICY/cri-resmgr.cfg: configuration of cri-resmgr" echo "POLICY/TOPOLOGY/topology.var.json: contents of the topology variable for run.sh" echo "POLICY/TOPOLOGY/TEST/code.var.sh: contents of the code var (that is, test script)" } error() { (echo ""; echo "error: $1" ) >&2 exit 1 } warning() { echo "WARNING: $1" >&2 } export-var-files() { # export ENV_VAR from ENV_VAR.var.* file content local var_file_dir="$1" local var_filepath local var_file_name local var_name for var_filepath in "$var_file_dir"/*.var "$var_file_dir"/*.var.*; do if ! [ -f "$var_filepath" ] || [[ "$var_filepath" == *"~" ]] || [[ "$var_filepath" == *"#"* ]]; then continue fi var_file_name=$(basename "$var_filepath") var_name=${var_file_name%%.var*} if [ "$var_name" == "code" ] || [ "$var_name" == "py_consts" ]; then # append values in code variables echo "exporting $var_name - appending from $var_filepath" export "$var_name"="${!var_name}"" $(< "$var_filepath")" else # creating / replace other variables if [ -z "${!var_name}" ]; then echo "exporting $var_name - creating from $var_filepath" else echo "exporting $var_name - overriding from $var_filepath" fi if [[ "$var_file_name" == *.var.in.* ]]; then export "$var_name"="$(eval "echo -e \"$(<"${var_filepath}")\"")" else export "$var_name"="$(< "$var_filepath")" fi fi done } export-vm-files() { # update and export vm_files associative array from directory content local vm_files_dir="$1" if [ ! -d "$vm_files_dir" ]; then return fi if [[ "$vm_files" == *"="* ]] ; then eval "declare -A vm_files_aa=${vm_files#*=}" else declare -A vm_files_aa fi prefix_len=${#vm_files_dir} shopt -s globstar for f in "$vm_files_dir"/**; do file_vm_name=${f:$prefix_len} if [ -z "$file_vm_name" ] || [ "$file_vm_name" == "/" ]; then continue elif [ -f "$f" ]; then if [ -n "${vm_files_aa[$file_vm_name]}" ]; then warning "vm file $file_vm_name: new file \"$f\" overrides \"${vm_files_aa[$file_vm_name]}\"" fi vm_files_aa[$file_vm_name]="file:$(realpath "$f")" fi done # serialize from associative array local serialized_vm_files serialized_vm_files="$(declare -p vm_files_aa)" export vm_files="declare -A vm_files${serialized_vm_files#declare -A vm_files_aa}" } source-source-files() { # Test execution will source *.source.* files before it executes # the real test code. The files will be sourced starting from the # test suite (root) directory and ending up to the test directory, # which enables overriding inherited functions and variables. local src_file_dir="$1" local src_filepath for src_filepath in "$src_file_dir"/*.source "$src_file_dir"/*.source.*; do if ! [ -f "$src_filepath" ] || [[ "$src_filepath" == *"~" ]]; then continue fi echo "sourcing $src_filepath before running test code" source_libs="${source_libs}"" source \"$src_filepath\" " done } export-and-source-dir() { local dir="$1" export-var-files "$dir" export-vm-files "$dir/vm-files" source-source-files "$dir" } if [ -z "$TESTS_DIR" ] || [ "$TESTS_DIR" == "help" ] || [ "$TESTS_DIR" == "--help" ]; then usage error "missing TESTS_DIR" fi if ! [ -d "$TESTS_DIR" ]; then error "bad TESTS_DIR: \"$TESTS_DIR\"" fi # Find TESTS_DIR root by looking for POLICY_DIR/*.cfg. If TESTS_DIR was not the # root dir, then execute tests only under TESTS_DIR. root_dir_glob="*.test-suite" # shellcheck disable=SC2053 if [[ "$(basename "$TESTS_DIR")" == $root_dir_glob ]]; then TESTS_ROOT_DIR="$TESTS_DIR" elif [[ "$(basename "$(realpath "$TESTS_DIR"/..)")" == $root_dir_glob ]]; then TESTS_ROOT_DIR=$(realpath "$TESTS_DIR/..") TESTS_POLICY_FILTER=$(basename "${TESTS_DIR}") elif [[ "$(basename "$(realpath "$TESTS_DIR"/../..)")" == $root_dir_glob ]]; then TESTS_ROOT_DIR=$(realpath "$TESTS_DIR/../..") TESTS_POLICY_FILTER=$(basename "$(dirname "${TESTS_DIR}")") TESTS_TOPOLOGY_FILTER=$(basename "${TESTS_DIR}") elif [[ "$(basename "$(realpath "$TESTS_DIR"/../../..)")" == $root_dir_glob ]]; then TESTS_ROOT_DIR=$(realpath "$TESTS_DIR/../../..") TESTS_POLICY_FILTER=$(basename "$(dirname "$(dirname "${TESTS_DIR}")")") TESTS_TOPOLOGY_FILTER=$(basename "$(dirname "${TESTS_DIR}")") TESTS_TEST_FILTER=$(basename "${TESTS_DIR}") else error "TESTS_DIR=\"$TESTS_DIR\" is invalid tests/policy/topology/test dir: *.cfg not found" fi echo "Running tests matching:" echo " TESTS_ROOT_DIR=$TESTS_ROOT_DIR" echo " TESTS_POLICY_FILTER=$TESTS_POLICY_FILTER" echo " TESTS_TOPOLOGY_FILTER=$TESTS_TOPOLOGY_FILTER" echo " TESTS_TEST_FILTER=$TESTS_TEST_FILTER" cleanup() { rm -rf "$summary_dir" } summary_dir=$(mktemp -d) trap cleanup TERM EXIT QUIT summary_file="$summary_dir/summary.txt" echo -n "" > "$summary_file" export-and-source-dir "$TESTS_ROOT_DIR" for POLICY_DIR in "$TESTS_ROOT_DIR"/*; do if ! [ -d "$POLICY_DIR" ]; then continue fi if ! [[ "$(basename "$POLICY_DIR")" =~ .*"$TESTS_POLICY_FILTER".* ]]; then continue fi # Run exports in subshells so that variables exported for previous # tests do not affect any other tests. ( for CFG_FILE in "$POLICY_DIR"/*.cfg; do if ! [ -f "$CFG_FILE" ]; then continue fi export cri_resmgr_cfg=$CFG_FILE done export-and-source-dir "$POLICY_DIR" for TOPOLOGY_DIR in "$POLICY_DIR"/*; do if ! [ -d "$TOPOLOGY_DIR" ]; then continue fi if ! [[ "$(basename "$TOPOLOGY_DIR")" =~ .*"$TESTS_TOPOLOGY_FILTER".* ]]; then continue fi if [ "$(basename "$TOPOLOGY_DIR")" == "vm-files" ]; then continue fi ( distro=${distro:=$DEFAULT_DISTRO} export distro # Create name for the vm. # Needs topology, distro and container runtime stack. k8scri=${k8scri:-"cri-resmgr|containerd"} case "${k8scri}" in "cri-resmgr|containerd") criname=crirm-containerd ;; "cri-resmgr|crio") criname=crirm-crio ;; "containerd") criname=containerd ;; "containerd&cri-resmgr") criname=nrirm-containerd ;; "crio") criname=crio ;; "crio&cri-resmgr") criname=nrirm-crio ;; *) error "unsupported k8scri: \"${k8scri}\"" ;; esac vm="$(basename "$TOPOLOGY_DIR")-${distro}-${criname}" export vm export-and-source-dir "$TOPOLOGY_DIR" for TEST_DIR in "$TOPOLOGY_DIR"/*; do if ! [ -d "$TEST_DIR" ]; then continue fi if ! [[ "$(basename "$TEST_DIR")" =~ .*"$TESTS_TEST_FILTER".* ]]; then continue fi if [ "$(basename "$TEST_DIR")" == "vm-files" ]; then continue fi ( export outdir="$TEST_DIR/output" export-and-source-dir "$TEST_DIR" export code="${source_libs}"" ${code}" mkdir -p "$outdir" echo "Run $(basename "$TEST_DIR")" TEST_DIR=$TEST_DIR TOPOLOGY_DIR=$TOPOLOGY_DIR POLICY_DIR=$POLICY_DIR \ "$RUN_SH" test 2>&1 | tee "$outdir/run.sh.output" test_name="$(basename "$POLICY_DIR")/$(basename "$TOPOLOGY_DIR")/$(basename "$TEST_DIR")" if grep -q "Test verdict: PASS" "$outdir/run.sh.output"; then echo "PASS $test_name" >> "$summary_file" elif grep -q "Test verdict: FAIL" "$outdir/run.sh.output"; then echo "FAIL $test_name" >> "$summary_file" else echo "ERROR $test_name" >> "$summary_file" fi ) done ) done ) done echo "" echo "Tests summary:" cat "$summary_file" if grep -q ERROR "$summary_file" || grep -q FAIL "$summary_file"; then exit 1 fi ================================================ FILE: test/functional/e2e_test.go ================================================ // Copyright 2020 Intel Corporation. All Rights Reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package e2e import ( "context" "flag" "fmt" "net" "os" "path/filepath" "testing" "time" resmgr "github.com/intel/cri-resource-manager/pkg/cri/resource-manager" "github.com/intel/cri-resource-manager/pkg/cri/resource-manager/cache" "github.com/intel/cri-resource-manager/pkg/cri/resource-manager/kubernetes" "github.com/intel/cri-resource-manager/pkg/dump" "google.golang.org/grpc" criv1 "k8s.io/cri-api/pkg/apis/runtime/v1" logger "github.com/intel/cri-resource-manager/pkg/log" ) const ( testDir = "/tmp/cri-rm-test" ) func init() { rate := logger.Rate{Limit: logger.Every(1 * time.Minute)} logger.SetGrpcLogger("grpc", &rate) if err := os.MkdirAll(testDir, 0700); err != nil { fmt.Printf("unable to create %q: %+v\n", testDir, err) } } type testEnv struct { t *testing.T handlers map[string]interface{} client criv1.RuntimeServiceClient forceConfig string mgr resmgr.ResourceManager cache cache.Cache } func (env *testEnv) Run(name string, testFunction func(context.Context, *testEnv)) { t := env.t overriddenCriHandlers := env.handlers t.Helper() t.Run(name, func(t *testing.T) { tmpDir, err := os.MkdirTemp(testDir, "requests-") if err != nil { t.Fatalf("unable to create temp directory: %+v", err) } defer os.RemoveAll(tmpDir) if err := flag.Set("runtime-socket", filepath.Join(tmpDir, "fakecri.sock")); err != nil { t.Fatalf("unable to set runtime-socket") } if err := flag.Set("image-socket", filepath.Join(tmpDir, "fakecri.sock")); err != nil { t.Fatalf("unable to set image-socket") } if err := flag.Set("relay-socket", filepath.Join(tmpDir, "relay.sock")); err != nil { t.Fatalf("unable to set relay-socket") } if err := flag.Set("relay-dir", filepath.Join(tmpDir, "relaystorage")); err != nil { t.Fatalf("unable to set relay-dir") } if err := flag.Set("agent-socket", filepath.Join(tmpDir, "agent.sock")); err != nil { t.Fatalf("unable to set agent-socket") } if err := flag.Set("config-socket", filepath.Join(tmpDir, "config.sock")); err != nil { t.Fatalf("unable to set config-socket") } if err := flag.Set("allow-untested-runtimes", "true"); err != nil { t.Fatalf("unable to allow untested runtimes: %v", err) } if env.forceConfig != "" { path := filepath.Join(tmpDir, "forcedconfig.cfg") if err := os.WriteFile(path, []byte(env.forceConfig), 0644); err != nil { t.Fatalf("failed to create configuration file %s: %v", path, err) } if err := flag.Set("force-config", path); err != nil { t.Fatalf("unable to set force-config") } } flag.Parse() fakeCri := newFakeCriServer(t, filepath.Join(tmpDir, "fakecri.sock"), overriddenCriHandlers) defer fakeCri.stop() resMgr, err := resmgr.NewResourceManager() if err != nil { t.Fatalf("unable to create resource manager: %+v", err) } if err := resMgr.Start(); err != nil { t.Fatalf("unable to start resource manager: %+v", err) } defer resMgr.Stop() ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second) defer cancel() conn, err := grpc.DialContext(ctx, filepath.Join(tmpDir, "relay.sock"), grpc.WithInsecure(), grpc.WithBlock(), grpc.WithContextDialer(func(ctx context.Context, addr string) (net.Conn, error) { if deadline, ok := ctx.Deadline(); ok { return net.DialTimeout("unix", addr, time.Until(deadline)) } return net.DialTimeout("unix", addr, 0) }), ) if err != nil { t.Fatalf("unable to connect to relay: %+v", err) } defer conn.Close() client := criv1.NewRuntimeServiceClient(conn) env.client = client env.mgr = resMgr env.cache = resMgr.GetCache() testFunction(ctx, env) // until pkg/log fixes gets merged: wait until pkg/dump is done with // logging before we run next test (and consequently do a reconfig) dump.Sync() }) } func TestListPodSandbox(t *testing.T) { tcases := []struct { name string pods []*criv1.PodSandbox expectedPods int }{ { name: "empty", }, { name: "list one pod", pods: []*criv1.PodSandbox{{}}, expectedPods: 1, }, } for _, tc := range tcases { criHandlers := map[string]interface{}{ "ListPodSandbox": func(*fakeCriServer, context.Context, *criv1.ListPodSandboxRequest) (*criv1.ListPodSandboxResponse, error) { return &criv1.ListPodSandboxResponse{ Items: tc.pods, }, nil }, } env := &testEnv{ t: t, handlers: criHandlers, } env.Run(tc.name, func(ctx context.Context, env *testEnv) { t := env.t client := env.client resp, err := client.ListPodSandbox(ctx, &criv1.ListPodSandboxRequest{}) if err != nil { t.Errorf("Unexpected error: %+v", err) return } if len(resp.Items) != tc.expectedPods { t.Errorf("Expected %d pods, got %d", tc.expectedPods, len(resp.Items)) } }) } } func TestListContainers(t *testing.T) { tcases := []struct { name string containers []*criv1.Container expectedContainers int }{ { name: "empty", }, { name: "list one container", containers: []*criv1.Container{{}}, expectedContainers: 1, }, } for _, tc := range tcases { criHandlers := map[string]interface{}{ "ListContainers": func(*fakeCriServer, context.Context, *criv1.ListContainersRequest) (*criv1.ListContainersResponse, error) { return &criv1.ListContainersResponse{ Containers: tc.containers, }, nil }, } env := &testEnv{ t: t, handlers: criHandlers, } env.Run(tc.name, func(ctx context.Context, env *testEnv) { t := env.t client := env.client resp, err := client.ListContainers(ctx, &criv1.ListContainersRequest{}) if err != nil { t.Errorf("Unexpected error: %+v", err) return } if len(resp.Containers) != tc.expectedContainers { t.Errorf("Expected %d pods, got %d", tc.expectedContainers, len(resp.Containers)) } }) } } func TestLingeringPodCleanup(t *testing.T) { cfg := ` policy: Active: topology-aware ReservedResources: CPU: 750m ` tcases := []struct { name string reqs []*criv1.RunPodSandboxRequest expectedPods int }{ { name: "create Pod #1", reqs: []*criv1.RunPodSandboxRequest{ createPodRequest("Pod#1", "UID#1", "", nil, nil, ""), }, expectedPods: 1, }, { name: "create Pods #1 and #2", reqs: []*criv1.RunPodSandboxRequest{ createPodRequest("Pod#1", "UID#1", "", nil, nil, ""), createPodRequest("Pod#2", "UID#2", "", nil, nil, ""), }, expectedPods: 2, }, { name: "create Pods #1, #2, and #3", reqs: []*criv1.RunPodSandboxRequest{ createPodRequest("Pod#1", "UID#1", "", nil, nil, ""), createPodRequest("Pod#2", "UID#2", "", nil, nil, ""), createPodRequest("Pod#3", "UID#3", "", nil, nil, ""), }, expectedPods: 3, }, { name: "create Pods #1, #2, #3, #4, '1, '2, '3", reqs: []*criv1.RunPodSandboxRequest{ createPodRequest("Pod#1", "UID#1", "", nil, nil, ""), createPodRequest("Pod#2", "UID#2", "", nil, nil, ""), createPodRequest("Pod#3", "UID#3", "", nil, nil, ""), createPodRequest("Pod#4", "UID#4", "", nil, nil, ""), createPodRequest("Pod#1", "UID#1", "", nil, nil, ""), createPodRequest("Pod#2", "UID'2", "", nil, nil, ""), createPodRequest("Pod#3", "UID'3", "", nil, nil, ""), createPodRequest("Pod#1", "UID#1", "", nil, nil, ""), createPodRequest("Pod#2", "UID#2", "", nil, nil, ""), createPodRequest("Pod#3", "UID#3", "", nil, nil, ""), createPodRequest("Pod#1", "UID'1", "", nil, nil, ""), createPodRequest("Pod#2", "UID'2", "", nil, nil, ""), createPodRequest("Pod#3", "UID'3", "", nil, nil, ""), createPodRequest("Pod#4", "UID#4", "", nil, nil, ""), }, expectedPods: 7, }, } numPods := 0 for _, tc := range tcases { criHandlers := map[string]interface{}{ "RunPodSandbox": func(*fakeCriServer, context.Context, *criv1.RunPodSandboxRequest) (*criv1.RunPodSandboxResponse, error) { numPods++ return &criv1.RunPodSandboxResponse{ PodSandboxId: fmt.Sprintf("Pod#%d", numPods), }, nil }, } env := &testEnv{ t: t, handlers: criHandlers, forceConfig: cfg, } env.Run(tc.name, func(ctx context.Context, env *testEnv) { t := env.t client := env.client cache := env.cache for _, req := range tc.reqs { _, err := client.RunPodSandbox(ctx, req) if err != nil { t.Errorf("failed to create pod %+v: %v", req, err) } } pods := cache.GetPods() if len(pods) != tc.expectedPods { t.Errorf("expected %d pods in cache, got %d (%v)", tc.expectedPods, len(pods), pods) } }) } } func TestLingeringContainerCleanup(t *testing.T) { cfg := ` policy: Active: topology-aware ReservedResources: CPU: 750m ` type pod struct { UID string ID string req *criv1.RunPodSandboxRequest } type container struct { pod string name string expect int req *criv1.CreateContainerRequest ID string } tcases := []struct { name string pods []*criv1.RunPodSandboxRequest containers []*container }{ { name: "create containers per one pod", pods: []*criv1.RunPodSandboxRequest{ createPodRequest("Pod#1", "UID#1", "", nil, nil, ""), }, containers: []*container{ {pod: "UID#1", name: "Container#1", expect: 1}, {pod: "UID#1", name: "Container#2", expect: 2}, }, }, { name: "create lingering containers per one pod", pods: []*criv1.RunPodSandboxRequest{ createPodRequest("Pod#1", "UID#1", "", nil, nil, ""), }, containers: []*container{ {pod: "UID#1", name: "Container#1", expect: 1}, {pod: "UID#1", name: "Container#2", expect: 2}, {pod: "UID#1", name: "Container#3", expect: 3}, {pod: "UID#1", name: "Container#3", expect: 3}, {pod: "UID#1", name: "Container#2", expect: 3}, {pod: "UID#1", name: "Container#1", expect: 3}, }, }, } numPods := 0 numContainers := 0 for _, tc := range tcases { criHandlers := map[string]interface{}{ "RunPodSandbox": func(*fakeCriServer, context.Context, *criv1.RunPodSandboxRequest) (*criv1.RunPodSandboxResponse, error) { numPods++ return &criv1.RunPodSandboxResponse{ PodSandboxId: fmt.Sprintf("Pod#%d", numPods), }, nil }, "CreateContainer": func(*fakeCriServer, context.Context, *criv1.CreateContainerRequest) (*criv1.CreateContainerResponse, error) { numContainers++ return &criv1.CreateContainerResponse{ ContainerId: fmt.Sprintf("Container#%d", numContainers), }, nil }, } env := &testEnv{ t: t, handlers: criHandlers, forceConfig: cfg, } env.Run(tc.name, func(ctx context.Context, env *testEnv) { t := env.t client := env.client cache := env.cache pods := map[string]*pod{} for _, req := range tc.pods { rpl, err := client.RunPodSandbox(ctx, req) if err != nil { t.Errorf("failed to create pod %+v: %v", req, err) } else { id := rpl.PodSandboxId uid := req.Config.Metadata.Uid pods[uid] = &pod{ UID: uid, ID: id, req: req, } } } for _, c := range tc.containers { pod, ok := pods[c.pod] if !ok { t.Errorf("failed to find pod by UID %s", c.pod) continue } c.req = createContainerRequest(pod.ID, c.name, pod.req) rpl, err := client.CreateContainer(ctx, c.req) if err != nil { t.Errorf("failed to create container %+v: %v", c.req, err) } else { c.ID = rpl.ContainerId cached := cache.GetContainers() if len(cached) != c.expect { t.Errorf("pod %s, container %s: expected %d containers in cache, got %d", c.pod, c.name, c.expect, len(cached)) } } } }) } } func createPodRequest(name, uid, namespace string, labels, annotations map[string]string, cgroupParent string) *criv1.RunPodSandboxRequest { if namespace == "" { namespace = "default" } if labels == nil { labels = map[string]string{} } labels[kubernetes.PodUIDLabel] = uid return &criv1.RunPodSandboxRequest{ Config: &criv1.PodSandboxConfig{ Metadata: &criv1.PodSandboxMetadata{ Name: name, Uid: uid, Namespace: namespace, }, Labels: labels, Annotations: annotations, Linux: &criv1.LinuxPodSandboxConfig{ CgroupParent: cgroupParent, }, }, } } func createContainerRequest(podID, name string, podReq *criv1.RunPodSandboxRequest) *criv1.CreateContainerRequest { return &criv1.CreateContainerRequest{ PodSandboxId: podID, Config: &criv1.ContainerConfig{ Metadata: &criv1.ContainerMetadata{ Name: name, }, Linux: &criv1.LinuxContainerConfig{}, }, SandboxConfig: podReq.Config, } } ================================================ FILE: test/functional/fake_cri_server_test.go ================================================ // Copyright 2019 Intel Corporation. All Rights Reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package e2e import ( "context" "fmt" "net" "os" "path/filepath" "reflect" "runtime" "strings" "testing" "time" "github.com/intel/cri-resource-manager/pkg/utils" "google.golang.org/grpc" criv1 "k8s.io/cri-api/pkg/apis/runtime/v1" ) const ( fakeKubeAPIVersion = "0.1.0" fakeRuntimeName = "fake-CRI-runtime" fakeRuntimeVersion = "v0.0.0" fakeRuntimeAPIVersion = "v1" ) type fakeCriServer struct { t *testing.T socket string grpcServer *grpc.Server fakeHandlers map[string]interface{} } func newFakeCriServer(t *testing.T, socket string, fakeHandlers map[string]interface{}) *fakeCriServer { t.Helper() if !filepath.IsAbs(socket) { t.Fatalf("invalid socket %q, absolute path expected", socket) } if err := os.MkdirAll(filepath.Dir(socket), 0700); err != nil { t.Fatalf("failed to create directory for socket %q: %v", socket, err) } srv := &fakeCriServer{ t: t, socket: socket, grpcServer: grpc.NewServer(), fakeHandlers: fakeHandlers, } criv1.RegisterRuntimeServiceServer(srv.grpcServer, srv) criv1.RegisterImageServiceServer(srv.grpcServer, srv) lis, err := net.Listen("unix", socket) if err != nil { if ls, err := utils.IsListeningSocket(socket); ls || err != nil { t.Fatalf("failed to create fake server: socket %s already exists", socket) } os.Remove(socket) lis, err = net.Listen("unix", socket) if err != nil { t.Fatalf("failed to create fake server on socket %q: %v", socket, err) } } go func() { if err := srv.grpcServer.Serve(lis); err != nil { fmt.Printf("unable to start gRPC server: %+v\n", err) } }() if err := utils.WaitForServer(socket, time.Second); err != nil { t.Fatalf("starting fake CRI server failed: %v", err) } return srv } func (s *fakeCriServer) stop() { s.t.Helper() s.grpcServer.Stop() os.Remove(s.socket) } func (s *fakeCriServer) callHandler(ctx context.Context, request interface{}, defaultHandler interface{}) (interface{}, error) { var err error pc, _, _, _ := runtime.Caller(1) nameFull := runtime.FuncForPC(pc).Name() nameEnd := filepath.Ext(nameFull) name := strings.TrimPrefix(nameEnd, ".") handler, found := s.fakeHandlers[name] if !found { if defaultHandler == nil { method := reflect.ValueOf(s).MethodByName(name) returnType := method.Type().Out(0) return reflect.New(returnType).Elem().Interface(), fmt.Errorf("%s() not implemented", name) } handler = defaultHandler } in := make([]reflect.Value, 3) in[0] = reflect.ValueOf(s) in[1] = reflect.ValueOf(ctx) in[2] = reflect.ValueOf(request) out := reflect.ValueOf(handler).Call(in) if !out[1].IsNil() { err = out[1].Interface().(error) } return out[0].Interface(), err } // Implementation of criv1.RuntimeServiceServer func (s *fakeCriServer) Version(ctx context.Context, req *criv1.VersionRequest) (*criv1.VersionResponse, error) { response, err := s.callHandler(ctx, req, func(*fakeCriServer, context.Context, *criv1.VersionRequest) (*criv1.VersionResponse, error) { return &criv1.VersionResponse{ Version: fakeKubeAPIVersion, RuntimeName: fakeRuntimeName, RuntimeVersion: fakeRuntimeVersion, RuntimeApiVersion: fakeRuntimeAPIVersion, }, nil }, ) return response.(*criv1.VersionResponse), err } func (s *fakeCriServer) RunPodSandbox(ctx context.Context, req *criv1.RunPodSandboxRequest) (*criv1.RunPodSandboxResponse, error) { response, err := s.callHandler(ctx, req, nil) return response.(*criv1.RunPodSandboxResponse), err } func (s *fakeCriServer) StopPodSandbox(ctx context.Context, req *criv1.StopPodSandboxRequest) (*criv1.StopPodSandboxResponse, error) { response, err := s.callHandler(ctx, req, nil) return response.(*criv1.StopPodSandboxResponse), err } func (s *fakeCriServer) RemovePodSandbox(ctx context.Context, req *criv1.RemovePodSandboxRequest) (*criv1.RemovePodSandboxResponse, error) { response, err := s.callHandler(ctx, req, nil) return response.(*criv1.RemovePodSandboxResponse), err } func (s *fakeCriServer) PodSandboxStatus(ctx context.Context, req *criv1.PodSandboxStatusRequest) (*criv1.PodSandboxStatusResponse, error) { response, err := s.callHandler(ctx, req, nil) return response.(*criv1.PodSandboxStatusResponse), err } func (s *fakeCriServer) ListPodSandbox(ctx context.Context, req *criv1.ListPodSandboxRequest) (*criv1.ListPodSandboxResponse, error) { response, err := s.callHandler(ctx, req, func(*fakeCriServer, context.Context, *criv1.ListPodSandboxRequest) (*criv1.ListPodSandboxResponse, error) { return &criv1.ListPodSandboxResponse{}, nil }) return response.(*criv1.ListPodSandboxResponse), err } func (s *fakeCriServer) CreateContainer(ctx context.Context, req *criv1.CreateContainerRequest) (*criv1.CreateContainerResponse, error) { response, err := s.callHandler(ctx, req, nil) return response.(*criv1.CreateContainerResponse), err } func (s *fakeCriServer) StartContainer(ctx context.Context, req *criv1.StartContainerRequest) (*criv1.StartContainerResponse, error) { response, err := s.callHandler(ctx, req, nil) return response.(*criv1.StartContainerResponse), err } func (s *fakeCriServer) StopContainer(ctx context.Context, req *criv1.StopContainerRequest) (*criv1.StopContainerResponse, error) { response, err := s.callHandler(ctx, req, nil) return response.(*criv1.StopContainerResponse), err } func (s *fakeCriServer) RemoveContainer(ctx context.Context, req *criv1.RemoveContainerRequest) (*criv1.RemoveContainerResponse, error) { response, err := s.callHandler(ctx, req, nil) return response.(*criv1.RemoveContainerResponse), err } func (s *fakeCriServer) ListContainers(ctx context.Context, req *criv1.ListContainersRequest) (*criv1.ListContainersResponse, error) { response, err := s.callHandler(ctx, req, func(*fakeCriServer, context.Context, *criv1.ListContainersRequest) (*criv1.ListContainersResponse, error) { return &criv1.ListContainersResponse{}, nil }) return response.(*criv1.ListContainersResponse), err } func (s *fakeCriServer) ContainerStatus(ctx context.Context, req *criv1.ContainerStatusRequest) (*criv1.ContainerStatusResponse, error) { response, err := s.callHandler(ctx, req, nil) return response.(*criv1.ContainerStatusResponse), err } func (s *fakeCriServer) UpdateContainerResources(ctx context.Context, req *criv1.UpdateContainerResourcesRequest) (*criv1.UpdateContainerResourcesResponse, error) { response, err := s.callHandler(ctx, req, func(*fakeCriServer, context.Context, *criv1.UpdateContainerResourcesRequest) (*criv1.UpdateContainerResourcesResponse, error) { return &criv1.UpdateContainerResourcesResponse{}, nil }, ) return response.(*criv1.UpdateContainerResourcesResponse), err } func (s *fakeCriServer) ReopenContainerLog(ctx context.Context, req *criv1.ReopenContainerLogRequest) (*criv1.ReopenContainerLogResponse, error) { response, err := s.callHandler(ctx, req, nil) return response.(*criv1.ReopenContainerLogResponse), err } func (s *fakeCriServer) ExecSync(ctx context.Context, req *criv1.ExecSyncRequest) (*criv1.ExecSyncResponse, error) { response, err := s.callHandler(ctx, req, nil) return response.(*criv1.ExecSyncResponse), err } func (s *fakeCriServer) Exec(ctx context.Context, req *criv1.ExecRequest) (*criv1.ExecResponse, error) { response, err := s.callHandler(ctx, req, nil) return response.(*criv1.ExecResponse), err } func (s *fakeCriServer) Attach(ctx context.Context, req *criv1.AttachRequest) (*criv1.AttachResponse, error) { response, err := s.callHandler(ctx, req, nil) return response.(*criv1.AttachResponse), err } func (s *fakeCriServer) PortForward(ctx context.Context, req *criv1.PortForwardRequest) (*criv1.PortForwardResponse, error) { response, err := s.callHandler(ctx, req, nil) return response.(*criv1.PortForwardResponse), err } func (s *fakeCriServer) ContainerStats(ctx context.Context, req *criv1.ContainerStatsRequest) (*criv1.ContainerStatsResponse, error) { response, err := s.callHandler(ctx, req, nil) return response.(*criv1.ContainerStatsResponse), err } func (s *fakeCriServer) ListContainerStats(ctx context.Context, req *criv1.ListContainerStatsRequest) (*criv1.ListContainerStatsResponse, error) { response, err := s.callHandler(ctx, req, nil) return response.(*criv1.ListContainerStatsResponse), err } func (s *fakeCriServer) PodSandboxStats(ctx context.Context, req *criv1.PodSandboxStatsRequest) (*criv1.PodSandboxStatsResponse, error) { response, err := s.callHandler(ctx, req, nil) return response.(*criv1.PodSandboxStatsResponse), err } func (s *fakeCriServer) ListPodSandboxStats(ctx context.Context, req *criv1.ListPodSandboxStatsRequest) (*criv1.ListPodSandboxStatsResponse, error) { response, err := s.callHandler(ctx, req, nil) return response.(*criv1.ListPodSandboxStatsResponse), err } func (s *fakeCriServer) UpdateRuntimeConfig(ctx context.Context, req *criv1.UpdateRuntimeConfigRequest) (*criv1.UpdateRuntimeConfigResponse, error) { response, err := s.callHandler(ctx, req, nil) return response.(*criv1.UpdateRuntimeConfigResponse), err } func (s *fakeCriServer) Status(ctx context.Context, req *criv1.StatusRequest) (*criv1.StatusResponse, error) { response, err := s.callHandler(ctx, req, nil) return response.(*criv1.StatusResponse), err } func (s *fakeCriServer) CheckpointContainer(ctx context.Context, req *criv1.CheckpointContainerRequest) (*criv1.CheckpointContainerResponse, error) { response, err := s.callHandler(ctx, req, nil) return response.(*criv1.CheckpointContainerResponse), err } func (s *fakeCriServer) GetContainerEvents(_ *criv1.GetEventsRequest, _ criv1.RuntimeService_GetContainerEventsServer) error { return nil } func (s *fakeCriServer) ListMetricDescriptors(ctx context.Context, req *criv1.ListMetricDescriptorsRequest) (*criv1.ListMetricDescriptorsResponse, error) { response, err := s.callHandler(ctx, req, nil) return response.(*criv1.ListMetricDescriptorsResponse), err } func (s *fakeCriServer) ListPodSandboxMetrics(ctx context.Context, req *criv1.ListPodSandboxMetricsRequest) (*criv1.ListPodSandboxMetricsResponse, error) { response, err := s.callHandler(ctx, req, nil) return response.(*criv1.ListPodSandboxMetricsResponse), err } func (s *fakeCriServer) RuntimeConfig(ctx context.Context, req *criv1.RuntimeConfigRequest) (*criv1.RuntimeConfigResponse, error) { response, err := s.callHandler(ctx, req, nil) return response.(*criv1.RuntimeConfigResponse), err } // Implementation of criv1.ImageServiceServer func (s *fakeCriServer) ListImages(ctx context.Context, req *criv1.ListImagesRequest) (*criv1.ListImagesResponse, error) { response, err := s.callHandler(ctx, req, func(*fakeCriServer, context.Context, *criv1.ListImagesRequest) (*criv1.ListImagesResponse, error) { return &criv1.ListImagesResponse{}, nil }, ) return response.(*criv1.ListImagesResponse), err } func (s *fakeCriServer) ImageStatus(ctx context.Context, req *criv1.ImageStatusRequest) (*criv1.ImageStatusResponse, error) { response, err := s.callHandler(ctx, req, nil) return response.(*criv1.ImageStatusResponse), err } func (s *fakeCriServer) PullImage(ctx context.Context, req *criv1.PullImageRequest) (*criv1.PullImageResponse, error) { response, err := s.callHandler(ctx, req, nil) return response.(*criv1.PullImageResponse), err } func (s *fakeCriServer) RemoveImage(ctx context.Context, req *criv1.RemoveImageRequest) (*criv1.RemoveImageResponse, error) { response, err := s.callHandler(ctx, req, nil) return response.(*criv1.RemoveImageResponse), err } func (s *fakeCriServer) ImageFsInfo(ctx context.Context, req *criv1.ImageFsInfoRequest) (*criv1.ImageFsInfoResponse, error) { response, err := s.callHandler(ctx, req, nil) return response.(*criv1.ImageFsInfoResponse), err }